diff --git a/sft_pretrain/Full_smoe_sharev3/added_tokens.json b/sft_pretrain/Full_smoe_sharev3/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/added_tokens.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7929d4cdbe9bb7ee3537b93d161990a8caa422ec --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sharev3", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/generation_config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c5f8dbc7405a7c0ce011bde8b4f8b403f296ff0 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76c792d5a62bb0dcafcb44db5c9b5a40f932d597e5951e4e4aa31bcbc3425365 +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fadd9d1b20c694c98581c0ffc7a9b5d01759e38d --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2fec3bfd8133383a94c23b88e8f169da52083e51e36a9c12dd8de3dbe9d83e0 +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b130f23e48bd4ddee95322e985de0f05ef5c1de --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a84f19f1b7c92d001ca752fcd13f1a0dea2dde4e775259709be4575d8317354c +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab7e7134cb7ca5844ccb1d1b130c29f918f8b51a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c02bae8df29b5bd22b440c210c31aa6736c5b17d598d1074766e771728e9e88 +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ae4093fc2eecc3f338358334b179aba19228e8a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59b56214bfda3eea6cae756c9a5b2d634dde1241d5d74aa58e5d5eba03771090 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72986bfd5f3445dd4b82d128644c1866dd2d2036 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5a6c57ca3e62ad32d871a090476e7c71186005d82772dcff3eaf268757ecd72 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..19e13b9c89ca5673cc9e2a20d57f3d8709838265 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71d7adc0bd3a72af0f6d694658be8a354d35702d19ae7800d02055fc810e067a +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f953af8e2a4c854bc694023b8eae903d1d24095 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ec532c783e35b553a9381599a075e232975372e107376e74f0ad1c882c68006 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/latest b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/latest new file mode 100644 index 0000000000000000000000000000000000000000..f37da78e3c7eee26ebe5f06b54d6621716edb6b9 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/latest @@ -0,0 +1 @@ +global_step1040 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a00a68ae9873fb4abba2d08ea1cafa6b115858aa --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6b2010e05dda51fdc4abde63b6f8f66017311c1a6c300c35998cee4cd59d425 +size 3759020544 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/model.safetensors.index.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..90a56cf0153ed9062ff35ee2b372f7ab9e796c6a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731420128 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/rng_state_0.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/rng_state_1.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/rng_state_2.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/rng_state_3.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/special_tokens_map.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/tokenizer.model b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/tokenizer_config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/trainer_state.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1b451aa639415d4cfd60e5bd01d7e45ab3d4e867 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/trainer_state.json @@ -0,0 +1,15633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2000769526741054, + "eval_steps": 500, + "global_step": 1040, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03964023, + "balance_loss_mlp": 3.01339984, + "epoch": 0.00019238168526356292, + "flos": 470464353792.0, + "grad_norm": 27.10233905437441, + "language_loss": 3.72295761, + "learning_rate": 0.0, + "loss": 2.48840261, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 9.5, + "step": 1, + "time_per_iteration": 29.606513023376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01906453, + "balance_loss_mlp": 1.25642872, + "epoch": 0.00038476337052712584, + "flos": 504311436288.0, + "grad_norm": 2.874173750989579, + "language_loss": 1.79264998, + "learning_rate": 0.00013726078121135892, + "loss": 1.81171465, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.5, + "step": 2, + "time_per_iteration": 2.7078208923339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01915611, + "balance_loss_mlp": 1.2667315, + "epoch": 0.0005771450557906887, + "flos": 598869282816.0, + "grad_norm": 2.1141296462778643, + "language_loss": 1.61429811, + "learning_rate": 0.00021755319103969496, + "loss": 1.63345432, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.48828125, + "step": 3, + "time_per_iteration": 3.010409116744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01917319, + "balance_loss_mlp": 1.26309848, + "epoch": 0.0007695267410542517, + "flos": 580133491200.0, + "grad_norm": 1.255159247360545, + "language_loss": 1.49202251, + "learning_rate": 0.00027452156242271784, + "loss": 1.51119578, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.54296875, + "step": 4, + "time_per_iteration": 2.7161622047424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0185163, + "balance_loss_mlp": 1.22144234, + "epoch": 0.0009619084263178145, + "flos": 485857861632.0, + "grad_norm": 4.267520959606063, + "language_loss": 1.57359505, + "learning_rate": 0.0003187096642208417, + "loss": 1.59211147, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 6.296875, + "step": 5, + "time_per_iteration": 2.718417167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01828185, + "balance_loss_mlp": 1.21211123, + "epoch": 0.0011542901115813775, + "flos": 559744791552.0, + "grad_norm": 1.225349312557607, + "language_loss": 1.4752574, + "learning_rate": 0.0003548139722510539, + "loss": 1.49353933, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 6.15234375, + "step": 6, + "time_per_iteration": 2.6827874183654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01821666, + "balance_loss_mlp": 1.22428453, + "epoch": 0.0013466717968449403, + "flos": 533721014784.0, + "grad_norm": 0.5025899606895544, + "language_loss": 1.33846116, + "learning_rate": 0.00038533972973918044, + "loss": 1.35667801, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.96875, + "step": 7, + "time_per_iteration": 2.6889517307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01776667, + "balance_loss_mlp": 1.2090404, + "epoch": 0.0015390534821085034, + "flos": 492037175808.0, + "grad_norm": 0.1719820928967348, + "language_loss": 1.2814672, + "learning_rate": 0.0004117823436340768, + "loss": 1.29923391, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.6875, + "step": 8, + "time_per_iteration": 2.7207248210906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0177577, + "balance_loss_mlp": 1.23217535, + "epoch": 0.0017314351673720662, + "flos": 564402207744.0, + "grad_norm": 0.6128716609675008, + "language_loss": 1.39861906, + "learning_rate": 0.00043510638207938993, + "loss": 1.41637683, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.44140625, + "step": 9, + "time_per_iteration": 2.887538194656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01823371, + "balance_loss_mlp": 1.31334615, + "epoch": 0.001923816852635629, + "flos": 593132308992.0, + "grad_norm": 0.480897383035181, + "language_loss": 1.25963569, + "learning_rate": 0.00045597044543220066, + "loss": 1.27786922, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.09765625, + "step": 10, + "time_per_iteration": 2.7672832012176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01930298, + "balance_loss_mlp": 1.44621277, + "epoch": 0.002116198537899192, + "flos": 609308752896.0, + "grad_norm": 0.21803247425844502, + "language_loss": 1.22959518, + "learning_rate": 0.00047484428652143135, + "loss": 1.24889803, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 4.83203125, + "step": 11, + "time_per_iteration": 2.9771082401275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130152, + "balance_loss_mlp": 1.67772901, + "epoch": 0.002308580223162755, + "flos": 544869075456.0, + "grad_norm": 0.19847359144835577, + "language_loss": 1.28057694, + "learning_rate": 0.0004920747534624128, + "loss": 1.30187845, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 4.52734375, + "step": 12, + "time_per_iteration": 2.6094090938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02177014, + "balance_loss_mlp": 1.7512939, + "epoch": 0.002500961908426318, + "flos": 644458277376.0, + "grad_norm": 0.3126355826019607, + "language_loss": 1.29235363, + "learning_rate": 0.0005079252465375872, + "loss": 1.31412375, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 4.265625, + "step": 13, + "time_per_iteration": 2.841792345046997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02141221, + "balance_loss_mlp": 1.74411082, + "epoch": 0.0026933435936898806, + "flos": 487605312000.0, + "grad_norm": 0.282411779716686, + "language_loss": 1.17459798, + "learning_rate": 0.0005226005109505393, + "loss": 1.19601011, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 3.96875, + "step": 14, + "time_per_iteration": 2.597313165664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02024541, + "balance_loss_mlp": 1.65890288, + "epoch": 0.0028857252789534437, + "flos": 434368949760.0, + "grad_norm": 0.2583476739022616, + "language_loss": 1.22957516, + "learning_rate": 0.0005362628552605367, + "loss": 1.24982059, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 3.65234375, + "step": 15, + "time_per_iteration": 2.6388704776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01790575, + "balance_loss_mlp": 1.44687057, + "epoch": 0.0030781069642170067, + "flos": 596465676288.0, + "grad_norm": 0.18613747071639053, + "language_loss": 1.27631426, + "learning_rate": 0.0005490431248454357, + "loss": 1.29421997, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 3.44140625, + "step": 16, + "time_per_iteration": 2.708346128463745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01779165, + "balance_loss_mlp": 1.46941185, + "epoch": 0.0032704886494805694, + "flos": 1537360432128.0, + "grad_norm": 0.2733785965407311, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.77484274, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 3.09375, + "step": 17, + "time_per_iteration": 6.916250705718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01553778, + "balance_loss_mlp": 1.24955583, + "epoch": 0.0034628703347441324, + "flos": 473720403456.0, + "grad_norm": 0.11658431553946913, + "language_loss": 1.14468098, + "learning_rate": 0.0005723671632907488, + "loss": 1.16021872, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 3.03710938, + "step": 18, + "time_per_iteration": 2.7716212272644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01490625, + "balance_loss_mlp": 1.21005416, + "epoch": 0.0036552520200076955, + "flos": 448303320576.0, + "grad_norm": 0.11552730485963776, + "language_loss": 1.19723654, + "learning_rate": 0.0005830738490244919, + "loss": 1.21214283, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 2.80859375, + "step": 19, + "time_per_iteration": 2.6067557334899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0141948, + "balance_loss_mlp": 1.16103387, + "epoch": 0.003847633705271258, + "flos": 635881148928.0, + "grad_norm": 0.11977740619668105, + "language_loss": 1.21676993, + "learning_rate": 0.0005932312266435596, + "loss": 1.23096466, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 2.58398438, + "step": 20, + "time_per_iteration": 2.8545703887939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01364308, + "balance_loss_mlp": 1.13084817, + "epoch": 0.004040015390534821, + "flos": 589222771200.0, + "grad_norm": 0.09935322828728523, + "language_loss": 1.16681409, + "learning_rate": 0.0006028929207788754, + "loss": 1.18045723, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 2.33203125, + "step": 21, + "time_per_iteration": 2.7119524478912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319718, + "balance_loss_mlp": 1.11038613, + "epoch": 0.004232397075798384, + "flos": 756253338624.0, + "grad_norm": 0.09023283304690737, + "language_loss": 1.20250762, + "learning_rate": 0.0006121050677327902, + "loss": 1.21570492, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 2.09667969, + "step": 22, + "time_per_iteration": 2.884739398956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304467, + "balance_loss_mlp": 1.1184051, + "epoch": 0.004424778761061947, + "flos": 526434439680.0, + "grad_norm": 0.08559602389751407, + "language_loss": 1.10067201, + "learning_rate": 0.0006209076479463684, + "loss": 1.1137166, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 1.85839844, + "step": 23, + "time_per_iteration": 2.6616718769073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275434, + "balance_loss_mlp": 1.10787356, + "epoch": 0.00461716044632551, + "flos": 547907079168.0, + "grad_norm": 0.07141137445072718, + "language_loss": 1.2012924, + "learning_rate": 0.0006293355346737718, + "loss": 1.21404672, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 1.67675781, + "step": 24, + "time_per_iteration": 2.7025952339172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252583, + "balance_loss_mlp": 1.10476315, + "epoch": 0.004809542131589073, + "flos": 567293234688.0, + "grad_norm": 0.08524381015789384, + "language_loss": 1.16738653, + "learning_rate": 0.0006374193284416834, + "loss": 1.17991233, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 1.47753906, + "step": 25, + "time_per_iteration": 2.827439069747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223638, + "balance_loss_mlp": 1.0984205, + "epoch": 0.005001923816852636, + "flos": 470391418368.0, + "grad_norm": 0.08512374611478205, + "language_loss": 1.15399337, + "learning_rate": 0.0006451860277489461, + "loss": 1.16622972, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 1.25097656, + "step": 26, + "time_per_iteration": 2.6214864253997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206141, + "balance_loss_mlp": 1.10009253, + "epoch": 0.005194305502116198, + "flos": 415283950080.0, + "grad_norm": 0.07774032731783902, + "language_loss": 1.23061514, + "learning_rate": 0.0006526595731190848, + "loss": 1.2426765, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 1.0625, + "step": 27, + "time_per_iteration": 2.5637125968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117904, + "balance_loss_mlp": 1.09192181, + "epoch": 0.005386687187379761, + "flos": 628466535936.0, + "grad_norm": 0.05524077436438855, + "language_loss": 1.1626848, + "learning_rate": 0.0006598612921618983, + "loss": 1.17447519, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 0.87158203, + "step": 28, + "time_per_iteration": 2.8202784061431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159441, + "balance_loss_mlp": 1.08772469, + "epoch": 0.005579068872643324, + "flos": 886100332032.0, + "grad_norm": 0.07386109802626846, + "language_loss": 1.08505416, + "learning_rate": 0.0006668102665011454, + "loss": 1.09664845, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 0.71728516, + "step": 29, + "time_per_iteration": 3.2254040241241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154142, + "balance_loss_mlp": 1.09520459, + "epoch": 0.005771450557906887, + "flos": 547287238656.0, + "grad_norm": 0.0797557646441396, + "language_loss": 1.18077409, + "learning_rate": 0.0006735236364718957, + "loss": 1.19231534, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 0.58886719, + "step": 30, + "time_per_iteration": 2.6730945110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140737, + "balance_loss_mlp": 1.09384, + "epoch": 0.00596383224317045, + "flos": 531766950912.0, + "grad_norm": 0.060827451674393726, + "language_loss": 1.1687839, + "learning_rate": 0.0006800168558381346, + "loss": 1.18019128, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 0.46875, + "step": 31, + "time_per_iteration": 2.649216651916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148736, + "balance_loss_mlp": 1.11166239, + "epoch": 0.0061562139284340135, + "flos": 588813926400.0, + "grad_norm": 0.10592463777190406, + "language_loss": 1.19211543, + "learning_rate": 0.0006863039060567947, + "loss": 1.20360279, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 0.37084961, + "step": 32, + "time_per_iteration": 2.6697018146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132499, + "balance_loss_mlp": 1.10136151, + "epoch": 0.006348595613697576, + "flos": 617929551360.0, + "grad_norm": 0.09812744917576391, + "language_loss": 1.1217525, + "learning_rate": 0.0006923974775611263, + "loss": 1.13307738, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 0.3112793, + "step": 33, + "time_per_iteration": 2.770225763320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137532, + "balance_loss_mlp": 1.11146092, + "epoch": 0.006540977298961139, + "flos": 777564444672.0, + "grad_norm": 0.06513543096417564, + "language_loss": 1.08375585, + "learning_rate": 0.0006983091239737814, + "loss": 1.09513116, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 0.26086426, + "step": 34, + "time_per_iteration": 2.99418306350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128276, + "balance_loss_mlp": 1.10578084, + "epoch": 0.006733358984224702, + "flos": 666837356544.0, + "grad_norm": 0.06344935516817307, + "language_loss": 1.07062221, + "learning_rate": 0.0007040493939600222, + "loss": 1.08190489, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 0.22497559, + "step": 35, + "time_per_iteration": 2.9126057624816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119708, + "balance_loss_mlp": 1.09892988, + "epoch": 0.006925740669488265, + "flos": 564092287488.0, + "grad_norm": 0.06579143759664555, + "language_loss": 1.07960629, + "learning_rate": 0.0007096279445021078, + "loss": 1.09080338, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 0.20788574, + "step": 36, + "time_per_iteration": 2.7079102993011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114855, + "balance_loss_mlp": 1.09574544, + "epoch": 0.007118122354751828, + "flos": 549583156224.0, + "grad_norm": 0.14799474820221378, + "language_loss": 1.14634764, + "learning_rate": 0.0007150536386503726, + "loss": 1.15749621, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 0.19104004, + "step": 37, + "time_per_iteration": 2.8290467262268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104011, + "balance_loss_mlp": 1.08533084, + "epoch": 0.007310504040015391, + "flos": 702161409024.0, + "grad_norm": 0.2513092385422617, + "language_loss": 1.08396375, + "learning_rate": 0.0007203346302358509, + "loss": 1.09500384, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 0.18688965, + "step": 38, + "time_per_iteration": 2.961430311203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121274, + "balance_loss_mlp": 1.10231924, + "epoch": 0.007502885725278953, + "flos": 599022051840.0, + "grad_norm": 0.0999674626629785, + "language_loss": 1.11391926, + "learning_rate": 0.000725478437577282, + "loss": 1.12513208, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 0.18945312, + "step": 39, + "time_per_iteration": 2.742088556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146989, + "balance_loss_mlp": 1.12810588, + "epoch": 0.007695267410542516, + "flos": 560000867328.0, + "grad_norm": 0.3323772184023467, + "language_loss": 1.08355689, + "learning_rate": 0.0007304920078549186, + "loss": 1.09502685, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 0.18884277, + "step": 40, + "time_per_iteration": 2.66943621635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116486, + "balance_loss_mlp": 1.1452378, + "epoch": 0.007887649095806078, + "flos": 507906671616.0, + "grad_norm": 0.11539272036457353, + "language_loss": 1.09356606, + "learning_rate": 0.0007353817735343603, + "loss": 1.1052146, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 0.19604492, + "step": 41, + "time_per_iteration": 2.7052595615386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132998, + "balance_loss_mlp": 1.11293542, + "epoch": 0.008080030781069641, + "flos": 503642133504.0, + "grad_norm": 0.12251683576194117, + "language_loss": 1.04851842, + "learning_rate": 0.0007401537019902344, + "loss": 1.05984843, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 0.20056152, + "step": 42, + "time_per_iteration": 2.590432643890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124507, + "balance_loss_mlp": 1.10198867, + "epoch": 0.008272412466333205, + "flos": 517764178944.0, + "grad_norm": 0.09393858903586973, + "language_loss": 1.08539796, + "learning_rate": 0.0007448133392900729, + "loss": 1.09664297, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 0.22521973, + "step": 43, + "time_per_iteration": 2.6619081497192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112544, + "balance_loss_mlp": 1.10156202, + "epoch": 0.008464794151596768, + "flos": 607673373696.0, + "grad_norm": 0.06822323064374927, + "language_loss": 1.03845203, + "learning_rate": 0.0007493658489441491, + "loss": 1.04970646, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 0.23864746, + "step": 44, + "time_per_iteration": 2.861008644104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128905, + "balance_loss_mlp": 1.10477662, + "epoch": 0.00865717583686033, + "flos": 537661075968.0, + "grad_norm": 0.1413166066405165, + "language_loss": 1.08820629, + "learning_rate": 0.0007538160463002316, + "loss": 1.09949529, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 0.2409668, + "step": 45, + "time_per_iteration": 2.643458604812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115676, + "balance_loss_mlp": 1.13258433, + "epoch": 0.008849557522123894, + "flos": 507758284800.0, + "grad_norm": 0.08570115972640321, + "language_loss": 1.10720444, + "learning_rate": 0.0007581684291577274, + "loss": 1.11877203, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.24157715, + "step": 46, + "time_per_iteration": 2.5904788970947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145761, + "balance_loss_mlp": 1.12085772, + "epoch": 0.009041939207387457, + "flos": 625048800768.0, + "grad_norm": 0.06636849455276843, + "language_loss": 1.14156199, + "learning_rate": 0.0007624272050891776, + "loss": 1.15301955, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.24902344, + "step": 47, + "time_per_iteration": 2.782179594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154374, + "balance_loss_mlp": 1.12759995, + "epoch": 0.00923432089265102, + "flos": 549124849152.0, + "grad_norm": 0.09356522507451794, + "language_loss": 1.04615343, + "learning_rate": 0.0007665963158851307, + "loss": 1.05769718, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.26806641, + "step": 48, + "time_per_iteration": 2.824540138244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174661, + "balance_loss_mlp": 1.14738548, + "epoch": 0.009426702577914583, + "flos": 562202242560.0, + "grad_norm": 0.059100241584136314, + "language_loss": 1.12381458, + "learning_rate": 0.0007706794594783609, + "loss": 1.13556111, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.27270508, + "step": 49, + "time_per_iteration": 2.790757894515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192673, + "balance_loss_mlp": 1.16604137, + "epoch": 0.009619084263178146, + "flos": 616486228992.0, + "grad_norm": 0.08074806779925832, + "language_loss": 1.11280799, + "learning_rate": 0.0007746801096530423, + "loss": 1.12473488, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.2668457, + "step": 50, + "time_per_iteration": 2.7235305309295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116178, + "balance_loss_mlp": 1.135149, + "epoch": 0.009811465948441709, + "flos": 541176325632.0, + "grad_norm": 0.06558886342971224, + "language_loss": 1.16576111, + "learning_rate": 0.0007786015338021173, + "loss": 1.17737889, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.26672363, + "step": 51, + "time_per_iteration": 2.6817519664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134628, + "balance_loss_mlp": 1.1085453, + "epoch": 0.010003847633705272, + "flos": 535608087552.0, + "grad_norm": 0.06210449580458492, + "language_loss": 1.08870959, + "learning_rate": 0.0007824468089603051, + "loss": 1.10005593, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.26098633, + "step": 52, + "time_per_iteration": 2.644577980041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125522, + "balance_loss_mlp": 1.09910512, + "epoch": 0.010196229318968833, + "flos": 908867907072.0, + "grad_norm": 0.05864822926220488, + "language_loss": 1.07807887, + "learning_rate": 0.0007862188363098669, + "loss": 1.08933413, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.26428223, + "step": 53, + "time_per_iteration": 3.1450047492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126237, + "balance_loss_mlp": 1.10084558, + "epoch": 0.010388611004232396, + "flos": 585594040320.0, + "grad_norm": 0.07974065634267835, + "language_loss": 1.08295977, + "learning_rate": 0.0007899203543304438, + "loss": 1.09422219, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.25390625, + "step": 54, + "time_per_iteration": 2.6822280883789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155972, + "balance_loss_mlp": 1.13315582, + "epoch": 0.01058099268949596, + "flos": 502233716736.0, + "grad_norm": 0.07014139109577967, + "language_loss": 1.22212756, + "learning_rate": 0.0007935539507422731, + "loss": 1.23368728, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.22814941, + "step": 55, + "time_per_iteration": 2.5841405391693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117516, + "balance_loss_mlp": 1.153512, + "epoch": 0.010773374374759523, + "flos": 544170659328.0, + "grad_norm": 0.07006342440897594, + "language_loss": 1.13914931, + "learning_rate": 0.0007971220733732573, + "loss": 1.15090084, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.21643066, + "step": 56, + "time_per_iteration": 2.697427988052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193099, + "balance_loss_mlp": 1.17267895, + "epoch": 0.010965756060023086, + "flos": 525874235904.0, + "grad_norm": 0.08125896119424647, + "language_loss": 1.0764755, + "learning_rate": 0.0008006270400641869, + "loss": 1.08840656, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.2043457, + "step": 57, + "time_per_iteration": 2.723154306411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174019, + "balance_loss_mlp": 1.15412247, + "epoch": 0.011158137745286649, + "flos": 576653147136.0, + "grad_norm": 0.07485866075688756, + "language_loss": 1.09104013, + "learning_rate": 0.0008040710477125043, + "loss": 1.10278034, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.19897461, + "step": 58, + "time_per_iteration": 2.703186273574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153983, + "balance_loss_mlp": 1.13440859, + "epoch": 0.011350519430550212, + "flos": 529024310784.0, + "grad_norm": 0.06764829366941465, + "language_loss": 1.09780312, + "learning_rate": 0.0008074561805429771, + "loss": 1.10934305, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.19567871, + "step": 59, + "time_per_iteration": 2.6111674308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136624, + "balance_loss_mlp": 1.11676335, + "epoch": 0.011542901115813775, + "flos": 555608291328.0, + "grad_norm": 0.06986870516034673, + "language_loss": 1.08079648, + "learning_rate": 0.0008107844176832545, + "loss": 1.09216261, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.19848633, + "step": 60, + "time_per_iteration": 2.682687997817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125651, + "balance_loss_mlp": 1.1056236, + "epoch": 0.011735282801077338, + "flos": 571826995200.0, + "grad_norm": 0.061548073586970495, + "language_loss": 1.09071934, + "learning_rate": 0.0008140576401132568, + "loss": 1.10197592, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.20019531, + "step": 61, + "time_per_iteration": 2.639394760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111743, + "balance_loss_mlp": 1.09838021, + "epoch": 0.0119276644863409, + "flos": 615309156864.0, + "grad_norm": 0.06273761556608791, + "language_loss": 1.10558033, + "learning_rate": 0.0008172776370494935, + "loss": 1.11675453, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.19030762, + "step": 62, + "time_per_iteration": 2.7110230922698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134294, + "balance_loss_mlp": 1.11483955, + "epoch": 0.012120046171604464, + "flos": 500835474432.0, + "grad_norm": 0.07391589684249159, + "language_loss": 1.17346644, + "learning_rate": 0.0008204461118185703, + "loss": 1.18480933, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.19445801, + "step": 63, + "time_per_iteration": 2.5490689277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142708, + "balance_loss_mlp": 1.12420678, + "epoch": 0.012312427856868027, + "flos": 473109327360.0, + "grad_norm": 0.05825974543220343, + "language_loss": 1.06081367, + "learning_rate": 0.0008235646872681536, + "loss": 1.07224083, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.18505859, + "step": 64, + "time_per_iteration": 2.5874247550964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139504, + "balance_loss_mlp": 1.12069249, + "epoch": 0.012504809542131588, + "flos": 538094651904.0, + "grad_norm": 0.1040066778051144, + "language_loss": 1.06503749, + "learning_rate": 0.0008266349107584288, + "loss": 1.07643247, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.18823242, + "step": 65, + "time_per_iteration": 2.678736925125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123492, + "balance_loss_mlp": 1.10500288, + "epoch": 0.012697191227395151, + "flos": 608450365440.0, + "grad_norm": 0.09066354406474254, + "language_loss": 1.09410381, + "learning_rate": 0.0008296582587724851, + "loss": 1.10533869, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.18481445, + "step": 66, + "time_per_iteration": 2.6937255859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121105, + "balance_loss_mlp": 1.10255599, + "epoch": 0.012889572912658714, + "flos": 767750607360.0, + "grad_norm": 0.11790618145169461, + "language_loss": 1.07982886, + "learning_rate": 0.0008326361411800136, + "loss": 1.0910399, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.1854248, + "step": 67, + "time_per_iteration": 2.9377663135528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096346, + "balance_loss_mlp": 1.07871521, + "epoch": 0.013081954597922277, + "flos": 533604561408.0, + "grad_norm": 0.09153807632987658, + "language_loss": 1.08335972, + "learning_rate": 0.0008355699051851403, + "loss": 1.09432316, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.17651367, + "step": 68, + "time_per_iteration": 2.7278473377227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_mlp": 1.0865227, + "epoch": 0.01327433628318584, + "flos": 572826567168.0, + "grad_norm": 0.08317322449907456, + "language_loss": 1.14837921, + "learning_rate": 0.0008384608389860635, + "loss": 1.15941942, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.1751709, + "step": 69, + "time_per_iteration": 2.7238211631774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111418, + "balance_loss_mlp": 1.09424019, + "epoch": 0.013466717968449404, + "flos": 497029243392.0, + "grad_norm": 0.08213812906773327, + "language_loss": 1.04970825, + "learning_rate": 0.000841310175171381, + "loss": 1.06082237, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.17199707, + "step": 70, + "time_per_iteration": 2.578726291656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101987, + "balance_loss_mlp": 1.08526158, + "epoch": 0.013659099653712967, + "flos": 565234454016.0, + "grad_norm": 0.06358988870017376, + "language_loss": 1.03380442, + "learning_rate": 0.000844119093875517, + "loss": 1.04482436, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.16723633, + "step": 71, + "time_per_iteration": 2.692791223526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103533, + "balance_loss_mlp": 1.08689094, + "epoch": 0.01385148133897653, + "flos": 573540950016.0, + "grad_norm": 0.07461407963015444, + "language_loss": 1.08098376, + "learning_rate": 0.0008468887257134666, + "loss": 1.09201908, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.16650391, + "step": 72, + "time_per_iteration": 2.6599459648132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122889, + "balance_loss_mlp": 1.10587776, + "epoch": 0.014043863024240093, + "flos": 576539665920.0, + "grad_norm": 0.05931650266846123, + "language_loss": 1.10316896, + "learning_rate": 0.0008496201545131264, + "loss": 1.11439776, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.17028809, + "step": 73, + "time_per_iteration": 2.7093684673309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126213, + "balance_loss_mlp": 1.10950017, + "epoch": 0.014236244709503656, + "flos": 938287660032.0, + "grad_norm": 0.060718352480344094, + "language_loss": 1.08902812, + "learning_rate": 0.0008523144198617317, + "loss": 1.1002903, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.16711426, + "step": 74, + "time_per_iteration": 3.1743276119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125614, + "balance_loss_mlp": 1.10876918, + "epoch": 0.014428626394767219, + "flos": 528231352320.0, + "grad_norm": 0.07198154728214846, + "language_loss": 1.08249164, + "learning_rate": 0.0008549725194813783, + "loss": 1.09374774, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.1685791, + "step": 75, + "time_per_iteration": 2.630387783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106727, + "balance_loss_mlp": 1.09047866, + "epoch": 0.014621008080030782, + "flos": 803371433472.0, + "grad_norm": 0.07553700512989577, + "language_loss": 1.06998253, + "learning_rate": 0.0008575954114472099, + "loss": 1.0810498, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.16247559, + "step": 76, + "time_per_iteration": 3.134385347366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109532, + "balance_loss_mlp": 1.0933075, + "epoch": 0.014813389765294343, + "flos": 696588788736.0, + "grad_norm": 0.053440596513601155, + "language_loss": 1.05069363, + "learning_rate": 0.0008601840162606118, + "loss": 1.06178904, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.16223145, + "step": 77, + "time_per_iteration": 3.039991855621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123171, + "balance_loss_mlp": 1.10660076, + "epoch": 0.015005771450557906, + "flos": 596702813184.0, + "grad_norm": 0.07894951514499118, + "language_loss": 1.1143651, + "learning_rate": 0.000862739218788641, + "loss": 1.12559676, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.16577148, + "step": 78, + "time_per_iteration": 2.867741346359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113228, + "balance_loss_mlp": 1.11553121, + "epoch": 0.01519815313582147, + "flos": 549148170240.0, + "grad_norm": 0.0893413961860561, + "language_loss": 1.07743871, + "learning_rate": 0.0008652618700799138, + "loss": 1.08876157, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.16760254, + "step": 79, + "time_per_iteration": 2.675795555114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160152, + "balance_loss_mlp": 1.14348662, + "epoch": 0.015390534821085032, + "flos": 430306642944.0, + "grad_norm": 0.06679936706529424, + "language_loss": 1.07125092, + "learning_rate": 0.0008677527890662774, + "loss": 1.08285248, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.16662598, + "step": 80, + "time_per_iteration": 2.4765963554382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196819, + "balance_loss_mlp": 1.17889023, + "epoch": 0.015582916506348595, + "flos": 523854743040.0, + "grad_norm": 0.12362960542988827, + "language_loss": 1.09903598, + "learning_rate": 0.0008702127641587799, + "loss": 1.11100423, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.17932129, + "step": 81, + "time_per_iteration": 2.636688470840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180455, + "balance_loss_mlp": 1.16288388, + "epoch": 0.015775298191612157, + "flos": 575151598080.0, + "grad_norm": 0.08274533442322421, + "language_loss": 1.04032063, + "learning_rate": 0.0008726425547457192, + "loss": 1.05212522, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.17565918, + "step": 82, + "time_per_iteration": 2.765179395675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157804, + "balance_loss_mlp": 1.14051914, + "epoch": 0.01596767987687572, + "flos": 610040664576.0, + "grad_norm": 0.07618339381967684, + "language_loss": 1.03921247, + "learning_rate": 0.0008750428925998964, + "loss": 1.05079055, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.1730957, + "step": 83, + "time_per_iteration": 2.7615418434143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159673, + "balance_loss_mlp": 1.14280462, + "epoch": 0.016160061562139283, + "flos": 566864040960.0, + "grad_norm": 0.0706757922791228, + "language_loss": 1.09743476, + "learning_rate": 0.0008774144832015932, + "loss": 1.10903156, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.16882324, + "step": 84, + "time_per_iteration": 2.694364070892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01699218, + "balance_loss_mlp": 1.68252861, + "epoch": 0.016352443247402846, + "flos": 1410557234688.0, + "grad_norm": 0.23342967148410274, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76473522, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.16699219, + "step": 85, + "time_per_iteration": 4.599137306213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212855, + "balance_loss_mlp": 1.19580793, + "epoch": 0.01654482493266641, + "flos": 730177127424.0, + "grad_norm": 0.09253845479208671, + "language_loss": 1.04518116, + "learning_rate": 0.0008820741205014318, + "loss": 1.05730963, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.1706543, + "step": 86, + "time_per_iteration": 2.8595266342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246652, + "balance_loss_mlp": 1.22939014, + "epoch": 0.016737206617929972, + "flos": 536016932352.0, + "grad_norm": 0.10044068584300966, + "language_loss": 1.06437612, + "learning_rate": 0.0008843634575408404, + "loss": 1.07684278, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.17248535, + "step": 87, + "time_per_iteration": 2.690492630004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215448, + "balance_loss_mlp": 1.19887805, + "epoch": 0.016929588303193535, + "flos": 536706584064.0, + "grad_norm": 0.0661610487366718, + "language_loss": 1.07674646, + "learning_rate": 0.0008866266301555082, + "loss": 1.08890104, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.16577148, + "step": 88, + "time_per_iteration": 2.737339496612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203027, + "balance_loss_mlp": 1.18706512, + "epoch": 0.017121969988457098, + "flos": 526498458624.0, + "grad_norm": 0.07897226836222233, + "language_loss": 1.08543992, + "learning_rate": 0.0008888642296509615, + "loss": 1.09747016, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.1595459, + "step": 89, + "time_per_iteration": 2.576819658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187655, + "balance_loss_mlp": 1.17131162, + "epoch": 0.01731435167372066, + "flos": 625304876544.0, + "grad_norm": 0.0740353605135553, + "language_loss": 1.13367987, + "learning_rate": 0.0008910768275115906, + "loss": 1.14555645, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.16345215, + "step": 90, + "time_per_iteration": 2.778571128845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173459, + "balance_loss_mlp": 1.15692425, + "epoch": 0.017506733358984224, + "flos": 496157709312.0, + "grad_norm": 0.07518713147028631, + "language_loss": 1.08794332, + "learning_rate": 0.0008932649762767675, + "loss": 1.0996778, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.16540527, + "step": 91, + "time_per_iteration": 2.5931665897369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185544, + "balance_loss_mlp": 1.16881919, + "epoch": 0.017699115044247787, + "flos": 745613047296.0, + "grad_norm": 0.07711429280558382, + "language_loss": 1.11576343, + "learning_rate": 0.0008954292103690864, + "loss": 1.12761879, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.1673584, + "step": 92, + "time_per_iteration": 2.9129488468170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194769, + "balance_loss_mlp": 1.17854476, + "epoch": 0.01789149672951135, + "flos": 515257265664.0, + "grad_norm": 0.0669718610224715, + "language_loss": 1.1343056, + "learning_rate": 0.0008975700468778296, + "loss": 1.14625335, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.16223145, + "step": 93, + "time_per_iteration": 2.576620101928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216953, + "balance_loss_mlp": 1.20076382, + "epoch": 0.018083878414774913, + "flos": 585850116096.0, + "grad_norm": 0.11698648494194364, + "language_loss": 1.0652318, + "learning_rate": 0.0008996879863005366, + "loss": 1.07740128, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.16186523, + "step": 94, + "time_per_iteration": 2.6751108169555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217025, + "balance_loss_mlp": 1.2013253, + "epoch": 0.018276260100038477, + "flos": 497103436800.0, + "grad_norm": 0.08327491501556071, + "language_loss": 1.06208014, + "learning_rate": 0.0009017835132453337, + "loss": 1.07425046, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.15686035, + "step": 95, + "time_per_iteration": 2.5971803665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196463, + "balance_loss_mlp": 1.1804409, + "epoch": 0.01846864178530204, + "flos": 639765955584.0, + "grad_norm": 0.09756000368948786, + "language_loss": 1.06920743, + "learning_rate": 0.0009038570970964896, + "loss": 1.08117199, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.16027832, + "step": 96, + "time_per_iteration": 2.7428832054138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173361, + "balance_loss_mlp": 1.15723228, + "epoch": 0.018661023470565603, + "flos": 511411746816.0, + "grad_norm": 0.07053433913024812, + "language_loss": 1.04343212, + "learning_rate": 0.0009059091926454854, + "loss": 1.05516577, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.16125488, + "step": 97, + "time_per_iteration": 2.570509433746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.16246903, + "epoch": 0.018853405155829166, + "flos": 930710103552.0, + "grad_norm": 0.08767892767743933, + "language_loss": 1.03389072, + "learning_rate": 0.0009079402406897198, + "loss": 1.04567385, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.15844727, + "step": 98, + "time_per_iteration": 3.202298164367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179075, + "balance_loss_mlp": 1.16296983, + "epoch": 0.01904578684109273, + "flos": 576209396736.0, + "grad_norm": 0.2639136557883628, + "language_loss": 1.0596242, + "learning_rate": 0.0009099506686008212, + "loss": 1.07141495, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.16101074, + "step": 99, + "time_per_iteration": 2.812368869781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139923, + "balance_loss_mlp": 1.12423468, + "epoch": 0.019238168526356292, + "flos": 558173431296.0, + "grad_norm": 0.12311670746354397, + "language_loss": 1.08180976, + "learning_rate": 0.0009119408908644013, + "loss": 1.09320903, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.15673828, + "step": 100, + "time_per_iteration": 2.7063775062561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150815, + "balance_loss_mlp": 1.13574743, + "epoch": 0.019430550211619855, + "flos": 723539506176.0, + "grad_norm": 0.12127606313133317, + "language_loss": 1.14121008, + "learning_rate": 0.0009139113095929519, + "loss": 1.15271831, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.15039062, + "step": 101, + "time_per_iteration": 2.840913772583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218173, + "balance_loss_mlp": 1.20243776, + "epoch": 0.019622931896883418, + "flos": 499235000832.0, + "grad_norm": 0.1104247345061639, + "language_loss": 1.0836457, + "learning_rate": 0.0009158623150134762, + "loss": 1.09582746, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.15722656, + "step": 102, + "time_per_iteration": 2.560464859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01357908, + "balance_loss_mlp": 1.34204173, + "epoch": 0.01981531358214698, + "flos": 508916418048.0, + "grad_norm": 0.15164768975642337, + "language_loss": 1.07661259, + "learning_rate": 0.000917794285931332, + "loss": 1.0901916, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.15856934, + "step": 103, + "time_per_iteration": 2.6684353351593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381196, + "balance_loss_mlp": 1.36572242, + "epoch": 0.020007695267410544, + "flos": 521087371776.0, + "grad_norm": 0.10342928287682196, + "language_loss": 0.9971087, + "learning_rate": 0.0009197075901716639, + "loss": 1.01092052, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.15454102, + "step": 104, + "time_per_iteration": 2.7250871658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01356986, + "balance_loss_mlp": 1.34017777, + "epoch": 0.020200076952674107, + "flos": 533013834240.0, + "grad_norm": 0.1824265866479698, + "language_loss": 1.09647703, + "learning_rate": 0.0009216025849997171, + "loss": 1.11004686, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.16809082, + "step": 105, + "time_per_iteration": 2.776764154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261961, + "balance_loss_mlp": 1.24583197, + "epoch": 0.020392458637937667, + "flos": 684430981632.0, + "grad_norm": 0.06376163654280764, + "language_loss": 1.0425086, + "learning_rate": 0.0009234796175212258, + "loss": 1.05512834, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.16125488, + "step": 106, + "time_per_iteration": 2.9174978733062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269614, + "balance_loss_mlp": 1.25201869, + "epoch": 0.02058484032320123, + "flos": 701791852032.0, + "grad_norm": 0.060044663360548714, + "language_loss": 1.08808422, + "learning_rate": 0.000925339025064007, + "loss": 1.10078037, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.17590332, + "step": 107, + "time_per_iteration": 2.975735902786255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324579, + "balance_loss_mlp": 1.30547023, + "epoch": 0.020777222008464793, + "flos": 638772175872.0, + "grad_norm": 0.12680512225677842, + "language_loss": 1.01262307, + "learning_rate": 0.0009271811355418027, + "loss": 1.02586877, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.19128418, + "step": 108, + "time_per_iteration": 2.8408150672912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01306621, + "balance_loss_mlp": 1.28755951, + "epoch": 0.020969603693728356, + "flos": 681785856000.0, + "grad_norm": 0.06997483982989385, + "language_loss": 1.08551693, + "learning_rate": 0.0009290062678013548, + "loss": 1.09858322, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.19055176, + "step": 109, + "time_per_iteration": 2.869980812072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309468, + "balance_loss_mlp": 1.29159832, + "epoch": 0.02116198537899192, + "flos": 533140462080.0, + "grad_norm": 0.13190855435004306, + "language_loss": 1.06647623, + "learning_rate": 0.0009308147319536321, + "loss": 1.07957077, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.17895508, + "step": 110, + "time_per_iteration": 2.6270735263824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130688, + "balance_loss_mlp": 1.29067969, + "epoch": 0.021354367064255482, + "flos": 717168135168.0, + "grad_norm": 0.10963649287068344, + "language_loss": 1.1282903, + "learning_rate": 0.0009326068296900676, + "loss": 1.14135909, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.1619873, + "step": 111, + "time_per_iteration": 2.8845341205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388527, + "balance_loss_mlp": 1.37200487, + "epoch": 0.021546748749519045, + "flos": 519290459136.0, + "grad_norm": 0.12406482447985402, + "language_loss": 1.03902006, + "learning_rate": 0.0009343828545846161, + "loss": 1.05290532, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.16516113, + "step": 112, + "time_per_iteration": 2.8167102336883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01548404, + "balance_loss_mlp": 1.53109479, + "epoch": 0.021739130434782608, + "flos": 504912337920.0, + "grad_norm": 0.2528517188051562, + "language_loss": 1.0722419, + "learning_rate": 0.0009361430923823841, + "loss": 1.08772588, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.1730957, + "step": 113, + "time_per_iteration": 2.664581060409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441472, + "balance_loss_mlp": 1.42576015, + "epoch": 0.02193151212004617, + "flos": 463251820032.0, + "grad_norm": 0.1910881492312462, + "language_loss": 1.11420846, + "learning_rate": 0.0009378878212755459, + "loss": 1.12862325, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.15710449, + "step": 114, + "time_per_iteration": 2.4851133823394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262203, + "balance_loss_mlp": 1.24767148, + "epoch": 0.022123893805309734, + "flos": 552008673792.0, + "grad_norm": 0.09004287588953173, + "language_loss": 1.0099957, + "learning_rate": 0.0009396173121672103, + "loss": 1.0226177, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.14538574, + "step": 115, + "time_per_iteration": 2.6535162925720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215709, + "balance_loss_mlp": 1.20165396, + "epoch": 0.022316275490573297, + "flos": 635920436736.0, + "grad_norm": 0.07849561533847389, + "language_loss": 1.07122314, + "learning_rate": 0.0009413318289238633, + "loss": 1.08338022, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.14050293, + "step": 116, + "time_per_iteration": 2.7836899757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203544, + "balance_loss_mlp": 1.18965602, + "epoch": 0.02250865717583686, + "flos": 798535107072.0, + "grad_norm": 0.07099947506123377, + "language_loss": 0.98912275, + "learning_rate": 0.0009430316286169771, + "loss": 1.00115824, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.13891602, + "step": 117, + "time_per_iteration": 3.049468517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263206, + "balance_loss_mlp": 1.24786401, + "epoch": 0.022701038861100423, + "flos": 455851763712.0, + "grad_norm": 0.18808502465815918, + "language_loss": 1.04843259, + "learning_rate": 0.0009447169617543361, + "loss": 1.06106472, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.15319824, + "step": 118, + "time_per_iteration": 2.5886504650115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121023, + "balance_loss_mlp": 1.19557953, + "epoch": 0.022893420546363986, + "flos": 582812112384.0, + "grad_norm": 0.09179634719817005, + "language_loss": 1.11139297, + "learning_rate": 0.0009463880725016029, + "loss": 1.12349522, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.14648438, + "step": 119, + "time_per_iteration": 2.6861259937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169264, + "balance_loss_mlp": 1.15572226, + "epoch": 0.02308580223162755, + "flos": 561010613760.0, + "grad_norm": 0.09164108943144146, + "language_loss": 1.05675769, + "learning_rate": 0.0009480451988946134, + "loss": 1.06845045, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.13549805, + "step": 120, + "time_per_iteration": 2.8075129985809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217336, + "balance_loss_mlp": 1.2034359, + "epoch": 0.023278183916891113, + "flos": 770966111232.0, + "grad_norm": 0.1019945076921087, + "language_loss": 1.07486713, + "learning_rate": 0.0009496885730428627, + "loss": 1.08704054, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.13903809, + "step": 121, + "time_per_iteration": 3.0081264972686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291544, + "balance_loss_mlp": 1.27698815, + "epoch": 0.023470565602154676, + "flos": 553111552512.0, + "grad_norm": 0.08478902086488087, + "language_loss": 1.05369067, + "learning_rate": 0.0009513184213246156, + "loss": 1.06660616, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.14550781, + "step": 122, + "time_per_iteration": 2.654902696609497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406128, + "balance_loss_mlp": 1.39054775, + "epoch": 0.02366294728741824, + "flos": 559744791552.0, + "grad_norm": 0.09837859270685317, + "language_loss": 1.09463692, + "learning_rate": 0.0009529349645740552, + "loss": 1.10869825, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.15563965, + "step": 123, + "time_per_iteration": 2.6837081909179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01484693, + "balance_loss_mlp": 1.46961284, + "epoch": 0.0238553289726818, + "flos": 468313698816.0, + "grad_norm": 0.11388616458843728, + "language_loss": 1.07573724, + "learning_rate": 0.0009545384182608524, + "loss": 1.09058416, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.1505127, + "step": 124, + "time_per_iteration": 2.5069937705993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01411359, + "balance_loss_mlp": 1.39688659, + "epoch": 0.024047710657945365, + "flos": 559763730432.0, + "grad_norm": 0.3429043048666504, + "language_loss": 1.05057025, + "learning_rate": 0.0009561289926625252, + "loss": 1.06468379, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.14465332, + "step": 125, + "time_per_iteration": 2.6802117824554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011688, + "balance_loss_mlp": 1.15507352, + "epoch": 0.024240092343208928, + "flos": 504528224256.0, + "grad_norm": 0.18048320350440872, + "language_loss": 1.09737623, + "learning_rate": 0.0009577068930299292, + "loss": 1.10906434, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.13739014, + "step": 126, + "time_per_iteration": 2.6096670627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163735, + "balance_loss_mlp": 1.15040147, + "epoch": 0.02443247402847249, + "flos": 435516908544.0, + "grad_norm": 0.07278748671530755, + "language_loss": 1.05931616, + "learning_rate": 0.0009592723197462087, + "loss": 1.07095349, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.13360596, + "step": 127, + "time_per_iteration": 2.6409482955932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248107, + "balance_loss_mlp": 1.23239577, + "epoch": 0.024624855713736054, + "flos": 683445966336.0, + "grad_norm": 0.0813490266373729, + "language_loss": 1.02871299, + "learning_rate": 0.0009608254684795125, + "loss": 1.04119396, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.15710449, + "step": 128, + "time_per_iteration": 2.940600872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265693, + "balance_loss_mlp": 1.24772859, + "epoch": 0.024817237398999614, + "flos": 524721894912.0, + "grad_norm": 0.0804185451989367, + "language_loss": 1.06161952, + "learning_rate": 0.0009623665303297678, + "loss": 1.07427645, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.1796875, + "step": 129, + "time_per_iteration": 2.7088472843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256284, + "balance_loss_mlp": 1.23668599, + "epoch": 0.025009619084263177, + "flos": 655350262272.0, + "grad_norm": 0.12369480901617341, + "language_loss": 1.10218048, + "learning_rate": 0.0009638956919697878, + "loss": 1.11474347, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.19592285, + "step": 130, + "time_per_iteration": 2.857571840286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266475, + "balance_loss_mlp": 1.24420691, + "epoch": 0.02520200076952674, + "flos": 454187271168.0, + "grad_norm": 0.08293639348197612, + "language_loss": 1.02638018, + "learning_rate": 0.0009654131357809714, + "loss": 1.03904486, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.22253418, + "step": 131, + "time_per_iteration": 2.641470432281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128644, + "balance_loss_mlp": 1.26142943, + "epoch": 0.025394382454790303, + "flos": 839427397632.0, + "grad_norm": 0.05741461740254168, + "language_loss": 1.11002767, + "learning_rate": 0.0009669190399838441, + "loss": 1.12289214, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.25036621, + "step": 132, + "time_per_iteration": 3.133596420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302533, + "balance_loss_mlp": 1.27633083, + "epoch": 0.025586764140053866, + "flos": 580725628416.0, + "grad_norm": 0.06987664196058198, + "language_loss": 1.0413487, + "learning_rate": 0.0009684135787636724, + "loss": 1.05437398, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.26208496, + "step": 133, + "time_per_iteration": 2.7968075275421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01325396, + "balance_loss_mlp": 1.29710746, + "epoch": 0.02577914582531743, + "flos": 789893959680.0, + "grad_norm": 0.07551411578012862, + "language_loss": 1.07757604, + "learning_rate": 0.0009698969223913726, + "loss": 1.09083009, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.28283691, + "step": 134, + "time_per_iteration": 3.0058987140655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131126, + "balance_loss_mlp": 1.28212547, + "epoch": 0.025971527510580992, + "flos": 594683320320.0, + "grad_norm": 0.0731546450398535, + "language_loss": 1.10457921, + "learning_rate": 0.0009713692373399265, + "loss": 1.11769176, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.29125977, + "step": 135, + "time_per_iteration": 2.6654229164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02152319, + "balance_loss_mlp": 1.95700705, + "epoch": 0.026163909195844555, + "flos": 1576771522560.0, + "grad_norm": 0.26755932757436196, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81608546, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 1.953125, + "step": 136, + "time_per_iteration": 6.531313896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01724331, + "balance_loss_mlp": 1.55266988, + "epoch": 0.026356290881108118, + "flos": 1501306030080.0, + "grad_norm": 0.1444935793983717, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79535371, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 1.71875, + "step": 137, + "time_per_iteration": 4.966995716094971 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01371776, + "balance_loss_mlp": 1.34284425, + "epoch": 0.02654867256637168, + "flos": 596841025536.0, + "grad_norm": 0.06823267419395149, + "language_loss": 1.03539467, + "learning_rate": 0.0009757216201974225, + "loss": 1.04911256, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.28918457, + "step": 138, + "time_per_iteration": 2.7901663780212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396345, + "balance_loss_mlp": 1.36752045, + "epoch": 0.026741054251635244, + "flos": 544761386496.0, + "grad_norm": 0.08904352821745645, + "language_loss": 1.08793342, + "learning_rate": 0.0009771514130396581, + "loss": 1.10189688, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.28833008, + "step": 139, + "time_per_iteration": 2.664384603500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410566, + "balance_loss_mlp": 1.38171697, + "epoch": 0.026933435936898807, + "flos": 506591387136.0, + "grad_norm": 0.09467843708761726, + "language_loss": 1.08393478, + "learning_rate": 0.00097857095638274, + "loss": 1.09804034, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.28833008, + "step": 140, + "time_per_iteration": 2.5600626468658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399161, + "balance_loss_mlp": 1.37263703, + "epoch": 0.02712581762216237, + "flos": 740513290752.0, + "grad_norm": 0.06303030428856128, + "language_loss": 0.99670362, + "learning_rate": 0.0009799803961288726, + "loss": 1.01069522, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.26538086, + "step": 141, + "time_per_iteration": 2.984253168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01354082, + "balance_loss_mlp": 1.33143175, + "epoch": 0.027318199307425933, + "flos": 848023464960.0, + "grad_norm": 0.06264638149228761, + "language_loss": 1.05898559, + "learning_rate": 0.000981379875086876, + "loss": 1.07252645, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.22644043, + "step": 142, + "time_per_iteration": 3.032597064971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323808, + "balance_loss_mlp": 1.30553341, + "epoch": 0.027510580992689496, + "flos": 575288400384.0, + "grad_norm": 0.07028220907739285, + "language_loss": 1.01752293, + "learning_rate": 0.0009827695330590185, + "loss": 1.030761, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.18273926, + "step": 143, + "time_per_iteration": 2.626483678817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303402, + "balance_loss_mlp": 1.28557992, + "epoch": 0.02770296267795306, + "flos": 772079164416.0, + "grad_norm": 0.05744811954937285, + "language_loss": 1.00619161, + "learning_rate": 0.0009841495069248256, + "loss": 1.0192256, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.17822266, + "step": 144, + "time_per_iteration": 2.9495198726654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316023, + "balance_loss_mlp": 1.29916632, + "epoch": 0.027895344363216622, + "flos": 569123642880.0, + "grad_norm": 0.04968902291069247, + "language_loss": 0.9920603, + "learning_rate": 0.0009855199307219871, + "loss": 1.00522041, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.1685791, + "step": 145, + "time_per_iteration": 2.6407721042633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130391, + "balance_loss_mlp": 1.28731608, + "epoch": 0.028087726048480186, + "flos": 547099564032.0, + "grad_norm": 0.10723696528856613, + "language_loss": 1.01566505, + "learning_rate": 0.0009868809357244854, + "loss": 1.02870417, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.16589355, + "step": 146, + "time_per_iteration": 2.6262452602386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287507, + "balance_loss_mlp": 1.27153277, + "epoch": 0.02828010773374375, + "flos": 524519663616.0, + "grad_norm": 0.06991830692152445, + "language_loss": 1.05632663, + "learning_rate": 0.0009882326505180556, + "loss": 1.06920183, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.15966797, + "step": 147, + "time_per_iteration": 2.6469435691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270213, + "balance_loss_mlp": 1.2534517, + "epoch": 0.02847248941900731, + "flos": 772108277760.0, + "grad_norm": 0.07309095407736986, + "language_loss": 1.04486537, + "learning_rate": 0.0009895752010730906, + "loss": 1.0575676, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.16748047, + "step": 148, + "time_per_iteration": 2.9457786083221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012724, + "balance_loss_mlp": 1.25667655, + "epoch": 0.028664871104270875, + "flos": 534150208512.0, + "grad_norm": 0.048334696317449924, + "language_loss": 1.10088921, + "learning_rate": 0.0009909087108150867, + "loss": 1.11361325, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.15710449, + "step": 149, + "time_per_iteration": 2.712559700012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309133, + "balance_loss_mlp": 1.29286051, + "epoch": 0.028857252789534438, + "flos": 367557599232.0, + "grad_norm": 0.13115053493636905, + "language_loss": 1.11238122, + "learning_rate": 0.0009922333006927371, + "loss": 1.12547255, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.16247559, + "step": 150, + "time_per_iteration": 2.4607067108154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01329212, + "balance_loss_mlp": 1.31257081, + "epoch": 0.029049634474798, + "flos": 515232534528.0, + "grad_norm": 0.06948512606819708, + "language_loss": 1.04613614, + "learning_rate": 0.0009935490892437632, + "loss": 1.05942833, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.16650391, + "step": 151, + "time_per_iteration": 2.5460238456726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309109, + "balance_loss_mlp": 1.29317045, + "epoch": 0.029242016160061564, + "flos": 587840495616.0, + "grad_norm": 0.11257287432569656, + "language_loss": 1.03097093, + "learning_rate": 0.0009948561926585687, + "loss": 1.04406202, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.15930176, + "step": 152, + "time_per_iteration": 2.753009557723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300362, + "balance_loss_mlp": 1.28555596, + "epoch": 0.029434397845325123, + "flos": 551816616960.0, + "grad_norm": 0.062223246716750634, + "language_loss": 1.06524086, + "learning_rate": 0.0009961547248418122, + "loss": 1.07824445, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.14807129, + "step": 153, + "time_per_iteration": 2.630092144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308008, + "balance_loss_mlp": 1.29357219, + "epoch": 0.029626779530588686, + "flos": 603221160960.0, + "grad_norm": 0.09420536563091944, + "language_loss": 1.03062868, + "learning_rate": 0.0009974447974719707, + "loss": 1.04370856, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.14440918, + "step": 154, + "time_per_iteration": 2.6962759494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312448, + "balance_loss_mlp": 1.29745138, + "epoch": 0.02981916121585225, + "flos": 620808993792.0, + "grad_norm": 0.08558703297148447, + "language_loss": 1.04985213, + "learning_rate": 0.0009987265200589763, + "loss": 1.0629766, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.15002441, + "step": 155, + "time_per_iteration": 2.7059414386749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295882, + "balance_loss_mlp": 1.28057528, + "epoch": 0.030011542901115813, + "flos": 661322962944.0, + "grad_norm": 0.09731995783752632, + "language_loss": 1.04436159, + "learning_rate": 0.001, + "loss": 1.05732036, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.1529541, + "step": 156, + "time_per_iteration": 2.856968641281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262708, + "balance_loss_mlp": 1.24682927, + "epoch": 0.030203924586379376, + "flos": 651258842112.0, + "grad_norm": 0.05966927829613408, + "language_loss": 1.02520585, + "learning_rate": 0.0009999999029413921, + "loss": 1.03783274, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.15856934, + "step": 157, + "time_per_iteration": 2.851480722427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268181, + "balance_loss_mlp": 1.25150406, + "epoch": 0.03039630627164294, + "flos": 531083091456.0, + "grad_norm": 0.1034311415514979, + "language_loss": 1.04085183, + "learning_rate": 0.0009999996117656068, + "loss": 1.05353379, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.16674805, + "step": 158, + "time_per_iteration": 2.707646369934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262524, + "balance_loss_mlp": 1.24747968, + "epoch": 0.030588687956906502, + "flos": 585914135040.0, + "grad_norm": 0.12050944658187299, + "language_loss": 0.97824669, + "learning_rate": 0.0009999991264727564, + "loss": 0.99087203, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.15039062, + "step": 159, + "time_per_iteration": 2.7575390338897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272116, + "balance_loss_mlp": 1.25716722, + "epoch": 0.030781069642170065, + "flos": 513026777088.0, + "grad_norm": 0.07020206521781955, + "language_loss": 1.08316755, + "learning_rate": 0.0009999984470630296, + "loss": 1.09588861, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.14929199, + "step": 160, + "time_per_iteration": 2.62310528755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128559, + "balance_loss_mlp": 1.27058172, + "epoch": 0.030973451327433628, + "flos": 717766064640.0, + "grad_norm": 0.06068839125924313, + "language_loss": 0.96528012, + "learning_rate": 0.0009999975735366902, + "loss": 0.978136, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.15002441, + "step": 161, + "time_per_iteration": 3.0823376178741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305752, + "balance_loss_mlp": 1.29055238, + "epoch": 0.03116583301269719, + "flos": 1109312133120.0, + "grad_norm": 0.09428930343360856, + "language_loss": 0.98546314, + "learning_rate": 0.0009999965058940775, + "loss": 0.99852067, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.1517334, + "step": 162, + "time_per_iteration": 3.486618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315996, + "balance_loss_mlp": 1.3010118, + "epoch": 0.031358214697960754, + "flos": 450676403712.0, + "grad_norm": 0.09976775191278689, + "language_loss": 1.04580116, + "learning_rate": 0.0009999952441356057, + "loss": 1.05896115, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.1496582, + "step": 163, + "time_per_iteration": 2.537173271179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300744, + "balance_loss_mlp": 1.28654623, + "epoch": 0.031550596383224314, + "flos": 1254701325312.0, + "grad_norm": 0.0838197011845512, + "language_loss": 1.05903006, + "learning_rate": 0.000999993788261765, + "loss": 1.07203746, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.14196777, + "step": 164, + "time_per_iteration": 3.5638957023620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270584, + "balance_loss_mlp": 1.25625503, + "epoch": 0.03174297806848788, + "flos": 667841310720.0, + "grad_norm": 0.068717417443618, + "language_loss": 1.0642612, + "learning_rate": 0.00099999213827312, + "loss": 1.076967, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.14343262, + "step": 165, + "time_per_iteration": 2.8084213733673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255587, + "balance_loss_mlp": 1.24152076, + "epoch": 0.03193535975375144, + "flos": 551033832960.0, + "grad_norm": 0.06892139424853191, + "language_loss": 1.0208962, + "learning_rate": 0.000999990294170312, + "loss": 1.03345203, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.14074707, + "step": 166, + "time_per_iteration": 2.6247787475585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259954, + "balance_loss_mlp": 1.24549401, + "epoch": 0.032127741439015006, + "flos": 543377700864.0, + "grad_norm": 0.08292396830811857, + "language_loss": 1.05774951, + "learning_rate": 0.0009999882559540566, + "loss": 1.07034898, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.14465332, + "step": 167, + "time_per_iteration": 2.654036283493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291491, + "balance_loss_mlp": 1.27790117, + "epoch": 0.032320123124278566, + "flos": 548104928256.0, + "grad_norm": 0.07217909902530589, + "language_loss": 1.02104354, + "learning_rate": 0.000999986023625145, + "loss": 1.03395844, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.13598633, + "step": 168, + "time_per_iteration": 2.696866750717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03738194, + "balance_loss_mlp": 3.61993837, + "epoch": 0.03251250480954213, + "flos": 1305156865536.0, + "grad_norm": 0.563981464368737, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.82662606, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 1.1796875, + "step": 169, + "time_per_iteration": 4.971506834030151 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134723, + "balance_loss_mlp": 1.33386648, + "epoch": 0.03270488649480569, + "flos": 560866609152.0, + "grad_norm": 0.12141219581883538, + "language_loss": 1.02540469, + "learning_rate": 0.0009999809766328958, + "loss": 1.03887701, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.13391113, + "step": 170, + "time_per_iteration": 2.646425724029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01355192, + "balance_loss_mlp": 1.34039843, + "epoch": 0.03289726818006926, + "flos": 482120031744.0, + "grad_norm": 0.08046017426621577, + "language_loss": 1.05186188, + "learning_rate": 0.0009999781619715177, + "loss": 1.06541371, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.14770508, + "step": 171, + "time_per_iteration": 2.535360336303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381569, + "balance_loss_mlp": 1.36640596, + "epoch": 0.03308964986533282, + "flos": 674355276288.0, + "grad_norm": 0.08789680193074563, + "language_loss": 1.04250002, + "learning_rate": 0.000999975153201402, + "loss": 1.05631578, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.15161133, + "step": 172, + "time_per_iteration": 2.8205513954162598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433883, + "balance_loss_mlp": 1.41711044, + "epoch": 0.033282031550596385, + "flos": 608937785856.0, + "grad_norm": 0.07610360898370483, + "language_loss": 1.02505267, + "learning_rate": 0.0009999719503237174, + "loss": 1.03939152, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.16760254, + "step": 173, + "time_per_iteration": 2.738676071166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451195, + "balance_loss_mlp": 1.43315864, + "epoch": 0.033474413235859944, + "flos": 467801547264.0, + "grad_norm": 0.07270846083900323, + "language_loss": 1.111094, + "learning_rate": 0.0009999685533397073, + "loss": 1.12560594, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.18029785, + "step": 174, + "time_per_iteration": 2.5556905269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01429898, + "balance_loss_mlp": 1.41368508, + "epoch": 0.03366679492112351, + "flos": 579365263872.0, + "grad_norm": 0.09196642879711979, + "language_loss": 1.03199494, + "learning_rate": 0.00099996496225069, + "loss": 1.04629397, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.16210938, + "step": 175, + "time_per_iteration": 2.6806485652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432234, + "balance_loss_mlp": 1.41513896, + "epoch": 0.03385917660638707, + "flos": 637378315776.0, + "grad_norm": 0.08705990667808558, + "language_loss": 1.05897307, + "learning_rate": 0.0009999611770580604, + "loss": 1.07329535, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.17102051, + "step": 176, + "time_per_iteration": 2.830826759338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415158, + "balance_loss_mlp": 1.39910054, + "epoch": 0.03405155829165064, + "flos": 441587123712.0, + "grad_norm": 0.08054669051038237, + "language_loss": 1.03868258, + "learning_rate": 0.0009999571977632876, + "loss": 1.05283427, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.16052246, + "step": 177, + "time_per_iteration": 2.623309850692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463141, + "balance_loss_mlp": 1.44573641, + "epoch": 0.034243939976914196, + "flos": 466097766912.0, + "grad_norm": 0.08089290506220445, + "language_loss": 1.06928194, + "learning_rate": 0.0009999530243679166, + "loss": 1.08391333, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.17407227, + "step": 178, + "time_per_iteration": 2.545133113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451423, + "balance_loss_mlp": 1.43560433, + "epoch": 0.03443632166217776, + "flos": 778919016960.0, + "grad_norm": 0.08468734735068614, + "language_loss": 1.01505899, + "learning_rate": 0.0009999486568735675, + "loss": 1.0295732, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.15808105, + "step": 179, + "time_per_iteration": 3.0384457111358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433641, + "balance_loss_mlp": 1.41778612, + "epoch": 0.03462870334744132, + "flos": 1263284246016.0, + "grad_norm": 0.06997324880309466, + "language_loss": 1.01388979, + "learning_rate": 0.0009999440952819362, + "loss": 1.02822614, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.15856934, + "step": 180, + "time_per_iteration": 3.6892786026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401308, + "balance_loss_mlp": 1.38610911, + "epoch": 0.03482108503270489, + "flos": 606899354112.0, + "grad_norm": 0.057831512038439566, + "language_loss": 1.02027512, + "learning_rate": 0.0009999393395947935, + "loss": 1.03428817, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.15185547, + "step": 181, + "time_per_iteration": 2.826353073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381551, + "balance_loss_mlp": 1.36612535, + "epoch": 0.03501346671796845, + "flos": 538010284032.0, + "grad_norm": 0.05913415109875365, + "language_loss": 1.05361927, + "learning_rate": 0.0009999343898139858, + "loss": 1.06743479, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.1541748, + "step": 182, + "time_per_iteration": 2.593250036239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01359754, + "balance_loss_mlp": 1.33988214, + "epoch": 0.035205848403232015, + "flos": 518231250432.0, + "grad_norm": 0.05898920665253376, + "language_loss": 1.04308426, + "learning_rate": 0.0009999292459414348, + "loss": 1.05668187, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.19909668, + "step": 183, + "time_per_iteration": 2.565936326980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01311064, + "balance_loss_mlp": 1.296103, + "epoch": 0.035398230088495575, + "flos": 472134486528.0, + "grad_norm": 0.06373248491183749, + "language_loss": 1.08499169, + "learning_rate": 0.0009999239079791374, + "loss": 1.09810233, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.14953613, + "step": 184, + "time_per_iteration": 2.552949905395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130912, + "balance_loss_mlp": 1.29237127, + "epoch": 0.03559061177375914, + "flos": 511820591616.0, + "grad_norm": 0.056329932736213485, + "language_loss": 1.01337337, + "learning_rate": 0.0009999183759291659, + "loss": 1.02646446, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.16748047, + "step": 185, + "time_per_iteration": 2.741727113723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291511, + "balance_loss_mlp": 1.27575147, + "epoch": 0.0357829934590227, + "flos": 477146903040.0, + "grad_norm": 0.11224085577532149, + "language_loss": 1.03738213, + "learning_rate": 0.0009999126497936682, + "loss": 1.05029726, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.1574707, + "step": 186, + "time_per_iteration": 2.4901957511901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291515, + "balance_loss_mlp": 1.27446783, + "epoch": 0.03597537514428627, + "flos": 644350588416.0, + "grad_norm": 0.06537030709871235, + "language_loss": 1.06735992, + "learning_rate": 0.0009999067295748676, + "loss": 1.08027506, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.1706543, + "step": 187, + "time_per_iteration": 2.7923052310943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327575, + "balance_loss_mlp": 1.30966997, + "epoch": 0.03616775682954983, + "flos": 580916275200.0, + "grad_norm": 0.06523062893181024, + "language_loss": 1.04418302, + "learning_rate": 0.000999900615275062, + "loss": 1.05745876, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.17919922, + "step": 188, + "time_per_iteration": 2.7248637676239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295421, + "balance_loss_mlp": 1.27722955, + "epoch": 0.03636013851481339, + "flos": 382210735104.0, + "grad_norm": 0.08035209765807474, + "language_loss": 1.10347509, + "learning_rate": 0.0009998943068966256, + "loss": 1.11642933, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.18188477, + "step": 189, + "time_per_iteration": 2.429497480392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279097, + "balance_loss_mlp": 1.26120377, + "epoch": 0.03655252020007695, + "flos": 582954706944.0, + "grad_norm": 0.07380481555246936, + "language_loss": 1.0506779, + "learning_rate": 0.0009998878044420072, + "loss": 1.06346881, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.17907715, + "step": 190, + "time_per_iteration": 2.6878626346588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012863, + "balance_loss_mlp": 1.26773953, + "epoch": 0.03674490188534051, + "flos": 471376433664.0, + "grad_norm": 0.10484442400689244, + "language_loss": 1.01223493, + "learning_rate": 0.0009998811079137318, + "loss": 1.02509785, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.18566895, + "step": 191, + "time_per_iteration": 2.561494827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281775, + "balance_loss_mlp": 1.26645625, + "epoch": 0.03693728357060408, + "flos": 528113488896.0, + "grad_norm": 0.0609431296621874, + "language_loss": 1.01984763, + "learning_rate": 0.0009998742173143987, + "loss": 1.03266537, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.1529541, + "step": 192, + "time_per_iteration": 2.59798264503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336751, + "balance_loss_mlp": 1.32157528, + "epoch": 0.03712966525586764, + "flos": 798657352704.0, + "grad_norm": 0.10186248006293357, + "language_loss": 1.02005363, + "learning_rate": 0.0009998671326466833, + "loss": 1.03342128, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.15185547, + "step": 193, + "time_per_iteration": 2.9510865211486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331057, + "balance_loss_mlp": 1.3157624, + "epoch": 0.037322046941131205, + "flos": 829628116992.0, + "grad_norm": 0.06375125184008373, + "language_loss": 1.02914846, + "learning_rate": 0.0009998598539133362, + "loss": 1.04245901, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.1529541, + "step": 194, + "time_per_iteration": 2.9981300830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01337882, + "balance_loss_mlp": 1.3235296, + "epoch": 0.037514428626394765, + "flos": 437460797952.0, + "grad_norm": 0.10181133305516413, + "language_loss": 1.03936744, + "learning_rate": 0.0009998523811171828, + "loss": 1.0527463, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.14379883, + "step": 195, + "time_per_iteration": 2.501542568206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296215, + "balance_loss_mlp": 1.28125429, + "epoch": 0.03770681031165833, + "flos": 511372459008.0, + "grad_norm": 0.09414845611868274, + "language_loss": 1.04584992, + "learning_rate": 0.0009998447142611248, + "loss": 1.05881214, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.14941406, + "step": 196, + "time_per_iteration": 2.6247317790985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128144, + "balance_loss_mlp": 1.26702762, + "epoch": 0.03789919199692189, + "flos": 807102061056.0, + "grad_norm": 0.05831249070889761, + "language_loss": 0.97701526, + "learning_rate": 0.0009998368533481387, + "loss": 0.9898296, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.14422607, + "step": 197, + "time_per_iteration": 3.01912784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294999, + "balance_loss_mlp": 1.27945375, + "epoch": 0.03809157368218546, + "flos": 690274234368.0, + "grad_norm": 0.06656848410147823, + "language_loss": 1.00630498, + "learning_rate": 0.0009998287983812762, + "loss": 1.01925504, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.15551758, + "step": 198, + "time_per_iteration": 2.8252804279327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0135387, + "balance_loss_mlp": 1.33592904, + "epoch": 0.03828395536744902, + "flos": 517675428864.0, + "grad_norm": 0.06988401379713739, + "language_loss": 1.06386423, + "learning_rate": 0.0009998205493636646, + "loss": 1.07740283, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.17944336, + "step": 199, + "time_per_iteration": 2.649765729904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339461, + "balance_loss_mlp": 1.32242572, + "epoch": 0.038476337052712584, + "flos": 581389138944.0, + "grad_norm": 0.07184113921580974, + "language_loss": 0.9925406, + "learning_rate": 0.0009998121062985063, + "loss": 1.00593519, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.17053223, + "step": 200, + "time_per_iteration": 2.6788320541381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328142, + "balance_loss_mlp": 1.31167912, + "epoch": 0.03866871873797614, + "flos": 576791359488.0, + "grad_norm": 0.059667024197710104, + "language_loss": 1.01260698, + "learning_rate": 0.0009998034691890794, + "loss": 1.02588844, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.16455078, + "step": 201, + "time_per_iteration": 2.753265380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297644, + "balance_loss_mlp": 1.28249288, + "epoch": 0.03886110042323971, + "flos": 540472117248.0, + "grad_norm": 0.07302515973387386, + "language_loss": 1.05948424, + "learning_rate": 0.0009997946380387369, + "loss": 1.07246065, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.15136719, + "step": 202, + "time_per_iteration": 2.618546485900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262023, + "balance_loss_mlp": 1.24746776, + "epoch": 0.03905348210850327, + "flos": 717694843392.0, + "grad_norm": 0.0775452329378228, + "language_loss": 1.08266401, + "learning_rate": 0.0009997856128509076, + "loss": 1.09528422, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.14550781, + "step": 203, + "time_per_iteration": 2.8284859657287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267878, + "balance_loss_mlp": 1.25321579, + "epoch": 0.039245863793766836, + "flos": 427268639232.0, + "grad_norm": 0.06664318613050589, + "language_loss": 1.02886617, + "learning_rate": 0.0009997763936290952, + "loss": 1.04154491, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.14660645, + "step": 204, + "time_per_iteration": 2.516263246536255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129892, + "balance_loss_mlp": 1.28264785, + "epoch": 0.039438245479030395, + "flos": 662804163072.0, + "grad_norm": 0.07463685050771204, + "language_loss": 1.0815413, + "learning_rate": 0.0009997669803768789, + "loss": 1.09453046, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.16271973, + "step": 205, + "time_per_iteration": 2.7576606273651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291456, + "balance_loss_mlp": 1.27812803, + "epoch": 0.03963062716429396, + "flos": 635063459328.0, + "grad_norm": 0.055878982250893716, + "language_loss": 1.03253651, + "learning_rate": 0.0009997573730979134, + "loss": 1.04545116, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.13342285, + "step": 206, + "time_per_iteration": 2.716325521469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.07279634, + "balance_loss_mlp": 4.65512276, + "epoch": 0.03982300884955752, + "flos": 1417813286400.0, + "grad_norm": 0.533603848118922, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.86472833, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 26.25, + "step": 207, + "time_per_iteration": 4.635821342468262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0137482, + "balance_loss_mlp": 1.35964513, + "epoch": 0.04001539053482109, + "flos": 688769713152.0, + "grad_norm": 0.1040721574676452, + "language_loss": 1.02094078, + "learning_rate": 0.0009997375764747294, + "loss": 1.03468895, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.1517334, + "step": 208, + "time_per_iteration": 2.974442481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415285, + "balance_loss_mlp": 1.40052748, + "epoch": 0.04020777222008465, + "flos": 533363042304.0, + "grad_norm": 0.08111266742266361, + "language_loss": 0.99458027, + "learning_rate": 0.0009997273871381967, + "loss": 1.00873303, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.14758301, + "step": 209, + "time_per_iteration": 2.6802144050598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466201, + "balance_loss_mlp": 1.44989347, + "epoch": 0.040400153905348214, + "flos": 567661381632.0, + "grad_norm": 0.06875741115436663, + "language_loss": 1.05031717, + "learning_rate": 0.0009997170037902862, + "loss": 1.0649792, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.16308594, + "step": 210, + "time_per_iteration": 2.6975836753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531768, + "balance_loss_mlp": 1.51399446, + "epoch": 0.040592535590611774, + "flos": 713130559488.0, + "grad_norm": 0.07197690318934227, + "language_loss": 1.07202697, + "learning_rate": 0.0009997064264350292, + "loss": 1.08734465, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.17785645, + "step": 211, + "time_per_iteration": 2.836771011352539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531925, + "balance_loss_mlp": 1.5154984, + "epoch": 0.04078491727587533, + "flos": 577824427008.0, + "grad_norm": 0.09120436996840299, + "language_loss": 1.0146966, + "learning_rate": 0.0009996956550765317, + "loss": 1.03001595, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.16430664, + "step": 212, + "time_per_iteration": 2.671292781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01499293, + "balance_loss_mlp": 1.4817214, + "epoch": 0.0409772989611389, + "flos": 552033404928.0, + "grad_norm": 0.11449485477945152, + "language_loss": 0.96278083, + "learning_rate": 0.0009996846897189762, + "loss": 0.97777379, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.17565918, + "step": 213, + "time_per_iteration": 2.6231424808502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014653, + "balance_loss_mlp": 1.44753814, + "epoch": 0.04116968064640246, + "flos": 555347833344.0, + "grad_norm": 0.09512793115916172, + "language_loss": 1.02356708, + "learning_rate": 0.0009996735303666193, + "loss": 1.03822017, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.1776123, + "step": 214, + "time_per_iteration": 2.6930177211761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01477298, + "balance_loss_mlp": 1.46134758, + "epoch": 0.041362062331666026, + "flos": 578204158464.0, + "grad_norm": 0.09141123477091552, + "language_loss": 1.04750729, + "learning_rate": 0.0009996621770237937, + "loss": 1.06228042, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.15942383, + "step": 215, + "time_per_iteration": 2.7448923587799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01578462, + "balance_loss_mlp": 1.56013966, + "epoch": 0.041554444016929586, + "flos": 611130396672.0, + "grad_norm": 0.10233552268903827, + "language_loss": 0.99822551, + "learning_rate": 0.0009996506296949073, + "loss": 1.01401007, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.18334961, + "step": 216, + "time_per_iteration": 2.8548526763916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01609008, + "balance_loss_mlp": 1.59156775, + "epoch": 0.04174682570219315, + "flos": 527857413120.0, + "grad_norm": 0.10522858499680945, + "language_loss": 0.99888742, + "learning_rate": 0.0009996388883844428, + "loss": 1.01497757, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.17456055, + "step": 217, + "time_per_iteration": 2.618546724319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01557164, + "balance_loss_mlp": 1.54124999, + "epoch": 0.04193920738745671, + "flos": 511258977792.0, + "grad_norm": 0.09341741551851517, + "language_loss": 1.03841758, + "learning_rate": 0.0009996269530969588, + "loss": 1.05398929, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.15905762, + "step": 218, + "time_per_iteration": 2.6204636096954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01525903, + "balance_loss_mlp": 1.50927377, + "epoch": 0.04213158907272028, + "flos": 571226093568.0, + "grad_norm": 0.09609660813155754, + "language_loss": 1.02944803, + "learning_rate": 0.0009996148238370888, + "loss": 1.04470706, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.16625977, + "step": 219, + "time_per_iteration": 2.7071943283081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0150071, + "balance_loss_mlp": 1.48340106, + "epoch": 0.04232397075798384, + "flos": 963803667456.0, + "grad_norm": 0.05454565212769997, + "language_loss": 0.9941752, + "learning_rate": 0.0009996025006095421, + "loss": 1.00918233, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.1730957, + "step": 220, + "time_per_iteration": 3.3006374835968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.10944285, + "balance_loss_mlp": 6.84272289, + "epoch": 0.042516352443247404, + "flos": 1468814777856.0, + "grad_norm": 0.48497418398004566, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.88727427, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 41.0, + "step": 221, + "time_per_iteration": 5.7136383056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601015, + "balance_loss_mlp": 1.58291924, + "epoch": 0.042708734128510964, + "flos": 654419091456.0, + "grad_norm": 0.10763646442297387, + "language_loss": 0.99765503, + "learning_rate": 0.0009995772722706307, + "loss": 1.0136652, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.1809082, + "step": 222, + "time_per_iteration": 2.8322792053222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01658843, + "balance_loss_mlp": 1.63811278, + "epoch": 0.04290111581377453, + "flos": 431601578496.0, + "grad_norm": 0.16393394652444138, + "language_loss": 1.13557565, + "learning_rate": 0.0009995643671690604, + "loss": 1.1521641, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.20739746, + "step": 223, + "time_per_iteration": 2.470729351043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163871, + "balance_loss_mlp": 1.61504686, + "epoch": 0.04309349749903809, + "flos": 644379701760.0, + "grad_norm": 0.08733094203359489, + "language_loss": 1.00837708, + "learning_rate": 0.0009995512681194023, + "loss": 1.02476418, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.23632812, + "step": 224, + "time_per_iteration": 2.8274452686309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615568, + "balance_loss_mlp": 1.58755326, + "epoch": 0.04328587918430166, + "flos": 830861853696.0, + "grad_norm": 0.12001676841435771, + "language_loss": 0.98664522, + "learning_rate": 0.0009995379751267417, + "loss": 1.00280082, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.28027344, + "step": 225, + "time_per_iteration": 3.275660991668701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01617416, + "balance_loss_mlp": 1.58639741, + "epoch": 0.043478260869565216, + "flos": 524804852736.0, + "grad_norm": 0.1467276253632499, + "language_loss": 1.0007726, + "learning_rate": 0.0009995244881962398, + "loss": 1.01694679, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.30981445, + "step": 226, + "time_per_iteration": 2.6300203800201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601732, + "balance_loss_mlp": 1.56787658, + "epoch": 0.04367064255482878, + "flos": 439253328384.0, + "grad_norm": 0.095918638324787, + "language_loss": 1.01389623, + "learning_rate": 0.0009995108073331323, + "loss": 1.02991343, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.33862305, + "step": 227, + "time_per_iteration": 2.667628765106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0158134, + "balance_loss_mlp": 1.5462923, + "epoch": 0.04386302424009234, + "flos": 507109330944.0, + "grad_norm": 0.08564981186298011, + "language_loss": 1.04024279, + "learning_rate": 0.0009994969325427309, + "loss": 1.05605614, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.35058594, + "step": 228, + "time_per_iteration": 2.6454501152038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558795, + "balance_loss_mlp": 1.52224541, + "epoch": 0.04405540592535591, + "flos": 540432829440.0, + "grad_norm": 0.07744391701114339, + "language_loss": 1.00052619, + "learning_rate": 0.0009994828638304218, + "loss": 1.016114, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.36547852, + "step": 229, + "time_per_iteration": 2.6468071937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01516137, + "balance_loss_mlp": 1.47794271, + "epoch": 0.04424778761061947, + "flos": 446136850944.0, + "grad_norm": 0.08052263902742013, + "language_loss": 1.06763554, + "learning_rate": 0.0009994686012016675, + "loss": 1.08279693, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.3815918, + "step": 230, + "time_per_iteration": 2.5467634201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01483037, + "balance_loss_mlp": 1.44515228, + "epoch": 0.044440169295883035, + "flos": 700383435264.0, + "grad_norm": 0.05918307184238542, + "language_loss": 1.0518043, + "learning_rate": 0.000999454144662005, + "loss": 1.06663465, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.37866211, + "step": 231, + "time_per_iteration": 2.8704099655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473358, + "balance_loss_mlp": 1.43549716, + "epoch": 0.044632550981146595, + "flos": 588055873536.0, + "grad_norm": 0.08626514264815018, + "language_loss": 0.99676436, + "learning_rate": 0.0009994394942170468, + "loss": 1.01149797, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.37866211, + "step": 232, + "time_per_iteration": 2.6578898429870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461415, + "balance_loss_mlp": 1.4258194, + "epoch": 0.04482493266641016, + "flos": 554534525952.0, + "grad_norm": 0.07124765242066121, + "language_loss": 0.96965969, + "learning_rate": 0.0009994246498724808, + "loss": 0.98427379, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.35620117, + "step": 233, + "time_per_iteration": 2.7764015197753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463645, + "balance_loss_mlp": 1.42790616, + "epoch": 0.04501731435167372, + "flos": 722500646400.0, + "grad_norm": 0.07759597622956232, + "language_loss": 0.99069166, + "learning_rate": 0.00099940961163407, + "loss": 1.00532806, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.35766602, + "step": 234, + "time_per_iteration": 2.8431143760681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454599, + "balance_loss_mlp": 1.42098188, + "epoch": 0.04520969603693728, + "flos": 511539784704.0, + "grad_norm": 0.05931413709293958, + "language_loss": 1.02564597, + "learning_rate": 0.0009993943795076528, + "loss": 1.04019189, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.33642578, + "step": 235, + "time_per_iteration": 2.645988702774048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444244, + "balance_loss_mlp": 1.40936303, + "epoch": 0.04540207772220085, + "flos": 364854246912.0, + "grad_norm": 0.07280953320994132, + "language_loss": 1.04776168, + "learning_rate": 0.0009993789534991427, + "loss": 1.062204, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.34912109, + "step": 236, + "time_per_iteration": 2.4084837436676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01418385, + "balance_loss_mlp": 1.38390946, + "epoch": 0.045594459407464406, + "flos": 522407038464.0, + "grad_norm": 0.060943880380569936, + "language_loss": 0.99500269, + "learning_rate": 0.0009993633336145287, + "loss": 1.00918651, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.34472656, + "step": 237, + "time_per_iteration": 2.6044533252716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406135, + "balance_loss_mlp": 1.3730185, + "epoch": 0.04578684109272797, + "flos": 671442338304.0, + "grad_norm": 0.06747057459653658, + "language_loss": 1.03573179, + "learning_rate": 0.0009993475198598752, + "loss": 1.04979324, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.33129883, + "step": 238, + "time_per_iteration": 2.9948084354400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01383668, + "balance_loss_mlp": 1.35164809, + "epoch": 0.04597922277799153, + "flos": 541387321344.0, + "grad_norm": 0.07135856148897902, + "language_loss": 0.99909985, + "learning_rate": 0.0009993315122413212, + "loss": 1.01293659, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.32006836, + "step": 239, + "time_per_iteration": 2.5848827362060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369111, + "balance_loss_mlp": 1.33773541, + "epoch": 0.0461716044632551, + "flos": 458732616192.0, + "grad_norm": 0.056000088810755834, + "language_loss": 1.0008105, + "learning_rate": 0.0009993153107650818, + "loss": 1.01450157, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.31347656, + "step": 240, + "time_per_iteration": 2.5492687225341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338815, + "balance_loss_mlp": 1.31015706, + "epoch": 0.04636398614851866, + "flos": 455009342976.0, + "grad_norm": 0.06491754001609312, + "language_loss": 0.99534512, + "learning_rate": 0.0009992989154374468, + "loss": 1.00873327, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.28662109, + "step": 241, + "time_per_iteration": 2.511237621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294622, + "balance_loss_mlp": 1.26833653, + "epoch": 0.046556367833782225, + "flos": 556558401024.0, + "grad_norm": 0.07592069792168304, + "language_loss": 1.06626534, + "learning_rate": 0.0009992823262647817, + "loss": 1.07921147, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26293945, + "step": 242, + "time_per_iteration": 2.7618701457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249282, + "balance_loss_mlp": 1.22240043, + "epoch": 0.046748749519045785, + "flos": 592625949696.0, + "grad_norm": 0.0687662987323222, + "language_loss": 1.00893593, + "learning_rate": 0.0009992655432535264, + "loss": 1.0214287, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.26879883, + "step": 243, + "time_per_iteration": 2.7471935749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255015, + "balance_loss_mlp": 1.23083937, + "epoch": 0.04694113120430935, + "flos": 569596506624.0, + "grad_norm": 0.07373455055845594, + "language_loss": 1.0054853, + "learning_rate": 0.0009992485664101973, + "loss": 1.01803541, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.24169922, + "step": 244, + "time_per_iteration": 2.635344982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295554, + "balance_loss_mlp": 1.27291572, + "epoch": 0.04713351288957291, + "flos": 863401158144.0, + "grad_norm": 0.10584905626659928, + "language_loss": 1.03312445, + "learning_rate": 0.000999231395741385, + "loss": 1.04607987, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.22631836, + "step": 245, + "time_per_iteration": 3.093386173248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128706, + "balance_loss_mlp": 1.26464868, + "epoch": 0.04732589457483648, + "flos": 536961249792.0, + "grad_norm": 0.08844420521863233, + "language_loss": 1.01371169, + "learning_rate": 0.0009992140312537557, + "loss": 1.02658224, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.22412109, + "step": 246, + "time_per_iteration": 2.667579412460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256359, + "balance_loss_mlp": 1.23515141, + "epoch": 0.04751827626010004, + "flos": 761566910976.0, + "grad_norm": 0.052835972446563725, + "language_loss": 0.9609164, + "learning_rate": 0.000999196472954051, + "loss": 0.97347999, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.2121582, + "step": 247, + "time_per_iteration": 2.9537084102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02369813, + "balance_loss_mlp": 2.16687083, + "epoch": 0.0477106579453636, + "flos": 1578961313280.0, + "grad_norm": 0.2151482568863758, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.81794667, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 2.03125, + "step": 248, + "time_per_iteration": 5.758621454238892 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289622, + "balance_loss_mlp": 1.27137113, + "epoch": 0.04790303963062716, + "flos": 457535195136.0, + "grad_norm": 0.10849969336884063, + "language_loss": 1.03316629, + "learning_rate": 0.0009991607749457578, + "loss": 1.04606247, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.18261719, + "step": 249, + "time_per_iteration": 2.5432913303375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334566, + "balance_loss_mlp": 1.31724536, + "epoch": 0.04809542131589073, + "flos": 782079266304.0, + "grad_norm": 0.08264534697846654, + "language_loss": 1.01180637, + "learning_rate": 0.0009991426352510286, + "loss": 1.02515209, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.17321777, + "step": 250, + "time_per_iteration": 3.1542766094207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351096, + "balance_loss_mlp": 1.33215368, + "epoch": 0.04828780300115429, + "flos": 558995503104.0, + "grad_norm": 0.06435857362074206, + "language_loss": 1.03307557, + "learning_rate": 0.0009991243017719422, + "loss": 1.04658651, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.18933105, + "step": 251, + "time_per_iteration": 2.693882942199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333217, + "balance_loss_mlp": 1.31485844, + "epoch": 0.048480184686417856, + "flos": 501682277376.0, + "grad_norm": 0.09276508096019526, + "language_loss": 0.97794825, + "learning_rate": 0.0009991057745156165, + "loss": 0.99128038, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.18347168, + "step": 252, + "time_per_iteration": 2.628873109817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01867297, + "balance_loss_mlp": 1.75514495, + "epoch": 0.048672566371681415, + "flos": 1535585430528.0, + "grad_norm": 0.16359674361847032, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.8377828, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.125, + "step": 253, + "time_per_iteration": 5.060615062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285031, + "balance_loss_mlp": 1.26567185, + "epoch": 0.04886494805694498, + "flos": 537665458176.0, + "grad_norm": 0.07164286827098729, + "language_loss": 1.06546187, + "learning_rate": 0.0009990681387000943, + "loss": 1.07831216, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.19384766, + "step": 254, + "time_per_iteration": 2.783367395401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275754, + "balance_loss_mlp": 1.25606036, + "epoch": 0.04905732974220854, + "flos": 679841966592.0, + "grad_norm": 0.06618046133348403, + "language_loss": 1.01404011, + "learning_rate": 0.0009990490301555093, + "loss": 1.02679765, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.19689941, + "step": 255, + "time_per_iteration": 2.9520761966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01936632, + "balance_loss_mlp": 1.86796737, + "epoch": 0.04924971142747211, + "flos": 1420408949760.0, + "grad_norm": 0.31562964738653715, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.81151783, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.6875, + "step": 256, + "time_per_iteration": 4.825209856033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615246, + "balance_loss_mlp": 1.55344784, + "epoch": 0.04944209311273567, + "flos": 1557202074624.0, + "grad_norm": 0.16937574338078817, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80857986, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.6171875, + "step": 257, + "time_per_iteration": 4.995501518249512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163115, + "balance_loss_mlp": 1.58422887, + "epoch": 0.04963447479799923, + "flos": 1569985514496.0, + "grad_norm": 0.13524925240989144, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71607035, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.46875, + "step": 258, + "time_per_iteration": 4.841471910476685 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272668, + "balance_loss_mlp": 1.24463022, + "epoch": 0.049826856483262794, + "flos": 625063357440.0, + "grad_norm": 0.06365504505971183, + "language_loss": 0.95603192, + "learning_rate": 0.0009989706585723202, + "loss": 0.96875864, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.28076172, + "step": 259, + "time_per_iteration": 2.7635786533355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130022, + "balance_loss_mlp": 1.27020288, + "epoch": 0.05001923816852635, + "flos": 503912765952.0, + "grad_norm": 0.062257698278494894, + "language_loss": 1.01846027, + "learning_rate": 0.0009989505813633442, + "loss": 1.03146255, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.29980469, + "step": 260, + "time_per_iteration": 2.6451833248138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131611, + "balance_loss_mlp": 1.28101516, + "epoch": 0.05021161985378992, + "flos": 587066476032.0, + "grad_norm": 0.06290514068599455, + "language_loss": 1.01911807, + "learning_rate": 0.000998930310444573, + "loss": 1.03227913, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.35083008, + "step": 261, + "time_per_iteration": 2.6989662647247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324978, + "balance_loss_mlp": 1.2880708, + "epoch": 0.05040400153905348, + "flos": 633029409792.0, + "grad_norm": 0.0625839964239575, + "language_loss": 1.00387836, + "learning_rate": 0.0009989098458238765, + "loss": 1.01712811, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.36914062, + "step": 262, + "time_per_iteration": 2.7581043243408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319841, + "balance_loss_mlp": 1.28395867, + "epoch": 0.050596383224317046, + "flos": 553344307200.0, + "grad_norm": 0.06067150197267865, + "language_loss": 0.99905968, + "learning_rate": 0.0009988891875091998, + "loss": 1.01225805, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.35913086, + "step": 263, + "time_per_iteration": 2.7601842880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131413, + "balance_loss_mlp": 1.27793837, + "epoch": 0.050788764909580605, + "flos": 549389689344.0, + "grad_norm": 0.07440292928735547, + "language_loss": 0.94277728, + "learning_rate": 0.0009988683355085636, + "loss": 0.95591855, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.36206055, + "step": 264, + "time_per_iteration": 2.7262909412384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277315, + "balance_loss_mlp": 1.24248254, + "epoch": 0.05098114659484417, + "flos": 604812870144.0, + "grad_norm": 0.06984595792035174, + "language_loss": 1.02861905, + "learning_rate": 0.000998847289830063, + "loss": 1.04139221, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.34838867, + "step": 265, + "time_per_iteration": 2.8318397998809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256298, + "balance_loss_mlp": 1.22272849, + "epoch": 0.05117352828010773, + "flos": 438317775360.0, + "grad_norm": 0.08677906198544101, + "language_loss": 0.95779377, + "learning_rate": 0.0009988260504818682, + "loss": 0.9703567, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.3359375, + "step": 266, + "time_per_iteration": 2.5212388038635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220367, + "balance_loss_mlp": 1.19046903, + "epoch": 0.0513659099653713, + "flos": 504784300032.0, + "grad_norm": 0.09456939977029206, + "language_loss": 1.01958096, + "learning_rate": 0.000998804617472226, + "loss": 1.03178465, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.29858398, + "step": 267, + "time_per_iteration": 2.649739980697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169139, + "balance_loss_mlp": 1.14131606, + "epoch": 0.05155829165063486, + "flos": 695183344128.0, + "grad_norm": 0.07125411147685125, + "language_loss": 0.97574937, + "learning_rate": 0.0009987829908094568, + "loss": 0.98744082, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.27856445, + "step": 268, + "time_per_iteration": 2.816098690032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119703, + "balance_loss_mlp": 1.09379935, + "epoch": 0.051750673335898424, + "flos": 1347751830528.0, + "grad_norm": 0.06583247177587333, + "language_loss": 1.04151332, + "learning_rate": 0.0009987611705019569, + "loss": 1.05271029, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.25927734, + "step": 269, + "time_per_iteration": 4.478148460388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103433, + "balance_loss_mlp": 1.08008027, + "epoch": 0.051943055021161984, + "flos": 489362936832.0, + "grad_norm": 0.06787757239342199, + "language_loss": 1.02481639, + "learning_rate": 0.0009987391565581978, + "loss": 1.03585076, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.23364258, + "step": 270, + "time_per_iteration": 2.5662009716033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111004, + "balance_loss_mlp": 1.08859241, + "epoch": 0.05213543670642555, + "flos": 545504882688.0, + "grad_norm": 0.08198896814085149, + "language_loss": 0.9504528, + "learning_rate": 0.000998716948986726, + "loss": 0.96156287, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.22424316, + "step": 271, + "time_per_iteration": 2.7815349102020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158552, + "balance_loss_mlp": 1.13697529, + "epoch": 0.05232781839168911, + "flos": 603285179904.0, + "grad_norm": 0.07646156534985457, + "language_loss": 0.97641921, + "learning_rate": 0.0009986945477961633, + "loss": 0.9880048, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.21569824, + "step": 272, + "time_per_iteration": 2.694547414779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188724, + "balance_loss_mlp": 1.16735017, + "epoch": 0.052520200076952676, + "flos": 538218307584.0, + "grad_norm": 0.07381807258867126, + "language_loss": 1.02498066, + "learning_rate": 0.0009986719529952066, + "loss": 1.03686786, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.21386719, + "step": 273, + "time_per_iteration": 2.8339192867279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121752, + "balance_loss_mlp": 1.19785035, + "epoch": 0.052712581762216236, + "flos": 463148513280.0, + "grad_norm": 0.0738352941440963, + "language_loss": 1.01808548, + "learning_rate": 0.000998649164592628, + "loss": 1.03026068, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.19677734, + "step": 274, + "time_per_iteration": 2.60577130317688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236713, + "balance_loss_mlp": 1.21763909, + "epoch": 0.0529049634474798, + "flos": 547749927936.0, + "grad_norm": 0.08134169766286939, + "language_loss": 0.99272913, + "learning_rate": 0.0009986261825972748, + "loss": 1.00509632, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.19055176, + "step": 275, + "time_per_iteration": 2.652561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196689, + "balance_loss_mlp": 1.17834246, + "epoch": 0.05309734513274336, + "flos": 617727320064.0, + "grad_norm": 0.09111845604121613, + "language_loss": 1.01860571, + "learning_rate": 0.000998603007018069, + "loss": 1.03057253, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.18334961, + "step": 276, + "time_per_iteration": 2.8293774127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011443, + "balance_loss_mlp": 1.1273365, + "epoch": 0.05328972681800693, + "flos": 605220304896.0, + "grad_norm": 0.07377841396756965, + "language_loss": 0.99345076, + "learning_rate": 0.0009985796378640089, + "loss": 1.00489378, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.16955566, + "step": 277, + "time_per_iteration": 2.694716215133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098185, + "balance_loss_mlp": 1.08067346, + "epoch": 0.05348210850327049, + "flos": 604197411840.0, + "grad_norm": 0.07074934963985437, + "language_loss": 0.99532163, + "learning_rate": 0.0009985560751441665, + "loss": 1.00630355, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.1751709, + "step": 278, + "time_per_iteration": 2.798563241958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095446, + "balance_loss_mlp": 1.07736206, + "epoch": 0.053674490188534055, + "flos": 630480236544.0, + "grad_norm": 0.054749659326078955, + "language_loss": 1.01733184, + "learning_rate": 0.00099853231886769, + "loss": 1.02828622, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.1809082, + "step": 279, + "time_per_iteration": 2.780940532684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134885, + "balance_loss_mlp": 1.11744475, + "epoch": 0.053866871873797614, + "flos": 478939433472.0, + "grad_norm": 0.06375435082524677, + "language_loss": 1.01461124, + "learning_rate": 0.0009985083690438024, + "loss": 1.02595997, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.17443848, + "step": 280, + "time_per_iteration": 2.68762469291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145965, + "balance_loss_mlp": 1.12913251, + "epoch": 0.054059253559061174, + "flos": 787673645568.0, + "grad_norm": 0.07384801764192533, + "language_loss": 0.92380941, + "learning_rate": 0.0009984842256818016, + "loss": 0.93526906, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.16845703, + "step": 281, + "time_per_iteration": 3.054032325744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114791, + "balance_loss_mlp": 1.13080359, + "epoch": 0.05425163524432474, + "flos": 628076630016.0, + "grad_norm": 0.082175996598207, + "language_loss": 1.0314945, + "learning_rate": 0.0009984598887910613, + "loss": 1.04297376, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.17114258, + "step": 282, + "time_per_iteration": 2.7095611095428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144379, + "balance_loss_mlp": 1.12627149, + "epoch": 0.0544440169295883, + "flos": 615453161472.0, + "grad_norm": 0.06813866095032944, + "language_loss": 0.9902432, + "learning_rate": 0.0009984353583810297, + "loss": 1.00168693, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.18103027, + "step": 283, + "time_per_iteration": 2.804438829421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124262, + "balance_loss_mlp": 1.10624945, + "epoch": 0.05463639861485187, + "flos": 647471549952.0, + "grad_norm": 0.10003204141391345, + "language_loss": 1.01340103, + "learning_rate": 0.0009984106344612302, + "loss": 1.02464366, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.18017578, + "step": 284, + "time_per_iteration": 2.7521376609802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109552, + "balance_loss_mlp": 1.07819879, + "epoch": 0.054828780300115426, + "flos": 796845883392.0, + "grad_norm": 0.07143310654982075, + "language_loss": 0.96421391, + "learning_rate": 0.0009983857170412615, + "loss": 0.97516906, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.17321777, + "step": 285, + "time_per_iteration": 2.9796621799468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089942, + "balance_loss_mlp": 1.07363439, + "epoch": 0.05502116198537899, + "flos": 549414420480.0, + "grad_norm": 0.05224422052371224, + "language_loss": 0.95713383, + "learning_rate": 0.000998360606130798, + "loss": 0.96803325, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.16308594, + "step": 286, + "time_per_iteration": 2.7950801849365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02908189, + "balance_loss_mlp": 2.83799911, + "epoch": 0.05521354367064255, + "flos": 1406967791616.0, + "grad_norm": 0.233188183772104, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71981305, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.703125, + "step": 287, + "time_per_iteration": 4.876653432846069 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179574, + "balance_loss_mlp": 1.1627655, + "epoch": 0.05540592535590612, + "flos": 645123197952.0, + "grad_norm": 0.17417830683261867, + "language_loss": 1.0204829, + "learning_rate": 0.0009983098038774552, + "loss": 1.03227878, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.16821289, + "step": 288, + "time_per_iteration": 2.7781550884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02101154, + "balance_loss_mlp": 2.07540464, + "epoch": 0.05559830704116968, + "flos": 1510293413376.0, + "grad_norm": 0.1730100464590254, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.80271375, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.2578125, + "step": 289, + "time_per_iteration": 4.801970481872559 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338926, + "balance_loss_mlp": 1.32332134, + "epoch": 0.055790688726433245, + "flos": 508078379520.0, + "grad_norm": 0.11288123874753296, + "language_loss": 0.99586821, + "learning_rate": 0.0009982582277800948, + "loss": 1.00925756, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.15588379, + "step": 290, + "time_per_iteration": 2.6019012928009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376714, + "balance_loss_mlp": 1.36076403, + "epoch": 0.055983070411696804, + "flos": 657570576384.0, + "grad_norm": 0.11158393407579077, + "language_loss": 1.06464982, + "learning_rate": 0.0009982321495648908, + "loss": 1.07841706, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.15942383, + "step": 291, + "time_per_iteration": 2.7833075523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281101, + "balance_loss_mlp": 1.26441216, + "epoch": 0.05617545209696037, + "flos": 587051919360.0, + "grad_norm": 0.091490024999748, + "language_loss": 0.97375935, + "learning_rate": 0.0009982058779188115, + "loss": 0.98657036, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.16699219, + "step": 292, + "time_per_iteration": 2.700998067855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223751, + "balance_loss_mlp": 1.20634639, + "epoch": 0.05636783378222393, + "flos": 611331217920.0, + "grad_norm": 0.09093545163733599, + "language_loss": 1.05090272, + "learning_rate": 0.0009981794128520567, + "loss": 1.06314015, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.17431641, + "step": 293, + "time_per_iteration": 2.769562244415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172918, + "balance_loss_mlp": 1.15501237, + "epoch": 0.0565602154674875, + "flos": 667847102976.0, + "grad_norm": 0.08200667246549262, + "language_loss": 1.02219713, + "learning_rate": 0.000998152754374901, + "loss": 1.03392649, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.17919922, + "step": 294, + "time_per_iteration": 2.8421483039855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140496, + "balance_loss_mlp": 1.12121987, + "epoch": 0.05675259715275106, + "flos": 616963474944.0, + "grad_norm": 0.06298459153201627, + "language_loss": 0.97706711, + "learning_rate": 0.0009981259024976943, + "loss": 0.98847204, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.19250488, + "step": 295, + "time_per_iteration": 2.709536552429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131247, + "balance_loss_mlp": 1.11139894, + "epoch": 0.05694497883801462, + "flos": 751424214528.0, + "grad_norm": 0.13011693222478776, + "language_loss": 0.96307456, + "learning_rate": 0.0009980988572308612, + "loss": 0.97438705, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.19848633, + "step": 296, + "time_per_iteration": 2.9606993198394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125313, + "balance_loss_mlp": 1.10492802, + "epoch": 0.05713736052327818, + "flos": 711669708288.0, + "grad_norm": 0.06808560063607492, + "language_loss": 0.9959082, + "learning_rate": 0.0009980716185849015, + "loss": 1.00716126, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.20385742, + "step": 297, + "time_per_iteration": 2.952467203140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133548, + "balance_loss_mlp": 1.11424804, + "epoch": 0.05732974220854175, + "flos": 468737100288.0, + "grad_norm": 0.05570922928007862, + "language_loss": 0.95103967, + "learning_rate": 0.0009980441865703904, + "loss": 0.9623751, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.19299316, + "step": 298, + "time_per_iteration": 2.6629996299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125947, + "balance_loss_mlp": 1.10630131, + "epoch": 0.05752212389380531, + "flos": 601143441408.0, + "grad_norm": 0.06175770353433084, + "language_loss": 1.038656, + "learning_rate": 0.000998016561197978, + "loss": 1.04991555, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.19628906, + "step": 299, + "time_per_iteration": 2.7027034759521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122899, + "balance_loss_mlp": 1.10499382, + "epoch": 0.057714505579068875, + "flos": 678344799744.0, + "grad_norm": 0.07709513760197055, + "language_loss": 0.95715761, + "learning_rate": 0.0009979887424783895, + "loss": 0.96838653, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.17907715, + "step": 300, + "time_per_iteration": 2.8467562198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122592, + "balance_loss_mlp": 1.10369694, + "epoch": 0.057906887264332435, + "flos": 595604316672.0, + "grad_norm": 0.05754387138467597, + "language_loss": 0.94804943, + "learning_rate": 0.0009979607304224248, + "loss": 0.95927536, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.18908691, + "step": 301, + "time_per_iteration": 2.7457566261291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135958, + "balance_loss_mlp": 1.11577594, + "epoch": 0.058099268949596, + "flos": 551855904768.0, + "grad_norm": 0.06951393564289957, + "language_loss": 1.02452385, + "learning_rate": 0.000997932525040959, + "loss": 1.03588343, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.20166016, + "step": 302, + "time_per_iteration": 2.670464038848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123814, + "balance_loss_mlp": 1.10513425, + "epoch": 0.05829165063485956, + "flos": 507906671616.0, + "grad_norm": 0.06408930588753382, + "language_loss": 1.04041958, + "learning_rate": 0.000997904126344943, + "loss": 1.05165768, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.18676758, + "step": 303, + "time_per_iteration": 2.654275417327881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122557, + "balance_loss_mlp": 1.10432982, + "epoch": 0.05848403232012313, + "flos": 614949774336.0, + "grad_norm": 0.10902949066110783, + "language_loss": 1.00108004, + "learning_rate": 0.0009978755343454018, + "loss": 1.0123055, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.18212891, + "step": 304, + "time_per_iteration": 2.7061922550201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118016, + "balance_loss_mlp": 1.10034943, + "epoch": 0.05867641400538669, + "flos": 499835902464.0, + "grad_norm": 0.07196511907519268, + "language_loss": 1.01183403, + "learning_rate": 0.0009978467490534355, + "loss": 1.02301419, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.17663574, + "step": 305, + "time_per_iteration": 2.5658843517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118418, + "balance_loss_mlp": 1.09971452, + "epoch": 0.05886879569065025, + "flos": 531019072512.0, + "grad_norm": 0.05577021807863236, + "language_loss": 0.98775607, + "learning_rate": 0.00099781777048022, + "loss": 0.99894023, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.18713379, + "step": 306, + "time_per_iteration": 2.688661813735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112614, + "balance_loss_mlp": 1.10866416, + "epoch": 0.05906117737591381, + "flos": 488811497472.0, + "grad_norm": 0.06489613907432343, + "language_loss": 0.99682212, + "learning_rate": 0.0009977885986370057, + "loss": 1.00808358, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.17480469, + "step": 307, + "time_per_iteration": 2.527008056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129188, + "balance_loss_mlp": 1.11242771, + "epoch": 0.05925355906117737, + "flos": 591213150720.0, + "grad_norm": 0.060579194597163814, + "language_loss": 0.94911426, + "learning_rate": 0.000997759233535118, + "loss": 0.96040612, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.16772461, + "step": 308, + "time_per_iteration": 2.768683433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_mlp": 1.09052539, + "epoch": 0.05944594074644094, + "flos": 563373522432.0, + "grad_norm": 0.074144120767366, + "language_loss": 1.01706028, + "learning_rate": 0.0009977296751859576, + "loss": 1.02814317, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.17749023, + "step": 309, + "time_per_iteration": 2.710550308227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109964, + "balance_loss_mlp": 1.0817585, + "epoch": 0.0596383224317045, + "flos": 538483147776.0, + "grad_norm": 0.1012520362466171, + "language_loss": 1.03562367, + "learning_rate": 0.0009976999236009998, + "loss": 1.04662001, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.17895508, + "step": 310, + "time_per_iteration": 2.7346065044403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095396, + "balance_loss_mlp": 1.07697809, + "epoch": 0.059830704116968066, + "flos": 560684726784.0, + "grad_norm": 0.05903807060939984, + "language_loss": 1.05193245, + "learning_rate": 0.0009976699787917955, + "loss": 1.06288636, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.18408203, + "step": 311, + "time_per_iteration": 2.737165689468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04018029, + "balance_loss_mlp": 3.94440532, + "epoch": 0.060023085802231625, + "flos": 1569759962112.0, + "grad_norm": 0.34396821433057967, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.77461016, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.734375, + "step": 312, + "time_per_iteration": 4.990010976791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130575, + "balance_loss_mlp": 1.11010623, + "epoch": 0.06021546748749519, + "flos": 482415395328.0, + "grad_norm": 0.18656347991450223, + "language_loss": 0.97164261, + "learning_rate": 0.0009976095095472243, + "loss": 0.98294836, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.20458984, + "step": 313, + "time_per_iteration": 2.5596373081207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143867, + "balance_loss_mlp": 1.12198031, + "epoch": 0.06040784917275875, + "flos": 619889407488.0, + "grad_norm": 0.10017394493353984, + "language_loss": 0.98154747, + "learning_rate": 0.0009975789851353334, + "loss": 0.9929862, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.21911621, + "step": 314, + "time_per_iteration": 2.783092498779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113993, + "balance_loss_mlp": 1.11832976, + "epoch": 0.06060023085802232, + "flos": 483292721664.0, + "grad_norm": 0.12837029886330253, + "language_loss": 1.00706339, + "learning_rate": 0.0009975482675461487, + "loss": 1.01846266, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.21594238, + "step": 315, + "time_per_iteration": 2.6375765800476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128184, + "balance_loss_mlp": 1.10697675, + "epoch": 0.06079261254328588, + "flos": 581620483584.0, + "grad_norm": 0.07139597701291463, + "language_loss": 0.9800331, + "learning_rate": 0.0009975173567915952, + "loss": 0.99131489, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.21228027, + "step": 316, + "time_per_iteration": 2.680223226547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116284, + "balance_loss_mlp": 1.09438515, + "epoch": 0.060984994228549444, + "flos": 687492306432.0, + "grad_norm": 0.12898022133672052, + "language_loss": 0.92624593, + "learning_rate": 0.000997486252883674, + "loss": 0.93740869, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.21887207, + "step": 317, + "time_per_iteration": 2.835162878036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_mlp": 1.08325243, + "epoch": 0.061177375913813004, + "flos": 1314284327424.0, + "grad_norm": 0.06442728945451602, + "language_loss": 0.97186124, + "learning_rate": 0.0009974549558344602, + "loss": 0.98290741, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.21350098, + "step": 318, + "time_per_iteration": 3.6293551921844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105129, + "balance_loss_mlp": 1.08439815, + "epoch": 0.06136975759907657, + "flos": 574072040448.0, + "grad_norm": 0.08131052095693254, + "language_loss": 1.07145, + "learning_rate": 0.000997423465656105, + "loss": 1.08250129, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.20715332, + "step": 319, + "time_per_iteration": 2.7070071697235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_mlp": 1.08168781, + "epoch": 0.06156213928434013, + "flos": 527281242624.0, + "grad_norm": 0.059301156484267634, + "language_loss": 1.04424822, + "learning_rate": 0.0009973917823608335, + "loss": 1.0552659, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.20092773, + "step": 320, + "time_per_iteration": 2.6225128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110531, + "balance_loss_mlp": 1.0897882, + "epoch": 0.061754520969603696, + "flos": 495238123008.0, + "grad_norm": 0.05387649814829365, + "language_loss": 0.98383266, + "learning_rate": 0.0009973599059609462, + "loss": 0.9949379, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.20739746, + "step": 321, + "time_per_iteration": 2.692152261734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107693, + "balance_loss_mlp": 1.08798778, + "epoch": 0.061946902654867256, + "flos": 439839673344.0, + "grad_norm": 0.06112812680296507, + "language_loss": 0.9711749, + "learning_rate": 0.000997327836468819, + "loss": 0.98225188, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.19702148, + "step": 322, + "time_per_iteration": 2.5772383213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110285, + "balance_loss_mlp": 1.0900557, + "epoch": 0.06213928434013082, + "flos": 598490961408.0, + "grad_norm": 0.0645434874295678, + "language_loss": 0.9942351, + "learning_rate": 0.000997295573896902, + "loss": 1.00533807, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.20239258, + "step": 323, + "time_per_iteration": 2.839282274246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02259253, + "balance_loss_mlp": 2.20088792, + "epoch": 0.06233166602539438, + "flos": 1449393716736.0, + "grad_norm": 0.19547826226404627, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83455294, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.58203125, + "step": 324, + "time_per_iteration": 4.67440938949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01888161, + "balance_loss_mlp": 1.83246601, + "epoch": 0.06252404771065795, + "flos": 1462504453632.0, + "grad_norm": 0.11962022052509429, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80460101, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.55859375, + "step": 325, + "time_per_iteration": 4.860283136367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177486, + "balance_loss_mlp": 1.15595722, + "epoch": 0.06271642939592151, + "flos": 464059335168.0, + "grad_norm": 0.06272096910143152, + "language_loss": 0.93621421, + "learning_rate": 0.000997197627828043, + "loss": 0.94798911, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.2154541, + "step": 326, + "time_per_iteration": 2.5594961643218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205877, + "balance_loss_mlp": 1.18165386, + "epoch": 0.06290881108118507, + "flos": 532111776768.0, + "grad_norm": 0.08849931028565244, + "language_loss": 0.89414704, + "learning_rate": 0.0009971645930629716, + "loss": 0.90620589, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.2421875, + "step": 327, + "time_per_iteration": 2.7163310050964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223238, + "balance_loss_mlp": 1.19748878, + "epoch": 0.06310119276644863, + "flos": 673262572032.0, + "grad_norm": 0.09892100413683627, + "language_loss": 1.02883804, + "learning_rate": 0.0009971313652814872, + "loss": 1.04107046, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.25769043, + "step": 328, + "time_per_iteration": 2.7786266803741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228803, + "balance_loss_mlp": 1.20175433, + "epoch": 0.0632935744517122, + "flos": 770404497408.0, + "grad_norm": 0.06852265531332852, + "language_loss": 0.99799907, + "learning_rate": 0.0009970979444964903, + "loss": 1.01028717, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.27050781, + "step": 329, + "time_per_iteration": 2.952498197555542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235649, + "balance_loss_mlp": 1.2062993, + "epoch": 0.06348595613697576, + "flos": 561649393152.0, + "grad_norm": 0.09680127661829774, + "language_loss": 1.0121367, + "learning_rate": 0.0009970643307209556, + "loss": 1.02449322, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.29296875, + "step": 330, + "time_per_iteration": 2.78190541267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240935, + "balance_loss_mlp": 1.20970178, + "epoch": 0.06367833782223932, + "flos": 675891730944.0, + "grad_norm": 0.08786526055569537, + "language_loss": 0.9788332, + "learning_rate": 0.0009970305239679334, + "loss": 0.99124253, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.31201172, + "step": 331, + "time_per_iteration": 2.805845022201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228576, + "balance_loss_mlp": 1.19891691, + "epoch": 0.06387071950750288, + "flos": 495035891712.0, + "grad_norm": 0.10390832636325384, + "language_loss": 1.03124022, + "learning_rate": 0.0009969965242505483, + "loss": 1.04352593, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.29614258, + "step": 332, + "time_per_iteration": 2.676711082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207199, + "balance_loss_mlp": 1.1777302, + "epoch": 0.06406310119276645, + "flos": 533170985472.0, + "grad_norm": 0.07105898063788767, + "language_loss": 0.98331362, + "learning_rate": 0.0009969623315820007, + "loss": 0.99538565, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.29418945, + "step": 333, + "time_per_iteration": 2.6556739807128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118815, + "balance_loss_mlp": 1.16106582, + "epoch": 0.06425548287803001, + "flos": 455940513792.0, + "grad_norm": 0.08067516684621483, + "language_loss": 0.99160993, + "learning_rate": 0.000996927945975565, + "loss": 1.0034914, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.27124023, + "step": 334, + "time_per_iteration": 2.5398526191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147495, + "balance_loss_mlp": 1.1214596, + "epoch": 0.06444786456329357, + "flos": 559817574912.0, + "grad_norm": 0.08169715789363684, + "language_loss": 0.96174645, + "learning_rate": 0.0009968933674445906, + "loss": 0.97322142, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.26062012, + "step": 335, + "time_per_iteration": 2.6592860221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112932, + "balance_loss_mlp": 1.0879097, + "epoch": 0.06464024624855713, + "flos": 665769383424.0, + "grad_norm": 0.07104021966044574, + "language_loss": 0.97756392, + "learning_rate": 0.0009968585960025028, + "loss": 0.98869324, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.25036621, + "step": 336, + "time_per_iteration": 2.9279658794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01860024, + "balance_loss_mlp": 1.84323907, + "epoch": 0.0648326279338207, + "flos": 1520578704384.0, + "grad_norm": 0.14426901756633248, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.7951321, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.16796875, + "step": 337, + "time_per_iteration": 4.810914993286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101739, + "balance_loss_mlp": 1.07948256, + "epoch": 0.06502500961908426, + "flos": 1142872768512.0, + "grad_norm": 0.058812216055980165, + "language_loss": 0.95864177, + "learning_rate": 0.0009967884744390583, + "loss": 0.96965921, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.22265625, + "step": 338, + "time_per_iteration": 3.512282371520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146504, + "balance_loss_mlp": 1.12267399, + "epoch": 0.06521739130434782, + "flos": 582339248640.0, + "grad_norm": 0.10793578588091769, + "language_loss": 0.97449529, + "learning_rate": 0.0009967531243449256, + "loss": 0.98596036, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.23828125, + "step": 339, + "time_per_iteration": 2.712907075881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154087, + "balance_loss_mlp": 1.12950587, + "epoch": 0.06540977298961138, + "flos": 497398800384.0, + "grad_norm": 0.06396927661276222, + "language_loss": 1.04641414, + "learning_rate": 0.000996717581394126, + "loss": 1.05795503, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.24584961, + "step": 340, + "time_per_iteration": 2.5783133506774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168509, + "balance_loss_mlp": 1.14584756, + "epoch": 0.06560215467487496, + "flos": 542613855744.0, + "grad_norm": 0.07568553531769329, + "language_loss": 1.05092287, + "learning_rate": 0.000996681845600459, + "loss": 1.062608, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.2265625, + "step": 341, + "time_per_iteration": 2.6543757915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.13118291, + "epoch": 0.06579453636013852, + "flos": 413230961664.0, + "grad_norm": 0.06593832485574395, + "language_loss": 0.97027373, + "learning_rate": 0.0009966459169777982, + "loss": 0.9818095, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.22387695, + "step": 342, + "time_per_iteration": 2.5120761394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141132, + "balance_loss_mlp": 1.11848283, + "epoch": 0.06598691804540208, + "flos": 560354457600.0, + "grad_norm": 0.055115078659976495, + "language_loss": 1.05281377, + "learning_rate": 0.0009966097955400924, + "loss": 1.0642252, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.22644043, + "step": 343, + "time_per_iteration": 2.6954751014709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111133, + "balance_loss_mlp": 1.08904982, + "epoch": 0.06617929973066564, + "flos": 571789117440.0, + "grad_norm": 0.06176008438986438, + "language_loss": 0.99064481, + "learning_rate": 0.0009965734813013652, + "loss": 1.00175822, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.22277832, + "step": 344, + "time_per_iteration": 2.8235929012298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090293, + "balance_loss_mlp": 1.06726193, + "epoch": 0.06637168141592921, + "flos": 490234470912.0, + "grad_norm": 0.05365164831273283, + "language_loss": 1.01308548, + "learning_rate": 0.0009965369742757151, + "loss": 1.02398837, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.23022461, + "step": 345, + "time_per_iteration": 2.5708556175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078086, + "balance_loss_mlp": 1.05727243, + "epoch": 0.06656406310119277, + "flos": 1078735656960.0, + "grad_norm": 0.04968829319439664, + "language_loss": 0.97902787, + "learning_rate": 0.0009965002744773152, + "loss": 0.98980874, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.20812988, + "step": 346, + "time_per_iteration": 3.4984121322631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086945, + "balance_loss_mlp": 1.06450987, + "epoch": 0.06675644478645633, + "flos": 513421065216.0, + "grad_norm": 0.06258978415695335, + "language_loss": 0.95138037, + "learning_rate": 0.0009964633819204139, + "loss": 0.96224982, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.22436523, + "step": 347, + "time_per_iteration": 2.6866109371185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01752926, + "balance_loss_mlp": 1.73108697, + "epoch": 0.06694882647171989, + "flos": 1446359943168.0, + "grad_norm": 0.11694655230354783, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83554041, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.21875, + "step": 348, + "time_per_iteration": 4.935550928115845 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01583198, + "balance_loss_mlp": 1.56116796, + "epoch": 0.06714120815698346, + "flos": 1551230784000.0, + "grad_norm": 0.0989027294649474, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76737082, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.22070312, + "step": 349, + "time_per_iteration": 4.891008615493774 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149858, + "balance_loss_mlp": 1.12826955, + "epoch": 0.06733358984224702, + "flos": 879689673216.0, + "grad_norm": 0.07075764146586616, + "language_loss": 0.94920838, + "learning_rate": 0.000996351547842304, + "loss": 0.96070701, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.21582031, + "step": 350, + "time_per_iteration": 3.156322717666626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192552, + "balance_loss_mlp": 1.17055774, + "epoch": 0.06752597152751058, + "flos": 518654651904.0, + "grad_norm": 0.09040238598346795, + "language_loss": 0.93423587, + "learning_rate": 0.0009963138843953744, + "loss": 0.94616139, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.2199707, + "step": 351, + "time_per_iteration": 2.610987663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206077, + "balance_loss_mlp": 1.18405879, + "epoch": 0.06771835321277414, + "flos": 539366266368.0, + "grad_norm": 0.08658544591035036, + "language_loss": 0.97852194, + "learning_rate": 0.000996276028262306, + "loss": 0.9905827, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.22021484, + "step": 352, + "time_per_iteration": 2.8413686752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166048, + "balance_loss_mlp": 1.14382768, + "epoch": 0.0679107348980377, + "flos": 460430604288.0, + "grad_norm": 0.09117082479319542, + "language_loss": 1.04269946, + "learning_rate": 0.0009962379794577964, + "loss": 1.05435991, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.22216797, + "step": 353, + "time_per_iteration": 2.591372489929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114388, + "balance_loss_mlp": 1.12227976, + "epoch": 0.06810311658330127, + "flos": 635601752064.0, + "grad_norm": 0.05781909345233015, + "language_loss": 0.94169199, + "learning_rate": 0.000996199737996617, + "loss": 0.95313084, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.21630859, + "step": 354, + "time_per_iteration": 2.9088492393493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125411, + "balance_loss_mlp": 1.10420346, + "epoch": 0.06829549826856483, + "flos": 464443448832.0, + "grad_norm": 0.06770201052263504, + "language_loss": 1.03043509, + "learning_rate": 0.0009961613038936149, + "loss": 1.04168916, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.2121582, + "step": 355, + "time_per_iteration": 2.571904420852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110613, + "balance_loss_mlp": 1.08917904, + "epoch": 0.06848787995382839, + "flos": 634335929856.0, + "grad_norm": 0.06097004840688574, + "language_loss": 0.95565176, + "learning_rate": 0.000996122677163711, + "loss": 0.96675789, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.21435547, + "step": 356, + "time_per_iteration": 2.794982671737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107296, + "balance_loss_mlp": 1.08667266, + "epoch": 0.06868026163909195, + "flos": 806023913472.0, + "grad_norm": 0.08020973782133771, + "language_loss": 1.01095176, + "learning_rate": 0.000996083857821902, + "loss": 1.02202487, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.20629883, + "step": 357, + "time_per_iteration": 3.007086753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101637, + "balance_loss_mlp": 1.08076346, + "epoch": 0.06887264332435553, + "flos": 438997252608.0, + "grad_norm": 0.08125476198078858, + "language_loss": 0.99797714, + "learning_rate": 0.0009960448458832588, + "loss": 1.00899351, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.2088623, + "step": 358, + "time_per_iteration": 2.699530601501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098146, + "balance_loss_mlp": 1.07872701, + "epoch": 0.06906502500961909, + "flos": 484513463808.0, + "grad_norm": 0.06827746260367892, + "language_loss": 0.99188638, + "learning_rate": 0.000996005641362927, + "loss": 1.00286782, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.1940918, + "step": 359, + "time_per_iteration": 2.5541014671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103345, + "balance_loss_mlp": 1.0841639, + "epoch": 0.06925740669488265, + "flos": 733293706752.0, + "grad_norm": 0.08731085845928575, + "language_loss": 1.02303529, + "learning_rate": 0.0009959662442761274, + "loss": 1.0340687, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.19189453, + "step": 360, + "time_per_iteration": 2.906623363494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093844, + "balance_loss_mlp": 1.07268476, + "epoch": 0.0694497883801462, + "flos": 552127947264.0, + "grad_norm": 0.06697663210144707, + "language_loss": 0.9595629, + "learning_rate": 0.000995926654638155, + "loss": 0.97050136, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.21179199, + "step": 361, + "time_per_iteration": 2.793663501739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082773, + "balance_loss_mlp": 1.06236482, + "epoch": 0.06964217006540978, + "flos": 677708992512.0, + "grad_norm": 0.06860924301964295, + "language_loss": 0.98198265, + "learning_rate": 0.00099588687246438, + "loss": 0.99281037, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.20410156, + "step": 362, + "time_per_iteration": 2.828139305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085274, + "balance_loss_mlp": 1.06330371, + "epoch": 0.06983455175067334, + "flos": 523987163136.0, + "grad_norm": 0.08747541291209461, + "language_loss": 1.04803789, + "learning_rate": 0.0009958468977702471, + "loss": 1.0588907, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.21972656, + "step": 363, + "time_per_iteration": 2.5759966373443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02224374, + "balance_loss_mlp": 2.20682669, + "epoch": 0.0700269334359369, + "flos": 1575943658496.0, + "grad_norm": 0.2746548069890379, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81959081, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.17578125, + "step": 364, + "time_per_iteration": 4.782835245132446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134514, + "balance_loss_mlp": 1.11340213, + "epoch": 0.07021931512120046, + "flos": 1012848274944.0, + "grad_norm": 0.08586169827549085, + "language_loss": 0.93286598, + "learning_rate": 0.0009957663708830612, + "loss": 0.94421113, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.21105957, + "step": 365, + "time_per_iteration": 3.2484283447265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116189, + "balance_loss_mlp": 1.13884652, + "epoch": 0.07041169680646403, + "flos": 822622348800.0, + "grad_norm": 0.09941073368395695, + "language_loss": 0.97043455, + "learning_rate": 0.0009957258187212714, + "loss": 0.98205346, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.23034668, + "step": 366, + "time_per_iteration": 3.009479522705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01756688, + "balance_loss_mlp": 1.7255981, + "epoch": 0.07060407849172759, + "flos": 1413670993920.0, + "grad_norm": 0.12374795181042475, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80951542, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.31054688, + "step": 367, + "time_per_iteration": 4.82874608039856 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152073, + "balance_loss_mlp": 1.13087749, + "epoch": 0.07079646017699115, + "flos": 512652837888.0, + "grad_norm": 0.06786716904588838, + "language_loss": 0.93450886, + "learning_rate": 0.0009956441370400167, + "loss": 0.94602954, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.21191406, + "step": 368, + "time_per_iteration": 2.6226603984832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153965, + "balance_loss_mlp": 1.13158989, + "epoch": 0.07098884186225471, + "flos": 540240772608.0, + "grad_norm": 0.08343626294497461, + "language_loss": 0.99467343, + "learning_rate": 0.0009956030075522636, + "loss": 1.00621307, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.22375488, + "step": 369, + "time_per_iteration": 2.7128794193267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142856, + "balance_loss_mlp": 1.12137485, + "epoch": 0.07118122354751828, + "flos": 548419230720.0, + "grad_norm": 0.07464528715750075, + "language_loss": 0.98955953, + "learning_rate": 0.0009955616856543587, + "loss": 1.00098813, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.21472168, + "step": 370, + "time_per_iteration": 2.613138198852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118555, + "balance_loss_mlp": 1.0958215, + "epoch": 0.07137360523278184, + "flos": 620612554752.0, + "grad_norm": 0.056434914921328155, + "language_loss": 0.91880834, + "learning_rate": 0.0009955201713623448, + "loss": 0.92999387, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.22717285, + "step": 371, + "time_per_iteration": 2.747133255004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01746336, + "balance_loss_mlp": 1.72154021, + "epoch": 0.0715659869180454, + "flos": 1501850115072.0, + "grad_norm": 0.08669176596007007, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78419054, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.24707031, + "step": 372, + "time_per_iteration": 4.931428670883179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102855, + "balance_loss_mlp": 1.08040774, + "epoch": 0.07175836860330896, + "flos": 495246887424.0, + "grad_norm": 0.07044890130803105, + "language_loss": 1.05121827, + "learning_rate": 0.0009954365656605333, + "loss": 1.06224692, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.22436523, + "step": 373, + "time_per_iteration": 2.550243616104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118244, + "balance_loss_mlp": 1.09438992, + "epoch": 0.07195075028857253, + "flos": 785387902464.0, + "grad_norm": 0.05415547127036835, + "language_loss": 0.98150015, + "learning_rate": 0.0009953944742831947, + "loss": 0.99268264, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.23864746, + "step": 374, + "time_per_iteration": 2.9659459590911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125507, + "balance_loss_mlp": 1.10202336, + "epoch": 0.0721431319738361, + "flos": 592799067648.0, + "grad_norm": 0.07003669353380264, + "language_loss": 1.01441097, + "learning_rate": 0.0009953521905766642, + "loss": 1.02566612, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.23486328, + "step": 375, + "time_per_iteration": 2.942763566970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117119, + "balance_loss_mlp": 1.09393334, + "epoch": 0.07233551365909965, + "flos": 547981272576.0, + "grad_norm": 0.06343477824222313, + "language_loss": 0.99901861, + "learning_rate": 0.0009953097145573577, + "loss": 1.01018989, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.23193359, + "step": 376, + "time_per_iteration": 2.6275272369384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113711, + "balance_loss_mlp": 1.09023869, + "epoch": 0.07252789534436321, + "flos": 957170428416.0, + "grad_norm": 0.0678891965164594, + "language_loss": 0.97798675, + "learning_rate": 0.000995267046241766, + "loss": 0.98912394, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.23474121, + "step": 377, + "time_per_iteration": 3.2014975547790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096997, + "balance_loss_mlp": 1.07496762, + "epoch": 0.07272027702962677, + "flos": 507398902272.0, + "grad_norm": 0.0806519998399971, + "language_loss": 0.97275257, + "learning_rate": 0.0009952241856464547, + "loss": 0.98372257, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.22045898, + "step": 378, + "time_per_iteration": 2.6189732551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109641, + "balance_loss_mlp": 1.0746069, + "epoch": 0.07291265871489035, + "flos": 612128558592.0, + "grad_norm": 0.0691049335661606, + "language_loss": 1.04592681, + "learning_rate": 0.0009951811327880632, + "loss": 1.05689096, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.21826172, + "step": 379, + "time_per_iteration": 2.7411558628082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092071, + "balance_loss_mlp": 1.07025611, + "epoch": 0.0731050404001539, + "flos": 495502963200.0, + "grad_norm": 0.05765504670581196, + "language_loss": 0.97682816, + "learning_rate": 0.0009951378876833063, + "loss": 0.98774892, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.21813965, + "step": 380, + "time_per_iteration": 2.6211278438568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081575, + "balance_loss_mlp": 1.06068945, + "epoch": 0.07329742208541747, + "flos": 639677205504.0, + "grad_norm": 0.06809750593205881, + "language_loss": 1.04190159, + "learning_rate": 0.0009950944503489736, + "loss": 1.05271733, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.20898438, + "step": 381, + "time_per_iteration": 2.7533762454986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081401, + "balance_loss_mlp": 1.0607307, + "epoch": 0.07348980377068103, + "flos": 815999284224.0, + "grad_norm": 0.06607035824886899, + "language_loss": 0.98459697, + "learning_rate": 0.0009950508208019285, + "loss": 0.99541104, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.20678711, + "step": 382, + "time_per_iteration": 2.9885637760162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073667, + "balance_loss_mlp": 1.05369973, + "epoch": 0.0736821854559446, + "flos": 508383917568.0, + "grad_norm": 0.05970909775769663, + "language_loss": 1.02745128, + "learning_rate": 0.0009950069990591096, + "loss": 1.03818798, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.19958496, + "step": 383, + "time_per_iteration": 2.6111788749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01835936, + "balance_loss_mlp": 1.8101871, + "epoch": 0.07387456714120816, + "flos": 1553801716224.0, + "grad_norm": 0.167122487372618, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.78237301, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.2578125, + "step": 384, + "time_per_iteration": 4.859915494918823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116619, + "balance_loss_mlp": 1.09575748, + "epoch": 0.07406694882647172, + "flos": 525219489792.0, + "grad_norm": 0.0799084124695288, + "language_loss": 0.96017051, + "learning_rate": 0.0009949187790542777, + "loss": 0.97133672, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.20861816, + "step": 385, + "time_per_iteration": 2.6976191997528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124239, + "balance_loss_mlp": 1.10322285, + "epoch": 0.07425933051173528, + "flos": 497468611584.0, + "grad_norm": 0.08753491640442414, + "language_loss": 0.91745877, + "learning_rate": 0.0009948743808265148, + "loss": 0.92870116, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.21020508, + "step": 386, + "time_per_iteration": 2.6870572566986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113476, + "balance_loss_mlp": 1.09249496, + "epoch": 0.07445171219699885, + "flos": 504740630016.0, + "grad_norm": 0.05063210924529089, + "language_loss": 1.0156467, + "learning_rate": 0.0009948297904714782, + "loss": 1.02678132, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.20996094, + "step": 387, + "time_per_iteration": 2.668027639389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097529, + "balance_loss_mlp": 1.07642913, + "epoch": 0.07464409388226241, + "flos": 553693515264.0, + "grad_norm": 0.06830922509793466, + "language_loss": 0.93493366, + "learning_rate": 0.0009947850080064796, + "loss": 0.9459089, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.21105957, + "step": 388, + "time_per_iteration": 2.79836106300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098078, + "balance_loss_mlp": 1.07695365, + "epoch": 0.07483647556752597, + "flos": 776511028224.0, + "grad_norm": 0.06471398355705121, + "language_loss": 0.98276728, + "learning_rate": 0.0009947400334489047, + "loss": 0.99374807, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.21130371, + "step": 389, + "time_per_iteration": 3.0046355724334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095267, + "balance_loss_mlp": 1.07513261, + "epoch": 0.07502885725278953, + "flos": 612256596480.0, + "grad_norm": 0.0754939105077014, + "language_loss": 0.90272582, + "learning_rate": 0.0009946948668162145, + "loss": 0.91367853, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.20141602, + "step": 390, + "time_per_iteration": 2.724792003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091157, + "balance_loss_mlp": 1.06946135, + "epoch": 0.0752212389380531, + "flos": 688324552704.0, + "grad_norm": 0.05626120625508035, + "language_loss": 0.9463594, + "learning_rate": 0.0009946495081259441, + "loss": 0.95727098, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.21704102, + "step": 391, + "time_per_iteration": 2.8221397399902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101684, + "balance_loss_mlp": 1.08008361, + "epoch": 0.07541362062331666, + "flos": 765362967552.0, + "grad_norm": 0.09729902751759628, + "language_loss": 0.97468722, + "learning_rate": 0.0009946039573957035, + "loss": 0.98570406, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.21606445, + "step": 392, + "time_per_iteration": 2.958655595779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095785, + "balance_loss_mlp": 1.07572174, + "epoch": 0.07560600230858022, + "flos": 588460336128.0, + "grad_norm": 0.06468718689622391, + "language_loss": 0.94257009, + "learning_rate": 0.000994558214643177, + "loss": 0.95352793, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.20056152, + "step": 393, + "time_per_iteration": 2.752979040145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101382, + "balance_loss_mlp": 1.08086586, + "epoch": 0.07579838399384378, + "flos": 749508028416.0, + "grad_norm": 0.06635223139616171, + "language_loss": 0.961483, + "learning_rate": 0.000994512279886123, + "loss": 0.97249681, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.20532227, + "step": 394, + "time_per_iteration": 3.055225133895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104661, + "balance_loss_mlp": 1.08346581, + "epoch": 0.07599076567910736, + "flos": 523185440256.0, + "grad_norm": 0.06901630142642712, + "language_loss": 0.96749192, + "learning_rate": 0.0009944661531423758, + "loss": 0.97853857, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.2121582, + "step": 395, + "time_per_iteration": 2.6922085285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093271, + "balance_loss_mlp": 1.07248056, + "epoch": 0.07618314736437092, + "flos": 550812662784.0, + "grad_norm": 0.07064334209039194, + "language_loss": 0.95375401, + "learning_rate": 0.000994419834429843, + "loss": 0.96468663, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.20788574, + "step": 396, + "time_per_iteration": 2.6657333374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092352, + "balance_loss_mlp": 1.0716933, + "epoch": 0.07637552904963447, + "flos": 697901253120.0, + "grad_norm": 0.07324881108467876, + "language_loss": 0.99580455, + "learning_rate": 0.0009943733237665069, + "loss": 1.00672793, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.20654297, + "step": 397, + "time_per_iteration": 2.8662500381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085112, + "balance_loss_mlp": 1.06454849, + "epoch": 0.07656791073489803, + "flos": 579066928128.0, + "grad_norm": 0.04790317238997088, + "language_loss": 0.98118353, + "learning_rate": 0.0009943266211704248, + "loss": 0.99203461, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.20568848, + "step": 398, + "time_per_iteration": 2.930741786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094784, + "balance_loss_mlp": 1.07348132, + "epoch": 0.0767602924201616, + "flos": 416923711488.0, + "grad_norm": 0.09980331544781734, + "language_loss": 1.00422275, + "learning_rate": 0.000994279726659728, + "loss": 1.01517057, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.21325684, + "step": 399, + "time_per_iteration": 2.533738851547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109036, + "balance_loss_mlp": 1.06970143, + "epoch": 0.07695267410542517, + "flos": 482671471104.0, + "grad_norm": 0.06967700921129397, + "language_loss": 0.97985041, + "learning_rate": 0.0009942326402526231, + "loss": 0.99075395, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.20666504, + "step": 400, + "time_per_iteration": 2.51460337638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096542, + "balance_loss_mlp": 1.07526302, + "epoch": 0.07714505579068873, + "flos": 530742647808.0, + "grad_norm": 0.052652305799428985, + "language_loss": 0.96639109, + "learning_rate": 0.0009941853619673902, + "loss": 0.97735649, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.2130127, + "step": 401, + "time_per_iteration": 2.620939016342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101036, + "balance_loss_mlp": 1.08012676, + "epoch": 0.07733743747595229, + "flos": 804635845632.0, + "grad_norm": 0.07273299487754427, + "language_loss": 0.99959278, + "learning_rate": 0.0009941378918223844, + "loss": 1.01060319, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.20910645, + "step": 402, + "time_per_iteration": 3.036839008331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110477, + "balance_loss_mlp": 1.08423018, + "epoch": 0.07752981916121585, + "flos": 622192679424.0, + "grad_norm": 0.05767312217272775, + "language_loss": 0.93044209, + "learning_rate": 0.0009940902298360354, + "loss": 0.94148982, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.20544434, + "step": 403, + "time_per_iteration": 2.7703943252563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097477, + "balance_loss_mlp": 1.07694876, + "epoch": 0.07772220084647942, + "flos": 727961195520.0, + "grad_norm": 0.0686344305115436, + "language_loss": 1.02037048, + "learning_rate": 0.0009940423760268473, + "loss": 1.03134525, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.2052002, + "step": 404, + "time_per_iteration": 2.8823602199554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.06497431, + "epoch": 0.07791458253174298, + "flos": 555149984256.0, + "grad_norm": 0.10727031409308073, + "language_loss": 0.96142864, + "learning_rate": 0.0009939943304133982, + "loss": 0.97228479, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.20654297, + "step": 405, + "time_per_iteration": 2.63908314704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078944, + "balance_loss_mlp": 1.05944133, + "epoch": 0.07810696421700654, + "flos": 552919495680.0, + "grad_norm": 0.08981509362846728, + "language_loss": 1.0302707, + "learning_rate": 0.0009939460930143416, + "loss": 1.04106021, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.19482422, + "step": 406, + "time_per_iteration": 2.63259220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079269, + "balance_loss_mlp": 1.05927801, + "epoch": 0.0782993459022701, + "flos": 650323289088.0, + "grad_norm": 0.07212254231156982, + "language_loss": 0.96910775, + "learning_rate": 0.0009938976638484043, + "loss": 0.97990054, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.1998291, + "step": 407, + "time_per_iteration": 2.9489452838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_mlp": 1.05239439, + "epoch": 0.07849172758753367, + "flos": 495926364672.0, + "grad_norm": 0.07302041560946317, + "language_loss": 0.9619081, + "learning_rate": 0.0009938490429343887, + "loss": 0.97263873, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.20666504, + "step": 408, + "time_per_iteration": 2.541293144226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078297, + "balance_loss_mlp": 1.05823374, + "epoch": 0.07868410927279723, + "flos": 577696389120.0, + "grad_norm": 0.06961121210328268, + "language_loss": 0.96404505, + "learning_rate": 0.0009938002302911709, + "loss": 0.974828, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.20056152, + "step": 409, + "time_per_iteration": 2.7890634536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.05869615, + "epoch": 0.07887649095806079, + "flos": 522698019840.0, + "grad_norm": 0.10283598941623227, + "language_loss": 0.99080813, + "learning_rate": 0.0009937512259377015, + "loss": 1.00159442, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.19921875, + "step": 410, + "time_per_iteration": 2.6631360054016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076374, + "balance_loss_mlp": 1.05739617, + "epoch": 0.07906887264332435, + "flos": 556958481408.0, + "grad_norm": 0.07518465865945036, + "language_loss": 0.97744381, + "learning_rate": 0.000993702029893006, + "loss": 0.98820746, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.18981934, + "step": 411, + "time_per_iteration": 2.762937068939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070708, + "balance_loss_mlp": 1.0512886, + "epoch": 0.07926125432858792, + "flos": 821641715712.0, + "grad_norm": 0.06547583340109177, + "language_loss": 0.97466588, + "learning_rate": 0.0009936526421761838, + "loss": 0.98537302, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.1940918, + "step": 412, + "time_per_iteration": 3.019529342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070741, + "balance_loss_mlp": 1.05210841, + "epoch": 0.07945363601385148, + "flos": 562072794624.0, + "grad_norm": 0.06412617323579047, + "language_loss": 0.9993977, + "learning_rate": 0.000993603062806409, + "loss": 1.01010513, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.18615723, + "step": 413, + "time_per_iteration": 2.667893409729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078833, + "balance_loss_mlp": 1.05879402, + "epoch": 0.07964601769911504, + "flos": 517615792128.0, + "grad_norm": 0.0777298152120257, + "language_loss": 1.03187037, + "learning_rate": 0.0009935532918029298, + "loss": 1.04265857, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.20031738, + "step": 414, + "time_per_iteration": 2.628847122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079604, + "balance_loss_mlp": 1.06020916, + "epoch": 0.0798383993843786, + "flos": 538956011520.0, + "grad_norm": 0.0762846382616791, + "language_loss": 0.96381676, + "learning_rate": 0.0009935033291850694, + "loss": 0.97461283, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.19384766, + "step": 415, + "time_per_iteration": 2.6874804496765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078311, + "balance_loss_mlp": 1.05915451, + "epoch": 0.08003078106964218, + "flos": 484901959680.0, + "grad_norm": 0.07548152614126195, + "language_loss": 0.9874112, + "learning_rate": 0.0009934531749722247, + "loss": 0.9981944, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.19177246, + "step": 416, + "time_per_iteration": 2.5752930641174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077721, + "balance_loss_mlp": 1.0581702, + "epoch": 0.08022316275490574, + "flos": 517999905792.0, + "grad_norm": 0.07373378819853486, + "language_loss": 0.97326815, + "learning_rate": 0.0009934028291838672, + "loss": 0.98404539, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.1953125, + "step": 417, + "time_per_iteration": 2.715142011642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078885, + "balance_loss_mlp": 1.0593344, + "epoch": 0.0804155444401693, + "flos": 493755512832.0, + "grad_norm": 0.06878732968267398, + "language_loss": 0.9290086, + "learning_rate": 0.0009933522918395433, + "loss": 0.93979746, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.19555664, + "step": 418, + "time_per_iteration": 2.7008063793182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01673141, + "balance_loss_mlp": 1.6505394, + "epoch": 0.08060792612543285, + "flos": 1580567579136.0, + "grad_norm": 0.10865535097535944, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.7992425, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.22558594, + "step": 419, + "time_per_iteration": 4.854820728302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092516, + "balance_loss_mlp": 1.07238102, + "epoch": 0.08080030781069643, + "flos": 525090041856.0, + "grad_norm": 0.07888672823303539, + "language_loss": 1.11010027, + "learning_rate": 0.000993250642561551, + "loss": 1.12102532, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.20129395, + "step": 420, + "time_per_iteration": 2.6152822971343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102725, + "balance_loss_mlp": 1.08251905, + "epoch": 0.08099268949595999, + "flos": 546459374592.0, + "grad_norm": 0.06927423279576624, + "language_loss": 0.96781242, + "learning_rate": 0.0009931995306673466, + "loss": 0.97883964, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.20202637, + "step": 421, + "time_per_iteration": 2.8378820419311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107938, + "balance_loss_mlp": 1.08725524, + "epoch": 0.08118507118122355, + "flos": 510116811264.0, + "grad_norm": 0.07245841989657228, + "language_loss": 1.01691484, + "learning_rate": 0.000993148227296103, + "loss": 1.02799416, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.20678711, + "step": 422, + "time_per_iteration": 2.6234657764434814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109153, + "balance_loss_mlp": 1.08827925, + "epoch": 0.08137745286648711, + "flos": 720339969024.0, + "grad_norm": 0.06440268991377437, + "language_loss": 0.90059143, + "learning_rate": 0.000993096732467738, + "loss": 0.91168296, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.2088623, + "step": 423, + "time_per_iteration": 2.9789979457855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107042, + "balance_loss_mlp": 1.08620405, + "epoch": 0.08156983455175067, + "flos": 679313848320.0, + "grad_norm": 0.09430690436493987, + "language_loss": 0.97591221, + "learning_rate": 0.0009930450462022435, + "loss": 0.9869827, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.20837402, + "step": 424, + "time_per_iteration": 2.7870407104492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01731933, + "balance_loss_mlp": 1.70847309, + "epoch": 0.08176221623701424, + "flos": 1452577135104.0, + "grad_norm": 0.13164555017172178, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80921739, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.234375, + "step": 425, + "time_per_iteration": 4.870323181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095108, + "balance_loss_mlp": 1.07456827, + "epoch": 0.0819545979222778, + "flos": 1556034071040.0, + "grad_norm": 0.10298759083167684, + "language_loss": 0.95328236, + "learning_rate": 0.0009929410994402065, + "loss": 0.9642334, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.20544434, + "step": 426, + "time_per_iteration": 3.7942585945129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093366, + "balance_loss_mlp": 1.07214665, + "epoch": 0.08214697960754136, + "flos": 512456398848.0, + "grad_norm": 0.069672302328133, + "language_loss": 0.99507213, + "learning_rate": 0.0009928888389840196, + "loss": 1.00600576, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.21240234, + "step": 427, + "time_per_iteration": 2.684760093688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073876, + "balance_loss_mlp": 1.05376494, + "epoch": 0.08233936129280492, + "flos": 594850646016.0, + "grad_norm": 0.07796900075206671, + "language_loss": 1.01749206, + "learning_rate": 0.0009928363871714147, + "loss": 1.02823079, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.20092773, + "step": 428, + "time_per_iteration": 2.6608195304870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078126, + "balance_loss_mlp": 1.05796742, + "epoch": 0.08253174297806849, + "flos": 571758594048.0, + "grad_norm": 0.07341701057973313, + "language_loss": 0.95524251, + "learning_rate": 0.0009927837440227556, + "loss": 0.96602374, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.20153809, + "step": 429, + "time_per_iteration": 2.824958324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083273, + "balance_loss_mlp": 1.06413972, + "epoch": 0.08272412466333205, + "flos": 623065623552.0, + "grad_norm": 0.06194570532237157, + "language_loss": 0.90308964, + "learning_rate": 0.0009927309095584798, + "loss": 0.91392243, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.19128418, + "step": 430, + "time_per_iteration": 2.9565205574035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105878, + "balance_loss_mlp": 1.08643484, + "epoch": 0.08291650634859561, + "flos": 513745542144.0, + "grad_norm": 0.09375416706629437, + "language_loss": 1.0225904, + "learning_rate": 0.0009926778837991, + "loss": 1.03364921, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.19433594, + "step": 431, + "time_per_iteration": 2.5606777667999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104802, + "balance_loss_mlp": 1.08521628, + "epoch": 0.08310888803385917, + "flos": 667073083392.0, + "grad_norm": 0.09022222071598751, + "language_loss": 1.00445497, + "learning_rate": 0.000992624666765202, + "loss": 1.01550293, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.19580078, + "step": 432, + "time_per_iteration": 2.763514995574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112312, + "balance_loss_mlp": 1.09166527, + "epoch": 0.08330126971912274, + "flos": 582995404800.0, + "grad_norm": 0.07142121215748316, + "language_loss": 0.98131895, + "learning_rate": 0.000992571258477447, + "loss": 0.99244213, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.20654297, + "step": 433, + "time_per_iteration": 2.7823588848114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086622, + "balance_loss_mlp": 1.06731021, + "epoch": 0.0834936514043863, + "flos": 561064458240.0, + "grad_norm": 0.06618743000622296, + "language_loss": 0.92206728, + "learning_rate": 0.0009925176589565695, + "loss": 0.93293345, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.1932373, + "step": 434, + "time_per_iteration": 2.7774362564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109069, + "balance_loss_mlp": 1.07043648, + "epoch": 0.08368603308964986, + "flos": 494272046592.0, + "grad_norm": 0.07800081613857189, + "language_loss": 1.01949787, + "learning_rate": 0.0009924638682233791, + "loss": 1.03040481, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.20251465, + "step": 435, + "time_per_iteration": 2.574716091156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236801, + "balance_loss_mlp": 1.21505737, + "epoch": 0.08387841477491342, + "flos": 1388322312192.0, + "grad_norm": 0.08820287098199171, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80801398, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.21777344, + "step": 436, + "time_per_iteration": 4.521069049835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087939, + "balance_loss_mlp": 1.06750691, + "epoch": 0.084070796460177, + "flos": 798642796032.0, + "grad_norm": 0.09737991847895365, + "language_loss": 0.92070073, + "learning_rate": 0.0009923557132036668, + "loss": 0.93158013, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.2043457, + "step": 437, + "time_per_iteration": 3.0401971340179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106896, + "balance_loss_mlp": 1.08635592, + "epoch": 0.08426317814544056, + "flos": 558681200640.0, + "grad_norm": 0.07082709395687636, + "language_loss": 0.96077365, + "learning_rate": 0.0009923013489591345, + "loss": 0.97184265, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.20532227, + "step": 438, + "time_per_iteration": 2.7388038635253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138911, + "balance_loss_mlp": 1.11965871, + "epoch": 0.08445555983070412, + "flos": 810057106944.0, + "grad_norm": 0.09946092642967543, + "language_loss": 0.94659293, + "learning_rate": 0.0009922467935862681, + "loss": 0.95798206, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.19250488, + "step": 439, + "time_per_iteration": 3.0827929973602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153103, + "balance_loss_mlp": 1.13278937, + "epoch": 0.08464794151596768, + "flos": 509939311104.0, + "grad_norm": 0.08658230076015333, + "language_loss": 0.97196984, + "learning_rate": 0.0009921920471062478, + "loss": 0.9835009, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.203125, + "step": 440, + "time_per_iteration": 2.5667247772216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_mlp": 1.08952785, + "epoch": 0.08484032320123125, + "flos": 556149556224.0, + "grad_norm": 0.0779492699350581, + "language_loss": 0.95526892, + "learning_rate": 0.0009921371095403281, + "loss": 0.96636182, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.19763184, + "step": 441, + "time_per_iteration": 2.6504476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081558, + "balance_loss_mlp": 1.06137586, + "epoch": 0.08503270488649481, + "flos": 527103742464.0, + "grad_norm": 0.0823758421396894, + "language_loss": 0.98291612, + "learning_rate": 0.0009920819809098379, + "loss": 0.99373174, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.20166016, + "step": 442, + "time_per_iteration": 2.5884947776794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076633, + "balance_loss_mlp": 1.05612862, + "epoch": 0.08522508657175837, + "flos": 613989490176.0, + "grad_norm": 0.07828377396362728, + "language_loss": 0.94043314, + "learning_rate": 0.0009920266612361798, + "loss": 0.95119947, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.20507812, + "step": 443, + "time_per_iteration": 2.7464845180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077144, + "balance_loss_mlp": 1.05650926, + "epoch": 0.08541746825702193, + "flos": 619495119360.0, + "grad_norm": 0.07442656272719532, + "language_loss": 0.94335687, + "learning_rate": 0.0009919711505408308, + "loss": 0.95412827, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.2064209, + "step": 444, + "time_per_iteration": 2.7615623474121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092391, + "balance_loss_mlp": 1.07126665, + "epoch": 0.08560984994228549, + "flos": 482671471104.0, + "grad_norm": 0.08601843511227286, + "language_loss": 0.92049706, + "learning_rate": 0.000991915448845342, + "loss": 0.93142092, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.21130371, + "step": 445, + "time_per_iteration": 2.519644260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103035, + "balance_loss_mlp": 1.08145857, + "epoch": 0.08580223162754906, + "flos": 516897027072.0, + "grad_norm": 0.07781715705073443, + "language_loss": 1.01207459, + "learning_rate": 0.000991859556171339, + "loss": 1.02310491, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.21569824, + "step": 446, + "time_per_iteration": 2.5678694248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116922, + "balance_loss_mlp": 1.09462976, + "epoch": 0.08599461331281262, + "flos": 531215511552.0, + "grad_norm": 0.11213971543052093, + "language_loss": 1.02931881, + "learning_rate": 0.000991803472540521, + "loss": 1.040488, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.22302246, + "step": 447, + "time_per_iteration": 2.6309196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124555, + "balance_loss_mlp": 1.10302639, + "epoch": 0.08618699499807618, + "flos": 789966743040.0, + "grad_norm": 0.07287006723198586, + "language_loss": 0.97443926, + "learning_rate": 0.0009917471979746615, + "loss": 0.98568487, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.21533203, + "step": 448, + "time_per_iteration": 2.9742491245269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134564, + "balance_loss_mlp": 1.11266506, + "epoch": 0.08637937668333974, + "flos": 565707317760.0, + "grad_norm": 0.08202115093309782, + "language_loss": 0.97199845, + "learning_rate": 0.0009916907324956086, + "loss": 0.98334408, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.21923828, + "step": 449, + "time_per_iteration": 2.704089641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151497, + "balance_loss_mlp": 1.12693954, + "epoch": 0.08657175836860331, + "flos": 444930665472.0, + "grad_norm": 0.09325215593581063, + "language_loss": 0.93441564, + "learning_rate": 0.0009916340761252837, + "loss": 0.9459306, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.24536133, + "step": 450, + "time_per_iteration": 2.5866575241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158359, + "balance_loss_mlp": 1.13567328, + "epoch": 0.08676414005386687, + "flos": 843789450240.0, + "grad_norm": 0.23711660967347972, + "language_loss": 0.90976942, + "learning_rate": 0.0009915772288856832, + "loss": 0.92135304, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.22668457, + "step": 451, + "time_per_iteration": 3.109010696411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118071, + "balance_loss_mlp": 1.15827537, + "epoch": 0.08695652173913043, + "flos": 602995608576.0, + "grad_norm": 0.08699490701012727, + "language_loss": 0.92036849, + "learning_rate": 0.000991520190798877, + "loss": 0.93217564, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.22424316, + "step": 452, + "time_per_iteration": 2.8523812294006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181191, + "balance_loss_mlp": 1.15807629, + "epoch": 0.08714890342439399, + "flos": 730423028736.0, + "grad_norm": 0.09293440668835976, + "language_loss": 1.01637089, + "learning_rate": 0.0009914629618870089, + "loss": 1.02818286, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.23095703, + "step": 453, + "time_per_iteration": 2.882887125015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142362, + "balance_loss_mlp": 1.12891519, + "epoch": 0.08734128510965757, + "flos": 1481518232064.0, + "grad_norm": 0.0645312523276542, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79818237, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.13476562, + "step": 454, + "time_per_iteration": 4.717878103256226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103219, + "balance_loss_mlp": 1.09034455, + "epoch": 0.08753366679492113, + "flos": 1522214083584.0, + "grad_norm": 0.04274098512475534, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82531178, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.12890625, + "step": 455, + "time_per_iteration": 4.838243246078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171645, + "balance_loss_mlp": 1.14951944, + "epoch": 0.08772604848018468, + "flos": 720935078400.0, + "grad_norm": 0.10543082910841049, + "language_loss": 0.94423014, + "learning_rate": 0.0009912901304235883, + "loss": 0.95594656, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.22131348, + "step": 456, + "time_per_iteration": 2.9432015419006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150762, + "balance_loss_mlp": 1.12861252, + "epoch": 0.08791843016544824, + "flos": 707926086144.0, + "grad_norm": 0.10980567381029156, + "language_loss": 0.91300154, + "learning_rate": 0.000991232138434397, + "loss": 0.92450917, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.22143555, + "step": 457, + "time_per_iteration": 2.832761526107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113929, + "balance_loss_mlp": 1.09195828, + "epoch": 0.08811081185071182, + "flos": 472799407104.0, + "grad_norm": 0.1324680836731367, + "language_loss": 0.97845554, + "learning_rate": 0.000991173955731976, + "loss": 0.98959482, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.21960449, + "step": 458, + "time_per_iteration": 2.660696506500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100778, + "balance_loss_mlp": 1.07958269, + "epoch": 0.08830319353597538, + "flos": 684647769600.0, + "grad_norm": 0.07138233575581546, + "language_loss": 1.0178268, + "learning_rate": 0.0009911155823389137, + "loss": 1.02883458, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.21203613, + "step": 459, + "time_per_iteration": 2.9878122806549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105128, + "balance_loss_mlp": 1.08344412, + "epoch": 0.08849557522123894, + "flos": 573235411968.0, + "grad_norm": 0.0735053314112025, + "language_loss": 0.9764787, + "learning_rate": 0.000991057018277873, + "loss": 0.98752999, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.21679688, + "step": 460, + "time_per_iteration": 2.707247018814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116963, + "balance_loss_mlp": 1.09422946, + "epoch": 0.0886879569065025, + "flos": 564303283200.0, + "grad_norm": 0.10552034142073316, + "language_loss": 0.9759655, + "learning_rate": 0.0009909982635715898, + "loss": 0.98713505, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.22729492, + "step": 461, + "time_per_iteration": 2.609016180038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120097, + "balance_loss_mlp": 1.09760189, + "epoch": 0.08888033859176607, + "flos": 563609249280.0, + "grad_norm": 0.09185893532484944, + "language_loss": 0.96625364, + "learning_rate": 0.0009909393182428751, + "loss": 0.97745454, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.22497559, + "step": 462, + "time_per_iteration": 2.682616949081421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116703, + "balance_loss_mlp": 1.09437466, + "epoch": 0.08907272027702963, + "flos": 465517214208.0, + "grad_norm": 0.08888403374641002, + "language_loss": 0.91300213, + "learning_rate": 0.000990880182314614, + "loss": 0.92416912, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.22314453, + "step": 463, + "time_per_iteration": 2.732579469680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122985, + "balance_loss_mlp": 1.10014486, + "epoch": 0.08926510196229319, + "flos": 681200921088.0, + "grad_norm": 0.07408309604525525, + "language_loss": 0.92294347, + "learning_rate": 0.0009908208558097643, + "loss": 0.93417335, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.22839355, + "step": 464, + "time_per_iteration": 2.910313606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137827, + "balance_loss_mlp": 1.115273, + "epoch": 0.08945748364755675, + "flos": 596411831808.0, + "grad_norm": 0.08673846989427919, + "language_loss": 0.93827909, + "learning_rate": 0.000990761338751359, + "loss": 0.94965738, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.22546387, + "step": 465, + "time_per_iteration": 2.827570676803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133815, + "balance_loss_mlp": 1.12222791, + "epoch": 0.08964986533282032, + "flos": 1585082400768.0, + "grad_norm": 0.06082202694548154, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74793446, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.11572266, + "step": 466, + "time_per_iteration": 4.960917234420776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177765, + "balance_loss_mlp": 1.15419745, + "epoch": 0.08984224701808388, + "flos": 533268499968.0, + "grad_norm": 0.4900596090566038, + "language_loss": 0.96587038, + "learning_rate": 0.0009906417330663815, + "loss": 0.97764802, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.23571777, + "step": 467, + "time_per_iteration": 2.5937299728393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157084, + "balance_loss_mlp": 1.13383865, + "epoch": 0.09003462870334744, + "flos": 478702296576.0, + "grad_norm": 0.08613132202477504, + "language_loss": 0.92798859, + "learning_rate": 0.0009905816444862442, + "loss": 0.93955946, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.23217773, + "step": 468, + "time_per_iteration": 2.6012237071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164868, + "balance_loss_mlp": 1.14150274, + "epoch": 0.090227010388611, + "flos": 653307448320.0, + "grad_norm": 0.08218040805372613, + "language_loss": 0.90769458, + "learning_rate": 0.0009905213654454216, + "loss": 0.91934329, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.23364258, + "step": 469, + "time_per_iteration": 2.8727760314941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176439, + "balance_loss_mlp": 1.15152478, + "epoch": 0.09041939207387456, + "flos": 617894645760.0, + "grad_norm": 0.09256259391525869, + "language_loss": 0.97864139, + "learning_rate": 0.0009904608959673158, + "loss": 0.9904058, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.24938965, + "step": 470, + "time_per_iteration": 2.7991952896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151805, + "balance_loss_mlp": 1.12671185, + "epoch": 0.09061177375913813, + "flos": 454137808896.0, + "grad_norm": 0.09693984756275055, + "language_loss": 0.97988749, + "learning_rate": 0.000990400236075403, + "loss": 0.99140555, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.25109863, + "step": 471, + "time_per_iteration": 2.523508310317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125394, + "balance_loss_mlp": 1.10119498, + "epoch": 0.0908041554444017, + "flos": 543982984704.0, + "grad_norm": 0.09250187628709369, + "language_loss": 0.9490509, + "learning_rate": 0.0009903393857932338, + "loss": 0.96030486, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.24194336, + "step": 472, + "time_per_iteration": 2.7065584659576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124084, + "balance_loss_mlp": 1.09912193, + "epoch": 0.09099653712966525, + "flos": 564052999680.0, + "grad_norm": 0.10897832311722938, + "language_loss": 0.93660218, + "learning_rate": 0.0009902783451444317, + "loss": 0.94784307, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.24963379, + "step": 473, + "time_per_iteration": 2.7067277431488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108649, + "balance_loss_mlp": 1.08496177, + "epoch": 0.09118891881492881, + "flos": 474300956160.0, + "grad_norm": 0.09402902414949979, + "language_loss": 0.97273493, + "learning_rate": 0.0009902171141526956, + "loss": 0.98382139, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.23693848, + "step": 474, + "time_per_iteration": 2.5281569957733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087186, + "balance_loss_mlp": 1.06240201, + "epoch": 0.09138130050019239, + "flos": 545579076096.0, + "grad_norm": 0.06728788346792411, + "language_loss": 0.85273343, + "learning_rate": 0.000990155692841797, + "loss": 0.86360526, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.2479248, + "step": 475, + "time_per_iteration": 2.970107316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_mlp": 1.0587163, + "epoch": 0.09157368218545595, + "flos": 732397441536.0, + "grad_norm": 0.07226189405033341, + "language_loss": 0.97062063, + "learning_rate": 0.0009900940812355818, + "loss": 0.98145562, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.24768066, + "step": 476, + "time_per_iteration": 2.959184169769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096233, + "balance_loss_mlp": 1.07208097, + "epoch": 0.0917660638707195, + "flos": 610709967360.0, + "grad_norm": 0.09034653129128065, + "language_loss": 0.92824447, + "learning_rate": 0.00099003227935797, + "loss": 0.93920678, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.24157715, + "step": 477, + "time_per_iteration": 2.7553765773773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113263, + "balance_loss_mlp": 1.08839583, + "epoch": 0.09195844555598306, + "flos": 655561257984.0, + "grad_norm": 0.09830094540804109, + "language_loss": 0.95358098, + "learning_rate": 0.000989970287232955, + "loss": 0.96471357, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.2487793, + "step": 478, + "time_per_iteration": 2.7916457653045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112064, + "balance_loss_mlp": 1.09633327, + "epoch": 0.09215082724124664, + "flos": 476339387904.0, + "grad_norm": 0.08054303064285366, + "language_loss": 0.93560576, + "learning_rate": 0.0009899081048846043, + "loss": 0.94681215, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.24267578, + "step": 479, + "time_per_iteration": 2.554161787033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114732, + "balance_loss_mlp": 1.12177348, + "epoch": 0.0923432089265102, + "flos": 524051182080.0, + "grad_norm": 0.1186512856896222, + "language_loss": 0.97593725, + "learning_rate": 0.0009898457323370593, + "loss": 0.98741049, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.25549316, + "step": 480, + "time_per_iteration": 2.5794191360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131558, + "balance_loss_mlp": 1.10608315, + "epoch": 0.09253559061177376, + "flos": 545302651392.0, + "grad_norm": 0.10688941209840569, + "language_loss": 0.96892118, + "learning_rate": 0.000989783169614535, + "loss": 0.98023689, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.25512695, + "step": 481, + "time_per_iteration": 2.6676101684570312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336494, + "balance_loss_mlp": 1.32304764, + "epoch": 0.09272797229703732, + "flos": 1537222219776.0, + "grad_norm": 0.112558059824644, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80089253, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.13476562, + "step": 482, + "time_per_iteration": 4.910710096359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121205, + "balance_loss_mlp": 1.09537172, + "epoch": 0.09292035398230089, + "flos": 689501624832.0, + "grad_norm": 0.08905484371867754, + "language_loss": 0.93989253, + "learning_rate": 0.000989657473741779, + "loss": 0.95110452, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.25866699, + "step": 483, + "time_per_iteration": 2.8736467361450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120092, + "balance_loss_mlp": 1.09219658, + "epoch": 0.09311273566756445, + "flos": 509482414080.0, + "grad_norm": 0.10011855628381364, + "language_loss": 0.94861096, + "learning_rate": 0.0009895943406403465, + "loss": 0.95981193, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.27905273, + "step": 484, + "time_per_iteration": 2.7233312129974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114409, + "balance_loss_mlp": 1.08641887, + "epoch": 0.09330511735282801, + "flos": 659111413248.0, + "grad_norm": 0.10884122740481975, + "language_loss": 0.87602448, + "learning_rate": 0.0009895310174615338, + "loss": 0.88716859, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.2800293, + "step": 485, + "time_per_iteration": 2.7538061141967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098211, + "balance_loss_mlp": 1.08533621, + "epoch": 0.09349749903809157, + "flos": 1452054809088.0, + "grad_norm": 0.04867374252302138, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76816726, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.12890625, + "step": 486, + "time_per_iteration": 4.681119441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121456, + "balance_loss_mlp": 1.09291732, + "epoch": 0.09368988072335514, + "flos": 520614508032.0, + "grad_norm": 0.07858969791005947, + "language_loss": 0.92458618, + "learning_rate": 0.0009894038009701782, + "loss": 0.93580067, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.28515625, + "step": 487, + "time_per_iteration": 2.6114649772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153128, + "balance_loss_mlp": 1.12148952, + "epoch": 0.0938822624086187, + "flos": 497502107136.0, + "grad_norm": 0.11959755259003642, + "language_loss": 0.91595036, + "learning_rate": 0.0009893399077070253, + "loss": 0.92748165, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.31616211, + "step": 488, + "time_per_iteration": 2.5603692531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127952, + "balance_loss_mlp": 1.09845996, + "epoch": 0.09407464409388226, + "flos": 532948405248.0, + "grad_norm": 0.09098963794592498, + "language_loss": 0.89760649, + "learning_rate": 0.0009892758244652718, + "loss": 0.90888608, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.29516602, + "step": 489, + "time_per_iteration": 2.65938401222229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127724, + "balance_loss_mlp": 1.09568012, + "epoch": 0.09426702577914582, + "flos": 585736634880.0, + "grad_norm": 0.09102778373185845, + "language_loss": 0.94519842, + "learning_rate": 0.0009892115512697968, + "loss": 0.95647562, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.3203125, + "step": 490, + "time_per_iteration": 2.6538186073303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120065, + "balance_loss_mlp": 1.08926105, + "epoch": 0.0944594074644094, + "flos": 503081929728.0, + "grad_norm": 0.07724049493821064, + "language_loss": 0.96624851, + "learning_rate": 0.0009891470881455537, + "loss": 0.97744912, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.30810547, + "step": 491, + "time_per_iteration": 2.699535608291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122711, + "balance_loss_mlp": 1.09145451, + "epoch": 0.09465178914967295, + "flos": 570748847616.0, + "grad_norm": 0.0816499633869022, + "language_loss": 0.94510269, + "learning_rate": 0.0009890824351175692, + "loss": 0.95632982, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.31225586, + "step": 492, + "time_per_iteration": 2.678191661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125893, + "balance_loss_mlp": 1.09418344, + "epoch": 0.09484417083493651, + "flos": 549098707968.0, + "grad_norm": 0.07977284094064935, + "language_loss": 0.98609412, + "learning_rate": 0.0009890175922109435, + "loss": 0.99735302, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.31689453, + "step": 493, + "time_per_iteration": 2.6466987133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138627, + "balance_loss_mlp": 1.10534418, + "epoch": 0.09503655252020007, + "flos": 823552109568.0, + "grad_norm": 0.09331424233507904, + "language_loss": 0.96939492, + "learning_rate": 0.0009889525594508513, + "loss": 0.9807812, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.33300781, + "step": 494, + "time_per_iteration": 3.009894371032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153225, + "balance_loss_mlp": 1.12218332, + "epoch": 0.09522893420546363, + "flos": 404397757440.0, + "grad_norm": 0.08141129996203125, + "language_loss": 0.91043431, + "learning_rate": 0.0009888873368625404, + "loss": 0.92196655, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.31030273, + "step": 495, + "time_per_iteration": 2.4904890060424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171726, + "balance_loss_mlp": 1.14025438, + "epoch": 0.0954213158907272, + "flos": 690707810304.0, + "grad_norm": 0.08256479818708104, + "language_loss": 0.94339681, + "learning_rate": 0.0009888219244713326, + "loss": 0.95511413, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.31445312, + "step": 496, + "time_per_iteration": 2.8060483932495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181664, + "balance_loss_mlp": 1.15033531, + "epoch": 0.09561369757599077, + "flos": 518739019776.0, + "grad_norm": 0.10472312979641793, + "language_loss": 0.94370055, + "learning_rate": 0.0009887563223026229, + "loss": 0.95551717, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.31323242, + "step": 497, + "time_per_iteration": 2.6536803245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228939, + "balance_loss_mlp": 1.21549225, + "epoch": 0.09580607926125433, + "flos": 1384825849344.0, + "grad_norm": 0.04877985805939708, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80297101, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.13476562, + "step": 498, + "time_per_iteration": 4.874605178833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197245, + "balance_loss_mlp": 1.16455829, + "epoch": 0.09599846094651789, + "flos": 717090969600.0, + "grad_norm": 0.08863465655244346, + "language_loss": 0.93284124, + "learning_rate": 0.0009886245487346482, + "loss": 0.94481373, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.3269043, + "step": 499, + "time_per_iteration": 3.047938108444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011865, + "balance_loss_mlp": 1.15474319, + "epoch": 0.09619084263178146, + "flos": 385824909312.0, + "grad_norm": 0.09673466805801513, + "language_loss": 0.96238041, + "learning_rate": 0.0009885583773865422, + "loss": 0.97424543, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.31762695, + "step": 500, + "time_per_iteration": 2.402763843536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140705, + "balance_loss_mlp": 1.1099968, + "epoch": 0.09638322431704502, + "flos": 533869401600.0, + "grad_norm": 0.08556524095898377, + "language_loss": 0.93457472, + "learning_rate": 0.0009884920163632524, + "loss": 0.94598186, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.30688477, + "step": 501, + "time_per_iteration": 2.7420296669006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155853, + "balance_loss_mlp": 1.12373805, + "epoch": 0.09657560600230858, + "flos": 500426629632.0, + "grad_norm": 0.08462195742795481, + "language_loss": 0.95688182, + "learning_rate": 0.000988425465690543, + "loss": 0.96844035, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.32104492, + "step": 502, + "time_per_iteration": 2.5425736904144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163304, + "balance_loss_mlp": 1.13099861, + "epoch": 0.09676798768757214, + "flos": 528995197440.0, + "grad_norm": 0.07192036847451248, + "language_loss": 0.92721838, + "learning_rate": 0.0009883587253942505, + "loss": 0.93885148, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.32324219, + "step": 503, + "time_per_iteration": 2.8340742588043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188959, + "balance_loss_mlp": 1.15598607, + "epoch": 0.09696036937283571, + "flos": 463379857920.0, + "grad_norm": 0.0888689340699796, + "language_loss": 0.99166393, + "learning_rate": 0.0009882917955002862, + "loss": 1.00355351, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.32983398, + "step": 504, + "time_per_iteration": 2.560448169708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147535, + "balance_loss_mlp": 1.11606395, + "epoch": 0.09715275105809927, + "flos": 534716204544.0, + "grad_norm": 0.07251663236407552, + "language_loss": 0.9150176, + "learning_rate": 0.0009882246760346343, + "loss": 0.92649293, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.31420898, + "step": 505, + "time_per_iteration": 2.6460299491882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114081, + "balance_loss_mlp": 1.10714495, + "epoch": 0.09734513274336283, + "flos": 454713979392.0, + "grad_norm": 0.10061537251918176, + "language_loss": 0.96100289, + "learning_rate": 0.0009881573670233533, + "loss": 0.97241098, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.33666992, + "step": 506, + "time_per_iteration": 2.5137040615081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109977, + "balance_loss_mlp": 1.08029366, + "epoch": 0.09753751442862639, + "flos": 508551243264.0, + "grad_norm": 0.0762964042901656, + "language_loss": 0.91185808, + "learning_rate": 0.0009880898684925747, + "loss": 0.92295784, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.29663086, + "step": 507, + "time_per_iteration": 2.6571738719940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110119, + "balance_loss_mlp": 1.07133985, + "epoch": 0.09772989611388996, + "flos": 484030425600.0, + "grad_norm": 0.07531505250568626, + "language_loss": 0.89554358, + "learning_rate": 0.0009880221804685037, + "loss": 0.90655547, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.29882812, + "step": 508, + "time_per_iteration": 2.596289873123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01404721, + "balance_loss_mlp": 1.39136958, + "epoch": 0.09792227779915352, + "flos": 1565306339328.0, + "grad_norm": 0.10151454340945995, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80749142, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.13378906, + "step": 509, + "time_per_iteration": 4.724441051483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116621, + "balance_loss_mlp": 1.08655643, + "epoch": 0.09811465948441708, + "flos": 587529165312.0, + "grad_norm": 0.08257009801201759, + "language_loss": 0.94708043, + "learning_rate": 0.0009878862360456733, + "loss": 0.95824659, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.30029297, + "step": 510, + "time_per_iteration": 2.703011989593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122701, + "balance_loss_mlp": 1.09406662, + "epoch": 0.09830704116968064, + "flos": 612719285760.0, + "grad_norm": 0.06191460590209878, + "language_loss": 0.88457662, + "learning_rate": 0.0009878179796996922, + "loss": 0.89580369, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.28637695, + "step": 511, + "time_per_iteration": 2.7212226390838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128587, + "balance_loss_mlp": 1.09885597, + "epoch": 0.09849942285494422, + "flos": 538528227840.0, + "grad_norm": 0.06874751685339883, + "language_loss": 0.9199326, + "learning_rate": 0.0009877495339659754, + "loss": 0.9312185, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.29724121, + "step": 512, + "time_per_iteration": 2.7520575523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111609, + "balance_loss_mlp": 1.08826661, + "epoch": 0.09869180454020778, + "flos": 620193535488.0, + "grad_norm": 0.06953003964378547, + "language_loss": 0.87301105, + "learning_rate": 0.000987680898871096, + "loss": 0.88417196, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.27832031, + "step": 513, + "time_per_iteration": 2.7121992111206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134292, + "balance_loss_mlp": 1.10401261, + "epoch": 0.09888418622547133, + "flos": 811375363584.0, + "grad_norm": 0.1024184057853134, + "language_loss": 0.87763435, + "learning_rate": 0.0009876120744417, + "loss": 0.88897729, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.30273438, + "step": 514, + "time_per_iteration": 2.971573829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123143, + "balance_loss_mlp": 1.09267306, + "epoch": 0.0990765679107349, + "flos": 535548450816.0, + "grad_norm": 0.06764912074049458, + "language_loss": 0.95588082, + "learning_rate": 0.0009875430607045078, + "loss": 0.9671123, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.3046875, + "step": 515, + "time_per_iteration": 2.6630361080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108813, + "balance_loss_mlp": 1.08072746, + "epoch": 0.09926894959599845, + "flos": 587607740928.0, + "grad_norm": 0.06593749006245919, + "language_loss": 0.92788792, + "learning_rate": 0.000987473857686313, + "loss": 0.93897605, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.28076172, + "step": 516, + "time_per_iteration": 2.710068702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121556, + "balance_loss_mlp": 1.09039485, + "epoch": 0.09946133128126203, + "flos": 640947409920.0, + "grad_norm": 0.08862761474564218, + "language_loss": 0.9451825, + "learning_rate": 0.0009874044654139824, + "loss": 0.95639801, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.3112793, + "step": 517, + "time_per_iteration": 2.729975461959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117034, + "balance_loss_mlp": 1.08520555, + "epoch": 0.09965371296652559, + "flos": 465546327552.0, + "grad_norm": 0.09157938746936445, + "language_loss": 0.9250825, + "learning_rate": 0.0009873348839144563, + "loss": 0.93625283, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.31811523, + "step": 518, + "time_per_iteration": 2.5117127895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112516, + "balance_loss_mlp": 1.09540534, + "epoch": 0.09984609465178915, + "flos": 483365505024.0, + "grad_norm": 0.07736257304557469, + "language_loss": 0.9674046, + "learning_rate": 0.000987265113214749, + "loss": 0.97865617, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.29711914, + "step": 519, + "time_per_iteration": 2.5816774368286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147544, + "balance_loss_mlp": 1.11421299, + "epoch": 0.1000384763370527, + "flos": 568764260352.0, + "grad_norm": 0.08763817133734854, + "language_loss": 0.96583092, + "learning_rate": 0.0009871951533419476, + "loss": 0.97730637, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.33325195, + "step": 520, + "time_per_iteration": 2.638664484024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140597, + "balance_loss_mlp": 1.108482, + "epoch": 0.10023085802231628, + "flos": 545515057152.0, + "grad_norm": 0.10925869968591369, + "language_loss": 0.88377398, + "learning_rate": 0.0009871250043232132, + "loss": 0.89517999, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.32104492, + "step": 521, + "time_per_iteration": 2.70491886138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136934, + "balance_loss_mlp": 1.10555792, + "epoch": 0.10042323970757984, + "flos": 503208557568.0, + "grad_norm": 0.07694864026409119, + "language_loss": 0.87725985, + "learning_rate": 0.0009870546661857797, + "loss": 0.8886292, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.31347656, + "step": 522, + "time_per_iteration": 2.653456211090088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126678, + "balance_loss_mlp": 1.09380031, + "epoch": 0.1006156213928434, + "flos": 770084402688.0, + "grad_norm": 0.08414569380370593, + "language_loss": 0.95787346, + "learning_rate": 0.0009869841389569553, + "loss": 0.96914017, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.32885742, + "step": 523, + "time_per_iteration": 2.9442663192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116557, + "balance_loss_mlp": 1.08625388, + "epoch": 0.10080800307810696, + "flos": 489786338304.0, + "grad_norm": 0.06587351152736676, + "language_loss": 0.88897854, + "learning_rate": 0.0009869134226641206, + "loss": 0.90014416, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.30297852, + "step": 524, + "time_per_iteration": 2.5559868812561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110225, + "balance_loss_mlp": 1.07746601, + "epoch": 0.10100038476337053, + "flos": 454478252544.0, + "grad_norm": 0.09167866019985617, + "language_loss": 0.88383424, + "learning_rate": 0.0009868425173347303, + "loss": 0.89493656, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.32788086, + "step": 525, + "time_per_iteration": 2.645116090774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111349, + "balance_loss_mlp": 1.08216143, + "epoch": 0.10119276644863409, + "flos": 556155348480.0, + "grad_norm": 0.07288604326691553, + "language_loss": 0.96749896, + "learning_rate": 0.0009867714229963125, + "loss": 0.97863394, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.31323242, + "step": 526, + "time_per_iteration": 2.730703592300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106354, + "balance_loss_mlp": 1.07540703, + "epoch": 0.10138514813389765, + "flos": 515990587392.0, + "grad_norm": 0.07095113284061857, + "language_loss": 0.93916923, + "learning_rate": 0.000986700139676468, + "loss": 0.95023274, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.30932617, + "step": 527, + "time_per_iteration": 2.5836338996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110446, + "balance_loss_mlp": 1.07833052, + "epoch": 0.10157752981916121, + "flos": 500323322880.0, + "grad_norm": 0.06933811905919615, + "language_loss": 0.91673893, + "learning_rate": 0.0009866286674028717, + "loss": 0.92784333, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.32104492, + "step": 528, + "time_per_iteration": 2.7084739208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_mlp": 1.07100391, + "epoch": 0.10176991150442478, + "flos": 656444376576.0, + "grad_norm": 0.07189407365130172, + "language_loss": 0.88586026, + "learning_rate": 0.0009865570062032717, + "loss": 0.8968786, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.30810547, + "step": 529, + "time_per_iteration": 2.9141628742218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103952, + "balance_loss_mlp": 1.07443571, + "epoch": 0.10196229318968834, + "flos": 572974953984.0, + "grad_norm": 0.06841647032337263, + "language_loss": 0.93659967, + "learning_rate": 0.0009864851561054893, + "loss": 0.94763923, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.29516602, + "step": 530, + "time_per_iteration": 2.7539894580841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090977, + "balance_loss_mlp": 1.06110358, + "epoch": 0.1021546748749519, + "flos": 517946061312.0, + "grad_norm": 0.07340246055426732, + "language_loss": 0.91722125, + "learning_rate": 0.0009864131171374191, + "loss": 0.92813098, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.29882812, + "step": 531, + "time_per_iteration": 2.6921956539154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109944, + "balance_loss_mlp": 1.06749225, + "epoch": 0.10234705656021546, + "flos": 609470286336.0, + "grad_norm": 0.07867637119915549, + "language_loss": 0.91107762, + "learning_rate": 0.0009863408893270292, + "loss": 0.92207205, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.31933594, + "step": 532, + "time_per_iteration": 2.7911570072174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106913, + "balance_loss_mlp": 1.07396317, + "epoch": 0.10253943824547904, + "flos": 601473710592.0, + "grad_norm": 0.08191923529880715, + "language_loss": 0.86522454, + "learning_rate": 0.0009862684727023605, + "loss": 0.87629366, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.3293457, + "step": 533, + "time_per_iteration": 2.7452800273895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105875, + "balance_loss_mlp": 1.07466602, + "epoch": 0.1027318199307426, + "flos": 662647011840.0, + "grad_norm": 0.07282647554851075, + "language_loss": 0.90315968, + "learning_rate": 0.0009861958672915283, + "loss": 0.91421843, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.31201172, + "step": 534, + "time_per_iteration": 2.8041269779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096602, + "balance_loss_mlp": 1.0673244, + "epoch": 0.10292420161600616, + "flos": 682962928128.0, + "grad_norm": 0.058349855756870184, + "language_loss": 0.90126884, + "learning_rate": 0.0009861230731227201, + "loss": 0.9122349, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.29248047, + "step": 535, + "time_per_iteration": 2.8627805709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108033, + "balance_loss_mlp": 1.07615674, + "epoch": 0.10311658330126972, + "flos": 490042414080.0, + "grad_norm": 0.091555564896082, + "language_loss": 0.91954774, + "learning_rate": 0.0009860500902241973, + "loss": 0.93062806, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.31884766, + "step": 536, + "time_per_iteration": 2.6052157878875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120335, + "balance_loss_mlp": 1.08800602, + "epoch": 0.10330896498653329, + "flos": 431508446208.0, + "grad_norm": 0.0585767653270487, + "language_loss": 0.96574026, + "learning_rate": 0.0009859769186242942, + "loss": 0.97694361, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.32324219, + "step": 537, + "time_per_iteration": 2.51180362701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116517, + "balance_loss_mlp": 1.08571362, + "epoch": 0.10350134667179685, + "flos": 549330052608.0, + "grad_norm": 0.0744119924563098, + "language_loss": 0.8926785, + "learning_rate": 0.0009859035583514187, + "loss": 0.90384364, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.30834961, + "step": 538, + "time_per_iteration": 2.6369993686676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146613, + "balance_loss_mlp": 1.11380613, + "epoch": 0.10369372835706041, + "flos": 640327569408.0, + "grad_norm": 0.09976070350989504, + "language_loss": 0.90389431, + "learning_rate": 0.0009858300094340517, + "loss": 0.91536051, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.328125, + "step": 539, + "time_per_iteration": 2.7695086002349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150737, + "balance_loss_mlp": 1.11838388, + "epoch": 0.10388611004232397, + "flos": 521500598784.0, + "grad_norm": 0.08771902350159133, + "language_loss": 0.85304511, + "learning_rate": 0.0009857562719007473, + "loss": 0.8645525, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.32324219, + "step": 540, + "time_per_iteration": 2.59881329536438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144681, + "balance_loss_mlp": 1.11320961, + "epoch": 0.10407849172758753, + "flos": 702111946752.0, + "grad_norm": 0.07496368213999542, + "language_loss": 0.88249481, + "learning_rate": 0.0009856823457801331, + "loss": 0.89394164, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.31494141, + "step": 541, + "time_per_iteration": 2.873481035232544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119735, + "balance_loss_mlp": 1.08738184, + "epoch": 0.1042708734128511, + "flos": 502652736000.0, + "grad_norm": 0.06973546911765124, + "language_loss": 0.94998306, + "learning_rate": 0.00098560823110091, + "loss": 0.96118045, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.32373047, + "step": 542, + "time_per_iteration": 2.661374807357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_mlp": 1.08757377, + "epoch": 0.10446325509811466, + "flos": 485331153408.0, + "grad_norm": 0.0792045331206184, + "language_loss": 0.95517921, + "learning_rate": 0.000985533927891851, + "loss": 0.96635967, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.30419922, + "step": 543, + "time_per_iteration": 2.7264697551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096256, + "balance_loss_mlp": 1.06502366, + "epoch": 0.10465563678337822, + "flos": 568365590016.0, + "grad_norm": 0.0919664039836503, + "language_loss": 0.93718112, + "learning_rate": 0.0009854594361818044, + "loss": 0.94814372, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.31201172, + "step": 544, + "time_per_iteration": 2.6869821548461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099422, + "balance_loss_mlp": 1.0683322, + "epoch": 0.10484801846864178, + "flos": 625806853632.0, + "grad_norm": 0.1054615502202609, + "language_loss": 0.927598, + "learning_rate": 0.0009853847559996897, + "loss": 0.9385922, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.31103516, + "step": 545, + "time_per_iteration": 2.7953526973724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100313, + "balance_loss_mlp": 1.06772113, + "epoch": 0.10504040015390535, + "flos": 743063874048.0, + "grad_norm": 0.0768702593450629, + "language_loss": 0.92008656, + "learning_rate": 0.0009853098873745, + "loss": 0.93108964, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.32592773, + "step": 546, + "time_per_iteration": 3.0344293117523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106321, + "balance_loss_mlp": 1.07430172, + "epoch": 0.10523278183916891, + "flos": 586382616576.0, + "grad_norm": 0.072035501246702, + "language_loss": 0.90983582, + "learning_rate": 0.0009852348303353027, + "loss": 0.92089903, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.32006836, + "step": 547, + "time_per_iteration": 2.7647972106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110403, + "balance_loss_mlp": 1.07100892, + "epoch": 0.10542516352443247, + "flos": 869270552064.0, + "grad_norm": 0.07817580313906373, + "language_loss": 0.84611928, + "learning_rate": 0.000985159584911237, + "loss": 0.85715961, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.33007812, + "step": 548, + "time_per_iteration": 3.143122434616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104478, + "balance_loss_mlp": 1.07212472, + "epoch": 0.10561754520969603, + "flos": 505182970368.0, + "grad_norm": 0.08898596974063745, + "language_loss": 0.91126573, + "learning_rate": 0.0009850841511315162, + "loss": 0.92231047, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.32348633, + "step": 549, + "time_per_iteration": 2.6164846420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112982, + "balance_loss_mlp": 1.07946038, + "epoch": 0.1058099268949596, + "flos": 559690947072.0, + "grad_norm": 0.06224197989448247, + "language_loss": 0.92054999, + "learning_rate": 0.0009850085290254256, + "loss": 0.93167984, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.33520508, + "step": 550, + "time_per_iteration": 2.7473480701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110676, + "balance_loss_mlp": 1.07431078, + "epoch": 0.10600230858022316, + "flos": 561773048832.0, + "grad_norm": 0.05678957528127819, + "language_loss": 0.88957977, + "learning_rate": 0.0009849327186223246, + "loss": 0.90064728, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.32446289, + "step": 551, + "time_per_iteration": 2.805126905441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094878, + "balance_loss_mlp": 1.06297779, + "epoch": 0.10619469026548672, + "flos": 494079989760.0, + "grad_norm": 0.07906939671673464, + "language_loss": 0.95596325, + "learning_rate": 0.000984856719951646, + "loss": 0.96691203, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.31860352, + "step": 552, + "time_per_iteration": 2.5688273906707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105536, + "balance_loss_mlp": 1.07370734, + "epoch": 0.10638707195075028, + "flos": 675843678720.0, + "grad_norm": 0.06469368191660979, + "language_loss": 0.93170857, + "learning_rate": 0.0009847805330428943, + "loss": 0.94276392, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.31811523, + "step": 553, + "time_per_iteration": 2.8858227729797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105116, + "balance_loss_mlp": 1.07080746, + "epoch": 0.10657945363601386, + "flos": 487811925504.0, + "grad_norm": 0.07365688544553677, + "language_loss": 0.94454086, + "learning_rate": 0.0009847041579256481, + "loss": 0.95559192, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.34326172, + "step": 554, + "time_per_iteration": 2.5912039279937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114154, + "balance_loss_mlp": 1.08158636, + "epoch": 0.10677183532127742, + "flos": 482706376704.0, + "grad_norm": 0.06731486395760358, + "language_loss": 0.95310724, + "learning_rate": 0.0009846275946295592, + "loss": 0.96424878, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.32568359, + "step": 555, + "time_per_iteration": 2.6071619987487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120557, + "balance_loss_mlp": 1.08755958, + "epoch": 0.10696421700654098, + "flos": 655917668352.0, + "grad_norm": 0.06239681935918944, + "language_loss": 0.88169777, + "learning_rate": 0.0009845508431843518, + "loss": 0.89290333, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.32983398, + "step": 556, + "time_per_iteration": 2.9906973838806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_mlp": 1.08986306, + "epoch": 0.10715659869180454, + "flos": 567483881472.0, + "grad_norm": 0.06803394611182671, + "language_loss": 0.89010829, + "learning_rate": 0.0009844739036198233, + "loss": 0.90133309, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.32592773, + "step": 557, + "time_per_iteration": 2.6462793350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113246, + "balance_loss_mlp": 1.09927225, + "epoch": 0.10734898037706811, + "flos": 540432829440.0, + "grad_norm": 0.0683091886411484, + "language_loss": 0.96000761, + "learning_rate": 0.0009843967759658448, + "loss": 0.97133219, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.33203125, + "step": 558, + "time_per_iteration": 2.664320707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369087, + "balance_loss_mlp": 1.3546865, + "epoch": 0.10754136206233167, + "flos": 1475870008320.0, + "grad_norm": 0.12144998025248735, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74136841, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.14355469, + "step": 559, + "time_per_iteration": 4.836310148239136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124853, + "balance_loss_mlp": 1.0925231, + "epoch": 0.10773374374759523, + "flos": 512155243008.0, + "grad_norm": 0.06725764235558847, + "language_loss": 0.96045369, + "learning_rate": 0.000984241956509384, + "loss": 0.97170222, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.32324219, + "step": 560, + "time_per_iteration": 2.7409372329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134795, + "balance_loss_mlp": 1.10005689, + "epoch": 0.10792612543285879, + "flos": 496261016064.0, + "grad_norm": 0.08502468521942065, + "language_loss": 0.91520619, + "learning_rate": 0.0009841642647670078, + "loss": 0.92655414, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.34741211, + "step": 561, + "time_per_iteration": 2.5360167026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134435, + "balance_loss_mlp": 1.10050821, + "epoch": 0.10811850711812235, + "flos": 735131317248.0, + "grad_norm": 0.08550854990342285, + "language_loss": 0.86122006, + "learning_rate": 0.0009840863850553944, + "loss": 0.87256444, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.33911133, + "step": 562, + "time_per_iteration": 3.0013930797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118751, + "balance_loss_mlp": 1.08604038, + "epoch": 0.10831088880338592, + "flos": 611257024512.0, + "grad_norm": 0.07414056330929218, + "language_loss": 0.92513216, + "learning_rate": 0.0009840083174047782, + "loss": 0.93631971, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.3269043, + "step": 563, + "time_per_iteration": 2.761746883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125103, + "balance_loss_mlp": 1.09353685, + "epoch": 0.10850327048864948, + "flos": 556022928384.0, + "grad_norm": 0.06849160846851732, + "language_loss": 0.86520386, + "learning_rate": 0.0009839300618454685, + "loss": 0.87645483, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.31518555, + "step": 564, + "time_per_iteration": 2.833545684814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124691, + "balance_loss_mlp": 1.09291005, + "epoch": 0.10869565217391304, + "flos": 602902476288.0, + "grad_norm": 0.06688991061359367, + "language_loss": 0.92471159, + "learning_rate": 0.0009838516184078466, + "loss": 0.9359585, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.31762695, + "step": 565, + "time_per_iteration": 2.838482618331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112559, + "balance_loss_mlp": 1.09345102, + "epoch": 0.1088880338591766, + "flos": 525922288128.0, + "grad_norm": 0.08266802783800845, + "language_loss": 0.89073956, + "learning_rate": 0.0009837729871223669, + "loss": 0.90199542, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.3215332, + "step": 566, + "time_per_iteration": 2.6670589447021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134729, + "balance_loss_mlp": 1.10073042, + "epoch": 0.10908041554444017, + "flos": 619986921984.0, + "grad_norm": 0.06816497946354988, + "language_loss": 0.89503658, + "learning_rate": 0.0009836941680195568, + "loss": 0.90638387, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.34033203, + "step": 567, + "time_per_iteration": 2.7894582748413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131677, + "balance_loss_mlp": 1.09691525, + "epoch": 0.10927279722970373, + "flos": 897740195328.0, + "grad_norm": 0.07371226629870802, + "language_loss": 0.8534497, + "learning_rate": 0.0009836151611300166, + "loss": 0.86476642, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.34765625, + "step": 568, + "time_per_iteration": 3.204500913619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116051, + "balance_loss_mlp": 1.08467555, + "epoch": 0.10946517891496729, + "flos": 528408852480.0, + "grad_norm": 0.061952855977424344, + "language_loss": 0.96103537, + "learning_rate": 0.0009835359664844194, + "loss": 0.97219586, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.3137207, + "step": 569, + "time_per_iteration": 2.6154720783233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124163, + "balance_loss_mlp": 1.11014414, + "epoch": 0.10965756060023085, + "flos": 1559944714752.0, + "grad_norm": 0.03358522647050957, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82160974, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.140625, + "step": 570, + "time_per_iteration": 4.907090187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112487, + "balance_loss_mlp": 1.09406638, + "epoch": 0.10984994228549443, + "flos": 512820163584.0, + "grad_norm": 0.08674533322611513, + "language_loss": 0.9339065, + "learning_rate": 0.0009833770140481118, + "loss": 0.9451552, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.30786133, + "step": 571, + "time_per_iteration": 2.694821357727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121358, + "balance_loss_mlp": 1.09072113, + "epoch": 0.11004232397075799, + "flos": 954314307072.0, + "grad_norm": 0.07582699316256973, + "language_loss": 0.84126109, + "learning_rate": 0.000983297256319112, + "loss": 0.85247469, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.30664062, + "step": 572, + "time_per_iteration": 3.208728313446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144326, + "balance_loss_mlp": 1.11097169, + "epoch": 0.11023470565602154, + "flos": 487921024512.0, + "grad_norm": 0.07530566153242002, + "language_loss": 0.8789041, + "learning_rate": 0.000983217310957477, + "loss": 0.89034736, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.33349609, + "step": 573, + "time_per_iteration": 2.7521331310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113474, + "balance_loss_mlp": 1.1014812, + "epoch": 0.1104270873412851, + "flos": 655521970176.0, + "grad_norm": 0.08427122985019045, + "language_loss": 0.91161472, + "learning_rate": 0.000983137177994244, + "loss": 0.92296207, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.33300781, + "step": 574, + "time_per_iteration": 2.869795083999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105984, + "balance_loss_mlp": 1.0752039, + "epoch": 0.11061946902654868, + "flos": 723097165824.0, + "grad_norm": 0.0803000190442887, + "language_loss": 0.87202144, + "learning_rate": 0.0009830568574605235, + "loss": 0.88308132, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.30737305, + "step": 575, + "time_per_iteration": 2.952505111694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111674, + "balance_loss_mlp": 1.07963109, + "epoch": 0.11081185071181224, + "flos": 835113397248.0, + "grad_norm": 0.07764025760375837, + "language_loss": 0.89234924, + "learning_rate": 0.0009829763493874992, + "loss": 0.90346599, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.3203125, + "step": 576, + "time_per_iteration": 3.0367727279663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110641, + "balance_loss_mlp": 1.07508206, + "epoch": 0.1110042323970758, + "flos": 608776252416.0, + "grad_norm": 0.06795308301133055, + "language_loss": 0.94366598, + "learning_rate": 0.0009828956538064264, + "loss": 0.95473009, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.31347656, + "step": 577, + "time_per_iteration": 2.783268928527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091394, + "balance_loss_mlp": 1.0610671, + "epoch": 0.11119661408233936, + "flos": 595643604480.0, + "grad_norm": 0.0662915232098912, + "language_loss": 0.9183138, + "learning_rate": 0.0009828147707486344, + "loss": 0.92922771, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.30297852, + "step": 578, + "time_per_iteration": 2.6628670692443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092993, + "balance_loss_mlp": 1.06109214, + "epoch": 0.11138899576760293, + "flos": 555573385728.0, + "grad_norm": 0.07355059798421615, + "language_loss": 0.87444091, + "learning_rate": 0.0009827337002455245, + "loss": 0.88537085, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.31884766, + "step": 579, + "time_per_iteration": 2.616842031478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087398, + "balance_loss_mlp": 1.05857313, + "epoch": 0.11158137745286649, + "flos": 689418667008.0, + "grad_norm": 0.05531737995895799, + "language_loss": 0.89474124, + "learning_rate": 0.0009826524423285712, + "loss": 0.90561521, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.28808594, + "step": 580, + "time_per_iteration": 2.896409749984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093471, + "balance_loss_mlp": 1.06393051, + "epoch": 0.11177375913813005, + "flos": 762688728576.0, + "grad_norm": 0.06807232662928764, + "language_loss": 0.9046967, + "learning_rate": 0.0009825709970293218, + "loss": 0.91563141, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.2956543, + "step": 581, + "time_per_iteration": 2.8843319416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096208, + "balance_loss_mlp": 1.0669775, + "epoch": 0.11196614082339361, + "flos": 806211588096.0, + "grad_norm": 0.07053725402235117, + "language_loss": 0.96166003, + "learning_rate": 0.0009824893643793956, + "loss": 0.9726221, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.29248047, + "step": 582, + "time_per_iteration": 3.04577898979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104715, + "balance_loss_mlp": 1.07305288, + "epoch": 0.11215852250865718, + "flos": 558350931456.0, + "grad_norm": 0.10752491555358674, + "language_loss": 0.89033759, + "learning_rate": 0.0009824075444104857, + "loss": 0.90138471, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.31689453, + "step": 583, + "time_per_iteration": 2.682020902633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125233, + "balance_loss_mlp": 1.09497714, + "epoch": 0.11235090419392074, + "flos": 513322140672.0, + "grad_norm": 0.06606619546840543, + "language_loss": 0.94941097, + "learning_rate": 0.000982325537154357, + "loss": 0.9606632, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.30224609, + "step": 584, + "time_per_iteration": 2.577632427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122311, + "balance_loss_mlp": 1.09045827, + "epoch": 0.1125432858791843, + "flos": 491209311744.0, + "grad_norm": 0.07452844115700766, + "language_loss": 0.95190644, + "learning_rate": 0.0009822433426428484, + "loss": 0.96312958, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.31860352, + "step": 585, + "time_per_iteration": 2.560591220855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126565, + "balance_loss_mlp": 1.09280539, + "epoch": 0.11273566756444786, + "flos": 510476193792.0, + "grad_norm": 0.11434848401200806, + "language_loss": 0.87964213, + "learning_rate": 0.0009821609609078697, + "loss": 0.89090776, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.3371582, + "step": 586, + "time_per_iteration": 2.633925437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118464, + "balance_loss_mlp": 1.08785152, + "epoch": 0.11292804924971142, + "flos": 622149009408.0, + "grad_norm": 0.08000190427267627, + "language_loss": 0.905334, + "learning_rate": 0.0009820783919814045, + "loss": 0.91651857, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.3059082, + "step": 587, + "time_per_iteration": 2.806704044342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111289, + "balance_loss_mlp": 1.07857847, + "epoch": 0.113120430934975, + "flos": 477811823616.0, + "grad_norm": 0.09357252991594707, + "language_loss": 0.83955467, + "learning_rate": 0.0009819956358955095, + "loss": 0.8506676, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.32714844, + "step": 588, + "time_per_iteration": 2.5903711318969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109097, + "balance_loss_mlp": 1.07455039, + "epoch": 0.11331281262023855, + "flos": 466801975296.0, + "grad_norm": 0.06610764616840299, + "language_loss": 0.85348701, + "learning_rate": 0.0009819126926823127, + "loss": 0.86457801, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.34570312, + "step": 589, + "time_per_iteration": 2.5726494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108183, + "balance_loss_mlp": 1.07535291, + "epoch": 0.11350519430550211, + "flos": 650164727808.0, + "grad_norm": 0.06035980490561805, + "language_loss": 0.87806922, + "learning_rate": 0.000981829562374016, + "loss": 0.8891511, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.328125, + "step": 590, + "time_per_iteration": 2.7960643768310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112987, + "balance_loss_mlp": 1.08041859, + "epoch": 0.11369757599076567, + "flos": 557547798528.0, + "grad_norm": 0.08830164474684658, + "language_loss": 0.98550045, + "learning_rate": 0.0009817462450028933, + "loss": 0.99663031, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.32568359, + "step": 591, + "time_per_iteration": 2.654860734939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107596, + "balance_loss_mlp": 1.07526684, + "epoch": 0.11388995767602925, + "flos": 570774988800.0, + "grad_norm": 0.06245390963608315, + "language_loss": 0.86587834, + "learning_rate": 0.0009816627406012916, + "loss": 0.87695432, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.32348633, + "step": 592, + "time_per_iteration": 2.8017733097076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101808, + "balance_loss_mlp": 1.07074225, + "epoch": 0.1140823393612928, + "flos": 740069540352.0, + "grad_norm": 0.06581053360364857, + "language_loss": 0.8595314, + "learning_rate": 0.0009815790492016295, + "loss": 0.87054944, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.31030273, + "step": 593, + "time_per_iteration": 2.9602174758911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097875, + "balance_loss_mlp": 1.06666636, + "epoch": 0.11427472104655637, + "flos": 698694211584.0, + "grad_norm": 0.07124053574400792, + "language_loss": 0.87982339, + "learning_rate": 0.0009814951708363993, + "loss": 0.89080215, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.31201172, + "step": 594, + "time_per_iteration": 2.818460702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167391, + "balance_loss_mlp": 1.15413451, + "epoch": 0.11446710273181993, + "flos": 1476387952128.0, + "grad_norm": 0.04038129773095179, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79158378, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.1328125, + "step": 595, + "time_per_iteration": 4.776912450790405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110386, + "balance_loss_mlp": 1.07250798, + "epoch": 0.1146594844170835, + "flos": 494641603584.0, + "grad_norm": 0.1404346857169784, + "language_loss": 0.89489102, + "learning_rate": 0.0009813268533395648, + "loss": 0.90592968, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.3137207, + "step": 596, + "time_per_iteration": 2.562816858291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115324, + "balance_loss_mlp": 1.08344746, + "epoch": 0.11485186610234706, + "flos": 474596319744.0, + "grad_norm": 0.07456374098915484, + "language_loss": 0.89145029, + "learning_rate": 0.0009812424142733073, + "loss": 0.90260351, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.31884766, + "step": 597, + "time_per_iteration": 2.5198655128479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_mlp": 1.0946219, + "epoch": 0.11504424778761062, + "flos": 730858014720.0, + "grad_norm": 0.05033183127205697, + "language_loss": 0.86898923, + "learning_rate": 0.000981157788372175, + "loss": 0.88022888, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.29345703, + "step": 598, + "time_per_iteration": 3.004558563232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155063, + "balance_loss_mlp": 1.12290049, + "epoch": 0.11523662947287418, + "flos": 545539788288.0, + "grad_norm": 0.07554757352201513, + "language_loss": 0.90216064, + "learning_rate": 0.0009810729756690223, + "loss": 0.91371131, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.3215332, + "step": 599, + "time_per_iteration": 2.7165520191192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149643, + "balance_loss_mlp": 1.11790919, + "epoch": 0.11542901115813775, + "flos": 774737436672.0, + "grad_norm": 0.08801397326806587, + "language_loss": 0.92855275, + "learning_rate": 0.0009809879761967766, + "loss": 0.94004917, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.31738281, + "step": 600, + "time_per_iteration": 2.9548492431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115619, + "balance_loss_mlp": 1.12004542, + "epoch": 0.11562139284340131, + "flos": 730585972224.0, + "grad_norm": 0.08285308963026158, + "language_loss": 0.87716347, + "learning_rate": 0.0009809027899884378, + "loss": 0.8887254, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.36157227, + "step": 601, + "time_per_iteration": 2.9107346534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131924, + "balance_loss_mlp": 1.10085821, + "epoch": 0.11581377452866487, + "flos": 535589148672.0, + "grad_norm": 0.07059046613839054, + "language_loss": 0.89834028, + "learning_rate": 0.0009808174170770779, + "loss": 0.90965956, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.31079102, + "step": 602, + "time_per_iteration": 2.79127836227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217718, + "balance_loss_mlp": 1.20541608, + "epoch": 0.11600615621392843, + "flos": 1554968613888.0, + "grad_norm": 0.07528653751738872, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86115962, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.12304688, + "step": 603, + "time_per_iteration": 4.862261772155762 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103115, + "balance_loss_mlp": 1.07238269, + "epoch": 0.116198537899192, + "flos": 537178037760.0, + "grad_norm": 0.08106568577848162, + "language_loss": 0.94434869, + "learning_rate": 0.0009806461112779462, + "loss": 0.95537978, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.30737305, + "step": 604, + "time_per_iteration": 2.600008249282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097342, + "balance_loss_mlp": 1.06427336, + "epoch": 0.11639091958445556, + "flos": 453970483200.0, + "grad_norm": 0.09761910402267754, + "language_loss": 0.89590895, + "learning_rate": 0.0009805601784566814, + "loss": 0.90688241, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.33056641, + "step": 605, + "time_per_iteration": 2.4687013626098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097807, + "balance_loss_mlp": 1.06635928, + "epoch": 0.11658330126971912, + "flos": 554815332864.0, + "grad_norm": 0.0628453025897625, + "language_loss": 0.96235836, + "learning_rate": 0.0009804740590654089, + "loss": 0.97333646, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.31469727, + "step": 606, + "time_per_iteration": 2.654134750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109435, + "balance_loss_mlp": 1.0789417, + "epoch": 0.11677568295498268, + "flos": 716025968640.0, + "grad_norm": 0.07837472156111998, + "language_loss": 0.90884066, + "learning_rate": 0.0009803877531375635, + "loss": 0.91993499, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.30493164, + "step": 607, + "time_per_iteration": 2.825778007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_mlp": 1.08074808, + "epoch": 0.11696806464024626, + "flos": 609474668544.0, + "grad_norm": 0.07263848878870109, + "language_loss": 0.91923869, + "learning_rate": 0.0009803012607066523, + "loss": 0.93036401, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.31787109, + "step": 608, + "time_per_iteration": 2.721005916595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101062, + "balance_loss_mlp": 1.06980491, + "epoch": 0.11716044632550981, + "flos": 520127087616.0, + "grad_norm": 0.06980646294906427, + "language_loss": 0.9077962, + "learning_rate": 0.0009802145818062543, + "loss": 0.91880679, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.31225586, + "step": 609, + "time_per_iteration": 2.707643985748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102401, + "balance_loss_mlp": 1.07035792, + "epoch": 0.11735282801077337, + "flos": 507246133248.0, + "grad_norm": 0.07162886221417876, + "language_loss": 0.9293434, + "learning_rate": 0.0009801277164700212, + "loss": 0.9403674, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.3203125, + "step": 610, + "time_per_iteration": 2.6389639377593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094537, + "balance_loss_mlp": 1.06323278, + "epoch": 0.11754520969603693, + "flos": 686339965440.0, + "grad_norm": 0.07220465483683103, + "language_loss": 0.90727574, + "learning_rate": 0.0009800406647316776, + "loss": 0.91822106, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.31274414, + "step": 611, + "time_per_iteration": 2.8033382892608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066854, + "balance_loss_mlp": 1.05369329, + "epoch": 0.1177375913813005, + "flos": 1541673022464.0, + "grad_norm": 0.030783707978337852, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.77981311, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.13183594, + "step": 612, + "time_per_iteration": 4.777275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116404, + "balance_loss_mlp": 1.08307314, + "epoch": 0.11792997306656407, + "flos": 520269682176.0, + "grad_norm": 0.07589987368124408, + "language_loss": 0.8961159, + "learning_rate": 0.000979866002183916, + "loss": 0.90727997, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.33325195, + "step": 613, + "time_per_iteration": 2.6848883628845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109453, + "balance_loss_mlp": 1.07719529, + "epoch": 0.11812235475182763, + "flos": 665980379136.0, + "grad_norm": 0.08667718058784188, + "language_loss": 0.91197205, + "learning_rate": 0.0009797783914423082, + "loss": 0.92306662, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.32250977, + "step": 614, + "time_per_iteration": 2.832414388656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.07140493, + "epoch": 0.11831473643709119, + "flos": 621021399552.0, + "grad_norm": 0.06050640051516142, + "language_loss": 0.85425436, + "learning_rate": 0.0009796905944342094, + "loss": 0.86530626, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.33813477, + "step": 615, + "time_per_iteration": 2.8220455646514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112849, + "balance_loss_mlp": 1.07913685, + "epoch": 0.11850711812235475, + "flos": 456438108672.0, + "grad_norm": 0.0714748534502384, + "language_loss": 0.893188, + "learning_rate": 0.0009796026111937057, + "loss": 0.90431643, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.3371582, + "step": 616, + "time_per_iteration": 2.590566873550415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102524, + "balance_loss_mlp": 1.07005119, + "epoch": 0.11869949980761832, + "flos": 513598565376.0, + "grad_norm": 0.06492309219220607, + "language_loss": 0.89778733, + "learning_rate": 0.0009795144417549552, + "loss": 0.90881252, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.32470703, + "step": 617, + "time_per_iteration": 2.672914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109626, + "balance_loss_mlp": 1.0773685, + "epoch": 0.11889188149288188, + "flos": 534732171264.0, + "grad_norm": 0.057544425945024125, + "language_loss": 0.90660846, + "learning_rate": 0.0009794260861521883, + "loss": 0.9177047, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.32250977, + "step": 618, + "time_per_iteration": 2.817354202270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_mlp": 1.07009149, + "epoch": 0.11908426317814544, + "flos": 498344527872.0, + "grad_norm": 0.0773697745436404, + "language_loss": 0.87738883, + "learning_rate": 0.0009793375444197075, + "loss": 0.88841403, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.32397461, + "step": 619, + "time_per_iteration": 2.607475996017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011109, + "balance_loss_mlp": 1.07697332, + "epoch": 0.119276644863409, + "flos": 659598833664.0, + "grad_norm": 0.06767977381214116, + "language_loss": 0.86337721, + "learning_rate": 0.000979248816591888, + "loss": 0.87448615, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.33935547, + "step": 620, + "time_per_iteration": 2.758866548538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098794, + "balance_loss_mlp": 1.06667948, + "epoch": 0.11946902654867257, + "flos": 758396487168.0, + "grad_norm": 0.06819106164994826, + "language_loss": 0.87032986, + "learning_rate": 0.0009791599027031766, + "loss": 0.88131785, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.32128906, + "step": 621, + "time_per_iteration": 3.029431104660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088156, + "balance_loss_mlp": 1.05611241, + "epoch": 0.11966140823393613, + "flos": 680697533952.0, + "grad_norm": 0.0732554324646167, + "language_loss": 0.87112588, + "learning_rate": 0.0009790708027880932, + "loss": 0.88200748, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.32055664, + "step": 622, + "time_per_iteration": 2.855576992034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056461, + "balance_loss_mlp": 1.04444504, + "epoch": 0.11985378991919969, + "flos": 1450268070912.0, + "grad_norm": 0.03732324883573809, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78483754, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.12011719, + "step": 623, + "time_per_iteration": 4.840993165969849 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108671, + "balance_loss_mlp": 1.0551914, + "epoch": 0.12004617160446325, + "flos": 527586780672.0, + "grad_norm": 0.07309096746678648, + "language_loss": 0.94236648, + "learning_rate": 0.0009788920450172487, + "loss": 0.9532336, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.31518555, + "step": 624, + "time_per_iteration": 2.6301677227020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102663, + "balance_loss_mlp": 1.07023823, + "epoch": 0.12023855328972682, + "flos": 473980861440.0, + "grad_norm": 0.15739190650861204, + "language_loss": 0.91515559, + "learning_rate": 0.0009788023872308875, + "loss": 0.92618221, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.32421875, + "step": 625, + "time_per_iteration": 2.506446361541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014454, + "balance_loss_mlp": 1.0033915, + "epoch": 0.12043093497499038, + "flos": 1530954155520.0, + "grad_norm": 0.02216054665264375, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76443458, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.11083984, + "step": 626, + "time_per_iteration": 4.713289260864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114644, + "balance_loss_mlp": 1.08391225, + "epoch": 0.12062331666025394, + "flos": 539571469824.0, + "grad_norm": 0.0672242080300053, + "language_loss": 0.94766486, + "learning_rate": 0.0009786225140303285, + "loss": 0.95881128, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.30761719, + "step": 627, + "time_per_iteration": 2.61875057220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011267, + "balance_loss_mlp": 1.09503818, + "epoch": 0.1208156983455175, + "flos": 511634327040.0, + "grad_norm": 0.06510849521455, + "language_loss": 0.925771, + "learning_rate": 0.0009785322986859634, + "loss": 0.93703806, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.31640625, + "step": 628, + "time_per_iteration": 2.6567625999450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141777, + "balance_loss_mlp": 1.11059177, + "epoch": 0.12100808003078108, + "flos": 596195043840.0, + "grad_norm": 0.06735600063735754, + "language_loss": 0.93719506, + "learning_rate": 0.0009784418975588838, + "loss": 0.94861281, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.31152344, + "step": 629, + "time_per_iteration": 2.697376012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122983, + "balance_loss_mlp": 1.09222674, + "epoch": 0.12120046171604464, + "flos": 522698019840.0, + "grad_norm": 0.47103484407124013, + "language_loss": 0.93927598, + "learning_rate": 0.0009783513106841862, + "loss": 0.95050573, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.30761719, + "step": 630, + "time_per_iteration": 2.7226808071136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143332, + "balance_loss_mlp": 1.13179243, + "epoch": 0.1213928434013082, + "flos": 1553605277184.0, + "grad_norm": 0.056788624646596834, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77876031, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.11523438, + "step": 631, + "time_per_iteration": 4.948111295700073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228128, + "balance_loss_mlp": 1.19219875, + "epoch": 0.12158522508657175, + "flos": 495143580672.0, + "grad_norm": 0.06834333100250278, + "language_loss": 0.88515621, + "learning_rate": 0.0009781695798326854, + "loss": 0.89743745, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.35961914, + "step": 632, + "time_per_iteration": 2.5616555213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267845, + "balance_loss_mlp": 1.23050833, + "epoch": 0.12177760677183531, + "flos": 475335433728.0, + "grad_norm": 0.1009303482431908, + "language_loss": 0.88543177, + "learning_rate": 0.0009780784359264365, + "loss": 0.89811015, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.37329102, + "step": 633, + "time_per_iteration": 2.597935438156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265484, + "balance_loss_mlp": 1.25370574, + "epoch": 0.12196998845709889, + "flos": 1467630351360.0, + "grad_norm": 0.08843071113371018, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75454181, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.11767578, + "step": 634, + "time_per_iteration": 4.768415451049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235432, + "balance_loss_mlp": 1.19976473, + "epoch": 0.12216237014236245, + "flos": 586279309824.0, + "grad_norm": 0.0829698455775257, + "language_loss": 0.88074899, + "learning_rate": 0.000977895591329867, + "loss": 0.89310336, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.35668945, + "step": 635, + "time_per_iteration": 2.7918457984924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214994, + "balance_loss_mlp": 1.17720437, + "epoch": 0.12235475182762601, + "flos": 597721324032.0, + "grad_norm": 0.0916527997361875, + "language_loss": 0.87791145, + "learning_rate": 0.000977803890710533, + "loss": 0.89006138, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.37792969, + "step": 636, + "time_per_iteration": 2.7248313426971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186705, + "balance_loss_mlp": 1.1509428, + "epoch": 0.12254713351288957, + "flos": 497487550464.0, + "grad_norm": 0.0702522126388857, + "language_loss": 0.93856937, + "learning_rate": 0.0009777120045912774, + "loss": 0.95043641, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.35766602, + "step": 637, + "time_per_iteration": 2.6079726219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180704, + "balance_loss_mlp": 1.14236617, + "epoch": 0.12273951519815314, + "flos": 605565130752.0, + "grad_norm": 0.06645311005239844, + "language_loss": 0.90599251, + "learning_rate": 0.0009776199330077736, + "loss": 0.91779959, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.38330078, + "step": 638, + "time_per_iteration": 2.7671282291412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196025, + "balance_loss_mlp": 1.15940344, + "epoch": 0.1229318968834167, + "flos": 597578729472.0, + "grad_norm": 0.09015200479441979, + "language_loss": 0.93140519, + "learning_rate": 0.0009775276759957667, + "loss": 0.94336545, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.36621094, + "step": 639, + "time_per_iteration": 2.6990442276000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179898, + "balance_loss_mlp": 1.14265716, + "epoch": 0.12312427856868026, + "flos": 678082931712.0, + "grad_norm": 0.08188642922116089, + "language_loss": 0.90714514, + "learning_rate": 0.0009774352335910745, + "loss": 0.91894412, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.37280273, + "step": 640, + "time_per_iteration": 2.7950265407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115004, + "balance_loss_mlp": 1.11658967, + "epoch": 0.12331666025394382, + "flos": 608656978944.0, + "grad_norm": 0.07361380744806716, + "language_loss": 0.95549798, + "learning_rate": 0.000977342605829586, + "loss": 0.96699834, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.3347168, + "step": 641, + "time_per_iteration": 2.6966538429260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140018, + "balance_loss_mlp": 1.10497069, + "epoch": 0.12350904193920739, + "flos": 762172194816.0, + "grad_norm": 0.08211004604029591, + "language_loss": 0.86708105, + "learning_rate": 0.0009772497927472623, + "loss": 0.87848121, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.35083008, + "step": 642, + "time_per_iteration": 3.050595998764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121507, + "balance_loss_mlp": 1.0852437, + "epoch": 0.12370142362447095, + "flos": 540699079680.0, + "grad_norm": 0.0716743258864478, + "language_loss": 0.85363436, + "learning_rate": 0.0009771567943801368, + "loss": 0.86484945, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.36254883, + "step": 643, + "time_per_iteration": 2.627019166946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112067, + "balance_loss_mlp": 1.07744884, + "epoch": 0.12389380530973451, + "flos": 547848852480.0, + "grad_norm": 0.06992166814052157, + "language_loss": 0.89936745, + "learning_rate": 0.0009770636107643152, + "loss": 0.91048813, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.34643555, + "step": 644, + "time_per_iteration": 2.696233034133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102963, + "balance_loss_mlp": 1.06846356, + "epoch": 0.12408618699499807, + "flos": 540048715776.0, + "grad_norm": 0.06268128655507912, + "language_loss": 0.88181639, + "learning_rate": 0.0009769702419359738, + "loss": 0.89284605, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.3449707, + "step": 645, + "time_per_iteration": 2.61401104927063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116268, + "balance_loss_mlp": 1.0810535, + "epoch": 0.12427856868026164, + "flos": 745451513856.0, + "grad_norm": 0.07610574883038115, + "language_loss": 0.89730537, + "learning_rate": 0.000976876687931362, + "loss": 0.90846807, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.35229492, + "step": 646, + "time_per_iteration": 2.999408721923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131315, + "balance_loss_mlp": 1.09622002, + "epoch": 0.1244709503655252, + "flos": 533460556800.0, + "grad_norm": 0.19449531308307466, + "language_loss": 0.85410094, + "learning_rate": 0.0009767829487868005, + "loss": 0.86541414, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.35107422, + "step": 647, + "time_per_iteration": 2.617666721343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138117, + "balance_loss_mlp": 1.10159075, + "epoch": 0.12466333205078876, + "flos": 507847034880.0, + "grad_norm": 0.07509451505155453, + "language_loss": 0.89358151, + "learning_rate": 0.000976689024538682, + "loss": 0.90496266, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.36499023, + "step": 648, + "time_per_iteration": 2.5929009914398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138062, + "balance_loss_mlp": 1.10110736, + "epoch": 0.12485571373605232, + "flos": 681023420928.0, + "grad_norm": 0.07057439208121223, + "language_loss": 0.87662494, + "learning_rate": 0.0009765949152234716, + "loss": 0.8880055, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.36962891, + "step": 649, + "time_per_iteration": 2.874701976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147728, + "balance_loss_mlp": 1.13504386, + "epoch": 0.1250480954213159, + "flos": 1329402668544.0, + "grad_norm": 0.04527818124304351, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79833812, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.12695312, + "step": 650, + "time_per_iteration": 4.680933713912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138039, + "balance_loss_mlp": 1.10287213, + "epoch": 0.12524047710657946, + "flos": 938140683264.0, + "grad_norm": 0.08375968037938068, + "language_loss": 0.82443976, + "learning_rate": 0.0009764061415379919, + "loss": 0.83582014, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.35205078, + "step": 651, + "time_per_iteration": 3.2550604343414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135369, + "balance_loss_mlp": 1.09774697, + "epoch": 0.12543285879184302, + "flos": 513642235392.0, + "grad_norm": 0.07146085627000143, + "language_loss": 0.89363486, + "learning_rate": 0.0009763114772410109, + "loss": 0.90498853, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.3762207, + "step": 652, + "time_per_iteration": 2.5937142372131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139745, + "balance_loss_mlp": 1.10419679, + "epoch": 0.12562524047710658, + "flos": 717991617024.0, + "grad_norm": 0.07913079577836896, + "language_loss": 0.87230957, + "learning_rate": 0.0009762166280235146, + "loss": 0.88370705, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.35571289, + "step": 653, + "time_per_iteration": 2.96162748336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147653, + "balance_loss_mlp": 1.10974443, + "epoch": 0.12581762216237014, + "flos": 563441923584.0, + "grad_norm": 0.06492259826928527, + "language_loss": 0.87890899, + "learning_rate": 0.0009761215939223267, + "loss": 0.89038551, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.37890625, + "step": 654, + "time_per_iteration": 2.714641809463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145182, + "balance_loss_mlp": 1.1077261, + "epoch": 0.1260100038476337, + "flos": 481642785792.0, + "grad_norm": 0.07920721431290144, + "language_loss": 0.86875665, + "learning_rate": 0.0009760263749743428, + "loss": 0.88020849, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.37426758, + "step": 655, + "time_per_iteration": 2.547499179840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145343, + "balance_loss_mlp": 1.11074805, + "epoch": 0.12620238553289725, + "flos": 575269461504.0, + "grad_norm": 0.06357383816141966, + "language_loss": 0.90176344, + "learning_rate": 0.0009759309712165299, + "loss": 0.91321695, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.34570312, + "step": 656, + "time_per_iteration": 2.693922996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137509, + "balance_loss_mlp": 1.103248, + "epoch": 0.12639476721816084, + "flos": 530909973504.0, + "grad_norm": 0.07169490366111804, + "language_loss": 0.93258119, + "learning_rate": 0.0009758353826859272, + "loss": 0.94395626, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.34277344, + "step": 657, + "time_per_iteration": 2.5744612216949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139269, + "balance_loss_mlp": 1.10314822, + "epoch": 0.1265871489034244, + "flos": 689654393856.0, + "grad_norm": 0.06860158128637554, + "language_loss": 0.89679217, + "learning_rate": 0.0009757396094196456, + "loss": 0.90818477, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36132812, + "step": 658, + "time_per_iteration": 2.851700782775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143308, + "balance_loss_mlp": 1.10675859, + "epoch": 0.12677953058868796, + "flos": 536863735296.0, + "grad_norm": 0.0696485175834739, + "language_loss": 0.84555894, + "learning_rate": 0.0009756436514548673, + "loss": 0.85699201, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.36523438, + "step": 659, + "time_per_iteration": 2.7971351146698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122244, + "balance_loss_mlp": 1.08800757, + "epoch": 0.12697191227395152, + "flos": 518749194240.0, + "grad_norm": 0.05327633329409036, + "language_loss": 0.88343394, + "learning_rate": 0.0009755475088288466, + "loss": 0.89465636, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.34228516, + "step": 660, + "time_per_iteration": 2.670555353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103127, + "balance_loss_mlp": 1.06903291, + "epoch": 0.12716429395921508, + "flos": 566341714944.0, + "grad_norm": 0.06801254087798507, + "language_loss": 0.90210187, + "learning_rate": 0.0009754511815789095, + "loss": 0.91313314, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.34106445, + "step": 661, + "time_per_iteration": 2.748224973678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102552, + "balance_loss_mlp": 1.06798172, + "epoch": 0.12735667564447864, + "flos": 513844466688.0, + "grad_norm": 0.06975204014846512, + "language_loss": 0.86245489, + "learning_rate": 0.0009753546697424533, + "loss": 0.87348044, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.34594727, + "step": 662, + "time_per_iteration": 2.664799213409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092108, + "balance_loss_mlp": 1.05863369, + "epoch": 0.1275490573297422, + "flos": 541023556608.0, + "grad_norm": 0.05485824904298714, + "language_loss": 0.90572149, + "learning_rate": 0.0009752579733569475, + "loss": 0.91664255, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.3347168, + "step": 663, + "time_per_iteration": 2.679975748062134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267369, + "balance_loss_mlp": 1.2515384, + "epoch": 0.12774143901500576, + "flos": 1557872787456.0, + "grad_norm": 0.0685532780556388, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7614876, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.15820312, + "step": 664, + "time_per_iteration": 4.938101053237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096151, + "balance_loss_mlp": 1.06177139, + "epoch": 0.12793382070026935, + "flos": 613462781952.0, + "grad_norm": 0.06920677464457729, + "language_loss": 0.90523887, + "learning_rate": 0.0009750640270890217, + "loss": 0.9162004, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.34375, + "step": 665, + "time_per_iteration": 2.6939845085144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099083, + "balance_loss_mlp": 1.06563258, + "epoch": 0.1281262023855329, + "flos": 707386231296.0, + "grad_norm": 0.06773970450457005, + "language_loss": 0.96531481, + "learning_rate": 0.0009749667772818983, + "loss": 0.9763056, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.33447266, + "step": 666, + "time_per_iteration": 2.967853307723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164762, + "balance_loss_mlp": 1.15131497, + "epoch": 0.12831858407079647, + "flos": 1424250086400.0, + "grad_norm": 0.045177828452490555, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78100705, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.13476562, + "step": 667, + "time_per_iteration": 4.85069465637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093582, + "balance_loss_mlp": 1.05958366, + "epoch": 0.12851096575606002, + "flos": 448869316608.0, + "grad_norm": 0.07778909975942494, + "language_loss": 0.95426726, + "learning_rate": 0.0009747717245101093, + "loss": 0.96520311, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.34008789, + "step": 668, + "time_per_iteration": 2.5234692096710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098998, + "balance_loss_mlp": 1.06519032, + "epoch": 0.12870334744132358, + "flos": 479697486336.0, + "grad_norm": 0.05465485885236262, + "language_loss": 0.84969366, + "learning_rate": 0.00097467392162117, + "loss": 0.86068368, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.33789062, + "step": 669, + "time_per_iteration": 2.601684808731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096385, + "balance_loss_mlp": 1.06341171, + "epoch": 0.12889572912658714, + "flos": 638633963520.0, + "grad_norm": 0.05954757179165737, + "language_loss": 0.91292465, + "learning_rate": 0.0009745759344474708, + "loss": 0.92388856, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.32983398, + "step": 670, + "time_per_iteration": 2.8225347995758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098806, + "balance_loss_mlp": 1.06411648, + "epoch": 0.1290881108118507, + "flos": 509693409792.0, + "grad_norm": 0.06976130099981656, + "language_loss": 0.89229816, + "learning_rate": 0.0009744777630270536, + "loss": 0.90328622, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.34692383, + "step": 671, + "time_per_iteration": 2.633571147918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109875, + "balance_loss_mlp": 1.07435024, + "epoch": 0.12928049249711426, + "flos": 670746894336.0, + "grad_norm": 0.08011077975608555, + "language_loss": 0.93749923, + "learning_rate": 0.000974379407398032, + "loss": 0.94859791, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.35546875, + "step": 672, + "time_per_iteration": 2.875609874725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093721, + "balance_loss_mlp": 1.06065273, + "epoch": 0.12947287418237785, + "flos": 793158925824.0, + "grad_norm": 0.05850057523774312, + "language_loss": 0.82016242, + "learning_rate": 0.0009742808675985913, + "loss": 0.83109969, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.33056641, + "step": 673, + "time_per_iteration": 3.087738275527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101029, + "balance_loss_mlp": 1.0646224, + "epoch": 0.1296652558676414, + "flos": 485222054400.0, + "grad_norm": 0.08954381825883409, + "language_loss": 0.9153564, + "learning_rate": 0.0009741821436669876, + "loss": 0.92636657, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.36450195, + "step": 674, + "time_per_iteration": 2.539849281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101326, + "balance_loss_mlp": 1.06673169, + "epoch": 0.12985763755290497, + "flos": 453226987008.0, + "grad_norm": 0.0648114016490977, + "language_loss": 0.9288274, + "learning_rate": 0.0009740832356415492, + "loss": 0.93984067, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.34619141, + "step": 675, + "time_per_iteration": 2.467801094055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097673, + "balance_loss_mlp": 1.06315041, + "epoch": 0.13005001923816853, + "flos": 824719007232.0, + "grad_norm": 0.0735546441878898, + "language_loss": 0.8857609, + "learning_rate": 0.0009739841435606756, + "loss": 0.89673769, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.34545898, + "step": 676, + "time_per_iteration": 3.008781909942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109741, + "balance_loss_mlp": 1.06457949, + "epoch": 0.1302424009234321, + "flos": 531107822592.0, + "grad_norm": 0.07312926894822828, + "language_loss": 0.90675485, + "learning_rate": 0.0009738848674628377, + "loss": 0.9177289, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.328125, + "step": 677, + "time_per_iteration": 2.695338010787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104607, + "balance_loss_mlp": 1.06955981, + "epoch": 0.13043478260869565, + "flos": 525626924544.0, + "grad_norm": 0.06033597827839572, + "language_loss": 0.89643902, + "learning_rate": 0.000973785407386578, + "loss": 0.90748513, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.35058594, + "step": 678, + "time_per_iteration": 2.7727599143981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101976, + "balance_loss_mlp": 1.06714272, + "epoch": 0.1306271642939592, + "flos": 625862108160.0, + "grad_norm": 0.05570081952525763, + "language_loss": 0.87361526, + "learning_rate": 0.0009736857633705103, + "loss": 0.88463503, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.34814453, + "step": 679, + "time_per_iteration": 2.843129873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110176, + "balance_loss_mlp": 1.06630766, + "epoch": 0.13081954597922277, + "flos": 550438723584.0, + "grad_norm": 0.06405817655948583, + "language_loss": 0.93204647, + "learning_rate": 0.0009735859354533196, + "loss": 0.94306409, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.35473633, + "step": 680, + "time_per_iteration": 2.7122464179992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093118, + "balance_loss_mlp": 1.05914354, + "epoch": 0.13101192766448633, + "flos": 536651329536.0, + "grad_norm": 0.06779912020183775, + "language_loss": 0.91948998, + "learning_rate": 0.0009734859236737628, + "loss": 0.93042123, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.33984375, + "step": 681, + "time_per_iteration": 2.594881296157837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093326, + "balance_loss_mlp": 1.0593034, + "epoch": 0.13120430934974991, + "flos": 503258019840.0, + "grad_norm": 0.06413082246497326, + "language_loss": 0.93904501, + "learning_rate": 0.0009733857280706678, + "loss": 0.94997829, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.34033203, + "step": 682, + "time_per_iteration": 2.5831425189971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010929, + "balance_loss_mlp": 1.05992687, + "epoch": 0.13139669103501347, + "flos": 614014221312.0, + "grad_norm": 0.06246118190021366, + "language_loss": 0.85051745, + "learning_rate": 0.000973285348682934, + "loss": 0.86144638, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.33007812, + "step": 683, + "time_per_iteration": 2.7236225605010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226892, + "balance_loss_mlp": 1.21096563, + "epoch": 0.13158907272027703, + "flos": 1484163357696.0, + "grad_norm": 0.08359566880013784, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79125261, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.15917969, + "step": 684, + "time_per_iteration": 4.87854790687561 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087932, + "balance_loss_mlp": 1.05488706, + "epoch": 0.1317814544055406, + "flos": 985049344512.0, + "grad_norm": 0.07039095593234826, + "language_loss": 0.85449159, + "learning_rate": 0.0009730840387095046, + "loss": 0.86537099, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.33056641, + "step": 685, + "time_per_iteration": 3.30759596824646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096943, + "balance_loss_mlp": 1.06156158, + "epoch": 0.13197383609080415, + "flos": 611163892224.0, + "grad_norm": 0.05759402546544749, + "language_loss": 0.912597, + "learning_rate": 0.0009729831082019642, + "loss": 0.92356646, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.35351562, + "step": 686, + "time_per_iteration": 2.7965087890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093388, + "balance_loss_mlp": 1.0589608, + "epoch": 0.1321662177760677, + "flos": 494116305408.0, + "grad_norm": 0.058033147986452156, + "language_loss": 0.89668858, + "learning_rate": 0.0009728819940660958, + "loss": 0.90762246, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.34399414, + "step": 687, + "time_per_iteration": 2.7347469329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110102, + "balance_loss_mlp": 1.0653528, + "epoch": 0.13235859946133127, + "flos": 495591713280.0, + "grad_norm": 0.07548862234195632, + "language_loss": 0.86088693, + "learning_rate": 0.0009727806963411557, + "loss": 0.87189722, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.35668945, + "step": 688, + "time_per_iteration": 2.621638774871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098222, + "balance_loss_mlp": 1.06279302, + "epoch": 0.13255098114659483, + "flos": 511417539072.0, + "grad_norm": 0.08656773393569435, + "language_loss": 0.88000298, + "learning_rate": 0.000972679215066471, + "loss": 0.89098513, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.35449219, + "step": 689, + "time_per_iteration": 2.6806418895721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103388, + "balance_loss_mlp": 1.06900764, + "epoch": 0.13274336283185842, + "flos": 547114120704.0, + "grad_norm": 0.07064056682134613, + "language_loss": 0.99675226, + "learning_rate": 0.0009725775502814401, + "loss": 1.00778604, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.34350586, + "step": 690, + "time_per_iteration": 2.607179641723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121046, + "balance_loss_mlp": 1.08397222, + "epoch": 0.13293574451712198, + "flos": 640465781760.0, + "grad_norm": 0.08777481913975324, + "language_loss": 0.85673726, + "learning_rate": 0.0009724757020255327, + "loss": 0.86794776, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.37084961, + "step": 691, + "time_per_iteration": 2.81113338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111244, + "balance_loss_mlp": 1.07726967, + "epoch": 0.13312812620238554, + "flos": 491234042880.0, + "grad_norm": 0.09165524457583717, + "language_loss": 0.87811983, + "learning_rate": 0.0009723736703382902, + "loss": 0.88923222, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.33984375, + "step": 692, + "time_per_iteration": 2.548689603805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110186, + "balance_loss_mlp": 1.0692203, + "epoch": 0.1333205078876491, + "flos": 508693837824.0, + "grad_norm": 0.061462060991887495, + "language_loss": 0.83746743, + "learning_rate": 0.0009722714552593244, + "loss": 0.84848601, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.32641602, + "step": 693, + "time_per_iteration": 2.6584513187408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099112, + "balance_loss_mlp": 1.06358743, + "epoch": 0.13351288957291266, + "flos": 418474722816.0, + "grad_norm": 0.07144638741394425, + "language_loss": 0.94810003, + "learning_rate": 0.000972169056828319, + "loss": 0.95909119, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.35522461, + "step": 694, + "time_per_iteration": 2.461437702178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_mlp": 1.06751275, + "epoch": 0.13370527125817622, + "flos": 615614694912.0, + "grad_norm": 0.05672506947017021, + "language_loss": 0.87834966, + "learning_rate": 0.0009720664750850283, + "loss": 0.88935745, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.33251953, + "step": 695, + "time_per_iteration": 2.7716193199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103519, + "balance_loss_mlp": 1.07085609, + "epoch": 0.13389765294343978, + "flos": 625757391360.0, + "grad_norm": 0.07304651625724701, + "language_loss": 0.93482703, + "learning_rate": 0.0009719637100692784, + "loss": 0.94586229, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.32666016, + "step": 696, + "time_per_iteration": 2.7741310596466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111476, + "balance_loss_mlp": 1.08090401, + "epoch": 0.13409003462870334, + "flos": 609391710720.0, + "grad_norm": 0.06235589965882817, + "language_loss": 0.83759153, + "learning_rate": 0.0009718607618209661, + "loss": 0.84873915, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.33862305, + "step": 697, + "time_per_iteration": 2.869180202484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128671, + "balance_loss_mlp": 1.09488726, + "epoch": 0.13428241631396692, + "flos": 683499810816.0, + "grad_norm": 0.0709058406100417, + "language_loss": 0.88053036, + "learning_rate": 0.0009717576303800595, + "loss": 0.89181709, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.33789062, + "step": 698, + "time_per_iteration": 3.007253408432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122193, + "balance_loss_mlp": 1.08716917, + "epoch": 0.13447479799923048, + "flos": 508565799936.0, + "grad_norm": 0.07060238478807088, + "language_loss": 0.86057615, + "learning_rate": 0.0009716543157865975, + "loss": 0.87179804, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.35083008, + "step": 699, + "time_per_iteration": 2.6622114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112811, + "balance_loss_mlp": 1.07812154, + "epoch": 0.13466717968449404, + "flos": 897124737024.0, + "grad_norm": 0.06896685381510245, + "language_loss": 0.84149206, + "learning_rate": 0.0009715508180806907, + "loss": 0.85262012, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.34716797, + "step": 700, + "time_per_iteration": 3.175494909286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106112, + "balance_loss_mlp": 1.07054055, + "epoch": 0.1348595613697576, + "flos": 989501557248.0, + "grad_norm": 0.07388845252403331, + "language_loss": 0.90260321, + "learning_rate": 0.0009714471373025202, + "loss": 0.91366434, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.35546875, + "step": 701, + "time_per_iteration": 3.3912835121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090254, + "balance_loss_mlp": 1.05499172, + "epoch": 0.13505194305502116, + "flos": 487580580864.0, + "grad_norm": 0.07959074518459132, + "language_loss": 0.89355272, + "learning_rate": 0.0009713432734923386, + "loss": 0.9044553, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.35253906, + "step": 702, + "time_per_iteration": 2.6718733310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090728, + "balance_loss_mlp": 1.05572796, + "epoch": 0.13524432474028472, + "flos": 613103399424.0, + "grad_norm": 0.06387437846302528, + "language_loss": 0.875036, + "learning_rate": 0.0009712392266904696, + "loss": 0.88594317, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.34985352, + "step": 703, + "time_per_iteration": 2.6985831260681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010863, + "balance_loss_mlp": 1.0524683, + "epoch": 0.13543670642554828, + "flos": 904425868800.0, + "grad_norm": 0.06666466963859687, + "language_loss": 0.86250496, + "learning_rate": 0.0009711349969373076, + "loss": 0.87336791, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.33862305, + "step": 704, + "time_per_iteration": 3.1328465938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095762, + "balance_loss_mlp": 1.0610956, + "epoch": 0.13562908811081184, + "flos": 550335416832.0, + "grad_norm": 0.0628446006314887, + "language_loss": 0.80944061, + "learning_rate": 0.0009710305842733178, + "loss": 0.82039821, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.34667969, + "step": 705, + "time_per_iteration": 2.7668187618255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093976, + "balance_loss_mlp": 1.06147909, + "epoch": 0.1358214697960754, + "flos": 507797572608.0, + "grad_norm": 0.06635154625105166, + "language_loss": 0.90133065, + "learning_rate": 0.0009709259887390373, + "loss": 0.91227043, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.32519531, + "step": 706, + "time_per_iteration": 2.656233072280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096924, + "balance_loss_mlp": 1.06390333, + "epoch": 0.136013851481339, + "flos": 528640197120.0, + "grad_norm": 0.09290535615143355, + "language_loss": 0.91425377, + "learning_rate": 0.0009708212103750737, + "loss": 0.92522299, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.33007812, + "step": 707, + "time_per_iteration": 2.569655656814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_mlp": 1.06812644, + "epoch": 0.13620623316660255, + "flos": 658772379648.0, + "grad_norm": 0.06731423560591156, + "language_loss": 0.87756282, + "learning_rate": 0.0009707162492221051, + "loss": 0.88857424, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.33007812, + "step": 708, + "time_per_iteration": 2.880669593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103707, + "balance_loss_mlp": 1.07009029, + "epoch": 0.1363986148518661, + "flos": 671583522816.0, + "grad_norm": 0.07312175328849302, + "language_loss": 0.88322687, + "learning_rate": 0.0009706111053208815, + "loss": 0.89426386, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.33642578, + "step": 709, + "time_per_iteration": 2.7878787517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097257, + "balance_loss_mlp": 1.06342554, + "epoch": 0.13659099653712967, + "flos": 472828520448.0, + "grad_norm": 0.06741688104713542, + "language_loss": 0.86067665, + "learning_rate": 0.0009705057787122232, + "loss": 0.87164921, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.33862305, + "step": 710, + "time_per_iteration": 2.528298854827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105446, + "balance_loss_mlp": 1.07190061, + "epoch": 0.13678337822239323, + "flos": 452483490816.0, + "grad_norm": 0.05706590332145298, + "language_loss": 0.91653168, + "learning_rate": 0.0009704002694370216, + "loss": 0.92758614, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.33569336, + "step": 711, + "time_per_iteration": 2.5201761722564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114394, + "balance_loss_mlp": 1.0794661, + "epoch": 0.13697575990765679, + "flos": 519373416960.0, + "grad_norm": 0.06387130477766731, + "language_loss": 0.86892813, + "learning_rate": 0.0009702945775362388, + "loss": 0.88007212, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.34960938, + "step": 712, + "time_per_iteration": 2.661848783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130562, + "balance_loss_mlp": 1.0947994, + "epoch": 0.13716814159292035, + "flos": 480145618944.0, + "grad_norm": 0.06038249383316015, + "language_loss": 0.87339497, + "learning_rate": 0.0009701887030509086, + "loss": 0.8847006, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.35766602, + "step": 713, + "time_per_iteration": 2.6068434715270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125148, + "balance_loss_mlp": 1.0908401, + "epoch": 0.1373605232781839, + "flos": 545376844800.0, + "grad_norm": 0.06924339631343991, + "language_loss": 0.92127877, + "learning_rate": 0.0009700826460221346, + "loss": 0.93253028, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.34301758, + "step": 714, + "time_per_iteration": 2.653224468231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145818, + "balance_loss_mlp": 1.11050797, + "epoch": 0.1375529049634475, + "flos": 708473143296.0, + "grad_norm": 0.0682346884445605, + "language_loss": 0.93435562, + "learning_rate": 0.0009699764064910921, + "loss": 0.94581378, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.35302734, + "step": 715, + "time_per_iteration": 2.878445625305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130305, + "balance_loss_mlp": 1.09542441, + "epoch": 0.13774528664871105, + "flos": 486452971008.0, + "grad_norm": 0.07091873756636237, + "language_loss": 0.87931371, + "learning_rate": 0.0009698699844990268, + "loss": 0.89061677, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.34863281, + "step": 716, + "time_per_iteration": 2.6278092861175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124133, + "balance_loss_mlp": 1.09070659, + "epoch": 0.1379376683339746, + "flos": 679885636608.0, + "grad_norm": 0.0686032560828043, + "language_loss": 0.88731855, + "learning_rate": 0.0009697633800872555, + "loss": 0.89855987, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.33422852, + "step": 717, + "time_per_iteration": 2.888576030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112997, + "balance_loss_mlp": 1.07825947, + "epoch": 0.13813005001923817, + "flos": 610628419584.0, + "grad_norm": 0.07907714555147631, + "language_loss": 0.9128629, + "learning_rate": 0.0009696565932971655, + "loss": 0.92399287, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.34741211, + "step": 718, + "time_per_iteration": 2.8937225341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110249, + "balance_loss_mlp": 1.06837237, + "epoch": 0.13832243170450173, + "flos": 588431222784.0, + "grad_norm": 0.05947825646897862, + "language_loss": 0.9001984, + "learning_rate": 0.0009695496241702153, + "loss": 0.91122329, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.34155273, + "step": 719, + "time_per_iteration": 2.791111469268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094313, + "balance_loss_mlp": 1.06093454, + "epoch": 0.1385148133897653, + "flos": 699674844672.0, + "grad_norm": 0.07440757355955382, + "language_loss": 0.86308432, + "learning_rate": 0.0009694424727479339, + "loss": 0.87402749, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.33398438, + "step": 720, + "time_per_iteration": 2.8781325817108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088445, + "balance_loss_mlp": 1.05475688, + "epoch": 0.13870719507502885, + "flos": 597977399808.0, + "grad_norm": 0.059872525751604476, + "language_loss": 0.90073895, + "learning_rate": 0.0009693351390719213, + "loss": 0.91162348, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.3371582, + "step": 721, + "time_per_iteration": 2.691493272781372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095999, + "balance_loss_mlp": 1.06242967, + "epoch": 0.1388995767602924, + "flos": 586279309824.0, + "grad_norm": 0.07792099406652078, + "language_loss": 0.91640067, + "learning_rate": 0.000969227623183848, + "loss": 0.92736065, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.33569336, + "step": 722, + "time_per_iteration": 2.768209218978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086676, + "balance_loss_mlp": 1.05475235, + "epoch": 0.139091958445556, + "flos": 650810709504.0, + "grad_norm": 0.07717859695455091, + "language_loss": 0.91485119, + "learning_rate": 0.0009691199251254554, + "loss": 0.92571795, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.3190918, + "step": 723, + "time_per_iteration": 2.813594102859497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093708, + "balance_loss_mlp": 1.06159282, + "epoch": 0.13928434013081956, + "flos": 575446961664.0, + "grad_norm": 0.06414169604653322, + "language_loss": 0.8718468, + "learning_rate": 0.0009690120449385555, + "loss": 0.88278389, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.32104492, + "step": 724, + "time_per_iteration": 2.732372999191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110008, + "balance_loss_mlp": 1.06574821, + "epoch": 0.13947672181608312, + "flos": 562954503168.0, + "grad_norm": 0.07538454681544235, + "language_loss": 0.93399024, + "learning_rate": 0.0009689039826650312, + "loss": 0.94499099, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.34375, + "step": 725, + "time_per_iteration": 2.769481658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111743, + "balance_loss_mlp": 1.09967864, + "epoch": 0.13966910350134668, + "flos": 1520699387904.0, + "grad_norm": 0.042030956775344956, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77634799, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.12060547, + "step": 726, + "time_per_iteration": 4.903716802597046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101674, + "balance_loss_mlp": 1.06619751, + "epoch": 0.13986148518661023, + "flos": 499604557824.0, + "grad_norm": 0.07361028590256702, + "language_loss": 0.88265646, + "learning_rate": 0.0009686873120259941, + "loss": 0.89367324, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.35522461, + "step": 727, + "time_per_iteration": 2.639673948287964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099007, + "balance_loss_mlp": 1.06612897, + "epoch": 0.1400538668718738, + "flos": 598381862400.0, + "grad_norm": 0.053177263225715844, + "language_loss": 0.87612498, + "learning_rate": 0.0009685787037446004, + "loss": 0.88711506, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.32885742, + "step": 728, + "time_per_iteration": 2.7457332611083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095755, + "balance_loss_mlp": 1.06135106, + "epoch": 0.14024624855713735, + "flos": 593757941760.0, + "grad_norm": 0.0730266030670127, + "language_loss": 0.88032103, + "learning_rate": 0.0009684699135448201, + "loss": 0.89127851, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.34423828, + "step": 729, + "time_per_iteration": 2.6995558738708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091636, + "balance_loss_mlp": 1.05940139, + "epoch": 0.1404386302424009, + "flos": 506335311360.0, + "grad_norm": 0.06378774069808751, + "language_loss": 0.93033969, + "learning_rate": 0.0009683609414688895, + "loss": 0.94125605, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.32226562, + "step": 730, + "time_per_iteration": 2.6648926734924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097348, + "balance_loss_mlp": 1.06175184, + "epoch": 0.14063101192766447, + "flos": 573132105216.0, + "grad_norm": 0.05452232030629634, + "language_loss": 0.86945236, + "learning_rate": 0.0009682517875591154, + "loss": 0.88042581, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.35620117, + "step": 731, + "time_per_iteration": 2.7333967685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099629, + "balance_loss_mlp": 1.06656027, + "epoch": 0.14082339361292806, + "flos": 564333806592.0, + "grad_norm": 0.06482276791137384, + "language_loss": 0.87207299, + "learning_rate": 0.0009681424518578749, + "loss": 0.88306928, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.33081055, + "step": 732, + "time_per_iteration": 2.706704616546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_mlp": 1.06734443, + "epoch": 0.14101577529819162, + "flos": 463336187904.0, + "grad_norm": 0.05411989278901109, + "language_loss": 0.88122302, + "learning_rate": 0.000968032934407616, + "loss": 0.89222693, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.33056641, + "step": 733, + "time_per_iteration": 2.5904436111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100997, + "balance_loss_mlp": 1.06766593, + "epoch": 0.14120815698345518, + "flos": 595791991296.0, + "grad_norm": 0.06321555834593343, + "language_loss": 0.82077157, + "learning_rate": 0.0009679232352508571, + "loss": 0.83178151, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.33349609, + "step": 734, + "time_per_iteration": 2.758493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102299, + "balance_loss_mlp": 1.06992185, + "epoch": 0.14140053866871874, + "flos": 534864591360.0, + "grad_norm": 0.05697576898708014, + "language_loss": 0.81442666, + "learning_rate": 0.0009678133544301871, + "loss": 0.82544965, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.32373047, + "step": 735, + "time_per_iteration": 2.6508195400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092208, + "balance_loss_mlp": 1.06006956, + "epoch": 0.1415929203539823, + "flos": 520013606400.0, + "grad_norm": 0.0400187761209974, + "language_loss": 0.91843486, + "learning_rate": 0.0009677032919882658, + "loss": 0.92935699, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.32128906, + "step": 736, + "time_per_iteration": 2.705019474029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_mlp": 1.06975937, + "epoch": 0.14178530203924586, + "flos": 482095300608.0, + "grad_norm": 0.07179339183341249, + "language_loss": 0.92199683, + "learning_rate": 0.000967593047967823, + "loss": 0.93300164, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.30712891, + "step": 737, + "time_per_iteration": 2.55415415763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109678, + "balance_loss_mlp": 1.07577443, + "epoch": 0.14197768372450942, + "flos": 676339863552.0, + "grad_norm": 0.08640894081958116, + "language_loss": 0.87084705, + "learning_rate": 0.0009674826224116593, + "loss": 0.88194382, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.33911133, + "step": 738, + "time_per_iteration": 2.819878101348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097633, + "balance_loss_mlp": 1.06544614, + "epoch": 0.14217006540977298, + "flos": 445802199552.0, + "grad_norm": 0.06953952980021996, + "language_loss": 0.8713401, + "learning_rate": 0.0009673720153626455, + "loss": 0.88231641, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.32177734, + "step": 739, + "time_per_iteration": 2.5987422466278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096281, + "balance_loss_mlp": 1.06385565, + "epoch": 0.14236244709503657, + "flos": 496261016064.0, + "grad_norm": 0.08400230511878481, + "language_loss": 0.87465405, + "learning_rate": 0.0009672612268637235, + "loss": 0.88561684, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.32421875, + "step": 740, + "time_per_iteration": 2.6148736476898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098472, + "balance_loss_mlp": 1.06669128, + "epoch": 0.14255482878030012, + "flos": 648022989312.0, + "grad_norm": 0.0806935070673247, + "language_loss": 0.846753, + "learning_rate": 0.0009671502569579048, + "loss": 0.85773772, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.31762695, + "step": 741, + "time_per_iteration": 2.7533769607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089599, + "balance_loss_mlp": 1.05774641, + "epoch": 0.14274721046556368, + "flos": 535888894464.0, + "grad_norm": 0.06572551706098649, + "language_loss": 0.90748239, + "learning_rate": 0.0009670391056882719, + "loss": 0.91837835, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.31835938, + "step": 742, + "time_per_iteration": 2.698690176010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088511, + "balance_loss_mlp": 1.0565629, + "epoch": 0.14293959215082724, + "flos": 956677215744.0, + "grad_norm": 0.07291469749344824, + "language_loss": 0.89417249, + "learning_rate": 0.0009669277730979776, + "loss": 0.90505755, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.31958008, + "step": 743, + "time_per_iteration": 3.1728732585906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108649, + "balance_loss_mlp": 1.05408931, + "epoch": 0.1431319738360908, + "flos": 692766590976.0, + "grad_norm": 0.06693583917292938, + "language_loss": 0.85588205, + "learning_rate": 0.0009668162592302449, + "loss": 0.86674696, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.32397461, + "step": 744, + "time_per_iteration": 2.896467685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099426, + "balance_loss_mlp": 1.06673896, + "epoch": 0.14332435552135436, + "flos": 565174817280.0, + "grad_norm": 0.0717564206721674, + "language_loss": 0.86683381, + "learning_rate": 0.0009667045641283676, + "loss": 0.877828, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.3269043, + "step": 745, + "time_per_iteration": 2.6326427459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095955, + "balance_loss_mlp": 1.06336319, + "epoch": 0.14351673720661792, + "flos": 738045665280.0, + "grad_norm": 0.07083856064802352, + "language_loss": 0.95545924, + "learning_rate": 0.0009665926878357092, + "loss": 0.96641874, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.32592773, + "step": 746, + "time_per_iteration": 2.902628183364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108393, + "balance_loss_mlp": 1.07565856, + "epoch": 0.14370911889188148, + "flos": 548951731200.0, + "grad_norm": 0.08672542857876225, + "language_loss": 0.91510898, + "learning_rate": 0.0009664806303957043, + "loss": 0.92619288, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.32714844, + "step": 747, + "time_per_iteration": 2.678656578063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107271, + "balance_loss_mlp": 1.07448816, + "epoch": 0.14390150057714507, + "flos": 589973469696.0, + "grad_norm": 0.06575006445724518, + "language_loss": 0.87633115, + "learning_rate": 0.0009663683918518571, + "loss": 0.88740385, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.32788086, + "step": 748, + "time_per_iteration": 2.894339084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116744, + "balance_loss_mlp": 1.08226848, + "epoch": 0.14409388226240863, + "flos": 590773782528.0, + "grad_norm": 0.06412555003569581, + "language_loss": 0.86334193, + "learning_rate": 0.0009662559722477428, + "loss": 0.87450933, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.3449707, + "step": 749, + "time_per_iteration": 2.6673357486724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116866, + "balance_loss_mlp": 1.15397346, + "epoch": 0.1442862639476722, + "flos": 1510418479104.0, + "grad_norm": 0.05654081816866197, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77331638, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.14648438, + "step": 750, + "time_per_iteration": 4.97744607925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111461, + "balance_loss_mlp": 1.0782733, + "epoch": 0.14447864563293575, + "flos": 496493770752.0, + "grad_norm": 0.05840496998451829, + "language_loss": 0.89989787, + "learning_rate": 0.0009660305900333632, + "loss": 0.91101241, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.33203125, + "step": 751, + "time_per_iteration": 2.6919631958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108129, + "balance_loss_mlp": 1.07513142, + "epoch": 0.1446710273181993, + "flos": 589400271360.0, + "grad_norm": 0.0663289310880325, + "language_loss": 0.83084202, + "learning_rate": 0.0009659176275105992, + "loss": 0.8419233, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.33007812, + "step": 752, + "time_per_iteration": 2.702003240585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097403, + "balance_loss_mlp": 1.0634284, + "epoch": 0.14486340900346287, + "flos": 585521256960.0, + "grad_norm": 0.05748666507804042, + "language_loss": 0.86628646, + "learning_rate": 0.0009658044841025701, + "loss": 0.87726045, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.34008789, + "step": 753, + "time_per_iteration": 2.7666702270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106626, + "balance_loss_mlp": 1.07114923, + "epoch": 0.14505579068872643, + "flos": 504405978624.0, + "grad_norm": 0.07320865998852653, + "language_loss": 0.81996346, + "learning_rate": 0.0009656911598532021, + "loss": 0.83102977, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.35498047, + "step": 754, + "time_per_iteration": 2.6273839473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094053, + "balance_loss_mlp": 1.05936301, + "epoch": 0.14524817237399, + "flos": 486566452224.0, + "grad_norm": 0.05776902712696923, + "language_loss": 0.90229332, + "learning_rate": 0.0009655776548064917, + "loss": 0.91323388, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.34667969, + "step": 755, + "time_per_iteration": 2.6639461517333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092368, + "balance_loss_mlp": 1.05867922, + "epoch": 0.14544055405925355, + "flos": 727857888768.0, + "grad_norm": 0.059694446461720084, + "language_loss": 0.88762641, + "learning_rate": 0.0009654639690065054, + "loss": 0.89855003, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.33691406, + "step": 756, + "time_per_iteration": 2.881164789199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092737, + "balance_loss_mlp": 1.05981112, + "epoch": 0.14563293574451713, + "flos": 593359271424.0, + "grad_norm": 0.0719411984245977, + "language_loss": 0.88362074, + "learning_rate": 0.00096535010249738, + "loss": 0.89454818, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.3293457, + "step": 757, + "time_per_iteration": 2.703355312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092325, + "balance_loss_mlp": 1.05925632, + "epoch": 0.1458253174297807, + "flos": 560192924160.0, + "grad_norm": 0.09095988428785044, + "language_loss": 0.8300786, + "learning_rate": 0.0009652360553233224, + "loss": 0.84100187, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.33081055, + "step": 758, + "time_per_iteration": 2.7321062088012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_mlp": 1.04821551, + "epoch": 0.14601769911504425, + "flos": 1557025984512.0, + "grad_norm": 0.03493248396843453, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74836457, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.14453125, + "step": 759, + "time_per_iteration": 4.917184591293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099151, + "balance_loss_mlp": 1.06605887, + "epoch": 0.1462100808003078, + "flos": 865922628096.0, + "grad_norm": 0.05465610046720203, + "language_loss": 0.8166393, + "learning_rate": 0.0009650074191575883, + "loss": 0.82763088, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.33105469, + "step": 760, + "time_per_iteration": 3.2009472846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097005, + "balance_loss_mlp": 1.06341171, + "epoch": 0.14640246248557137, + "flos": 522673288704.0, + "grad_norm": 0.07890258703475667, + "language_loss": 0.86329532, + "learning_rate": 0.0009648928302546766, + "loss": 0.87426543, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.3359375, + "step": 761, + "time_per_iteration": 2.6858482360839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087335, + "balance_loss_mlp": 1.05340791, + "epoch": 0.14659484417083493, + "flos": 1030121805312.0, + "grad_norm": 0.05505233607608704, + "language_loss": 0.8584463, + "learning_rate": 0.0009647780608643613, + "loss": 0.86931968, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.33935547, + "step": 762, + "time_per_iteration": 3.3784618377685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087006, + "balance_loss_mlp": 1.05365133, + "epoch": 0.1467872258560985, + "flos": 500426629632.0, + "grad_norm": 0.083565321416964, + "language_loss": 0.88299912, + "learning_rate": 0.0009646631110312001, + "loss": 0.89386916, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.33349609, + "step": 763, + "time_per_iteration": 2.642038345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096574, + "balance_loss_mlp": 1.06465006, + "epoch": 0.14697960754136205, + "flos": 547514201088.0, + "grad_norm": 0.05646167170610495, + "language_loss": 0.88908124, + "learning_rate": 0.0009645479807998203, + "loss": 0.900047, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.3190918, + "step": 764, + "time_per_iteration": 2.7709102630615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093321, + "balance_loss_mlp": 1.0614922, + "epoch": 0.14717198922662564, + "flos": 517586678784.0, + "grad_norm": 0.06731397985108602, + "language_loss": 0.93233657, + "learning_rate": 0.0009644326702149196, + "loss": 0.94326979, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.31811523, + "step": 765, + "time_per_iteration": 2.691761016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098194, + "balance_loss_mlp": 1.06472015, + "epoch": 0.1473643709118892, + "flos": 731661147648.0, + "grad_norm": 0.08664060064789567, + "language_loss": 0.85604531, + "learning_rate": 0.0009643171793212653, + "loss": 0.86702728, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.33496094, + "step": 766, + "time_per_iteration": 3.0578510761260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095374, + "balance_loss_mlp": 1.06190002, + "epoch": 0.14755675259715276, + "flos": 620257554432.0, + "grad_norm": 0.06875066800131625, + "language_loss": 0.90379435, + "learning_rate": 0.0009642015081636952, + "loss": 0.91474807, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.33496094, + "step": 767, + "time_per_iteration": 2.690892219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091039, + "balance_loss_mlp": 1.05830407, + "epoch": 0.14774913428241632, + "flos": 451981513728.0, + "grad_norm": 0.06617868208271054, + "language_loss": 0.88812423, + "learning_rate": 0.0009640856567871166, + "loss": 0.89903462, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.32714844, + "step": 768, + "time_per_iteration": 2.5108768939971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086849, + "balance_loss_mlp": 1.05316067, + "epoch": 0.14794151596767988, + "flos": 836881196544.0, + "grad_norm": 0.06813910901976611, + "language_loss": 0.89643073, + "learning_rate": 0.0009639696252365072, + "loss": 0.90729922, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.33691406, + "step": 769, + "time_per_iteration": 3.036872386932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087914, + "balance_loss_mlp": 1.05546558, + "epoch": 0.14813389765294344, + "flos": 685765204992.0, + "grad_norm": 0.06952898718112278, + "language_loss": 0.82433641, + "learning_rate": 0.0009638534135569144, + "loss": 0.83521557, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.32446289, + "step": 770, + "time_per_iteration": 2.920228958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096521, + "balance_loss_mlp": 1.06395316, + "epoch": 0.148326279338207, + "flos": 509625008640.0, + "grad_norm": 0.05850145176667806, + "language_loss": 0.90417981, + "learning_rate": 0.0009637370217934554, + "loss": 0.91514498, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.32568359, + "step": 771, + "time_per_iteration": 2.6692943572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0624088, + "epoch": 0.14851866102347056, + "flos": 587869608960.0, + "grad_norm": 0.06374792966079154, + "language_loss": 0.83362675, + "learning_rate": 0.0009636204499913175, + "loss": 0.84457153, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.32055664, + "step": 772, + "time_per_iteration": 2.9103784561157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101595, + "balance_loss_mlp": 1.07129157, + "epoch": 0.14871104270873411, + "flos": 690722366976.0, + "grad_norm": 0.05784692032564958, + "language_loss": 0.8891257, + "learning_rate": 0.0009635036981957581, + "loss": 0.90014172, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.30273438, + "step": 773, + "time_per_iteration": 2.840233087539673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109331, + "balance_loss_mlp": 1.06112361, + "epoch": 0.1489034243939977, + "flos": 654803205120.0, + "grad_norm": 0.06091674471201955, + "language_loss": 0.9126395, + "learning_rate": 0.0009633867664521043, + "loss": 0.9235726, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.32202148, + "step": 774, + "time_per_iteration": 2.8467912673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098219, + "balance_loss_mlp": 1.0643878, + "epoch": 0.14909580607926126, + "flos": 475595891712.0, + "grad_norm": 0.06395321005815084, + "language_loss": 0.87366414, + "learning_rate": 0.0009632696548057527, + "loss": 0.8846463, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.33862305, + "step": 775, + "time_per_iteration": 2.55267596244812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090534, + "balance_loss_mlp": 1.05729866, + "epoch": 0.14928818776452482, + "flos": 610789953024.0, + "grad_norm": 0.07257335679926562, + "language_loss": 0.85489643, + "learning_rate": 0.0009631523633021704, + "loss": 0.86580181, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.33251953, + "step": 776, + "time_per_iteration": 2.800656795501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090063, + "balance_loss_mlp": 1.05694628, + "epoch": 0.14948056944978838, + "flos": 561487859712.0, + "grad_norm": 0.058446141184189525, + "language_loss": 0.88943005, + "learning_rate": 0.0009630348919868936, + "loss": 0.90033066, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.33129883, + "step": 777, + "time_per_iteration": 2.7306644916534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088502, + "balance_loss_mlp": 1.05397916, + "epoch": 0.14967295113505194, + "flos": 448972623360.0, + "grad_norm": 0.08136314957760014, + "language_loss": 0.81536144, + "learning_rate": 0.0009629172409055293, + "loss": 0.82624644, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.34545898, + "step": 778, + "time_per_iteration": 2.532480239868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091534, + "balance_loss_mlp": 1.05937171, + "epoch": 0.1498653328203155, + "flos": 571000541184.0, + "grad_norm": 0.06865521140792329, + "language_loss": 0.88039231, + "learning_rate": 0.0009627994101037531, + "loss": 0.89130771, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.3215332, + "step": 779, + "time_per_iteration": 2.7336056232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091586, + "balance_loss_mlp": 1.05811191, + "epoch": 0.15005771450557906, + "flos": 630918194688.0, + "grad_norm": 0.06277485509918372, + "language_loss": 0.8981787, + "learning_rate": 0.0009626813996273114, + "loss": 0.90909451, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.3347168, + "step": 780, + "time_per_iteration": 2.8651859760284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092654, + "balance_loss_mlp": 1.06018162, + "epoch": 0.15025009619084262, + "flos": 577633780224.0, + "grad_norm": 0.06737111741199381, + "language_loss": 0.89359641, + "learning_rate": 0.0009625632095220198, + "loss": 0.90452296, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.32470703, + "step": 781, + "time_per_iteration": 2.910163640975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093984, + "balance_loss_mlp": 1.06041455, + "epoch": 0.1504424778761062, + "flos": 483646311936.0, + "grad_norm": 0.06188715182302237, + "language_loss": 0.87568116, + "learning_rate": 0.0009624448398337637, + "loss": 0.88662094, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.3359375, + "step": 782, + "time_per_iteration": 2.532055616378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100397, + "balance_loss_mlp": 1.06751907, + "epoch": 0.15063485956136977, + "flos": 762167812608.0, + "grad_norm": 0.06229794960735175, + "language_loss": 0.89905757, + "learning_rate": 0.0009623262906084984, + "loss": 0.91006154, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.32861328, + "step": 783, + "time_per_iteration": 2.9851605892181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104855, + "balance_loss_mlp": 1.0712378, + "epoch": 0.15082724124663333, + "flos": 497369687040.0, + "grad_norm": 0.060596744514248076, + "language_loss": 0.90796679, + "learning_rate": 0.0009622075618922486, + "loss": 0.91901541, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.33642578, + "step": 784, + "time_per_iteration": 2.6796786785125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102349, + "balance_loss_mlp": 1.06928015, + "epoch": 0.15101962293189689, + "flos": 509476621824.0, + "grad_norm": 0.06389342174673626, + "language_loss": 0.87423813, + "learning_rate": 0.0009620886537311091, + "loss": 0.8852616, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.33081055, + "step": 785, + "time_per_iteration": 2.6153056621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113163, + "balance_loss_mlp": 1.07685184, + "epoch": 0.15121200461716044, + "flos": 457520638464.0, + "grad_norm": 0.06793935281312648, + "language_loss": 0.85492945, + "learning_rate": 0.000961969566171244, + "loss": 0.86606109, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.36303711, + "step": 786, + "time_per_iteration": 2.506267786026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122272, + "balance_loss_mlp": 1.08703363, + "epoch": 0.151404386302424, + "flos": 537729477120.0, + "grad_norm": 0.0670602351843582, + "language_loss": 0.90370345, + "learning_rate": 0.0009618502992588873, + "loss": 0.91492617, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.35253906, + "step": 787, + "time_per_iteration": 2.623457670211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141844, + "balance_loss_mlp": 1.10658193, + "epoch": 0.15159676798768756, + "flos": 687858891264.0, + "grad_norm": 0.06543467559167064, + "language_loss": 0.88581872, + "learning_rate": 0.0009617308530403424, + "loss": 0.89723718, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.35302734, + "step": 788, + "time_per_iteration": 2.975861072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149381, + "balance_loss_mlp": 1.11371326, + "epoch": 0.15178914967295112, + "flos": 545042193408.0, + "grad_norm": 0.059566397417978756, + "language_loss": 0.87806541, + "learning_rate": 0.0009616112275619825, + "loss": 0.88955921, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.35668945, + "step": 789, + "time_per_iteration": 2.683262348175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152452, + "balance_loss_mlp": 1.1169517, + "epoch": 0.1519815313582147, + "flos": 511510671360.0, + "grad_norm": 0.05728483560240697, + "language_loss": 0.84466863, + "learning_rate": 0.0009614914228702503, + "loss": 0.85619313, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.35498047, + "step": 790, + "time_per_iteration": 2.6616339683532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142719, + "balance_loss_mlp": 1.10850596, + "epoch": 0.15217391304347827, + "flos": 683747122176.0, + "grad_norm": 0.057799273493116435, + "language_loss": 0.89279461, + "learning_rate": 0.0009613714390116581, + "loss": 0.90422177, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.34204102, + "step": 791, + "time_per_iteration": 2.947608470916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133841, + "balance_loss_mlp": 1.0997231, + "epoch": 0.15236629472874183, + "flos": 643873342464.0, + "grad_norm": 0.06413295296627212, + "language_loss": 0.86589968, + "learning_rate": 0.0009612512760327879, + "loss": 0.87723809, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.34155273, + "step": 792, + "time_per_iteration": 2.8261189460754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124449, + "balance_loss_mlp": 1.08727932, + "epoch": 0.1525586764140054, + "flos": 412654791168.0, + "grad_norm": 0.06095846853214657, + "language_loss": 0.85749042, + "learning_rate": 0.0009611309339802909, + "loss": 0.86873484, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.37182617, + "step": 793, + "time_per_iteration": 2.438474178314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113041, + "balance_loss_mlp": 1.07811236, + "epoch": 0.15275105809926895, + "flos": 802444644864.0, + "grad_norm": 0.04691390558901254, + "language_loss": 0.84620011, + "learning_rate": 0.0009610104129008881, + "loss": 0.85733056, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.34985352, + "step": 794, + "time_per_iteration": 3.1149892807006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092099, + "balance_loss_mlp": 1.05786228, + "epoch": 0.1529434397845325, + "flos": 612143115264.0, + "grad_norm": 0.06446455819394356, + "language_loss": 0.88995111, + "learning_rate": 0.0009608897128413701, + "loss": 0.90087205, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.3425293, + "step": 795, + "time_per_iteration": 2.7310965061187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085841, + "balance_loss_mlp": 1.05160367, + "epoch": 0.15313582146979607, + "flos": 614941009920.0, + "grad_norm": 0.04580320827636504, + "language_loss": 0.8595438, + "learning_rate": 0.0009607688338485965, + "loss": 0.87040222, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.3425293, + "step": 796, + "time_per_iteration": 2.8534584045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088916, + "balance_loss_mlp": 1.05427384, + "epoch": 0.15332820315505963, + "flos": 793256440320.0, + "grad_norm": 0.053101967265095064, + "language_loss": 0.91128695, + "learning_rate": 0.0009606477759694969, + "loss": 0.92217612, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.34643555, + "step": 797, + "time_per_iteration": 3.0544466972351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_mlp": 1.06441545, + "epoch": 0.1535205848403232, + "flos": 549945510912.0, + "grad_norm": 0.0662794157411924, + "language_loss": 0.87591946, + "learning_rate": 0.0009605265392510703, + "loss": 0.88690674, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.34350586, + "step": 798, + "time_per_iteration": 2.6120660305023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011091, + "balance_loss_mlp": 1.07417202, + "epoch": 0.15371296652558677, + "flos": 535691045376.0, + "grad_norm": 0.07220239734969772, + "language_loss": 0.92342889, + "learning_rate": 0.0009604051237403846, + "loss": 0.93451989, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.34960938, + "step": 799, + "time_per_iteration": 2.640749216079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111728, + "balance_loss_mlp": 1.07808757, + "epoch": 0.15390534821085033, + "flos": 395002939392.0, + "grad_norm": 0.06314402273456009, + "language_loss": 0.86126584, + "learning_rate": 0.0009602835294845776, + "loss": 0.87238312, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.33666992, + "step": 800, + "time_per_iteration": 2.44914174079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117351, + "balance_loss_mlp": 1.08254242, + "epoch": 0.1540977298961139, + "flos": 535587738624.0, + "grad_norm": 0.057636094576239, + "language_loss": 0.91100746, + "learning_rate": 0.0009601617565308565, + "loss": 0.92218101, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.34790039, + "step": 801, + "time_per_iteration": 2.599679470062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119062, + "balance_loss_mlp": 1.08511138, + "epoch": 0.15429011158137745, + "flos": 723388147200.0, + "grad_norm": 0.05961266019354579, + "language_loss": 0.86783326, + "learning_rate": 0.0009600398049264977, + "loss": 0.87902391, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.33935547, + "step": 802, + "time_per_iteration": 2.9514007568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121787, + "balance_loss_mlp": 1.08735943, + "epoch": 0.154482493266641, + "flos": 620209502208.0, + "grad_norm": 0.06366105456569557, + "language_loss": 0.92098475, + "learning_rate": 0.0009599176747188469, + "loss": 0.9322027, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.34448242, + "step": 803, + "time_per_iteration": 2.8068411350250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_mlp": 1.08012128, + "epoch": 0.15467487495190457, + "flos": 525351909888.0, + "grad_norm": 0.08101366702111423, + "language_loss": 0.83651662, + "learning_rate": 0.0009597953659553196, + "loss": 0.84765685, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.33911133, + "step": 804, + "time_per_iteration": 2.7075448036193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098377, + "balance_loss_mlp": 1.06616712, + "epoch": 0.15486725663716813, + "flos": 527473299456.0, + "grad_norm": 0.07377431927286832, + "language_loss": 0.89624304, + "learning_rate": 0.0009596728786833997, + "loss": 0.90722686, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.32202148, + "step": 805, + "time_per_iteration": 2.6376051902770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088085, + "balance_loss_mlp": 1.05420554, + "epoch": 0.1550596383224317, + "flos": 1048118482944.0, + "grad_norm": 0.06708822771662253, + "language_loss": 0.90018022, + "learning_rate": 0.0009595502129506415, + "loss": 0.91106105, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.33911133, + "step": 806, + "time_per_iteration": 3.3391284942626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092582, + "balance_loss_mlp": 1.05903625, + "epoch": 0.15525202000769528, + "flos": 613438050816.0, + "grad_norm": 0.06052700763637142, + "language_loss": 0.83084035, + "learning_rate": 0.0009594273688046678, + "loss": 0.84176612, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.33544922, + "step": 807, + "time_per_iteration": 2.7136006355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088184, + "balance_loss_mlp": 1.05273128, + "epoch": 0.15544440169295884, + "flos": 532805810688.0, + "grad_norm": 0.07048562468234597, + "language_loss": 0.86048424, + "learning_rate": 0.000959304346293171, + "loss": 0.87136608, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.35473633, + "step": 808, + "time_per_iteration": 2.6744906902313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097573, + "balance_loss_mlp": 1.06254935, + "epoch": 0.1556367833782224, + "flos": 644433546240.0, + "grad_norm": 0.06803397985071584, + "language_loss": 0.88331544, + "learning_rate": 0.0009591811454639125, + "loss": 0.89429116, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.3503418, + "step": 809, + "time_per_iteration": 2.730431079864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0610261, + "epoch": 0.15582916506348596, + "flos": 543540644352.0, + "grad_norm": 0.06204685505428811, + "language_loss": 0.88227659, + "learning_rate": 0.0009590577663647234, + "loss": 0.89322132, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.3347168, + "step": 810, + "time_per_iteration": 2.71469783782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104684, + "balance_loss_mlp": 1.07078123, + "epoch": 0.15602154674874952, + "flos": 579740613120.0, + "grad_norm": 0.05672341894910533, + "language_loss": 0.86610442, + "learning_rate": 0.0009589342090435036, + "loss": 0.87715125, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.33935547, + "step": 811, + "time_per_iteration": 2.799246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110387, + "balance_loss_mlp": 1.06918025, + "epoch": 0.15621392843401308, + "flos": 534982454784.0, + "grad_norm": 0.0647852675732537, + "language_loss": 0.87778354, + "learning_rate": 0.0009588104735482223, + "loss": 0.8888222, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.34692383, + "step": 812, + "time_per_iteration": 2.6684510707855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126921, + "balance_loss_mlp": 1.09106326, + "epoch": 0.15640631011927664, + "flos": 550635162624.0, + "grad_norm": 0.08222618986335321, + "language_loss": 0.84280443, + "learning_rate": 0.0009586865599269177, + "loss": 0.85407358, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.35864258, + "step": 813, + "time_per_iteration": 2.6293816566467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131277, + "balance_loss_mlp": 1.09651566, + "epoch": 0.1565986918045402, + "flos": 637190641152.0, + "grad_norm": 0.05945515562529824, + "language_loss": 0.88725412, + "learning_rate": 0.0009585624682276977, + "loss": 0.89856684, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.34814453, + "step": 814, + "time_per_iteration": 2.744253158569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137563, + "balance_loss_mlp": 1.10113239, + "epoch": 0.15679107348980378, + "flos": 490569122304.0, + "grad_norm": 0.09591637295165127, + "language_loss": 0.87945771, + "learning_rate": 0.0009584381984987386, + "loss": 0.89083332, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.36474609, + "step": 815, + "time_per_iteration": 2.5264036655426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124613, + "balance_loss_mlp": 1.0911386, + "epoch": 0.15698345517506734, + "flos": 529689231360.0, + "grad_norm": 0.05838460881618622, + "language_loss": 0.90277314, + "learning_rate": 0.0009583137507882864, + "loss": 0.91401929, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.3347168, + "step": 816, + "time_per_iteration": 2.6488330364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109898, + "balance_loss_mlp": 1.07418323, + "epoch": 0.1571758368603309, + "flos": 545779897344.0, + "grad_norm": 0.07313796537718548, + "language_loss": 0.81262791, + "learning_rate": 0.000958189125144656, + "loss": 0.82372689, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.35766602, + "step": 817, + "time_per_iteration": 2.7040657997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101746, + "balance_loss_mlp": 1.06672239, + "epoch": 0.15736821854559446, + "flos": 565377048576.0, + "grad_norm": 0.067694528538076, + "language_loss": 0.88558215, + "learning_rate": 0.0009580643216162313, + "loss": 0.89659959, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.3503418, + "step": 818, + "time_per_iteration": 2.6538634300231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096156, + "balance_loss_mlp": 1.06110835, + "epoch": 0.15756060023085802, + "flos": 500707436544.0, + "grad_norm": 0.05957146674366314, + "language_loss": 0.79884583, + "learning_rate": 0.0009579393402514652, + "loss": 0.80980736, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.35107422, + "step": 819, + "time_per_iteration": 2.5606625080108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082975, + "balance_loss_mlp": 1.048738, + "epoch": 0.15775298191612158, + "flos": 519014034432.0, + "grad_norm": 0.06194437160070725, + "language_loss": 0.91126758, + "learning_rate": 0.0009578141810988801, + "loss": 0.92209733, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.34228516, + "step": 820, + "time_per_iteration": 2.55538010597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082194, + "balance_loss_mlp": 1.04712272, + "epoch": 0.15794536360138514, + "flos": 465891153408.0, + "grad_norm": 0.060184436438788555, + "language_loss": 0.91010749, + "learning_rate": 0.0009576888442070668, + "loss": 0.92092943, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.35083008, + "step": 821, + "time_per_iteration": 2.6139276027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094225, + "balance_loss_mlp": 1.05982161, + "epoch": 0.1581377452866487, + "flos": 516911583744.0, + "grad_norm": 0.06832586535724347, + "language_loss": 0.92820144, + "learning_rate": 0.0009575633296246854, + "loss": 0.93914366, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.34423828, + "step": 822, + "time_per_iteration": 2.557404041290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096972, + "balance_loss_mlp": 1.06242526, + "epoch": 0.15833012697191226, + "flos": 549522109440.0, + "grad_norm": 0.06257557491721027, + "language_loss": 0.83520567, + "learning_rate": 0.0009574376374004652, + "loss": 0.84617537, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.34570312, + "step": 823, + "time_per_iteration": 2.673220157623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100077, + "balance_loss_mlp": 1.06395626, + "epoch": 0.15852250865717585, + "flos": 487206641664.0, + "grad_norm": 0.07116075590187526, + "language_loss": 0.81073487, + "learning_rate": 0.000957311767583204, + "loss": 0.82173562, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.36132812, + "step": 824, + "time_per_iteration": 2.605074882507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159484, + "balance_loss_mlp": 1.14126849, + "epoch": 0.1587148903424394, + "flos": 1309041672192.0, + "grad_norm": 0.051809649393169656, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83231074, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.18261719, + "step": 825, + "time_per_iteration": 4.726073265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111019, + "balance_loss_mlp": 1.07349157, + "epoch": 0.15890727202770297, + "flos": 466634649600.0, + "grad_norm": 0.07947222616221912, + "language_loss": 0.92132723, + "learning_rate": 0.0009570594953650961, + "loss": 0.93243748, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.37524414, + "step": 826, + "time_per_iteration": 2.5146830081939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109067, + "balance_loss_mlp": 1.07208848, + "epoch": 0.15909965371296653, + "flos": 776733608448.0, + "grad_norm": 0.06013225990958685, + "language_loss": 0.80852252, + "learning_rate": 0.00095693309306219, + "loss": 0.81961316, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.36962891, + "step": 827, + "time_per_iteration": 3.095632553100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117102, + "balance_loss_mlp": 1.07945621, + "epoch": 0.1592920353982301, + "flos": 1077852538368.0, + "grad_norm": 0.05984978885312211, + "language_loss": 0.88600951, + "learning_rate": 0.0009568065133621244, + "loss": 0.89718056, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.37646484, + "step": 828, + "time_per_iteration": 3.3153574466705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111269, + "balance_loss_mlp": 1.07584, + "epoch": 0.15948441708349365, + "flos": 725307305472.0, + "grad_norm": 0.0632864692280333, + "language_loss": 0.85493571, + "learning_rate": 0.0009566797563140422, + "loss": 0.86604846, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.35449219, + "step": 829, + "time_per_iteration": 2.8705785274505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121543, + "balance_loss_mlp": 1.08470702, + "epoch": 0.1596767987687572, + "flos": 578447087616.0, + "grad_norm": 0.06433687205870958, + "language_loss": 0.88630873, + "learning_rate": 0.0009565528219671547, + "loss": 0.89752412, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.36816406, + "step": 830, + "time_per_iteration": 2.8890771865844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137333, + "balance_loss_mlp": 1.10049748, + "epoch": 0.15986918045402077, + "flos": 528728947200.0, + "grad_norm": 0.04994246668943954, + "language_loss": 0.85232639, + "learning_rate": 0.0009564257103707418, + "loss": 0.86369967, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.36816406, + "step": 831, + "time_per_iteration": 2.5870308876037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133852, + "balance_loss_mlp": 1.09632492, + "epoch": 0.16006156213928435, + "flos": 574313559552.0, + "grad_norm": 0.0648316290803925, + "language_loss": 0.91675746, + "learning_rate": 0.0009562984215741533, + "loss": 0.92809594, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.37524414, + "step": 832, + "time_per_iteration": 2.655066967010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117496, + "balance_loss_mlp": 1.08170903, + "epoch": 0.1602539438245479, + "flos": 515258675712.0, + "grad_norm": 0.14271195523272245, + "language_loss": 0.82911491, + "learning_rate": 0.0009561709556268065, + "loss": 0.84028995, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.35839844, + "step": 833, + "time_per_iteration": 2.69999098777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119914, + "balance_loss_mlp": 1.08419931, + "epoch": 0.16044632550981147, + "flos": 620730418176.0, + "grad_norm": 0.05962773238435596, + "language_loss": 0.95060706, + "learning_rate": 0.0009560433125781884, + "loss": 0.96180618, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.35693359, + "step": 834, + "time_per_iteration": 2.711109161376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126421, + "balance_loss_mlp": 1.08977628, + "epoch": 0.16063870719507503, + "flos": 560817146880.0, + "grad_norm": 0.06388697234939344, + "language_loss": 0.92829657, + "learning_rate": 0.0009559154924778544, + "loss": 0.93956077, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.36621094, + "step": 835, + "time_per_iteration": 2.695260763168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121916, + "balance_loss_mlp": 1.08789361, + "epoch": 0.1608310888803386, + "flos": 804778440192.0, + "grad_norm": 0.05750453212643973, + "language_loss": 0.85217482, + "learning_rate": 0.0009557874953754284, + "loss": 0.86339402, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.34057617, + "step": 836, + "time_per_iteration": 3.002013921737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126166, + "balance_loss_mlp": 1.09204817, + "epoch": 0.16102347056560215, + "flos": 600311195136.0, + "grad_norm": 0.06332628409766573, + "language_loss": 0.84060842, + "learning_rate": 0.0009556593213206038, + "loss": 0.85187006, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.34130859, + "step": 837, + "time_per_iteration": 2.698716163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125003, + "balance_loss_mlp": 1.09102869, + "epoch": 0.1612158522508657, + "flos": 553235208192.0, + "grad_norm": 0.07524747482874264, + "language_loss": 0.87844718, + "learning_rate": 0.0009555309703631414, + "loss": 0.88969719, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.33984375, + "step": 838, + "time_per_iteration": 2.669588327407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133813, + "balance_loss_mlp": 1.09752607, + "epoch": 0.16140823393612927, + "flos": 555701423616.0, + "grad_norm": 0.07144746672945328, + "language_loss": 0.87685311, + "learning_rate": 0.0009554024425528722, + "loss": 0.88819122, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.36279297, + "step": 839, + "time_per_iteration": 2.6809709072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112028, + "balance_loss_mlp": 1.08737814, + "epoch": 0.16160061562139286, + "flos": 543613427712.0, + "grad_norm": 0.06970106087394082, + "language_loss": 0.8929134, + "learning_rate": 0.0009552737379396948, + "loss": 0.90411627, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.32885742, + "step": 840, + "time_per_iteration": 2.6100995540618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102587, + "balance_loss_mlp": 1.06920815, + "epoch": 0.16179299730665642, + "flos": 603590717952.0, + "grad_norm": 0.06131687325166246, + "language_loss": 0.87945604, + "learning_rate": 0.0009551448565735767, + "loss": 0.89048195, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.33398438, + "step": 841, + "time_per_iteration": 2.7360360622406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095736, + "balance_loss_mlp": 1.06168985, + "epoch": 0.16198537899191998, + "flos": 786821050368.0, + "grad_norm": 0.07162496841720159, + "language_loss": 0.8519845, + "learning_rate": 0.0009550157985045543, + "loss": 0.86294186, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.34082031, + "step": 842, + "time_per_iteration": 3.0436456203460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087632, + "balance_loss_mlp": 1.05413389, + "epoch": 0.16217776067718354, + "flos": 519550917120.0, + "grad_norm": 0.060562390499230526, + "language_loss": 0.89622426, + "learning_rate": 0.0009548865637827321, + "loss": 0.90710062, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.33496094, + "step": 843, + "time_per_iteration": 2.6422221660614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086448, + "balance_loss_mlp": 1.05342698, + "epoch": 0.1623701423624471, + "flos": 505015644672.0, + "grad_norm": 0.07097995853224412, + "language_loss": 0.90216166, + "learning_rate": 0.0009547571524582838, + "loss": 0.91302609, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.33032227, + "step": 844, + "time_per_iteration": 2.6082894802093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095057, + "balance_loss_mlp": 1.06031966, + "epoch": 0.16256252404771065, + "flos": 496940493312.0, + "grad_norm": 0.06932052947515681, + "language_loss": 0.92511153, + "learning_rate": 0.0009546275645814512, + "loss": 0.9360621, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.34765625, + "step": 845, + "time_per_iteration": 2.5985872745513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100013, + "balance_loss_mlp": 1.065418, + "epoch": 0.16275490573297421, + "flos": 502110061056.0, + "grad_norm": 0.07540183512891604, + "language_loss": 0.90294898, + "learning_rate": 0.0009544978002025446, + "loss": 0.91394913, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.34619141, + "step": 846, + "time_per_iteration": 2.5778391361236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096289, + "balance_loss_mlp": 1.06174231, + "epoch": 0.16294728741823777, + "flos": 506952179712.0, + "grad_norm": 0.06018935314915502, + "language_loss": 0.87532055, + "learning_rate": 0.0009543678593719434, + "loss": 0.8862834, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.34570312, + "step": 847, + "time_per_iteration": 2.697566270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102624, + "balance_loss_mlp": 1.06824434, + "epoch": 0.16313966910350133, + "flos": 509418395136.0, + "grad_norm": 0.054217985504269955, + "language_loss": 0.8754853, + "learning_rate": 0.0009542377421400945, + "loss": 0.88651162, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.34375, + "step": 848, + "time_per_iteration": 2.786766290664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104457, + "balance_loss_mlp": 1.06847942, + "epoch": 0.16333205078876492, + "flos": 543712352256.0, + "grad_norm": 0.06122856356214084, + "language_loss": 0.83524954, + "learning_rate": 0.0009541074485575145, + "loss": 0.84629411, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.35986328, + "step": 849, + "time_per_iteration": 2.713759183883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098701, + "balance_loss_mlp": 1.06346297, + "epoch": 0.16352443247402848, + "flos": 507477477888.0, + "grad_norm": 0.06331477383231503, + "language_loss": 0.92240757, + "learning_rate": 0.0009539769786747874, + "loss": 0.93339461, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.35253906, + "step": 850, + "time_per_iteration": 2.589945077896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100894, + "balance_loss_mlp": 1.06584692, + "epoch": 0.16371681415929204, + "flos": 541851420672.0, + "grad_norm": 0.06704648725492578, + "language_loss": 0.81567919, + "learning_rate": 0.0009538463325425665, + "loss": 0.82668811, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.35083008, + "step": 851, + "time_per_iteration": 2.6779844760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105544, + "balance_loss_mlp": 1.07042515, + "epoch": 0.1639091958445556, + "flos": 520501026816.0, + "grad_norm": 0.058426853420895056, + "language_loss": 0.8673842, + "learning_rate": 0.0009537155102115728, + "loss": 0.87843966, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.35131836, + "step": 852, + "time_per_iteration": 2.5614206790924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106136, + "balance_loss_mlp": 1.07175565, + "epoch": 0.16410157752981916, + "flos": 547149026304.0, + "grad_norm": 0.06460558975646845, + "language_loss": 0.83482397, + "learning_rate": 0.0009535845117325961, + "loss": 0.84588534, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.34423828, + "step": 853, + "time_per_iteration": 2.6453073024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098632, + "balance_loss_mlp": 1.06470513, + "epoch": 0.16429395921508272, + "flos": 582561828864.0, + "grad_norm": 0.052152281018199936, + "language_loss": 0.93584174, + "learning_rate": 0.0009534533371564946, + "loss": 0.94682807, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.33959961, + "step": 854, + "time_per_iteration": 2.75186824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111206, + "balance_loss_mlp": 1.07670665, + "epoch": 0.16448634090034628, + "flos": 530678628864.0, + "grad_norm": 0.06475772966833339, + "language_loss": 0.8907218, + "learning_rate": 0.0009533219865341949, + "loss": 0.90183383, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.3449707, + "step": 855, + "time_per_iteration": 2.581479787826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108852, + "balance_loss_mlp": 1.07285094, + "epoch": 0.16467872258560984, + "flos": 491623948800.0, + "grad_norm": 0.06378602693040462, + "language_loss": 0.87287533, + "learning_rate": 0.0009531904599166916, + "loss": 0.88396388, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.36035156, + "step": 856, + "time_per_iteration": 2.6429831981658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107008, + "balance_loss_mlp": 1.07141232, + "epoch": 0.16487110427087343, + "flos": 506015216640.0, + "grad_norm": 0.07162133431974482, + "language_loss": 0.85139728, + "learning_rate": 0.0009530587573550478, + "loss": 0.86246729, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.35620117, + "step": 857, + "time_per_iteration": 2.5667338371276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071319, + "balance_loss_mlp": 1.05434394, + "epoch": 0.16506348595613698, + "flos": 1432006553088.0, + "grad_norm": 0.02717136097410494, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75390708, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.16992188, + "step": 858, + "time_per_iteration": 5.02930474281311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107487, + "balance_loss_mlp": 1.0740366, + "epoch": 0.16525586764140054, + "flos": 476890827264.0, + "grad_norm": 0.06438670275364486, + "language_loss": 0.90481895, + "learning_rate": 0.0009527948246039337, + "loss": 0.91589379, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.33447266, + "step": 859, + "time_per_iteration": 2.5222055912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109782, + "balance_loss_mlp": 1.07618856, + "epoch": 0.1654482493266641, + "flos": 880737297408.0, + "grad_norm": 0.058857893791213665, + "language_loss": 0.88361865, + "learning_rate": 0.000952662594516931, + "loss": 0.8947165, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.33618164, + "step": 860, + "time_per_iteration": 3.065053701400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109979, + "balance_loss_mlp": 1.07497942, + "epoch": 0.16564063101192766, + "flos": 626527028736.0, + "grad_norm": 0.058557043780191484, + "language_loss": 0.86803752, + "learning_rate": 0.0009525301886907234, + "loss": 0.87913728, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.34985352, + "step": 861, + "time_per_iteration": 2.873415470123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121945, + "balance_loss_mlp": 1.08537149, + "epoch": 0.16583301269719122, + "flos": 561250722816.0, + "grad_norm": 0.0761086770239273, + "language_loss": 0.8825953, + "learning_rate": 0.0009523976071767155, + "loss": 0.8938148, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.36572266, + "step": 862, + "time_per_iteration": 2.71508526802063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115561, + "balance_loss_mlp": 1.07994115, + "epoch": 0.16602539438245478, + "flos": 567510022656.0, + "grad_norm": 0.05388299317844869, + "language_loss": 0.88433009, + "learning_rate": 0.00095226485002638, + "loss": 0.8954857, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.35620117, + "step": 863, + "time_per_iteration": 2.7524497509002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111173, + "balance_loss_mlp": 1.07617354, + "epoch": 0.16621777606771834, + "flos": 574589984256.0, + "grad_norm": 0.05833582522103205, + "language_loss": 0.89311475, + "learning_rate": 0.0009521319172912576, + "loss": 0.90422642, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.35009766, + "step": 864, + "time_per_iteration": 2.717493772506714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134798, + "balance_loss_mlp": 1.09846306, + "epoch": 0.16641015775298193, + "flos": 514292599296.0, + "grad_norm": 0.05644176285984134, + "language_loss": 0.94990546, + "learning_rate": 0.0009519988090229579, + "loss": 0.96125346, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.36352539, + "step": 865, + "time_per_iteration": 2.6850624084472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133116, + "balance_loss_mlp": 1.09668565, + "epoch": 0.1666025394382455, + "flos": 621395338752.0, + "grad_norm": 0.05816643645022503, + "language_loss": 0.88684535, + "learning_rate": 0.0009518655252731576, + "loss": 0.89817655, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.36450195, + "step": 866, + "time_per_iteration": 2.7240021228790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124082, + "balance_loss_mlp": 1.0882715, + "epoch": 0.16679492112350905, + "flos": 548528329728.0, + "grad_norm": 0.06128727898968579, + "language_loss": 0.9070124, + "learning_rate": 0.0009517320660936022, + "loss": 0.91825324, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.35839844, + "step": 867, + "time_per_iteration": 2.6959731578826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118134, + "balance_loss_mlp": 1.08260965, + "epoch": 0.1669873028087726, + "flos": 665379477504.0, + "grad_norm": 0.05857722537468161, + "language_loss": 0.83557463, + "learning_rate": 0.0009515984315361051, + "loss": 0.84675598, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.35546875, + "step": 868, + "time_per_iteration": 2.813674211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122458, + "balance_loss_mlp": 1.08638549, + "epoch": 0.16717968449403617, + "flos": 538305647616.0, + "grad_norm": 0.06553445455365839, + "language_loss": 0.87103701, + "learning_rate": 0.000951464621652548, + "loss": 0.88226151, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.36083984, + "step": 869, + "time_per_iteration": 2.674333333969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111253, + "balance_loss_mlp": 1.07757819, + "epoch": 0.16737206617929973, + "flos": 529833235968.0, + "grad_norm": 0.059309523866322815, + "language_loss": 0.78951609, + "learning_rate": 0.0009513306364948804, + "loss": 0.80064136, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.34985352, + "step": 870, + "time_per_iteration": 2.7519431114196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011143, + "balance_loss_mlp": 1.07953846, + "epoch": 0.1675644478645633, + "flos": 480529732608.0, + "grad_norm": 0.06711563999134491, + "language_loss": 0.89559376, + "learning_rate": 0.0009511964761151197, + "loss": 0.90673673, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.34814453, + "step": 871, + "time_per_iteration": 2.544520854949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113298, + "balance_loss_mlp": 1.07820272, + "epoch": 0.16775682954982685, + "flos": 494311334400.0, + "grad_norm": 0.06484202096701225, + "language_loss": 0.9050945, + "learning_rate": 0.0009510621405653521, + "loss": 0.91622752, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.35131836, + "step": 872, + "time_per_iteration": 2.594224452972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106918, + "balance_loss_mlp": 1.07265687, + "epoch": 0.1679492112350904, + "flos": 751694846976.0, + "grad_norm": 0.060317450015561574, + "language_loss": 0.847211, + "learning_rate": 0.0009509276298977309, + "loss": 0.85828018, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.34277344, + "step": 873, + "time_per_iteration": 2.9428915977478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110568, + "balance_loss_mlp": 1.07261181, + "epoch": 0.168141592920354, + "flos": 1135413075456.0, + "grad_norm": 0.05441785661992682, + "language_loss": 0.81867516, + "learning_rate": 0.0009507929441644778, + "loss": 0.82978088, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.37939453, + "step": 874, + "time_per_iteration": 3.52008318901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101336, + "balance_loss_mlp": 1.06640816, + "epoch": 0.16833397460561755, + "flos": 632114205696.0, + "grad_norm": 0.06557720885733571, + "language_loss": 0.86201179, + "learning_rate": 0.0009506580834178826, + "loss": 0.87302518, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.34936523, + "step": 875, + "time_per_iteration": 2.7744014263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110472, + "balance_loss_mlp": 1.06817079, + "epoch": 0.1685263562908811, + "flos": 541171943424.0, + "grad_norm": 0.06007828909359903, + "language_loss": 0.91612709, + "learning_rate": 0.0009505230477103028, + "loss": 0.92717427, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.36547852, + "step": 876, + "time_per_iteration": 2.6593635082244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103279, + "balance_loss_mlp": 1.06703997, + "epoch": 0.16871873797614467, + "flos": 619036812288.0, + "grad_norm": 0.08702038824672748, + "language_loss": 0.81312418, + "learning_rate": 0.0009503878370941641, + "loss": 0.824157, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.36206055, + "step": 877, + "time_per_iteration": 2.7511024475097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094199, + "balance_loss_mlp": 1.05986643, + "epoch": 0.16891111966140823, + "flos": 606067107840.0, + "grad_norm": 0.06953183101172467, + "language_loss": 0.88841844, + "learning_rate": 0.0009502524516219595, + "loss": 0.89936042, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.34375, + "step": 878, + "time_per_iteration": 2.697455406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091575, + "balance_loss_mlp": 1.05757689, + "epoch": 0.1691035013466718, + "flos": 552058136064.0, + "grad_norm": 0.0721678347454753, + "language_loss": 0.89980447, + "learning_rate": 0.0009501168913462506, + "loss": 0.91072023, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.34008789, + "step": 879, + "time_per_iteration": 2.6825287342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080465, + "balance_loss_mlp": 1.06263125, + "epoch": 0.16929588303193535, + "flos": 1475544121344.0, + "grad_norm": 0.044515803528062385, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80202389, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.17871094, + "step": 880, + "time_per_iteration": 4.825777769088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.0464623, + "epoch": 0.1694882647171989, + "flos": 925850456064.0, + "grad_norm": 0.06491790696384477, + "language_loss": 0.85360616, + "learning_rate": 0.0009498452465949042, + "loss": 0.86441934, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.34887695, + "step": 881, + "time_per_iteration": 3.2700376510620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086319, + "balance_loss_mlp": 1.05227244, + "epoch": 0.1696806464024625, + "flos": 545829359616.0, + "grad_norm": 0.057533624801199786, + "language_loss": 0.916857, + "learning_rate": 0.0009497091622247285, + "loss": 0.92772019, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.34082031, + "step": 882, + "time_per_iteration": 2.711721181869507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085559, + "balance_loss_mlp": 1.05184698, + "epoch": 0.16987302808772606, + "flos": 528970466304.0, + "grad_norm": 0.08384615451013337, + "language_loss": 0.93744707, + "learning_rate": 0.0009495729032619723, + "loss": 0.94830269, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.33740234, + "step": 883, + "time_per_iteration": 2.688525438308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084621, + "balance_loss_mlp": 1.05062199, + "epoch": 0.17006540977298962, + "flos": 754855096320.0, + "grad_norm": 0.06073677328113264, + "language_loss": 0.84419179, + "learning_rate": 0.0009494364697595354, + "loss": 0.85503805, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.34033203, + "step": 884, + "time_per_iteration": 2.9112000465393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092434, + "balance_loss_mlp": 1.05750597, + "epoch": 0.17025779145825318, + "flos": 558532813824.0, + "grad_norm": 0.06728326387015754, + "language_loss": 0.89818925, + "learning_rate": 0.0009492998617703867, + "loss": 0.90911365, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.34936523, + "step": 885, + "time_per_iteration": 2.6760926246643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093981, + "balance_loss_mlp": 1.06045985, + "epoch": 0.17045017314351674, + "flos": 511963186176.0, + "grad_norm": 0.0687386743468794, + "language_loss": 0.87971282, + "learning_rate": 0.0009491630793475619, + "loss": 0.89065266, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.33520508, + "step": 886, + "time_per_iteration": 2.59726619720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.06011951, + "epoch": 0.1706425548287803, + "flos": 508674898944.0, + "grad_norm": 0.058204707286146434, + "language_loss": 0.85722017, + "learning_rate": 0.0009490261225441643, + "loss": 0.8681649, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.34350586, + "step": 887, + "time_per_iteration": 2.900501012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087689, + "balance_loss_mlp": 1.0545013, + "epoch": 0.17083493651404386, + "flos": 717016776192.0, + "grad_norm": 0.05310353290702558, + "language_loss": 0.90775931, + "learning_rate": 0.0009488889914133656, + "loss": 0.9186362, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.33203125, + "step": 888, + "time_per_iteration": 2.992532968521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089684, + "balance_loss_mlp": 1.05520868, + "epoch": 0.17102731819930742, + "flos": 558852908544.0, + "grad_norm": 0.047287767355612194, + "language_loss": 0.88680297, + "learning_rate": 0.0009487516860084047, + "loss": 0.89769983, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.34472656, + "step": 889, + "time_per_iteration": 2.7029643058776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082798, + "balance_loss_mlp": 1.04858518, + "epoch": 0.17121969988457098, + "flos": 494542679040.0, + "grad_norm": 0.0765590367769256, + "language_loss": 0.88680983, + "learning_rate": 0.0009486142063825884, + "loss": 0.89763772, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.34228516, + "step": 890, + "time_per_iteration": 2.5640931129455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_mlp": 1.02402985, + "epoch": 0.17141208156983456, + "flos": 1548088063488.0, + "grad_norm": 0.02832238153451814, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.7346493, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.14648438, + "step": 891, + "time_per_iteration": 4.979609251022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108867, + "balance_loss_mlp": 1.0540278, + "epoch": 0.17160446325509812, + "flos": 619282713600.0, + "grad_norm": 0.06449268303648867, + "language_loss": 0.90758598, + "learning_rate": 0.0009483387246819542, + "loss": 0.91847265, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.34667969, + "step": 892, + "time_per_iteration": 2.731332540512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_mlp": 1.01767898, + "epoch": 0.17179684494036168, + "flos": 1381026972672.0, + "grad_norm": 0.016720826063608682, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83318138, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.1484375, + "step": 893, + "time_per_iteration": 4.641844987869263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097484, + "balance_loss_mlp": 1.06386662, + "epoch": 0.17198922662562524, + "flos": 492386383872.0, + "grad_norm": 0.05711411270468228, + "language_loss": 0.89587665, + "learning_rate": 0.0009480625467392688, + "loss": 0.90685147, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.33642578, + "step": 894, + "time_per_iteration": 2.6310250759124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_mlp": 1.01795936, + "epoch": 0.1721816083108888, + "flos": 1457529914880.0, + "grad_norm": 0.013728573618451478, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79027796, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.15136719, + "step": 895, + "time_per_iteration": 4.7525646686553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132534, + "balance_loss_mlp": 1.09834456, + "epoch": 0.17237398999615236, + "flos": 527853030912.0, + "grad_norm": 0.05821127752563967, + "language_loss": 0.87793648, + "learning_rate": 0.0009477856729834196, + "loss": 0.88926184, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.34228516, + "step": 896, + "time_per_iteration": 2.7015438079833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132108, + "balance_loss_mlp": 1.09901524, + "epoch": 0.17256637168141592, + "flos": 603644562432.0, + "grad_norm": 0.08337200045302615, + "language_loss": 0.9056648, + "learning_rate": 0.0009476469753098809, + "loss": 0.91698587, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.33105469, + "step": 897, + "time_per_iteration": 2.7035813331604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125108, + "balance_loss_mlp": 1.08922589, + "epoch": 0.17275875336667948, + "flos": 509437334016.0, + "grad_norm": 0.05742024530278536, + "language_loss": 0.874506, + "learning_rate": 0.0009475081038443738, + "loss": 0.88575709, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.35913086, + "step": 898, + "time_per_iteration": 2.584437370300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115669, + "balance_loss_mlp": 1.07971573, + "epoch": 0.17295113505194307, + "flos": 664951693824.0, + "grad_norm": 0.06535241228499304, + "language_loss": 0.85809892, + "learning_rate": 0.0009473690586408124, + "loss": 0.86925566, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.35986328, + "step": 899, + "time_per_iteration": 2.83156418800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116393, + "balance_loss_mlp": 1.08084452, + "epoch": 0.17314351673720663, + "flos": 555125253120.0, + "grad_norm": 0.0683413775827569, + "language_loss": 0.86639923, + "learning_rate": 0.0009472298397531792, + "loss": 0.87756318, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.35546875, + "step": 900, + "time_per_iteration": 2.6944193840026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123337, + "balance_loss_mlp": 1.08635855, + "epoch": 0.17333589842247019, + "flos": 503361326592.0, + "grad_norm": 0.09670394547256775, + "language_loss": 0.87118709, + "learning_rate": 0.0009470904472355235, + "loss": 0.88242042, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.36987305, + "step": 901, + "time_per_iteration": 2.637882709503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114178, + "balance_loss_mlp": 1.07982159, + "epoch": 0.17352828010773375, + "flos": 555924003840.0, + "grad_norm": 0.06358596699153923, + "language_loss": 0.79912066, + "learning_rate": 0.0009469508811419626, + "loss": 0.81026244, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.34399414, + "step": 902, + "time_per_iteration": 2.726072311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077408, + "balance_loss_mlp": 1.06453359, + "epoch": 0.1737206617929973, + "flos": 1553711556096.0, + "grad_norm": 0.030950293127884103, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72691238, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.12890625, + "step": 903, + "time_per_iteration": 4.791790723800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109546, + "balance_loss_mlp": 1.07445073, + "epoch": 0.17391304347826086, + "flos": 516390667776.0, + "grad_norm": 0.06883251868009001, + "language_loss": 0.84220147, + "learning_rate": 0.0009466712284439292, + "loss": 0.85329694, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.35131836, + "step": 904, + "time_per_iteration": 2.7648017406463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104504, + "balance_loss_mlp": 1.06995738, + "epoch": 0.17410542516352442, + "flos": 540773273088.0, + "grad_norm": 0.06988851938988141, + "language_loss": 0.8903957, + "learning_rate": 0.0009465311419480276, + "loss": 0.90144074, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.34545898, + "step": 905, + "time_per_iteration": 2.725829601287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098399, + "balance_loss_mlp": 1.0637325, + "epoch": 0.17429780684878798, + "flos": 623542869504.0, + "grad_norm": 0.06312030659776342, + "language_loss": 0.88624233, + "learning_rate": 0.0009463908820933622, + "loss": 0.89722633, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.34692383, + "step": 906, + "time_per_iteration": 2.8389482498168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093859, + "balance_loss_mlp": 1.05900264, + "epoch": 0.17449018853405157, + "flos": 575368386048.0, + "grad_norm": 0.056066721215551084, + "language_loss": 0.83138871, + "learning_rate": 0.0009462504489343868, + "loss": 0.84232736, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.34863281, + "step": 907, + "time_per_iteration": 2.863349437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086469, + "balance_loss_mlp": 1.05199337, + "epoch": 0.17468257021931513, + "flos": 533499844608.0, + "grad_norm": 0.07604190500703253, + "language_loss": 0.894853, + "learning_rate": 0.0009461098425256222, + "loss": 0.90571761, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.3449707, + "step": 908, + "time_per_iteration": 2.5941011905670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108834, + "balance_loss_mlp": 1.05345941, + "epoch": 0.1748749519045787, + "flos": 540496848384.0, + "grad_norm": 0.050694136543679796, + "language_loss": 0.85873353, + "learning_rate": 0.0009459690629216567, + "loss": 0.86961693, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.34887695, + "step": 909, + "time_per_iteration": 2.6097571849823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109032, + "balance_loss_mlp": 1.0570612, + "epoch": 0.17506733358984225, + "flos": 498373641216.0, + "grad_norm": 0.0569262349138849, + "language_loss": 0.88138729, + "learning_rate": 0.0009458281101771457, + "loss": 0.89229047, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.33276367, + "step": 910, + "time_per_iteration": 2.5904784202575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091588, + "balance_loss_mlp": 1.05744696, + "epoch": 0.1752597152751058, + "flos": 622621873152.0, + "grad_norm": 0.06350455217589325, + "language_loss": 0.8266046, + "learning_rate": 0.0009456869843468122, + "loss": 0.83752048, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.34179688, + "step": 911, + "time_per_iteration": 2.8930556774139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090023, + "balance_loss_mlp": 1.05476046, + "epoch": 0.17545209696036937, + "flos": 520717814784.0, + "grad_norm": 0.07844481886152296, + "language_loss": 0.79097009, + "learning_rate": 0.0009455456854854459, + "loss": 0.80187035, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.35302734, + "step": 912, + "time_per_iteration": 2.5984511375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096028, + "balance_loss_mlp": 1.0631026, + "epoch": 0.17564447864563293, + "flos": 461750270976.0, + "grad_norm": 0.05516798292623818, + "language_loss": 0.84505737, + "learning_rate": 0.0009454042136479039, + "loss": 0.85601771, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.3293457, + "step": 913, + "time_per_iteration": 2.586195945739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085981, + "balance_loss_mlp": 1.05286503, + "epoch": 0.1758368603308965, + "flos": 480416251392.0, + "grad_norm": 0.05301404729603274, + "language_loss": 0.83308446, + "learning_rate": 0.0009452625688891103, + "loss": 0.84394431, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.33129883, + "step": 914, + "time_per_iteration": 2.5374929904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052517, + "balance_loss_mlp": 1.038975, + "epoch": 0.17602924201616005, + "flos": 1478160133632.0, + "grad_norm": 0.03507986977902886, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79787254, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.13574219, + "step": 915, + "time_per_iteration": 4.5561583042144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097389, + "balance_loss_mlp": 1.06226993, + "epoch": 0.17622162370142364, + "flos": 602010593280.0, + "grad_norm": 0.06815502965849334, + "language_loss": 0.93451321, + "learning_rate": 0.0009449787608278015, + "loss": 0.94548714, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.35131836, + "step": 916, + "time_per_iteration": 2.7807908058166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092918, + "balance_loss_mlp": 1.0588007, + "epoch": 0.1764140053866872, + "flos": 442473214464.0, + "grad_norm": 0.0637680644109211, + "language_loss": 0.92700857, + "learning_rate": 0.0009448365976354704, + "loss": 0.93793774, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.34130859, + "step": 917, + "time_per_iteration": 2.4800124168395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.06909323, + "epoch": 0.17660638707195075, + "flos": 500362610688.0, + "grad_norm": 0.07080486598597346, + "language_loss": 0.90158784, + "learning_rate": 0.0009446942617422558, + "loss": 0.91264427, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.36547852, + "step": 918, + "time_per_iteration": 2.5415430068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101469, + "balance_loss_mlp": 1.06766129, + "epoch": 0.17679876875721431, + "flos": 538621360128.0, + "grad_norm": 0.060000223973742446, + "language_loss": 0.86201262, + "learning_rate": 0.0009445517532034176, + "loss": 0.87302732, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.33789062, + "step": 919, + "time_per_iteration": 2.6849868297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121669, + "balance_loss_mlp": 1.08569145, + "epoch": 0.17699115044247787, + "flos": 497477376000.0, + "grad_norm": 0.08221632690768264, + "language_loss": 0.89522099, + "learning_rate": 0.0009444090720742824, + "loss": 0.9064377, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.35986328, + "step": 920, + "time_per_iteration": 2.6034600734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113572, + "balance_loss_mlp": 1.07883418, + "epoch": 0.17718353212774143, + "flos": 662444780544.0, + "grad_norm": 0.08029288241638204, + "language_loss": 0.88040781, + "learning_rate": 0.0009442662184102439, + "loss": 0.89154357, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.34741211, + "step": 921, + "time_per_iteration": 2.767352342605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105947, + "balance_loss_mlp": 1.07309294, + "epoch": 0.177375913813005, + "flos": 582340658688.0, + "grad_norm": 0.0705507668945597, + "language_loss": 0.87951338, + "learning_rate": 0.000944123192266763, + "loss": 0.89057279, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.32836914, + "step": 922, + "time_per_iteration": 2.789315700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108289, + "balance_loss_mlp": 1.0727644, + "epoch": 0.17756829549826855, + "flos": 552285098496.0, + "grad_norm": 0.06115562628552814, + "language_loss": 0.83835006, + "learning_rate": 0.0009439799936993671, + "loss": 0.84943295, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.35546875, + "step": 923, + "time_per_iteration": 2.7160987854003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090669, + "balance_loss_mlp": 1.05733824, + "epoch": 0.17776067718353214, + "flos": 556060806144.0, + "grad_norm": 0.07059184324253498, + "language_loss": 0.88508618, + "learning_rate": 0.0009438366227636511, + "loss": 0.89599288, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.33349609, + "step": 924, + "time_per_iteration": 2.6319191455841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084897, + "balance_loss_mlp": 1.05163789, + "epoch": 0.1779530588687957, + "flos": 658161303552.0, + "grad_norm": 0.06263940487075517, + "language_loss": 0.86677843, + "learning_rate": 0.0009436930795152763, + "loss": 0.87762737, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.33276367, + "step": 925, + "time_per_iteration": 2.8063783645629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084326, + "balance_loss_mlp": 1.05159163, + "epoch": 0.17814544055405926, + "flos": 644187644928.0, + "grad_norm": 0.06448697412821461, + "language_loss": 0.8710525, + "learning_rate": 0.0009435493640099713, + "loss": 0.88189578, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.32739258, + "step": 926, + "time_per_iteration": 2.7599081993103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080787, + "balance_loss_mlp": 1.04664516, + "epoch": 0.17833782223932282, + "flos": 460672123392.0, + "grad_norm": 0.06497730431564504, + "language_loss": 0.84328961, + "learning_rate": 0.0009434054763035314, + "loss": 0.85409749, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.34155273, + "step": 927, + "time_per_iteration": 2.612910032272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081381, + "balance_loss_mlp": 1.04740596, + "epoch": 0.17853020392458638, + "flos": 759212766720.0, + "grad_norm": 0.04594292129212818, + "language_loss": 0.85898727, + "learning_rate": 0.0009432614164518185, + "loss": 0.8698011, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.33984375, + "step": 928, + "time_per_iteration": 2.926981210708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086846, + "balance_loss_mlp": 1.05153632, + "epoch": 0.17872258560984994, + "flos": 782320785408.0, + "grad_norm": 0.055185850673896385, + "language_loss": 0.84792197, + "learning_rate": 0.000943117184510762, + "loss": 0.85879046, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.35327148, + "step": 929, + "time_per_iteration": 2.995514154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039861, + "balance_loss_mlp": 1.02660513, + "epoch": 0.1789149672951135, + "flos": 1459095482880.0, + "grad_norm": 0.021362691821678215, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79829824, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.1328125, + "step": 930, + "time_per_iteration": 4.99839448928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091678, + "balance_loss_mlp": 1.05739331, + "epoch": 0.17910734898037706, + "flos": 503598463488.0, + "grad_norm": 0.05761618473313655, + "language_loss": 0.88773429, + "learning_rate": 0.0009428282045846674, + "loss": 0.89865112, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.34301758, + "step": 931, + "time_per_iteration": 2.6966652870178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.05452061, + "epoch": 0.17929973066564064, + "flos": 745895264256.0, + "grad_norm": 0.05798282919409206, + "language_loss": 0.89983928, + "learning_rate": 0.0009426834567118214, + "loss": 0.91071755, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.33300781, + "step": 932, + "time_per_iteration": 3.072160482406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092765, + "balance_loss_mlp": 1.05907631, + "epoch": 0.1794921123509042, + "flos": 712875893760.0, + "grad_norm": 0.055390897890994044, + "language_loss": 0.80879378, + "learning_rate": 0.0009425385369740155, + "loss": 0.81972146, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.3371582, + "step": 933, + "time_per_iteration": 3.0337042808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092731, + "balance_loss_mlp": 1.05825567, + "epoch": 0.17968449403616776, + "flos": 632838763008.0, + "grad_norm": 0.0687685702394307, + "language_loss": 0.87443584, + "learning_rate": 0.0009423934454275125, + "loss": 0.8853631, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.3449707, + "step": 934, + "time_per_iteration": 2.7970879077911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089195, + "balance_loss_mlp": 1.05526757, + "epoch": 0.17987687572143132, + "flos": 536060602368.0, + "grad_norm": 0.08214865293258214, + "language_loss": 0.92215371, + "learning_rate": 0.0009422481821286418, + "loss": 0.93304563, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.33959961, + "step": 935, + "time_per_iteration": 2.7134642601013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091708, + "balance_loss_mlp": 1.05914021, + "epoch": 0.18006925740669488, + "flos": 537818227200.0, + "grad_norm": 0.0718764173736199, + "language_loss": 0.87967253, + "learning_rate": 0.0009421027471337998, + "loss": 0.89058959, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.32568359, + "step": 936, + "time_per_iteration": 2.608764171600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098474, + "balance_loss_mlp": 1.06333113, + "epoch": 0.18026163909195844, + "flos": 539255757312.0, + "grad_norm": 0.06697051800305152, + "language_loss": 0.82882118, + "learning_rate": 0.0009419571404994493, + "loss": 0.83980596, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.3515625, + "step": 937, + "time_per_iteration": 2.620296001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096306, + "balance_loss_mlp": 1.06240284, + "epoch": 0.180454020777222, + "flos": 500382959616.0, + "grad_norm": 0.08555714620461663, + "language_loss": 0.90948844, + "learning_rate": 0.00094181136228212, + "loss": 0.92045152, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.33935547, + "step": 938, + "time_per_iteration": 2.62837290763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109732, + "balance_loss_mlp": 1.06415629, + "epoch": 0.18064640246248556, + "flos": 498689353728.0, + "grad_norm": 0.06983123921060745, + "language_loss": 0.86323059, + "learning_rate": 0.0009416654125384077, + "loss": 0.8742038, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.33154297, + "step": 939, + "time_per_iteration": 2.715686321258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054723, + "balance_loss_mlp": 1.04242051, + "epoch": 0.18083878414774912, + "flos": 1518572358144.0, + "grad_norm": 0.027679884047562747, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80827093, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.12304688, + "step": 940, + "time_per_iteration": 4.941875219345093 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090642, + "balance_loss_mlp": 1.05728722, + "epoch": 0.1810311658330127, + "flos": 727006703616.0, + "grad_norm": 0.07011009980003599, + "language_loss": 0.84326053, + "learning_rate": 0.000941372998698552, + "loss": 0.85416698, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.33374023, + "step": 941, + "time_per_iteration": 2.931520938873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094264, + "balance_loss_mlp": 1.0597409, + "epoch": 0.18122354751827627, + "flos": 564643726848.0, + "grad_norm": 0.08254502738164117, + "language_loss": 0.8207435, + "learning_rate": 0.0009412265347159336, + "loss": 0.8316862, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.34570312, + "step": 942, + "time_per_iteration": 2.696354627609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091238, + "balance_loss_mlp": 1.05869377, + "epoch": 0.18141592920353983, + "flos": 519024208896.0, + "grad_norm": 0.05729066672306875, + "language_loss": 0.85217965, + "learning_rate": 0.0009410798994339829, + "loss": 0.86309201, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.32543945, + "step": 943, + "time_per_iteration": 2.6009600162506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088013, + "balance_loss_mlp": 1.0545156, + "epoch": 0.1816083108888034, + "flos": 512219261952.0, + "grad_norm": 0.05342615519744699, + "language_loss": 0.88234782, + "learning_rate": 0.000940933092909628, + "loss": 0.89322793, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.33520508, + "step": 944, + "time_per_iteration": 2.618419647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095526, + "balance_loss_mlp": 1.06286263, + "epoch": 0.18180069257406695, + "flos": 492144864768.0, + "grad_norm": 0.053227732023653135, + "language_loss": 0.8393383, + "learning_rate": 0.0009407861151998649, + "loss": 0.85029352, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.32666016, + "step": 945, + "time_per_iteration": 2.5718705654144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097774, + "balance_loss_mlp": 1.06406188, + "epoch": 0.1819930742593305, + "flos": 569891870208.0, + "grad_norm": 0.05775782434029923, + "language_loss": 0.86156505, + "learning_rate": 0.0009406389663617552, + "loss": 0.87254274, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.33740234, + "step": 946, + "time_per_iteration": 2.66513729095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097522, + "balance_loss_mlp": 1.06433463, + "epoch": 0.18218545594459407, + "flos": 605693168640.0, + "grad_norm": 0.06350431386522506, + "language_loss": 0.85736459, + "learning_rate": 0.000940491646452427, + "loss": 0.86833978, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.33203125, + "step": 947, + "time_per_iteration": 2.715071201324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103916, + "balance_loss_mlp": 1.07010818, + "epoch": 0.18237783762985763, + "flos": 548419230720.0, + "grad_norm": 0.06277969821047595, + "language_loss": 0.91195452, + "learning_rate": 0.000940344155529075, + "loss": 0.92299366, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.33837891, + "step": 948, + "time_per_iteration": 2.6502938270568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099574, + "balance_loss_mlp": 1.06550407, + "epoch": 0.1825702193151212, + "flos": 450509078016.0, + "grad_norm": 0.06933176029299125, + "language_loss": 0.87683523, + "learning_rate": 0.0009401964936489605, + "loss": 0.88783091, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.34106445, + "step": 949, + "time_per_iteration": 2.5181798934936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084135, + "balance_loss_mlp": 1.05247355, + "epoch": 0.18276260100038477, + "flos": 588962313216.0, + "grad_norm": 0.07980064544074586, + "language_loss": 0.85422635, + "learning_rate": 0.0009400486608694108, + "loss": 0.86506772, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.31640625, + "step": 950, + "time_per_iteration": 2.7189955711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087871, + "balance_loss_mlp": 1.05384839, + "epoch": 0.18295498268564833, + "flos": 786988376064.0, + "grad_norm": 0.05265351460276348, + "language_loss": 0.87225658, + "learning_rate": 0.0009399006572478195, + "loss": 0.88313532, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.34033203, + "step": 951, + "time_per_iteration": 3.0805773735046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086089, + "balance_loss_mlp": 1.05218577, + "epoch": 0.1831473643709119, + "flos": 577878271488.0, + "grad_norm": 0.059447924131550096, + "language_loss": 0.91242015, + "learning_rate": 0.0009397524828416468, + "loss": 0.92328107, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.33935547, + "step": 952, + "time_per_iteration": 2.6567108631134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082155, + "balance_loss_mlp": 1.04801321, + "epoch": 0.18333974605617545, + "flos": 566622521856.0, + "grad_norm": 0.05513512337372911, + "language_loss": 0.96184212, + "learning_rate": 0.0009396041377084192, + "loss": 0.97266364, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.34179688, + "step": 953, + "time_per_iteration": 2.6937921047210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079403, + "balance_loss_mlp": 1.04478431, + "epoch": 0.183532127741439, + "flos": 526725421056.0, + "grad_norm": 0.07204875194033089, + "language_loss": 0.87840325, + "learning_rate": 0.0009394556219057295, + "loss": 0.88919723, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.34667969, + "step": 954, + "time_per_iteration": 2.6962215900421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107777, + "balance_loss_mlp": 1.04272258, + "epoch": 0.18372450942670257, + "flos": 594259918848.0, + "grad_norm": 0.07227161235955501, + "language_loss": 0.83883446, + "learning_rate": 0.0009393069354912362, + "loss": 0.84961212, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.35058594, + "step": 955, + "time_per_iteration": 2.718308925628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081248, + "balance_loss_mlp": 1.04677236, + "epoch": 0.18391689111196613, + "flos": 644720145408.0, + "grad_norm": 0.07091738302891186, + "language_loss": 0.82511717, + "learning_rate": 0.0009391580785226649, + "loss": 0.83592963, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.34521484, + "step": 956, + "time_per_iteration": 2.907367467880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077991, + "balance_loss_mlp": 1.06216049, + "epoch": 0.18410927279722972, + "flos": 1456246563840.0, + "grad_norm": 0.048423099914415325, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80418444, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.15820312, + "step": 957, + "time_per_iteration": 4.78663969039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091525, + "balance_loss_mlp": 1.05702567, + "epoch": 0.18430165448249328, + "flos": 658437728256.0, + "grad_norm": 0.09319397884021513, + "language_loss": 0.86484683, + "learning_rate": 0.0009388598531545196, + "loss": 0.8757621, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.34545898, + "step": 958, + "time_per_iteration": 2.8470118045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087326, + "balance_loss_mlp": 1.05285025, + "epoch": 0.18449403616775684, + "flos": 517679811072.0, + "grad_norm": 0.07377492103556435, + "language_loss": 0.86076611, + "learning_rate": 0.000938710484870727, + "loss": 0.87163937, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.3449707, + "step": 959, + "time_per_iteration": 2.5930731296539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090986, + "balance_loss_mlp": 1.05672574, + "epoch": 0.1846864178530204, + "flos": 552481537536.0, + "grad_norm": 0.06589557505977534, + "language_loss": 0.86379164, + "learning_rate": 0.0009385609462644189, + "loss": 0.8747015, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.34277344, + "step": 960, + "time_per_iteration": 2.688706636428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096456, + "balance_loss_mlp": 1.06212378, + "epoch": 0.18487879953828396, + "flos": 465930441216.0, + "grad_norm": 0.0643439417949763, + "language_loss": 0.86035949, + "learning_rate": 0.0009384112373936514, + "loss": 0.871324, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.34326172, + "step": 961, + "time_per_iteration": 4.0801496505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095895, + "balance_loss_mlp": 1.06132412, + "epoch": 0.18507118122354752, + "flos": 648200489472.0, + "grad_norm": 0.0614591664996872, + "language_loss": 0.91820455, + "learning_rate": 0.0009382613583165467, + "loss": 0.92916346, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.34594727, + "step": 962, + "time_per_iteration": 2.790069341659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113098, + "balance_loss_mlp": 1.07921863, + "epoch": 0.18526356290881107, + "flos": 626486330880.0, + "grad_norm": 0.06374556186760763, + "language_loss": 0.89594233, + "learning_rate": 0.0009381113090912928, + "loss": 0.90707326, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.33886719, + "step": 963, + "time_per_iteration": 2.6891098022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117951, + "balance_loss_mlp": 1.08559799, + "epoch": 0.18545594459407463, + "flos": 432497843712.0, + "grad_norm": 0.06491910119233056, + "language_loss": 0.90103394, + "learning_rate": 0.000937961089776144, + "loss": 0.91221344, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.32348633, + "step": 964, + "time_per_iteration": 2.5821962356567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124554, + "balance_loss_mlp": 1.08926833, + "epoch": 0.1856483262793382, + "flos": 748720862208.0, + "grad_norm": 0.06849062336391444, + "language_loss": 0.829036, + "learning_rate": 0.0009378107004294208, + "loss": 0.84028149, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.35302734, + "step": 965, + "time_per_iteration": 2.9898061752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115255, + "balance_loss_mlp": 1.081972, + "epoch": 0.18584070796460178, + "flos": 530058788352.0, + "grad_norm": 0.08647217477609576, + "language_loss": 0.91352308, + "learning_rate": 0.0009376601411095096, + "loss": 0.92467564, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.33300781, + "step": 966, + "time_per_iteration": 2.6415059566497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093436, + "balance_loss_mlp": 1.06196475, + "epoch": 0.18603308964986534, + "flos": 482863527936.0, + "grad_norm": 0.05783783242438048, + "language_loss": 0.8708145, + "learning_rate": 0.0009375094118748622, + "loss": 0.88174886, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.31445312, + "step": 967, + "time_per_iteration": 2.5149550437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089094, + "balance_loss_mlp": 1.05650234, + "epoch": 0.1862254713351289, + "flos": 800976591360.0, + "grad_norm": 0.0756042683078202, + "language_loss": 0.9083451, + "learning_rate": 0.0009373585127839976, + "loss": 0.91923606, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.32592773, + "step": 968, + "time_per_iteration": 2.9569485187530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084434, + "balance_loss_mlp": 1.05250978, + "epoch": 0.18641785302039246, + "flos": 478082456064.0, + "grad_norm": 0.06160067145414361, + "language_loss": 0.91074634, + "learning_rate": 0.0009372074438954994, + "loss": 0.92159069, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.3190918, + "step": 969, + "time_per_iteration": 2.508530378341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083542, + "balance_loss_mlp": 1.05040169, + "epoch": 0.18661023470565602, + "flos": 388695587328.0, + "grad_norm": 0.07517959095695621, + "language_loss": 0.91676056, + "learning_rate": 0.0009370562052680181, + "loss": 0.92759597, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.33154297, + "step": 970, + "time_per_iteration": 2.4572672843933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087332, + "balance_loss_mlp": 1.05400109, + "epoch": 0.18680261639091958, + "flos": 564402207744.0, + "grad_norm": 0.052448577146131624, + "language_loss": 0.89610398, + "learning_rate": 0.0009369047969602695, + "loss": 0.90697736, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.33349609, + "step": 971, + "time_per_iteration": 2.714704751968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101064, + "balance_loss_mlp": 1.06556404, + "epoch": 0.18699499807618314, + "flos": 479018009088.0, + "grad_norm": 0.06595213007116614, + "language_loss": 0.8674072, + "learning_rate": 0.0009367532190310357, + "loss": 0.87841785, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.35498047, + "step": 972, + "time_per_iteration": 2.589667558670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111914, + "balance_loss_mlp": 1.07660413, + "epoch": 0.1871873797614467, + "flos": 553022802432.0, + "grad_norm": 0.0720295199384638, + "language_loss": 0.88701892, + "learning_rate": 0.0009366014715391644, + "loss": 0.89813805, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.35327148, + "step": 973, + "time_per_iteration": 2.634023904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107724, + "balance_loss_mlp": 1.07389259, + "epoch": 0.18737976144671029, + "flos": 552526617600.0, + "grad_norm": 0.05153911900793568, + "language_loss": 0.8432554, + "learning_rate": 0.0009364495545435693, + "loss": 0.85433269, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.33837891, + "step": 974, + "time_per_iteration": 2.7729458808898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107099, + "balance_loss_mlp": 1.07281494, + "epoch": 0.18757214313197385, + "flos": 502002372096.0, + "grad_norm": 0.05815108638233015, + "language_loss": 0.88620323, + "learning_rate": 0.0009362974681032297, + "loss": 0.8972742, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.34326172, + "step": 975, + "time_per_iteration": 2.631744623184204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105498, + "balance_loss_mlp": 1.07130909, + "epoch": 0.1877645248172374, + "flos": 674691337728.0, + "grad_norm": 0.06841603134690444, + "language_loss": 0.88265896, + "learning_rate": 0.0009361452122771907, + "loss": 0.89371395, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.34204102, + "step": 976, + "time_per_iteration": 2.8427281379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094167, + "balance_loss_mlp": 1.06012082, + "epoch": 0.18795690650250096, + "flos": 404771696640.0, + "grad_norm": 0.07319435948671522, + "language_loss": 0.8377496, + "learning_rate": 0.0009359927871245635, + "loss": 0.84869128, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.34057617, + "step": 977, + "time_per_iteration": 2.4665186405181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090091, + "balance_loss_mlp": 1.0565697, + "epoch": 0.18814928818776452, + "flos": 637599485952.0, + "grad_norm": 0.05986452276683665, + "language_loss": 0.86337954, + "learning_rate": 0.0009358401927045246, + "loss": 0.87428045, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.33520508, + "step": 978, + "time_per_iteration": 2.8037781715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090672, + "balance_loss_mlp": 1.05707908, + "epoch": 0.18834166987302808, + "flos": 1137825446400.0, + "grad_norm": 0.054509582003230646, + "language_loss": 0.88314402, + "learning_rate": 0.0009356874290763166, + "loss": 0.89405078, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.33618164, + "step": 979, + "time_per_iteration": 3.456723213195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097981, + "balance_loss_mlp": 1.06481671, + "epoch": 0.18853405155829164, + "flos": 504538398720.0, + "grad_norm": 0.06366920756378494, + "language_loss": 0.8866874, + "learning_rate": 0.0009355344962992474, + "loss": 0.89766723, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.33154297, + "step": 980, + "time_per_iteration": 2.6105339527130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109825, + "balance_loss_mlp": 1.06494308, + "epoch": 0.1887264332435552, + "flos": 607879987200.0, + "grad_norm": 0.05130215804193928, + "language_loss": 0.88147485, + "learning_rate": 0.0009353813944326908, + "loss": 0.89245737, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.33325195, + "step": 981, + "time_per_iteration": 2.882836103439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109809, + "balance_loss_mlp": 1.0758822, + "epoch": 0.1889188149288188, + "flos": 552264749568.0, + "grad_norm": 0.07032712681879846, + "language_loss": 0.83146608, + "learning_rate": 0.0009352281235360863, + "loss": 0.84256417, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.33959961, + "step": 982, + "time_per_iteration": 2.695748805999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120418, + "balance_loss_mlp": 1.08775461, + "epoch": 0.18911119661408235, + "flos": 418332128256.0, + "grad_norm": 0.06033753714629359, + "language_loss": 0.84987485, + "learning_rate": 0.0009350746836689389, + "loss": 0.86107904, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.32666016, + "step": 983, + "time_per_iteration": 2.5073440074920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260435, + "balance_loss_mlp": 1.23916793, + "epoch": 0.1893035782993459, + "flos": 1481141320704.0, + "grad_norm": 0.0731593378732656, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82699656, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.21289062, + "step": 984, + "time_per_iteration": 5.065609931945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133244, + "balance_loss_mlp": 1.09831583, + "epoch": 0.18949595998460947, + "flos": 508220974080.0, + "grad_norm": 0.09166419018528392, + "language_loss": 0.83211792, + "learning_rate": 0.0009347672972613634, + "loss": 0.84345031, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.34936523, + "step": 985, + "time_per_iteration": 2.580009937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115864, + "balance_loss_mlp": 1.08270001, + "epoch": 0.18968834166987303, + "flos": 530812459008.0, + "grad_norm": 0.0668772854373454, + "language_loss": 0.85875785, + "learning_rate": 0.0009346133508402735, + "loss": 0.8699165, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.33178711, + "step": 986, + "time_per_iteration": 2.6872711181640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111031, + "balance_loss_mlp": 1.07724667, + "epoch": 0.1898807233551366, + "flos": 499515807744.0, + "grad_norm": 0.11088649382938841, + "language_loss": 0.8420769, + "learning_rate": 0.0009344592356873166, + "loss": 0.8531872, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.33813477, + "step": 987, + "time_per_iteration": 2.6347994804382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098007, + "balance_loss_mlp": 1.06462848, + "epoch": 0.19007310504040015, + "flos": 601936399872.0, + "grad_norm": 0.05765681888892058, + "language_loss": 0.78527796, + "learning_rate": 0.0009343049518623255, + "loss": 0.79625803, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.33398438, + "step": 988, + "time_per_iteration": 2.696929693222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082914, + "balance_loss_mlp": 1.05029869, + "epoch": 0.1902654867256637, + "flos": 601374786048.0, + "grad_norm": 0.05732720380572914, + "language_loss": 0.83250153, + "learning_rate": 0.0009341504994251985, + "loss": 0.84333068, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.32617188, + "step": 989, + "time_per_iteration": 2.8399016857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095582, + "balance_loss_mlp": 1.07841623, + "epoch": 0.19045786841092727, + "flos": 1574925147648.0, + "grad_norm": 0.03888561388969961, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74616081, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.171875, + "step": 990, + "time_per_iteration": 5.072636842727661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109747, + "balance_loss_mlp": 1.06394839, + "epoch": 0.19065025009619085, + "flos": 681280906752.0, + "grad_norm": 0.135211113906906, + "language_loss": 0.818295, + "learning_rate": 0.0009338410889544574, + "loss": 0.82926977, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.33544922, + "step": 991, + "time_per_iteration": 3.050665855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_mlp": 1.06786811, + "epoch": 0.1908426317814544, + "flos": 601971305472.0, + "grad_norm": 0.06286082016671143, + "language_loss": 0.87738663, + "learning_rate": 0.000933686131040967, + "loss": 0.88840532, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.34033203, + "step": 992, + "time_per_iteration": 2.7589659690856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089273, + "balance_loss_mlp": 1.05672884, + "epoch": 0.19103501346671797, + "flos": 586027616256.0, + "grad_norm": 0.0561482479745879, + "language_loss": 0.90427077, + "learning_rate": 0.0009335310047555883, + "loss": 0.91516346, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.32543945, + "step": 993, + "time_per_iteration": 2.7133467197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108709, + "balance_loss_mlp": 1.0532825, + "epoch": 0.19122739515198153, + "flos": 545494708224.0, + "grad_norm": 0.06221036652136981, + "language_loss": 0.88114065, + "learning_rate": 0.0009333757101585467, + "loss": 0.89201152, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.33837891, + "step": 994, + "time_per_iteration": 2.6733241081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083527, + "balance_loss_mlp": 1.05105424, + "epoch": 0.1914197768372451, + "flos": 521171739648.0, + "grad_norm": 0.05606370206634765, + "language_loss": 0.93617988, + "learning_rate": 0.0009332202473101329, + "loss": 0.94701517, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.32470703, + "step": 995, + "time_per_iteration": 2.6689558029174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079312, + "balance_loss_mlp": 1.04536152, + "epoch": 0.19161215852250865, + "flos": 610961660928.0, + "grad_norm": 0.05986652691328414, + "language_loss": 0.83121806, + "learning_rate": 0.0009330646162707028, + "loss": 0.84201121, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.33984375, + "step": 996, + "time_per_iteration": 2.7264511585235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081823, + "balance_loss_mlp": 1.04849207, + "epoch": 0.1918045402077722, + "flos": 846281806848.0, + "grad_norm": 0.05485586532204223, + "language_loss": 0.84800065, + "learning_rate": 0.0009329088171006779, + "loss": 0.85881883, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.33349609, + "step": 997, + "time_per_iteration": 3.1486315727233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097705, + "balance_loss_mlp": 1.06220424, + "epoch": 0.19199692189303577, + "flos": 465699096576.0, + "grad_norm": 0.06540772430376247, + "language_loss": 0.84963006, + "learning_rate": 0.0009327528498605446, + "loss": 0.86060709, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.35522461, + "step": 998, + "time_per_iteration": 2.532460927963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.0542109, + "epoch": 0.19218930357829936, + "flos": 531318818304.0, + "grad_norm": 0.06065225266474605, + "language_loss": 0.89716202, + "learning_rate": 0.0009325967146108548, + "loss": 0.90804029, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.33642578, + "step": 999, + "time_per_iteration": 2.6381072998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108792, + "balance_loss_mlp": 1.05334902, + "epoch": 0.19238168526356292, + "flos": 601350054912.0, + "grad_norm": 0.06318510310852068, + "language_loss": 0.87984866, + "learning_rate": 0.0009324404114122258, + "loss": 0.89072788, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.34594727, + "step": 1000, + "time_per_iteration": 2.7017252445220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088105, + "balance_loss_mlp": 1.0544883, + "epoch": 0.19257406694882648, + "flos": 571690192896.0, + "grad_norm": 0.05361295189234855, + "language_loss": 0.87132722, + "learning_rate": 0.0009322839403253397, + "loss": 0.88220823, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.33642578, + "step": 1001, + "time_per_iteration": 2.7725350856781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091645, + "balance_loss_mlp": 1.05759907, + "epoch": 0.19276644863409004, + "flos": 801478568448.0, + "grad_norm": 0.0661765462165054, + "language_loss": 0.84038174, + "learning_rate": 0.0009321273014109439, + "loss": 0.85129815, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.34082031, + "step": 1002, + "time_per_iteration": 2.9275383949279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089997, + "balance_loss_mlp": 1.05676103, + "epoch": 0.1929588303193536, + "flos": 563024314368.0, + "grad_norm": 0.05133430998282463, + "language_loss": 0.85232604, + "learning_rate": 0.0009319704947298513, + "loss": 0.863226, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.33251953, + "step": 1003, + "time_per_iteration": 2.9198272228240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083204, + "balance_loss_mlp": 1.05120838, + "epoch": 0.19315121200461716, + "flos": 626550349824.0, + "grad_norm": 0.04652496586479965, + "language_loss": 0.88737059, + "learning_rate": 0.0009318135203429393, + "loss": 0.8982026, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.31982422, + "step": 1004, + "time_per_iteration": 2.7145965099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094807, + "balance_loss_mlp": 1.06116605, + "epoch": 0.19334359368988072, + "flos": 517169069568.0, + "grad_norm": 0.06711221272981459, + "language_loss": 0.88228458, + "learning_rate": 0.0009316563783111511, + "loss": 0.8932327, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.33642578, + "step": 1005, + "time_per_iteration": 2.68135404586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095243, + "balance_loss_mlp": 1.06050563, + "epoch": 0.19353597537514428, + "flos": 693751606272.0, + "grad_norm": 0.04947727679523619, + "language_loss": 0.82323831, + "learning_rate": 0.0009314990686954943, + "loss": 0.83419079, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.34765625, + "step": 1006, + "time_per_iteration": 2.9068872928619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098932, + "balance_loss_mlp": 1.06495738, + "epoch": 0.19372835706040784, + "flos": 1209665180160.0, + "grad_norm": 0.05336104081377929, + "language_loss": 0.80917025, + "learning_rate": 0.000931341591557042, + "loss": 0.82015955, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.34008789, + "step": 1007, + "time_per_iteration": 3.759119749069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098415, + "balance_loss_mlp": 1.06291509, + "epoch": 0.19392073874567142, + "flos": 520368606720.0, + "grad_norm": 0.06549831272650784, + "language_loss": 0.87757689, + "learning_rate": 0.0009311839469569325, + "loss": 0.88856107, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.35522461, + "step": 1008, + "time_per_iteration": 2.6298930644989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100893, + "balance_loss_mlp": 1.06620264, + "epoch": 0.19411312043093498, + "flos": 588543293952.0, + "grad_norm": 0.06763315162421418, + "language_loss": 0.8732397, + "learning_rate": 0.0009310261349563687, + "loss": 0.88424855, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.34692383, + "step": 1009, + "time_per_iteration": 2.6843061447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110066, + "balance_loss_mlp": 1.06718588, + "epoch": 0.19430550211619854, + "flos": 579085867008.0, + "grad_norm": 0.05371296475785438, + "language_loss": 0.8534441, + "learning_rate": 0.0009308681556166186, + "loss": 0.86445075, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.33496094, + "step": 1010, + "time_per_iteration": 2.8197336196899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107606, + "balance_loss_mlp": 1.07291579, + "epoch": 0.1944978838014621, + "flos": 620848281600.0, + "grad_norm": 0.08312668477716535, + "language_loss": 0.87206143, + "learning_rate": 0.0009307100089990152, + "loss": 0.88313752, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.34716797, + "step": 1011, + "time_per_iteration": 2.7118990421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101924, + "balance_loss_mlp": 1.0672822, + "epoch": 0.19469026548672566, + "flos": 598440089088.0, + "grad_norm": 0.061832865854500894, + "language_loss": 0.83946323, + "learning_rate": 0.0009305516951649568, + "loss": 0.85048252, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.34667969, + "step": 1012, + "time_per_iteration": 2.667672872543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096764, + "balance_loss_mlp": 1.06314659, + "epoch": 0.19488264717198922, + "flos": 551890810368.0, + "grad_norm": 0.04827143175142062, + "language_loss": 0.87187612, + "learning_rate": 0.0009303932141759057, + "loss": 0.88284373, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.33642578, + "step": 1013, + "time_per_iteration": 2.7321088314056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101502, + "balance_loss_mlp": 1.06705046, + "epoch": 0.19507502885725278, + "flos": 665842166784.0, + "grad_norm": 0.05715794205563071, + "language_loss": 0.84201366, + "learning_rate": 0.0009302345660933902, + "loss": 0.85302866, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.3449707, + "step": 1014, + "time_per_iteration": 2.7699263095855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109904, + "balance_loss_mlp": 1.07616735, + "epoch": 0.19526741054251634, + "flos": 670771625472.0, + "grad_norm": 0.05949834877265084, + "language_loss": 0.84866655, + "learning_rate": 0.0009300757509790026, + "loss": 0.85976553, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.33764648, + "step": 1015, + "time_per_iteration": 2.8250515460968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110813, + "balance_loss_mlp": 1.0766474, + "epoch": 0.19545979222777993, + "flos": 446983653888.0, + "grad_norm": 0.0671511226198219, + "language_loss": 0.90974069, + "learning_rate": 0.0009299167688944005, + "loss": 0.92084885, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.34204102, + "step": 1016, + "time_per_iteration": 2.545133590698242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111157, + "balance_loss_mlp": 1.07778645, + "epoch": 0.1956521739130435, + "flos": 568813722624.0, + "grad_norm": 0.06338586690579641, + "language_loss": 0.85958129, + "learning_rate": 0.0009297576199013063, + "loss": 0.87069696, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.33813477, + "step": 1017, + "time_per_iteration": 2.668503761291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148218, + "balance_loss_mlp": 1.13295972, + "epoch": 0.19584455559830705, + "flos": 1454969157120.0, + "grad_norm": 0.047651466398381144, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74150348, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.15234375, + "step": 1018, + "time_per_iteration": 4.920944929122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104842, + "balance_loss_mlp": 1.09015501, + "epoch": 0.1960369372835706, + "flos": 1590320369664.0, + "grad_norm": 0.036993279908541045, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80531144, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.14648438, + "step": 1019, + "time_per_iteration": 6.0059425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118502, + "balance_loss_mlp": 1.08505166, + "epoch": 0.19622931896883417, + "flos": 615709237248.0, + "grad_norm": 0.05240041234704895, + "language_loss": 0.86600977, + "learning_rate": 0.0009292791720892659, + "loss": 0.87719476, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.3347168, + "step": 1020, + "time_per_iteration": 2.995192527770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113873, + "balance_loss_mlp": 1.07930255, + "epoch": 0.19642170065409773, + "flos": 465950790144.0, + "grad_norm": 0.0657036282835547, + "language_loss": 0.88724279, + "learning_rate": 0.0009291193560807218, + "loss": 0.89838147, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.34594727, + "step": 1021, + "time_per_iteration": 2.633256196975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114293, + "balance_loss_mlp": 1.07962656, + "epoch": 0.19661408233936128, + "flos": 515040477696.0, + "grad_norm": 0.054836200403870924, + "language_loss": 0.87439638, + "learning_rate": 0.0009289593734732688, + "loss": 0.88553929, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.34716797, + "step": 1022, + "time_per_iteration": 2.622284173965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107262, + "balance_loss_mlp": 1.0736922, + "epoch": 0.19680646402462484, + "flos": 392427624960.0, + "grad_norm": 0.053036961045345866, + "language_loss": 0.94139373, + "learning_rate": 0.0009287992243290175, + "loss": 0.95246631, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.3359375, + "step": 1023, + "time_per_iteration": 2.4402668476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108975, + "balance_loss_mlp": 1.07247353, + "epoch": 0.19699884570988843, + "flos": 626122566144.0, + "grad_norm": 0.056904835680118435, + "language_loss": 0.90850759, + "learning_rate": 0.0009286389087101435, + "loss": 0.91959733, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.36523438, + "step": 1024, + "time_per_iteration": 2.762068271636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110519, + "balance_loss_mlp": 1.06957078, + "epoch": 0.197191227395152, + "flos": 557710742016.0, + "grad_norm": 0.05298833269370499, + "language_loss": 0.88575542, + "learning_rate": 0.0009284784266788864, + "loss": 0.89680731, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.35668945, + "step": 1025, + "time_per_iteration": 4.087035417556763 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109606, + "balance_loss_mlp": 1.07565546, + "epoch": 0.19738360908041555, + "flos": 664681061376.0, + "grad_norm": 0.0565537913278748, + "language_loss": 0.92494339, + "learning_rate": 0.0009283177782975512, + "loss": 0.93603945, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.33984375, + "step": 1026, + "time_per_iteration": 2.948167562484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095626, + "balance_loss_mlp": 1.06117415, + "epoch": 0.1975759907656791, + "flos": 522244094976.0, + "grad_norm": 0.06218898027866582, + "language_loss": 0.88052273, + "learning_rate": 0.000928156963628507, + "loss": 0.89147896, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.3449707, + "step": 1027, + "time_per_iteration": 2.564019203186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091019, + "balance_loss_mlp": 1.05694866, + "epoch": 0.19776837245094267, + "flos": 462233309184.0, + "grad_norm": 0.056114928823487176, + "language_loss": 0.8826099, + "learning_rate": 0.0009279959827341877, + "loss": 0.89352006, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.34082031, + "step": 1028, + "time_per_iteration": 2.7226340770721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090946, + "balance_loss_mlp": 1.05699515, + "epoch": 0.19796075413620623, + "flos": 502809887232.0, + "grad_norm": 0.05507551359640612, + "language_loss": 0.88204837, + "learning_rate": 0.0009278348356770915, + "loss": 0.89295781, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.33984375, + "step": 1029, + "time_per_iteration": 2.592756748199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085455, + "balance_loss_mlp": 1.05093157, + "epoch": 0.1981531358214698, + "flos": 507281038848.0, + "grad_norm": 0.061172366255401664, + "language_loss": 0.85939109, + "learning_rate": 0.0009276735225197814, + "loss": 0.87024558, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.34570312, + "step": 1030, + "time_per_iteration": 2.598607063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088832, + "balance_loss_mlp": 1.05495238, + "epoch": 0.19834551750673335, + "flos": 531275148288.0, + "grad_norm": 0.0802549423316463, + "language_loss": 0.86293721, + "learning_rate": 0.0009275120433248847, + "loss": 0.87382561, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.33886719, + "step": 1031, + "time_per_iteration": 2.7143311500549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090216, + "balance_loss_mlp": 1.05726683, + "epoch": 0.1985378991919969, + "flos": 775147691520.0, + "grad_norm": 0.05308511447166053, + "language_loss": 0.86272347, + "learning_rate": 0.0009273503981550931, + "loss": 0.87362564, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.32958984, + "step": 1032, + "time_per_iteration": 3.0616648197174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087082, + "balance_loss_mlp": 1.05351269, + "epoch": 0.1987302808772605, + "flos": 434063411712.0, + "grad_norm": 0.059916166081832097, + "language_loss": 0.8703599, + "learning_rate": 0.0009271885870731626, + "loss": 0.88123071, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.3359375, + "step": 1033, + "time_per_iteration": 2.487316131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092715, + "balance_loss_mlp": 1.05921745, + "epoch": 0.19892266256252406, + "flos": 553342897152.0, + "grad_norm": 0.06168947094446192, + "language_loss": 0.88599998, + "learning_rate": 0.0009270266101419143, + "loss": 0.89692712, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.33520508, + "step": 1034, + "time_per_iteration": 2.5978119373321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085912, + "balance_loss_mlp": 1.05403578, + "epoch": 0.19911504424778761, + "flos": 549596302848.0, + "grad_norm": 0.06019117447906982, + "language_loss": 0.85564321, + "learning_rate": 0.0009268644674242328, + "loss": 0.86650234, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.31860352, + "step": 1035, + "time_per_iteration": 2.7259163856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097892, + "balance_loss_mlp": 1.0645138, + "epoch": 0.19930742593305117, + "flos": 518024636928.0, + "grad_norm": 0.05869793462101787, + "language_loss": 0.81141233, + "learning_rate": 0.0009267021589830678, + "loss": 0.82239127, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.33398438, + "step": 1036, + "time_per_iteration": 2.597724199295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161292, + "balance_loss_mlp": 1.14507985, + "epoch": 0.19949980761831473, + "flos": 1508516849664.0, + "grad_norm": 0.04621309141147155, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78788376, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.16210938, + "step": 1037, + "time_per_iteration": 4.918612241744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093993, + "balance_loss_mlp": 1.06044722, + "epoch": 0.1996921893035783, + "flos": 697803738624.0, + "grad_norm": 0.061892224045152405, + "language_loss": 0.93283784, + "learning_rate": 0.000926377045182406, + "loss": 0.94377768, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.33569336, + "step": 1038, + "time_per_iteration": 2.8800160884857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096412, + "balance_loss_mlp": 1.06334293, + "epoch": 0.19988457098884185, + "flos": 726682226688.0, + "grad_norm": 0.0613562398808313, + "language_loss": 0.87972045, + "learning_rate": 0.0009262142399491296, + "loss": 0.89068449, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.33081055, + "step": 1039, + "time_per_iteration": 3.0561435222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097284, + "balance_loss_mlp": 1.06345224, + "epoch": 0.2000769526741054, + "flos": 560275881984.0, + "grad_norm": 0.06364175085873486, + "language_loss": 0.87837642, + "learning_rate": 0.0009260512692448105, + "loss": 0.88934934, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.33862305, + "step": 1040, + "time_per_iteration": 2.7037088871002197 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 86214480, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2342240041697280.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/training_args.bin b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..459663e238ea62a90da439e633388cc1e16cedb6 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63f07a99639c8908760dc7ac65f4d34d749c1861fc4b5a1f91cbdcc73581ce9e +size 7992 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/zero_to_fp32.py b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/added_tokens.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7929d4cdbe9bb7ee3537b93d161990a8caa422ec --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sharev3", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/generation_config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a2b29de5d24fe25e10985dd07d1a281b8f2b6af --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f1ab49cf1a090c1571101f4e5b8b5c90df25e17be2a90f88e591f20ade2cf4b +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d499ef45258f41d0c4a0558796ff73bbbcf4599 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b397b454c6054ff34a858498a1c096ac06d706348639a0af4f15bae693253911 +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a3d259ef97400f35bcc44c831aeb01f62de29e6 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56963530c675483188da9f17ac11e09f596bb5f56b52d1f3e42734ed2f8058b4 +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..621a1874a703165cf54fec337ab11fde3551915b --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a36679a2ff80e5b0538bd64c9e6e6a397dc2bba2534fafe585f2b875c513aeb9 +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da6484c5a14088cbc1644f958880c678891a426b --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be544caad4f7d502ba56d2238246b839d184c91a92818fcfd4071d17abd38f26 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8240c57edba033c8d96a84e80a9e75330dfaa2a4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ca9fa4b23f64c1a7a0cd31a9fb3be7ba760fbdf95afbe68ae87fa5d04b6ce1d +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfe2b67cfb0eba6d8b256dc452def88966aae00b --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d445413db3703f2e7f442094f09165e8e623f40059a8ba003a884d09a7b193d +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5a8cb5d08a4ee00c27a3c42c0b46c2690a349a3 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b3797f5d7708193efa50ce2434c0d8658eabd07e9919b2304807ec6fb4dc989 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/latest b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/latest new file mode 100644 index 0000000000000000000000000000000000000000..306b989cc55bbad3d1661dff0bcd6923a752cb0a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/latest @@ -0,0 +1 @@ +global_step2080 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f3fb402ba8daad7aab4d19177b401948734208da --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a1fcd3184a794a41e9a266da20a6dc9692a5156ce39e754072b62e8cc4b37de +size 3759020544 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/model.safetensors.index.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..90a56cf0153ed9062ff35ee2b372f7ab9e796c6a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731420128 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/rng_state_0.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/rng_state_1.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/rng_state_2.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/rng_state_3.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/special_tokens_map.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/tokenizer.model b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/tokenizer_config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/trainer_state.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ae78c18d443846ff19d3467cbbf481ff608524a4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/trainer_state.json @@ -0,0 +1,31233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4001539053482108, + "eval_steps": 500, + "global_step": 2080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03964023, + "balance_loss_mlp": 3.01339984, + "epoch": 0.00019238168526356292, + "flos": 470464353792.0, + "grad_norm": 27.10233905437441, + "language_loss": 3.72295761, + "learning_rate": 0.0, + "loss": 2.48840261, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 9.5, + "step": 1, + "time_per_iteration": 29.606513023376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01906453, + "balance_loss_mlp": 1.25642872, + "epoch": 0.00038476337052712584, + "flos": 504311436288.0, + "grad_norm": 2.874173750989579, + "language_loss": 1.79264998, + "learning_rate": 0.00013726078121135892, + "loss": 1.81171465, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.5, + "step": 2, + "time_per_iteration": 2.7078208923339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01915611, + "balance_loss_mlp": 1.2667315, + "epoch": 0.0005771450557906887, + "flos": 598869282816.0, + "grad_norm": 2.1141296462778643, + "language_loss": 1.61429811, + "learning_rate": 0.00021755319103969496, + "loss": 1.63345432, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.48828125, + "step": 3, + "time_per_iteration": 3.010409116744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01917319, + "balance_loss_mlp": 1.26309848, + "epoch": 0.0007695267410542517, + "flos": 580133491200.0, + "grad_norm": 1.255159247360545, + "language_loss": 1.49202251, + "learning_rate": 0.00027452156242271784, + "loss": 1.51119578, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.54296875, + "step": 4, + "time_per_iteration": 2.7161622047424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0185163, + "balance_loss_mlp": 1.22144234, + "epoch": 0.0009619084263178145, + "flos": 485857861632.0, + "grad_norm": 4.267520959606063, + "language_loss": 1.57359505, + "learning_rate": 0.0003187096642208417, + "loss": 1.59211147, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 6.296875, + "step": 5, + "time_per_iteration": 2.718417167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01828185, + "balance_loss_mlp": 1.21211123, + "epoch": 0.0011542901115813775, + "flos": 559744791552.0, + "grad_norm": 1.225349312557607, + "language_loss": 1.4752574, + "learning_rate": 0.0003548139722510539, + "loss": 1.49353933, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 6.15234375, + "step": 6, + "time_per_iteration": 2.6827874183654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01821666, + "balance_loss_mlp": 1.22428453, + "epoch": 0.0013466717968449403, + "flos": 533721014784.0, + "grad_norm": 0.5025899606895544, + "language_loss": 1.33846116, + "learning_rate": 0.00038533972973918044, + "loss": 1.35667801, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.96875, + "step": 7, + "time_per_iteration": 2.6889517307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01776667, + "balance_loss_mlp": 1.2090404, + "epoch": 0.0015390534821085034, + "flos": 492037175808.0, + "grad_norm": 0.1719820928967348, + "language_loss": 1.2814672, + "learning_rate": 0.0004117823436340768, + "loss": 1.29923391, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.6875, + "step": 8, + "time_per_iteration": 2.7207248210906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0177577, + "balance_loss_mlp": 1.23217535, + "epoch": 0.0017314351673720662, + "flos": 564402207744.0, + "grad_norm": 0.6128716609675008, + "language_loss": 1.39861906, + "learning_rate": 0.00043510638207938993, + "loss": 1.41637683, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.44140625, + "step": 9, + "time_per_iteration": 2.887538194656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01823371, + "balance_loss_mlp": 1.31334615, + "epoch": 0.001923816852635629, + "flos": 593132308992.0, + "grad_norm": 0.480897383035181, + "language_loss": 1.25963569, + "learning_rate": 0.00045597044543220066, + "loss": 1.27786922, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.09765625, + "step": 10, + "time_per_iteration": 2.7672832012176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01930298, + "balance_loss_mlp": 1.44621277, + "epoch": 0.002116198537899192, + "flos": 609308752896.0, + "grad_norm": 0.21803247425844502, + "language_loss": 1.22959518, + "learning_rate": 0.00047484428652143135, + "loss": 1.24889803, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 4.83203125, + "step": 11, + "time_per_iteration": 2.9771082401275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130152, + "balance_loss_mlp": 1.67772901, + "epoch": 0.002308580223162755, + "flos": 544869075456.0, + "grad_norm": 0.19847359144835577, + "language_loss": 1.28057694, + "learning_rate": 0.0004920747534624128, + "loss": 1.30187845, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 4.52734375, + "step": 12, + "time_per_iteration": 2.6094090938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02177014, + "balance_loss_mlp": 1.7512939, + "epoch": 0.002500961908426318, + "flos": 644458277376.0, + "grad_norm": 0.3126355826019607, + "language_loss": 1.29235363, + "learning_rate": 0.0005079252465375872, + "loss": 1.31412375, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 4.265625, + "step": 13, + "time_per_iteration": 2.841792345046997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02141221, + "balance_loss_mlp": 1.74411082, + "epoch": 0.0026933435936898806, + "flos": 487605312000.0, + "grad_norm": 0.282411779716686, + "language_loss": 1.17459798, + "learning_rate": 0.0005226005109505393, + "loss": 1.19601011, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 3.96875, + "step": 14, + "time_per_iteration": 2.597313165664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02024541, + "balance_loss_mlp": 1.65890288, + "epoch": 0.0028857252789534437, + "flos": 434368949760.0, + "grad_norm": 0.2583476739022616, + "language_loss": 1.22957516, + "learning_rate": 0.0005362628552605367, + "loss": 1.24982059, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 3.65234375, + "step": 15, + "time_per_iteration": 2.6388704776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01790575, + "balance_loss_mlp": 1.44687057, + "epoch": 0.0030781069642170067, + "flos": 596465676288.0, + "grad_norm": 0.18613747071639053, + "language_loss": 1.27631426, + "learning_rate": 0.0005490431248454357, + "loss": 1.29421997, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 3.44140625, + "step": 16, + "time_per_iteration": 2.708346128463745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01779165, + "balance_loss_mlp": 1.46941185, + "epoch": 0.0032704886494805694, + "flos": 1537360432128.0, + "grad_norm": 0.2733785965407311, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.77484274, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 3.09375, + "step": 17, + "time_per_iteration": 6.916250705718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01553778, + "balance_loss_mlp": 1.24955583, + "epoch": 0.0034628703347441324, + "flos": 473720403456.0, + "grad_norm": 0.11658431553946913, + "language_loss": 1.14468098, + "learning_rate": 0.0005723671632907488, + "loss": 1.16021872, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 3.03710938, + "step": 18, + "time_per_iteration": 2.7716212272644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01490625, + "balance_loss_mlp": 1.21005416, + "epoch": 0.0036552520200076955, + "flos": 448303320576.0, + "grad_norm": 0.11552730485963776, + "language_loss": 1.19723654, + "learning_rate": 0.0005830738490244919, + "loss": 1.21214283, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 2.80859375, + "step": 19, + "time_per_iteration": 2.6067557334899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0141948, + "balance_loss_mlp": 1.16103387, + "epoch": 0.003847633705271258, + "flos": 635881148928.0, + "grad_norm": 0.11977740619668105, + "language_loss": 1.21676993, + "learning_rate": 0.0005932312266435596, + "loss": 1.23096466, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 2.58398438, + "step": 20, + "time_per_iteration": 2.8545703887939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01364308, + "balance_loss_mlp": 1.13084817, + "epoch": 0.004040015390534821, + "flos": 589222771200.0, + "grad_norm": 0.09935322828728523, + "language_loss": 1.16681409, + "learning_rate": 0.0006028929207788754, + "loss": 1.18045723, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 2.33203125, + "step": 21, + "time_per_iteration": 2.7119524478912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319718, + "balance_loss_mlp": 1.11038613, + "epoch": 0.004232397075798384, + "flos": 756253338624.0, + "grad_norm": 0.09023283304690737, + "language_loss": 1.20250762, + "learning_rate": 0.0006121050677327902, + "loss": 1.21570492, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 2.09667969, + "step": 22, + "time_per_iteration": 2.884739398956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304467, + "balance_loss_mlp": 1.1184051, + "epoch": 0.004424778761061947, + "flos": 526434439680.0, + "grad_norm": 0.08559602389751407, + "language_loss": 1.10067201, + "learning_rate": 0.0006209076479463684, + "loss": 1.1137166, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 1.85839844, + "step": 23, + "time_per_iteration": 2.6616718769073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275434, + "balance_loss_mlp": 1.10787356, + "epoch": 0.00461716044632551, + "flos": 547907079168.0, + "grad_norm": 0.07141137445072718, + "language_loss": 1.2012924, + "learning_rate": 0.0006293355346737718, + "loss": 1.21404672, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 1.67675781, + "step": 24, + "time_per_iteration": 2.7025952339172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252583, + "balance_loss_mlp": 1.10476315, + "epoch": 0.004809542131589073, + "flos": 567293234688.0, + "grad_norm": 0.08524381015789384, + "language_loss": 1.16738653, + "learning_rate": 0.0006374193284416834, + "loss": 1.17991233, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 1.47753906, + "step": 25, + "time_per_iteration": 2.827439069747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223638, + "balance_loss_mlp": 1.0984205, + "epoch": 0.005001923816852636, + "flos": 470391418368.0, + "grad_norm": 0.08512374611478205, + "language_loss": 1.15399337, + "learning_rate": 0.0006451860277489461, + "loss": 1.16622972, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 1.25097656, + "step": 26, + "time_per_iteration": 2.6214864253997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206141, + "balance_loss_mlp": 1.10009253, + "epoch": 0.005194305502116198, + "flos": 415283950080.0, + "grad_norm": 0.07774032731783902, + "language_loss": 1.23061514, + "learning_rate": 0.0006526595731190848, + "loss": 1.2426765, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 1.0625, + "step": 27, + "time_per_iteration": 2.5637125968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117904, + "balance_loss_mlp": 1.09192181, + "epoch": 0.005386687187379761, + "flos": 628466535936.0, + "grad_norm": 0.05524077436438855, + "language_loss": 1.1626848, + "learning_rate": 0.0006598612921618983, + "loss": 1.17447519, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 0.87158203, + "step": 28, + "time_per_iteration": 2.8202784061431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159441, + "balance_loss_mlp": 1.08772469, + "epoch": 0.005579068872643324, + "flos": 886100332032.0, + "grad_norm": 0.07386109802626846, + "language_loss": 1.08505416, + "learning_rate": 0.0006668102665011454, + "loss": 1.09664845, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 0.71728516, + "step": 29, + "time_per_iteration": 3.2254040241241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154142, + "balance_loss_mlp": 1.09520459, + "epoch": 0.005771450557906887, + "flos": 547287238656.0, + "grad_norm": 0.0797557646441396, + "language_loss": 1.18077409, + "learning_rate": 0.0006735236364718957, + "loss": 1.19231534, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 0.58886719, + "step": 30, + "time_per_iteration": 2.6730945110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140737, + "balance_loss_mlp": 1.09384, + "epoch": 0.00596383224317045, + "flos": 531766950912.0, + "grad_norm": 0.060827451674393726, + "language_loss": 1.1687839, + "learning_rate": 0.0006800168558381346, + "loss": 1.18019128, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 0.46875, + "step": 31, + "time_per_iteration": 2.649216651916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148736, + "balance_loss_mlp": 1.11166239, + "epoch": 0.0061562139284340135, + "flos": 588813926400.0, + "grad_norm": 0.10592463777190406, + "language_loss": 1.19211543, + "learning_rate": 0.0006863039060567947, + "loss": 1.20360279, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 0.37084961, + "step": 32, + "time_per_iteration": 2.6697018146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132499, + "balance_loss_mlp": 1.10136151, + "epoch": 0.006348595613697576, + "flos": 617929551360.0, + "grad_norm": 0.09812744917576391, + "language_loss": 1.1217525, + "learning_rate": 0.0006923974775611263, + "loss": 1.13307738, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 0.3112793, + "step": 33, + "time_per_iteration": 2.770225763320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137532, + "balance_loss_mlp": 1.11146092, + "epoch": 0.006540977298961139, + "flos": 777564444672.0, + "grad_norm": 0.06513543096417564, + "language_loss": 1.08375585, + "learning_rate": 0.0006983091239737814, + "loss": 1.09513116, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 0.26086426, + "step": 34, + "time_per_iteration": 2.99418306350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128276, + "balance_loss_mlp": 1.10578084, + "epoch": 0.006733358984224702, + "flos": 666837356544.0, + "grad_norm": 0.06344935516817307, + "language_loss": 1.07062221, + "learning_rate": 0.0007040493939600222, + "loss": 1.08190489, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 0.22497559, + "step": 35, + "time_per_iteration": 2.9126057624816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119708, + "balance_loss_mlp": 1.09892988, + "epoch": 0.006925740669488265, + "flos": 564092287488.0, + "grad_norm": 0.06579143759664555, + "language_loss": 1.07960629, + "learning_rate": 0.0007096279445021078, + "loss": 1.09080338, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 0.20788574, + "step": 36, + "time_per_iteration": 2.7079102993011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114855, + "balance_loss_mlp": 1.09574544, + "epoch": 0.007118122354751828, + "flos": 549583156224.0, + "grad_norm": 0.14799474820221378, + "language_loss": 1.14634764, + "learning_rate": 0.0007150536386503726, + "loss": 1.15749621, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 0.19104004, + "step": 37, + "time_per_iteration": 2.8290467262268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104011, + "balance_loss_mlp": 1.08533084, + "epoch": 0.007310504040015391, + "flos": 702161409024.0, + "grad_norm": 0.2513092385422617, + "language_loss": 1.08396375, + "learning_rate": 0.0007203346302358509, + "loss": 1.09500384, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 0.18688965, + "step": 38, + "time_per_iteration": 2.961430311203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121274, + "balance_loss_mlp": 1.10231924, + "epoch": 0.007502885725278953, + "flos": 599022051840.0, + "grad_norm": 0.0999674626629785, + "language_loss": 1.11391926, + "learning_rate": 0.000725478437577282, + "loss": 1.12513208, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 0.18945312, + "step": 39, + "time_per_iteration": 2.742088556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146989, + "balance_loss_mlp": 1.12810588, + "epoch": 0.007695267410542516, + "flos": 560000867328.0, + "grad_norm": 0.3323772184023467, + "language_loss": 1.08355689, + "learning_rate": 0.0007304920078549186, + "loss": 1.09502685, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 0.18884277, + "step": 40, + "time_per_iteration": 2.66943621635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116486, + "balance_loss_mlp": 1.1452378, + "epoch": 0.007887649095806078, + "flos": 507906671616.0, + "grad_norm": 0.11539272036457353, + "language_loss": 1.09356606, + "learning_rate": 0.0007353817735343603, + "loss": 1.1052146, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 0.19604492, + "step": 41, + "time_per_iteration": 2.7052595615386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132998, + "balance_loss_mlp": 1.11293542, + "epoch": 0.008080030781069641, + "flos": 503642133504.0, + "grad_norm": 0.12251683576194117, + "language_loss": 1.04851842, + "learning_rate": 0.0007401537019902344, + "loss": 1.05984843, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 0.20056152, + "step": 42, + "time_per_iteration": 2.590432643890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124507, + "balance_loss_mlp": 1.10198867, + "epoch": 0.008272412466333205, + "flos": 517764178944.0, + "grad_norm": 0.09393858903586973, + "language_loss": 1.08539796, + "learning_rate": 0.0007448133392900729, + "loss": 1.09664297, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 0.22521973, + "step": 43, + "time_per_iteration": 2.6619081497192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112544, + "balance_loss_mlp": 1.10156202, + "epoch": 0.008464794151596768, + "flos": 607673373696.0, + "grad_norm": 0.06822323064374927, + "language_loss": 1.03845203, + "learning_rate": 0.0007493658489441491, + "loss": 1.04970646, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 0.23864746, + "step": 44, + "time_per_iteration": 2.861008644104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128905, + "balance_loss_mlp": 1.10477662, + "epoch": 0.00865717583686033, + "flos": 537661075968.0, + "grad_norm": 0.1413166066405165, + "language_loss": 1.08820629, + "learning_rate": 0.0007538160463002316, + "loss": 1.09949529, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 0.2409668, + "step": 45, + "time_per_iteration": 2.643458604812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115676, + "balance_loss_mlp": 1.13258433, + "epoch": 0.008849557522123894, + "flos": 507758284800.0, + "grad_norm": 0.08570115972640321, + "language_loss": 1.10720444, + "learning_rate": 0.0007581684291577274, + "loss": 1.11877203, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.24157715, + "step": 46, + "time_per_iteration": 2.5904788970947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145761, + "balance_loss_mlp": 1.12085772, + "epoch": 0.009041939207387457, + "flos": 625048800768.0, + "grad_norm": 0.06636849455276843, + "language_loss": 1.14156199, + "learning_rate": 0.0007624272050891776, + "loss": 1.15301955, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.24902344, + "step": 47, + "time_per_iteration": 2.782179594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154374, + "balance_loss_mlp": 1.12759995, + "epoch": 0.00923432089265102, + "flos": 549124849152.0, + "grad_norm": 0.09356522507451794, + "language_loss": 1.04615343, + "learning_rate": 0.0007665963158851307, + "loss": 1.05769718, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.26806641, + "step": 48, + "time_per_iteration": 2.824540138244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174661, + "balance_loss_mlp": 1.14738548, + "epoch": 0.009426702577914583, + "flos": 562202242560.0, + "grad_norm": 0.059100241584136314, + "language_loss": 1.12381458, + "learning_rate": 0.0007706794594783609, + "loss": 1.13556111, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.27270508, + "step": 49, + "time_per_iteration": 2.790757894515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192673, + "balance_loss_mlp": 1.16604137, + "epoch": 0.009619084263178146, + "flos": 616486228992.0, + "grad_norm": 0.08074806779925832, + "language_loss": 1.11280799, + "learning_rate": 0.0007746801096530423, + "loss": 1.12473488, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.2668457, + "step": 50, + "time_per_iteration": 2.7235305309295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116178, + "balance_loss_mlp": 1.135149, + "epoch": 0.009811465948441709, + "flos": 541176325632.0, + "grad_norm": 0.06558886342971224, + "language_loss": 1.16576111, + "learning_rate": 0.0007786015338021173, + "loss": 1.17737889, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.26672363, + "step": 51, + "time_per_iteration": 2.6817519664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134628, + "balance_loss_mlp": 1.1085453, + "epoch": 0.010003847633705272, + "flos": 535608087552.0, + "grad_norm": 0.06210449580458492, + "language_loss": 1.08870959, + "learning_rate": 0.0007824468089603051, + "loss": 1.10005593, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.26098633, + "step": 52, + "time_per_iteration": 2.644577980041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125522, + "balance_loss_mlp": 1.09910512, + "epoch": 0.010196229318968833, + "flos": 908867907072.0, + "grad_norm": 0.05864822926220488, + "language_loss": 1.07807887, + "learning_rate": 0.0007862188363098669, + "loss": 1.08933413, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.26428223, + "step": 53, + "time_per_iteration": 3.1450047492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126237, + "balance_loss_mlp": 1.10084558, + "epoch": 0.010388611004232396, + "flos": 585594040320.0, + "grad_norm": 0.07974065634267835, + "language_loss": 1.08295977, + "learning_rate": 0.0007899203543304438, + "loss": 1.09422219, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.25390625, + "step": 54, + "time_per_iteration": 2.6822280883789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155972, + "balance_loss_mlp": 1.13315582, + "epoch": 0.01058099268949596, + "flos": 502233716736.0, + "grad_norm": 0.07014139109577967, + "language_loss": 1.22212756, + "learning_rate": 0.0007935539507422731, + "loss": 1.23368728, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.22814941, + "step": 55, + "time_per_iteration": 2.5841405391693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117516, + "balance_loss_mlp": 1.153512, + "epoch": 0.010773374374759523, + "flos": 544170659328.0, + "grad_norm": 0.07006342440897594, + "language_loss": 1.13914931, + "learning_rate": 0.0007971220733732573, + "loss": 1.15090084, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.21643066, + "step": 56, + "time_per_iteration": 2.697427988052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193099, + "balance_loss_mlp": 1.17267895, + "epoch": 0.010965756060023086, + "flos": 525874235904.0, + "grad_norm": 0.08125896119424647, + "language_loss": 1.0764755, + "learning_rate": 0.0008006270400641869, + "loss": 1.08840656, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.2043457, + "step": 57, + "time_per_iteration": 2.723154306411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174019, + "balance_loss_mlp": 1.15412247, + "epoch": 0.011158137745286649, + "flos": 576653147136.0, + "grad_norm": 0.07485866075688756, + "language_loss": 1.09104013, + "learning_rate": 0.0008040710477125043, + "loss": 1.10278034, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.19897461, + "step": 58, + "time_per_iteration": 2.703186273574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153983, + "balance_loss_mlp": 1.13440859, + "epoch": 0.011350519430550212, + "flos": 529024310784.0, + "grad_norm": 0.06764829366941465, + "language_loss": 1.09780312, + "learning_rate": 0.0008074561805429771, + "loss": 1.10934305, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.19567871, + "step": 59, + "time_per_iteration": 2.6111674308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136624, + "balance_loss_mlp": 1.11676335, + "epoch": 0.011542901115813775, + "flos": 555608291328.0, + "grad_norm": 0.06986870516034673, + "language_loss": 1.08079648, + "learning_rate": 0.0008107844176832545, + "loss": 1.09216261, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.19848633, + "step": 60, + "time_per_iteration": 2.682687997817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125651, + "balance_loss_mlp": 1.1056236, + "epoch": 0.011735282801077338, + "flos": 571826995200.0, + "grad_norm": 0.061548073586970495, + "language_loss": 1.09071934, + "learning_rate": 0.0008140576401132568, + "loss": 1.10197592, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.20019531, + "step": 61, + "time_per_iteration": 2.639394760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111743, + "balance_loss_mlp": 1.09838021, + "epoch": 0.0119276644863409, + "flos": 615309156864.0, + "grad_norm": 0.06273761556608791, + "language_loss": 1.10558033, + "learning_rate": 0.0008172776370494935, + "loss": 1.11675453, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.19030762, + "step": 62, + "time_per_iteration": 2.7110230922698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134294, + "balance_loss_mlp": 1.11483955, + "epoch": 0.012120046171604464, + "flos": 500835474432.0, + "grad_norm": 0.07391589684249159, + "language_loss": 1.17346644, + "learning_rate": 0.0008204461118185703, + "loss": 1.18480933, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.19445801, + "step": 63, + "time_per_iteration": 2.5490689277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142708, + "balance_loss_mlp": 1.12420678, + "epoch": 0.012312427856868027, + "flos": 473109327360.0, + "grad_norm": 0.05825974543220343, + "language_loss": 1.06081367, + "learning_rate": 0.0008235646872681536, + "loss": 1.07224083, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.18505859, + "step": 64, + "time_per_iteration": 2.5874247550964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139504, + "balance_loss_mlp": 1.12069249, + "epoch": 0.012504809542131588, + "flos": 538094651904.0, + "grad_norm": 0.1040066778051144, + "language_loss": 1.06503749, + "learning_rate": 0.0008266349107584288, + "loss": 1.07643247, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.18823242, + "step": 65, + "time_per_iteration": 2.678736925125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123492, + "balance_loss_mlp": 1.10500288, + "epoch": 0.012697191227395151, + "flos": 608450365440.0, + "grad_norm": 0.09066354406474254, + "language_loss": 1.09410381, + "learning_rate": 0.0008296582587724851, + "loss": 1.10533869, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.18481445, + "step": 66, + "time_per_iteration": 2.6937255859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121105, + "balance_loss_mlp": 1.10255599, + "epoch": 0.012889572912658714, + "flos": 767750607360.0, + "grad_norm": 0.11790618145169461, + "language_loss": 1.07982886, + "learning_rate": 0.0008326361411800136, + "loss": 1.0910399, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.1854248, + "step": 67, + "time_per_iteration": 2.9377663135528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096346, + "balance_loss_mlp": 1.07871521, + "epoch": 0.013081954597922277, + "flos": 533604561408.0, + "grad_norm": 0.09153807632987658, + "language_loss": 1.08335972, + "learning_rate": 0.0008355699051851403, + "loss": 1.09432316, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.17651367, + "step": 68, + "time_per_iteration": 2.7278473377227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_mlp": 1.0865227, + "epoch": 0.01327433628318584, + "flos": 572826567168.0, + "grad_norm": 0.08317322449907456, + "language_loss": 1.14837921, + "learning_rate": 0.0008384608389860635, + "loss": 1.15941942, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.1751709, + "step": 69, + "time_per_iteration": 2.7238211631774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111418, + "balance_loss_mlp": 1.09424019, + "epoch": 0.013466717968449404, + "flos": 497029243392.0, + "grad_norm": 0.08213812906773327, + "language_loss": 1.04970825, + "learning_rate": 0.000841310175171381, + "loss": 1.06082237, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.17199707, + "step": 70, + "time_per_iteration": 2.578726291656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101987, + "balance_loss_mlp": 1.08526158, + "epoch": 0.013659099653712967, + "flos": 565234454016.0, + "grad_norm": 0.06358988870017376, + "language_loss": 1.03380442, + "learning_rate": 0.000844119093875517, + "loss": 1.04482436, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.16723633, + "step": 71, + "time_per_iteration": 2.692791223526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103533, + "balance_loss_mlp": 1.08689094, + "epoch": 0.01385148133897653, + "flos": 573540950016.0, + "grad_norm": 0.07461407963015444, + "language_loss": 1.08098376, + "learning_rate": 0.0008468887257134666, + "loss": 1.09201908, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.16650391, + "step": 72, + "time_per_iteration": 2.6599459648132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122889, + "balance_loss_mlp": 1.10587776, + "epoch": 0.014043863024240093, + "flos": 576539665920.0, + "grad_norm": 0.05931650266846123, + "language_loss": 1.10316896, + "learning_rate": 0.0008496201545131264, + "loss": 1.11439776, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.17028809, + "step": 73, + "time_per_iteration": 2.7093684673309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126213, + "balance_loss_mlp": 1.10950017, + "epoch": 0.014236244709503656, + "flos": 938287660032.0, + "grad_norm": 0.060718352480344094, + "language_loss": 1.08902812, + "learning_rate": 0.0008523144198617317, + "loss": 1.1002903, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.16711426, + "step": 74, + "time_per_iteration": 3.1743276119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125614, + "balance_loss_mlp": 1.10876918, + "epoch": 0.014428626394767219, + "flos": 528231352320.0, + "grad_norm": 0.07198154728214846, + "language_loss": 1.08249164, + "learning_rate": 0.0008549725194813783, + "loss": 1.09374774, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.1685791, + "step": 75, + "time_per_iteration": 2.630387783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106727, + "balance_loss_mlp": 1.09047866, + "epoch": 0.014621008080030782, + "flos": 803371433472.0, + "grad_norm": 0.07553700512989577, + "language_loss": 1.06998253, + "learning_rate": 0.0008575954114472099, + "loss": 1.0810498, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.16247559, + "step": 76, + "time_per_iteration": 3.134385347366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109532, + "balance_loss_mlp": 1.0933075, + "epoch": 0.014813389765294343, + "flos": 696588788736.0, + "grad_norm": 0.053440596513601155, + "language_loss": 1.05069363, + "learning_rate": 0.0008601840162606118, + "loss": 1.06178904, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.16223145, + "step": 77, + "time_per_iteration": 3.039991855621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123171, + "balance_loss_mlp": 1.10660076, + "epoch": 0.015005771450557906, + "flos": 596702813184.0, + "grad_norm": 0.07894951514499118, + "language_loss": 1.1143651, + "learning_rate": 0.000862739218788641, + "loss": 1.12559676, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.16577148, + "step": 78, + "time_per_iteration": 2.867741346359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113228, + "balance_loss_mlp": 1.11553121, + "epoch": 0.01519815313582147, + "flos": 549148170240.0, + "grad_norm": 0.0893413961860561, + "language_loss": 1.07743871, + "learning_rate": 0.0008652618700799138, + "loss": 1.08876157, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.16760254, + "step": 79, + "time_per_iteration": 2.675795555114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160152, + "balance_loss_mlp": 1.14348662, + "epoch": 0.015390534821085032, + "flos": 430306642944.0, + "grad_norm": 0.06679936706529424, + "language_loss": 1.07125092, + "learning_rate": 0.0008677527890662774, + "loss": 1.08285248, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.16662598, + "step": 80, + "time_per_iteration": 2.4765963554382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196819, + "balance_loss_mlp": 1.17889023, + "epoch": 0.015582916506348595, + "flos": 523854743040.0, + "grad_norm": 0.12362960542988827, + "language_loss": 1.09903598, + "learning_rate": 0.0008702127641587799, + "loss": 1.11100423, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.17932129, + "step": 81, + "time_per_iteration": 2.636688470840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180455, + "balance_loss_mlp": 1.16288388, + "epoch": 0.015775298191612157, + "flos": 575151598080.0, + "grad_norm": 0.08274533442322421, + "language_loss": 1.04032063, + "learning_rate": 0.0008726425547457192, + "loss": 1.05212522, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.17565918, + "step": 82, + "time_per_iteration": 2.765179395675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157804, + "balance_loss_mlp": 1.14051914, + "epoch": 0.01596767987687572, + "flos": 610040664576.0, + "grad_norm": 0.07618339381967684, + "language_loss": 1.03921247, + "learning_rate": 0.0008750428925998964, + "loss": 1.05079055, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.1730957, + "step": 83, + "time_per_iteration": 2.7615418434143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159673, + "balance_loss_mlp": 1.14280462, + "epoch": 0.016160061562139283, + "flos": 566864040960.0, + "grad_norm": 0.0706757922791228, + "language_loss": 1.09743476, + "learning_rate": 0.0008774144832015932, + "loss": 1.10903156, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.16882324, + "step": 84, + "time_per_iteration": 2.694364070892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01699218, + "balance_loss_mlp": 1.68252861, + "epoch": 0.016352443247402846, + "flos": 1410557234688.0, + "grad_norm": 0.23342967148410274, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76473522, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.16699219, + "step": 85, + "time_per_iteration": 4.599137306213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212855, + "balance_loss_mlp": 1.19580793, + "epoch": 0.01654482493266641, + "flos": 730177127424.0, + "grad_norm": 0.09253845479208671, + "language_loss": 1.04518116, + "learning_rate": 0.0008820741205014318, + "loss": 1.05730963, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.1706543, + "step": 86, + "time_per_iteration": 2.8595266342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246652, + "balance_loss_mlp": 1.22939014, + "epoch": 0.016737206617929972, + "flos": 536016932352.0, + "grad_norm": 0.10044068584300966, + "language_loss": 1.06437612, + "learning_rate": 0.0008843634575408404, + "loss": 1.07684278, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.17248535, + "step": 87, + "time_per_iteration": 2.690492630004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215448, + "balance_loss_mlp": 1.19887805, + "epoch": 0.016929588303193535, + "flos": 536706584064.0, + "grad_norm": 0.0661610487366718, + "language_loss": 1.07674646, + "learning_rate": 0.0008866266301555082, + "loss": 1.08890104, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.16577148, + "step": 88, + "time_per_iteration": 2.737339496612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203027, + "balance_loss_mlp": 1.18706512, + "epoch": 0.017121969988457098, + "flos": 526498458624.0, + "grad_norm": 0.07897226836222233, + "language_loss": 1.08543992, + "learning_rate": 0.0008888642296509615, + "loss": 1.09747016, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.1595459, + "step": 89, + "time_per_iteration": 2.576819658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187655, + "balance_loss_mlp": 1.17131162, + "epoch": 0.01731435167372066, + "flos": 625304876544.0, + "grad_norm": 0.0740353605135553, + "language_loss": 1.13367987, + "learning_rate": 0.0008910768275115906, + "loss": 1.14555645, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.16345215, + "step": 90, + "time_per_iteration": 2.778571128845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173459, + "balance_loss_mlp": 1.15692425, + "epoch": 0.017506733358984224, + "flos": 496157709312.0, + "grad_norm": 0.07518713147028631, + "language_loss": 1.08794332, + "learning_rate": 0.0008932649762767675, + "loss": 1.0996778, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.16540527, + "step": 91, + "time_per_iteration": 2.5931665897369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185544, + "balance_loss_mlp": 1.16881919, + "epoch": 0.017699115044247787, + "flos": 745613047296.0, + "grad_norm": 0.07711429280558382, + "language_loss": 1.11576343, + "learning_rate": 0.0008954292103690864, + "loss": 1.12761879, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.1673584, + "step": 92, + "time_per_iteration": 2.9129488468170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194769, + "balance_loss_mlp": 1.17854476, + "epoch": 0.01789149672951135, + "flos": 515257265664.0, + "grad_norm": 0.0669718610224715, + "language_loss": 1.1343056, + "learning_rate": 0.0008975700468778296, + "loss": 1.14625335, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.16223145, + "step": 93, + "time_per_iteration": 2.576620101928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216953, + "balance_loss_mlp": 1.20076382, + "epoch": 0.018083878414774913, + "flos": 585850116096.0, + "grad_norm": 0.11698648494194364, + "language_loss": 1.0652318, + "learning_rate": 0.0008996879863005366, + "loss": 1.07740128, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.16186523, + "step": 94, + "time_per_iteration": 2.6751108169555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217025, + "balance_loss_mlp": 1.2013253, + "epoch": 0.018276260100038477, + "flos": 497103436800.0, + "grad_norm": 0.08327491501556071, + "language_loss": 1.06208014, + "learning_rate": 0.0009017835132453337, + "loss": 1.07425046, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.15686035, + "step": 95, + "time_per_iteration": 2.5971803665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196463, + "balance_loss_mlp": 1.1804409, + "epoch": 0.01846864178530204, + "flos": 639765955584.0, + "grad_norm": 0.09756000368948786, + "language_loss": 1.06920743, + "learning_rate": 0.0009038570970964896, + "loss": 1.08117199, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.16027832, + "step": 96, + "time_per_iteration": 2.7428832054138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173361, + "balance_loss_mlp": 1.15723228, + "epoch": 0.018661023470565603, + "flos": 511411746816.0, + "grad_norm": 0.07053433913024812, + "language_loss": 1.04343212, + "learning_rate": 0.0009059091926454854, + "loss": 1.05516577, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.16125488, + "step": 97, + "time_per_iteration": 2.570509433746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.16246903, + "epoch": 0.018853405155829166, + "flos": 930710103552.0, + "grad_norm": 0.08767892767743933, + "language_loss": 1.03389072, + "learning_rate": 0.0009079402406897198, + "loss": 1.04567385, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.15844727, + "step": 98, + "time_per_iteration": 3.202298164367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179075, + "balance_loss_mlp": 1.16296983, + "epoch": 0.01904578684109273, + "flos": 576209396736.0, + "grad_norm": 0.2639136557883628, + "language_loss": 1.0596242, + "learning_rate": 0.0009099506686008212, + "loss": 1.07141495, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.16101074, + "step": 99, + "time_per_iteration": 2.812368869781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139923, + "balance_loss_mlp": 1.12423468, + "epoch": 0.019238168526356292, + "flos": 558173431296.0, + "grad_norm": 0.12311670746354397, + "language_loss": 1.08180976, + "learning_rate": 0.0009119408908644013, + "loss": 1.09320903, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.15673828, + "step": 100, + "time_per_iteration": 2.7063775062561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150815, + "balance_loss_mlp": 1.13574743, + "epoch": 0.019430550211619855, + "flos": 723539506176.0, + "grad_norm": 0.12127606313133317, + "language_loss": 1.14121008, + "learning_rate": 0.0009139113095929519, + "loss": 1.15271831, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.15039062, + "step": 101, + "time_per_iteration": 2.840913772583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218173, + "balance_loss_mlp": 1.20243776, + "epoch": 0.019622931896883418, + "flos": 499235000832.0, + "grad_norm": 0.1104247345061639, + "language_loss": 1.0836457, + "learning_rate": 0.0009158623150134762, + "loss": 1.09582746, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.15722656, + "step": 102, + "time_per_iteration": 2.560464859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01357908, + "balance_loss_mlp": 1.34204173, + "epoch": 0.01981531358214698, + "flos": 508916418048.0, + "grad_norm": 0.15164768975642337, + "language_loss": 1.07661259, + "learning_rate": 0.000917794285931332, + "loss": 1.0901916, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.15856934, + "step": 103, + "time_per_iteration": 2.6684353351593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381196, + "balance_loss_mlp": 1.36572242, + "epoch": 0.020007695267410544, + "flos": 521087371776.0, + "grad_norm": 0.10342928287682196, + "language_loss": 0.9971087, + "learning_rate": 0.0009197075901716639, + "loss": 1.01092052, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.15454102, + "step": 104, + "time_per_iteration": 2.7250871658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01356986, + "balance_loss_mlp": 1.34017777, + "epoch": 0.020200076952674107, + "flos": 533013834240.0, + "grad_norm": 0.1824265866479698, + "language_loss": 1.09647703, + "learning_rate": 0.0009216025849997171, + "loss": 1.11004686, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.16809082, + "step": 105, + "time_per_iteration": 2.776764154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261961, + "balance_loss_mlp": 1.24583197, + "epoch": 0.020392458637937667, + "flos": 684430981632.0, + "grad_norm": 0.06376163654280764, + "language_loss": 1.0425086, + "learning_rate": 0.0009234796175212258, + "loss": 1.05512834, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.16125488, + "step": 106, + "time_per_iteration": 2.9174978733062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269614, + "balance_loss_mlp": 1.25201869, + "epoch": 0.02058484032320123, + "flos": 701791852032.0, + "grad_norm": 0.060044663360548714, + "language_loss": 1.08808422, + "learning_rate": 0.000925339025064007, + "loss": 1.10078037, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.17590332, + "step": 107, + "time_per_iteration": 2.975735902786255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324579, + "balance_loss_mlp": 1.30547023, + "epoch": 0.020777222008464793, + "flos": 638772175872.0, + "grad_norm": 0.12680512225677842, + "language_loss": 1.01262307, + "learning_rate": 0.0009271811355418027, + "loss": 1.02586877, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.19128418, + "step": 108, + "time_per_iteration": 2.8408150672912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01306621, + "balance_loss_mlp": 1.28755951, + "epoch": 0.020969603693728356, + "flos": 681785856000.0, + "grad_norm": 0.06997483982989385, + "language_loss": 1.08551693, + "learning_rate": 0.0009290062678013548, + "loss": 1.09858322, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.19055176, + "step": 109, + "time_per_iteration": 2.869980812072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309468, + "balance_loss_mlp": 1.29159832, + "epoch": 0.02116198537899192, + "flos": 533140462080.0, + "grad_norm": 0.13190855435004306, + "language_loss": 1.06647623, + "learning_rate": 0.0009308147319536321, + "loss": 1.07957077, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.17895508, + "step": 110, + "time_per_iteration": 2.6270735263824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130688, + "balance_loss_mlp": 1.29067969, + "epoch": 0.021354367064255482, + "flos": 717168135168.0, + "grad_norm": 0.10963649287068344, + "language_loss": 1.1282903, + "learning_rate": 0.0009326068296900676, + "loss": 1.14135909, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.1619873, + "step": 111, + "time_per_iteration": 2.8845341205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388527, + "balance_loss_mlp": 1.37200487, + "epoch": 0.021546748749519045, + "flos": 519290459136.0, + "grad_norm": 0.12406482447985402, + "language_loss": 1.03902006, + "learning_rate": 0.0009343828545846161, + "loss": 1.05290532, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.16516113, + "step": 112, + "time_per_iteration": 2.8167102336883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01548404, + "balance_loss_mlp": 1.53109479, + "epoch": 0.021739130434782608, + "flos": 504912337920.0, + "grad_norm": 0.2528517188051562, + "language_loss": 1.0722419, + "learning_rate": 0.0009361430923823841, + "loss": 1.08772588, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.1730957, + "step": 113, + "time_per_iteration": 2.664581060409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441472, + "balance_loss_mlp": 1.42576015, + "epoch": 0.02193151212004617, + "flos": 463251820032.0, + "grad_norm": 0.1910881492312462, + "language_loss": 1.11420846, + "learning_rate": 0.0009378878212755459, + "loss": 1.12862325, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.15710449, + "step": 114, + "time_per_iteration": 2.4851133823394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262203, + "balance_loss_mlp": 1.24767148, + "epoch": 0.022123893805309734, + "flos": 552008673792.0, + "grad_norm": 0.09004287588953173, + "language_loss": 1.0099957, + "learning_rate": 0.0009396173121672103, + "loss": 1.0226177, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.14538574, + "step": 115, + "time_per_iteration": 2.6535162925720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215709, + "balance_loss_mlp": 1.20165396, + "epoch": 0.022316275490573297, + "flos": 635920436736.0, + "grad_norm": 0.07849561533847389, + "language_loss": 1.07122314, + "learning_rate": 0.0009413318289238633, + "loss": 1.08338022, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.14050293, + "step": 116, + "time_per_iteration": 2.7836899757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203544, + "balance_loss_mlp": 1.18965602, + "epoch": 0.02250865717583686, + "flos": 798535107072.0, + "grad_norm": 0.07099947506123377, + "language_loss": 0.98912275, + "learning_rate": 0.0009430316286169771, + "loss": 1.00115824, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.13891602, + "step": 117, + "time_per_iteration": 3.049468517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263206, + "balance_loss_mlp": 1.24786401, + "epoch": 0.022701038861100423, + "flos": 455851763712.0, + "grad_norm": 0.18808502465815918, + "language_loss": 1.04843259, + "learning_rate": 0.0009447169617543361, + "loss": 1.06106472, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.15319824, + "step": 118, + "time_per_iteration": 2.5886504650115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121023, + "balance_loss_mlp": 1.19557953, + "epoch": 0.022893420546363986, + "flos": 582812112384.0, + "grad_norm": 0.09179634719817005, + "language_loss": 1.11139297, + "learning_rate": 0.0009463880725016029, + "loss": 1.12349522, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.14648438, + "step": 119, + "time_per_iteration": 2.6861259937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169264, + "balance_loss_mlp": 1.15572226, + "epoch": 0.02308580223162755, + "flos": 561010613760.0, + "grad_norm": 0.09164108943144146, + "language_loss": 1.05675769, + "learning_rate": 0.0009480451988946134, + "loss": 1.06845045, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.13549805, + "step": 120, + "time_per_iteration": 2.8075129985809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217336, + "balance_loss_mlp": 1.2034359, + "epoch": 0.023278183916891113, + "flos": 770966111232.0, + "grad_norm": 0.1019945076921087, + "language_loss": 1.07486713, + "learning_rate": 0.0009496885730428627, + "loss": 1.08704054, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.13903809, + "step": 121, + "time_per_iteration": 3.0081264972686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291544, + "balance_loss_mlp": 1.27698815, + "epoch": 0.023470565602154676, + "flos": 553111552512.0, + "grad_norm": 0.08478902086488087, + "language_loss": 1.05369067, + "learning_rate": 0.0009513184213246156, + "loss": 1.06660616, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.14550781, + "step": 122, + "time_per_iteration": 2.654902696609497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406128, + "balance_loss_mlp": 1.39054775, + "epoch": 0.02366294728741824, + "flos": 559744791552.0, + "grad_norm": 0.09837859270685317, + "language_loss": 1.09463692, + "learning_rate": 0.0009529349645740552, + "loss": 1.10869825, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.15563965, + "step": 123, + "time_per_iteration": 2.6837081909179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01484693, + "balance_loss_mlp": 1.46961284, + "epoch": 0.0238553289726818, + "flos": 468313698816.0, + "grad_norm": 0.11388616458843728, + "language_loss": 1.07573724, + "learning_rate": 0.0009545384182608524, + "loss": 1.09058416, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.1505127, + "step": 124, + "time_per_iteration": 2.5069937705993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01411359, + "balance_loss_mlp": 1.39688659, + "epoch": 0.024047710657945365, + "flos": 559763730432.0, + "grad_norm": 0.3429043048666504, + "language_loss": 1.05057025, + "learning_rate": 0.0009561289926625252, + "loss": 1.06468379, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.14465332, + "step": 125, + "time_per_iteration": 2.6802117824554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011688, + "balance_loss_mlp": 1.15507352, + "epoch": 0.024240092343208928, + "flos": 504528224256.0, + "grad_norm": 0.18048320350440872, + "language_loss": 1.09737623, + "learning_rate": 0.0009577068930299292, + "loss": 1.10906434, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.13739014, + "step": 126, + "time_per_iteration": 2.6096670627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163735, + "balance_loss_mlp": 1.15040147, + "epoch": 0.02443247402847249, + "flos": 435516908544.0, + "grad_norm": 0.07278748671530755, + "language_loss": 1.05931616, + "learning_rate": 0.0009592723197462087, + "loss": 1.07095349, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.13360596, + "step": 127, + "time_per_iteration": 2.6409482955932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248107, + "balance_loss_mlp": 1.23239577, + "epoch": 0.024624855713736054, + "flos": 683445966336.0, + "grad_norm": 0.0813490266373729, + "language_loss": 1.02871299, + "learning_rate": 0.0009608254684795125, + "loss": 1.04119396, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.15710449, + "step": 128, + "time_per_iteration": 2.940600872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265693, + "balance_loss_mlp": 1.24772859, + "epoch": 0.024817237398999614, + "flos": 524721894912.0, + "grad_norm": 0.0804185451989367, + "language_loss": 1.06161952, + "learning_rate": 0.0009623665303297678, + "loss": 1.07427645, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.1796875, + "step": 129, + "time_per_iteration": 2.7088472843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256284, + "balance_loss_mlp": 1.23668599, + "epoch": 0.025009619084263177, + "flos": 655350262272.0, + "grad_norm": 0.12369480901617341, + "language_loss": 1.10218048, + "learning_rate": 0.0009638956919697878, + "loss": 1.11474347, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.19592285, + "step": 130, + "time_per_iteration": 2.857571840286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266475, + "balance_loss_mlp": 1.24420691, + "epoch": 0.02520200076952674, + "flos": 454187271168.0, + "grad_norm": 0.08293639348197612, + "language_loss": 1.02638018, + "learning_rate": 0.0009654131357809714, + "loss": 1.03904486, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.22253418, + "step": 131, + "time_per_iteration": 2.641470432281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128644, + "balance_loss_mlp": 1.26142943, + "epoch": 0.025394382454790303, + "flos": 839427397632.0, + "grad_norm": 0.05741461740254168, + "language_loss": 1.11002767, + "learning_rate": 0.0009669190399838441, + "loss": 1.12289214, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.25036621, + "step": 132, + "time_per_iteration": 3.133596420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302533, + "balance_loss_mlp": 1.27633083, + "epoch": 0.025586764140053866, + "flos": 580725628416.0, + "grad_norm": 0.06987664196058198, + "language_loss": 1.0413487, + "learning_rate": 0.0009684135787636724, + "loss": 1.05437398, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.26208496, + "step": 133, + "time_per_iteration": 2.7968075275421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01325396, + "balance_loss_mlp": 1.29710746, + "epoch": 0.02577914582531743, + "flos": 789893959680.0, + "grad_norm": 0.07551411578012862, + "language_loss": 1.07757604, + "learning_rate": 0.0009698969223913726, + "loss": 1.09083009, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.28283691, + "step": 134, + "time_per_iteration": 3.0058987140655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131126, + "balance_loss_mlp": 1.28212547, + "epoch": 0.025971527510580992, + "flos": 594683320320.0, + "grad_norm": 0.0731546450398535, + "language_loss": 1.10457921, + "learning_rate": 0.0009713692373399265, + "loss": 1.11769176, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.29125977, + "step": 135, + "time_per_iteration": 2.6654229164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02152319, + "balance_loss_mlp": 1.95700705, + "epoch": 0.026163909195844555, + "flos": 1576771522560.0, + "grad_norm": 0.26755932757436196, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81608546, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 1.953125, + "step": 136, + "time_per_iteration": 6.531313896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01724331, + "balance_loss_mlp": 1.55266988, + "epoch": 0.026356290881108118, + "flos": 1501306030080.0, + "grad_norm": 0.1444935793983717, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79535371, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 1.71875, + "step": 137, + "time_per_iteration": 4.966995716094971 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01371776, + "balance_loss_mlp": 1.34284425, + "epoch": 0.02654867256637168, + "flos": 596841025536.0, + "grad_norm": 0.06823267419395149, + "language_loss": 1.03539467, + "learning_rate": 0.0009757216201974225, + "loss": 1.04911256, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.28918457, + "step": 138, + "time_per_iteration": 2.7901663780212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396345, + "balance_loss_mlp": 1.36752045, + "epoch": 0.026741054251635244, + "flos": 544761386496.0, + "grad_norm": 0.08904352821745645, + "language_loss": 1.08793342, + "learning_rate": 0.0009771514130396581, + "loss": 1.10189688, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.28833008, + "step": 139, + "time_per_iteration": 2.664384603500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410566, + "balance_loss_mlp": 1.38171697, + "epoch": 0.026933435936898807, + "flos": 506591387136.0, + "grad_norm": 0.09467843708761726, + "language_loss": 1.08393478, + "learning_rate": 0.00097857095638274, + "loss": 1.09804034, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.28833008, + "step": 140, + "time_per_iteration": 2.5600626468658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399161, + "balance_loss_mlp": 1.37263703, + "epoch": 0.02712581762216237, + "flos": 740513290752.0, + "grad_norm": 0.06303030428856128, + "language_loss": 0.99670362, + "learning_rate": 0.0009799803961288726, + "loss": 1.01069522, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.26538086, + "step": 141, + "time_per_iteration": 2.984253168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01354082, + "balance_loss_mlp": 1.33143175, + "epoch": 0.027318199307425933, + "flos": 848023464960.0, + "grad_norm": 0.06264638149228761, + "language_loss": 1.05898559, + "learning_rate": 0.000981379875086876, + "loss": 1.07252645, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.22644043, + "step": 142, + "time_per_iteration": 3.032597064971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323808, + "balance_loss_mlp": 1.30553341, + "epoch": 0.027510580992689496, + "flos": 575288400384.0, + "grad_norm": 0.07028220907739285, + "language_loss": 1.01752293, + "learning_rate": 0.0009827695330590185, + "loss": 1.030761, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.18273926, + "step": 143, + "time_per_iteration": 2.626483678817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303402, + "balance_loss_mlp": 1.28557992, + "epoch": 0.02770296267795306, + "flos": 772079164416.0, + "grad_norm": 0.05744811954937285, + "language_loss": 1.00619161, + "learning_rate": 0.0009841495069248256, + "loss": 1.0192256, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.17822266, + "step": 144, + "time_per_iteration": 2.9495198726654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316023, + "balance_loss_mlp": 1.29916632, + "epoch": 0.027895344363216622, + "flos": 569123642880.0, + "grad_norm": 0.04968902291069247, + "language_loss": 0.9920603, + "learning_rate": 0.0009855199307219871, + "loss": 1.00522041, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.1685791, + "step": 145, + "time_per_iteration": 2.6407721042633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130391, + "balance_loss_mlp": 1.28731608, + "epoch": 0.028087726048480186, + "flos": 547099564032.0, + "grad_norm": 0.10723696528856613, + "language_loss": 1.01566505, + "learning_rate": 0.0009868809357244854, + "loss": 1.02870417, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.16589355, + "step": 146, + "time_per_iteration": 2.6262452602386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287507, + "balance_loss_mlp": 1.27153277, + "epoch": 0.02828010773374375, + "flos": 524519663616.0, + "grad_norm": 0.06991830692152445, + "language_loss": 1.05632663, + "learning_rate": 0.0009882326505180556, + "loss": 1.06920183, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.15966797, + "step": 147, + "time_per_iteration": 2.6469435691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270213, + "balance_loss_mlp": 1.2534517, + "epoch": 0.02847248941900731, + "flos": 772108277760.0, + "grad_norm": 0.07309095407736986, + "language_loss": 1.04486537, + "learning_rate": 0.0009895752010730906, + "loss": 1.0575676, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.16748047, + "step": 148, + "time_per_iteration": 2.9457786083221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012724, + "balance_loss_mlp": 1.25667655, + "epoch": 0.028664871104270875, + "flos": 534150208512.0, + "grad_norm": 0.048334696317449924, + "language_loss": 1.10088921, + "learning_rate": 0.0009909087108150867, + "loss": 1.11361325, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.15710449, + "step": 149, + "time_per_iteration": 2.712559700012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309133, + "balance_loss_mlp": 1.29286051, + "epoch": 0.028857252789534438, + "flos": 367557599232.0, + "grad_norm": 0.13115053493636905, + "language_loss": 1.11238122, + "learning_rate": 0.0009922333006927371, + "loss": 1.12547255, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.16247559, + "step": 150, + "time_per_iteration": 2.4607067108154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01329212, + "balance_loss_mlp": 1.31257081, + "epoch": 0.029049634474798, + "flos": 515232534528.0, + "grad_norm": 0.06948512606819708, + "language_loss": 1.04613614, + "learning_rate": 0.0009935490892437632, + "loss": 1.05942833, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.16650391, + "step": 151, + "time_per_iteration": 2.5460238456726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309109, + "balance_loss_mlp": 1.29317045, + "epoch": 0.029242016160061564, + "flos": 587840495616.0, + "grad_norm": 0.11257287432569656, + "language_loss": 1.03097093, + "learning_rate": 0.0009948561926585687, + "loss": 1.04406202, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.15930176, + "step": 152, + "time_per_iteration": 2.753009557723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300362, + "balance_loss_mlp": 1.28555596, + "epoch": 0.029434397845325123, + "flos": 551816616960.0, + "grad_norm": 0.062223246716750634, + "language_loss": 1.06524086, + "learning_rate": 0.0009961547248418122, + "loss": 1.07824445, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.14807129, + "step": 153, + "time_per_iteration": 2.630092144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308008, + "balance_loss_mlp": 1.29357219, + "epoch": 0.029626779530588686, + "flos": 603221160960.0, + "grad_norm": 0.09420536563091944, + "language_loss": 1.03062868, + "learning_rate": 0.0009974447974719707, + "loss": 1.04370856, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.14440918, + "step": 154, + "time_per_iteration": 2.6962759494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312448, + "balance_loss_mlp": 1.29745138, + "epoch": 0.02981916121585225, + "flos": 620808993792.0, + "grad_norm": 0.08558703297148447, + "language_loss": 1.04985213, + "learning_rate": 0.0009987265200589763, + "loss": 1.0629766, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.15002441, + "step": 155, + "time_per_iteration": 2.7059414386749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295882, + "balance_loss_mlp": 1.28057528, + "epoch": 0.030011542901115813, + "flos": 661322962944.0, + "grad_norm": 0.09731995783752632, + "language_loss": 1.04436159, + "learning_rate": 0.001, + "loss": 1.05732036, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.1529541, + "step": 156, + "time_per_iteration": 2.856968641281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262708, + "balance_loss_mlp": 1.24682927, + "epoch": 0.030203924586379376, + "flos": 651258842112.0, + "grad_norm": 0.05966927829613408, + "language_loss": 1.02520585, + "learning_rate": 0.0009999999029413921, + "loss": 1.03783274, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.15856934, + "step": 157, + "time_per_iteration": 2.851480722427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268181, + "balance_loss_mlp": 1.25150406, + "epoch": 0.03039630627164294, + "flos": 531083091456.0, + "grad_norm": 0.1034311415514979, + "language_loss": 1.04085183, + "learning_rate": 0.0009999996117656068, + "loss": 1.05353379, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.16674805, + "step": 158, + "time_per_iteration": 2.707646369934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262524, + "balance_loss_mlp": 1.24747968, + "epoch": 0.030588687956906502, + "flos": 585914135040.0, + "grad_norm": 0.12050944658187299, + "language_loss": 0.97824669, + "learning_rate": 0.0009999991264727564, + "loss": 0.99087203, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.15039062, + "step": 159, + "time_per_iteration": 2.7575390338897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272116, + "balance_loss_mlp": 1.25716722, + "epoch": 0.030781069642170065, + "flos": 513026777088.0, + "grad_norm": 0.07020206521781955, + "language_loss": 1.08316755, + "learning_rate": 0.0009999984470630296, + "loss": 1.09588861, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.14929199, + "step": 160, + "time_per_iteration": 2.62310528755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128559, + "balance_loss_mlp": 1.27058172, + "epoch": 0.030973451327433628, + "flos": 717766064640.0, + "grad_norm": 0.06068839125924313, + "language_loss": 0.96528012, + "learning_rate": 0.0009999975735366902, + "loss": 0.978136, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.15002441, + "step": 161, + "time_per_iteration": 3.0823376178741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305752, + "balance_loss_mlp": 1.29055238, + "epoch": 0.03116583301269719, + "flos": 1109312133120.0, + "grad_norm": 0.09428930343360856, + "language_loss": 0.98546314, + "learning_rate": 0.0009999965058940775, + "loss": 0.99852067, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.1517334, + "step": 162, + "time_per_iteration": 3.486618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315996, + "balance_loss_mlp": 1.3010118, + "epoch": 0.031358214697960754, + "flos": 450676403712.0, + "grad_norm": 0.09976775191278689, + "language_loss": 1.04580116, + "learning_rate": 0.0009999952441356057, + "loss": 1.05896115, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.1496582, + "step": 163, + "time_per_iteration": 2.537173271179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300744, + "balance_loss_mlp": 1.28654623, + "epoch": 0.031550596383224314, + "flos": 1254701325312.0, + "grad_norm": 0.0838197011845512, + "language_loss": 1.05903006, + "learning_rate": 0.000999993788261765, + "loss": 1.07203746, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.14196777, + "step": 164, + "time_per_iteration": 3.5638957023620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270584, + "balance_loss_mlp": 1.25625503, + "epoch": 0.03174297806848788, + "flos": 667841310720.0, + "grad_norm": 0.068717417443618, + "language_loss": 1.0642612, + "learning_rate": 0.00099999213827312, + "loss": 1.076967, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.14343262, + "step": 165, + "time_per_iteration": 2.8084213733673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255587, + "balance_loss_mlp": 1.24152076, + "epoch": 0.03193535975375144, + "flos": 551033832960.0, + "grad_norm": 0.06892139424853191, + "language_loss": 1.0208962, + "learning_rate": 0.000999990294170312, + "loss": 1.03345203, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.14074707, + "step": 166, + "time_per_iteration": 2.6247787475585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259954, + "balance_loss_mlp": 1.24549401, + "epoch": 0.032127741439015006, + "flos": 543377700864.0, + "grad_norm": 0.08292396830811857, + "language_loss": 1.05774951, + "learning_rate": 0.0009999882559540566, + "loss": 1.07034898, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.14465332, + "step": 167, + "time_per_iteration": 2.654036283493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291491, + "balance_loss_mlp": 1.27790117, + "epoch": 0.032320123124278566, + "flos": 548104928256.0, + "grad_norm": 0.07217909902530589, + "language_loss": 1.02104354, + "learning_rate": 0.000999986023625145, + "loss": 1.03395844, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.13598633, + "step": 168, + "time_per_iteration": 2.696866750717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03738194, + "balance_loss_mlp": 3.61993837, + "epoch": 0.03251250480954213, + "flos": 1305156865536.0, + "grad_norm": 0.563981464368737, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.82662606, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 1.1796875, + "step": 169, + "time_per_iteration": 4.971506834030151 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134723, + "balance_loss_mlp": 1.33386648, + "epoch": 0.03270488649480569, + "flos": 560866609152.0, + "grad_norm": 0.12141219581883538, + "language_loss": 1.02540469, + "learning_rate": 0.0009999809766328958, + "loss": 1.03887701, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.13391113, + "step": 170, + "time_per_iteration": 2.646425724029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01355192, + "balance_loss_mlp": 1.34039843, + "epoch": 0.03289726818006926, + "flos": 482120031744.0, + "grad_norm": 0.08046017426621577, + "language_loss": 1.05186188, + "learning_rate": 0.0009999781619715177, + "loss": 1.06541371, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.14770508, + "step": 171, + "time_per_iteration": 2.535360336303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381569, + "balance_loss_mlp": 1.36640596, + "epoch": 0.03308964986533282, + "flos": 674355276288.0, + "grad_norm": 0.08789680193074563, + "language_loss": 1.04250002, + "learning_rate": 0.000999975153201402, + "loss": 1.05631578, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.15161133, + "step": 172, + "time_per_iteration": 2.8205513954162598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433883, + "balance_loss_mlp": 1.41711044, + "epoch": 0.033282031550596385, + "flos": 608937785856.0, + "grad_norm": 0.07610360898370483, + "language_loss": 1.02505267, + "learning_rate": 0.0009999719503237174, + "loss": 1.03939152, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.16760254, + "step": 173, + "time_per_iteration": 2.738676071166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451195, + "balance_loss_mlp": 1.43315864, + "epoch": 0.033474413235859944, + "flos": 467801547264.0, + "grad_norm": 0.07270846083900323, + "language_loss": 1.111094, + "learning_rate": 0.0009999685533397073, + "loss": 1.12560594, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.18029785, + "step": 174, + "time_per_iteration": 2.5556905269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01429898, + "balance_loss_mlp": 1.41368508, + "epoch": 0.03366679492112351, + "flos": 579365263872.0, + "grad_norm": 0.09196642879711979, + "language_loss": 1.03199494, + "learning_rate": 0.00099996496225069, + "loss": 1.04629397, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.16210938, + "step": 175, + "time_per_iteration": 2.6806485652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432234, + "balance_loss_mlp": 1.41513896, + "epoch": 0.03385917660638707, + "flos": 637378315776.0, + "grad_norm": 0.08705990667808558, + "language_loss": 1.05897307, + "learning_rate": 0.0009999611770580604, + "loss": 1.07329535, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.17102051, + "step": 176, + "time_per_iteration": 2.830826759338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415158, + "balance_loss_mlp": 1.39910054, + "epoch": 0.03405155829165064, + "flos": 441587123712.0, + "grad_norm": 0.08054669051038237, + "language_loss": 1.03868258, + "learning_rate": 0.0009999571977632876, + "loss": 1.05283427, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.16052246, + "step": 177, + "time_per_iteration": 2.623309850692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463141, + "balance_loss_mlp": 1.44573641, + "epoch": 0.034243939976914196, + "flos": 466097766912.0, + "grad_norm": 0.08089290506220445, + "language_loss": 1.06928194, + "learning_rate": 0.0009999530243679166, + "loss": 1.08391333, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.17407227, + "step": 178, + "time_per_iteration": 2.545133113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451423, + "balance_loss_mlp": 1.43560433, + "epoch": 0.03443632166217776, + "flos": 778919016960.0, + "grad_norm": 0.08468734735068614, + "language_loss": 1.01505899, + "learning_rate": 0.0009999486568735675, + "loss": 1.0295732, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.15808105, + "step": 179, + "time_per_iteration": 3.0384457111358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433641, + "balance_loss_mlp": 1.41778612, + "epoch": 0.03462870334744132, + "flos": 1263284246016.0, + "grad_norm": 0.06997324880309466, + "language_loss": 1.01388979, + "learning_rate": 0.0009999440952819362, + "loss": 1.02822614, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.15856934, + "step": 180, + "time_per_iteration": 3.6892786026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401308, + "balance_loss_mlp": 1.38610911, + "epoch": 0.03482108503270489, + "flos": 606899354112.0, + "grad_norm": 0.057831512038439566, + "language_loss": 1.02027512, + "learning_rate": 0.0009999393395947935, + "loss": 1.03428817, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.15185547, + "step": 181, + "time_per_iteration": 2.826353073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381551, + "balance_loss_mlp": 1.36612535, + "epoch": 0.03501346671796845, + "flos": 538010284032.0, + "grad_norm": 0.05913415109875365, + "language_loss": 1.05361927, + "learning_rate": 0.0009999343898139858, + "loss": 1.06743479, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.1541748, + "step": 182, + "time_per_iteration": 2.593250036239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01359754, + "balance_loss_mlp": 1.33988214, + "epoch": 0.035205848403232015, + "flos": 518231250432.0, + "grad_norm": 0.05898920665253376, + "language_loss": 1.04308426, + "learning_rate": 0.0009999292459414348, + "loss": 1.05668187, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.19909668, + "step": 183, + "time_per_iteration": 2.565936326980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01311064, + "balance_loss_mlp": 1.296103, + "epoch": 0.035398230088495575, + "flos": 472134486528.0, + "grad_norm": 0.06373248491183749, + "language_loss": 1.08499169, + "learning_rate": 0.0009999239079791374, + "loss": 1.09810233, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.14953613, + "step": 184, + "time_per_iteration": 2.552949905395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130912, + "balance_loss_mlp": 1.29237127, + "epoch": 0.03559061177375914, + "flos": 511820591616.0, + "grad_norm": 0.056329932736213485, + "language_loss": 1.01337337, + "learning_rate": 0.0009999183759291659, + "loss": 1.02646446, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.16748047, + "step": 185, + "time_per_iteration": 2.741727113723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291511, + "balance_loss_mlp": 1.27575147, + "epoch": 0.0357829934590227, + "flos": 477146903040.0, + "grad_norm": 0.11224085577532149, + "language_loss": 1.03738213, + "learning_rate": 0.0009999126497936682, + "loss": 1.05029726, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.1574707, + "step": 186, + "time_per_iteration": 2.4901957511901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291515, + "balance_loss_mlp": 1.27446783, + "epoch": 0.03597537514428627, + "flos": 644350588416.0, + "grad_norm": 0.06537030709871235, + "language_loss": 1.06735992, + "learning_rate": 0.0009999067295748676, + "loss": 1.08027506, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.1706543, + "step": 187, + "time_per_iteration": 2.7923052310943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327575, + "balance_loss_mlp": 1.30966997, + "epoch": 0.03616775682954983, + "flos": 580916275200.0, + "grad_norm": 0.06523062893181024, + "language_loss": 1.04418302, + "learning_rate": 0.000999900615275062, + "loss": 1.05745876, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.17919922, + "step": 188, + "time_per_iteration": 2.7248637676239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295421, + "balance_loss_mlp": 1.27722955, + "epoch": 0.03636013851481339, + "flos": 382210735104.0, + "grad_norm": 0.08035209765807474, + "language_loss": 1.10347509, + "learning_rate": 0.0009998943068966256, + "loss": 1.11642933, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.18188477, + "step": 189, + "time_per_iteration": 2.429497480392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279097, + "balance_loss_mlp": 1.26120377, + "epoch": 0.03655252020007695, + "flos": 582954706944.0, + "grad_norm": 0.07380481555246936, + "language_loss": 1.0506779, + "learning_rate": 0.0009998878044420072, + "loss": 1.06346881, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.17907715, + "step": 190, + "time_per_iteration": 2.6878626346588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012863, + "balance_loss_mlp": 1.26773953, + "epoch": 0.03674490188534051, + "flos": 471376433664.0, + "grad_norm": 0.10484442400689244, + "language_loss": 1.01223493, + "learning_rate": 0.0009998811079137318, + "loss": 1.02509785, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.18566895, + "step": 191, + "time_per_iteration": 2.561494827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281775, + "balance_loss_mlp": 1.26645625, + "epoch": 0.03693728357060408, + "flos": 528113488896.0, + "grad_norm": 0.0609431296621874, + "language_loss": 1.01984763, + "learning_rate": 0.0009998742173143987, + "loss": 1.03266537, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.1529541, + "step": 192, + "time_per_iteration": 2.59798264503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336751, + "balance_loss_mlp": 1.32157528, + "epoch": 0.03712966525586764, + "flos": 798657352704.0, + "grad_norm": 0.10186248006293357, + "language_loss": 1.02005363, + "learning_rate": 0.0009998671326466833, + "loss": 1.03342128, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.15185547, + "step": 193, + "time_per_iteration": 2.9510865211486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331057, + "balance_loss_mlp": 1.3157624, + "epoch": 0.037322046941131205, + "flos": 829628116992.0, + "grad_norm": 0.06375125184008373, + "language_loss": 1.02914846, + "learning_rate": 0.0009998598539133362, + "loss": 1.04245901, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.1529541, + "step": 194, + "time_per_iteration": 2.9981300830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01337882, + "balance_loss_mlp": 1.3235296, + "epoch": 0.037514428626394765, + "flos": 437460797952.0, + "grad_norm": 0.10181133305516413, + "language_loss": 1.03936744, + "learning_rate": 0.0009998523811171828, + "loss": 1.0527463, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.14379883, + "step": 195, + "time_per_iteration": 2.501542568206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296215, + "balance_loss_mlp": 1.28125429, + "epoch": 0.03770681031165833, + "flos": 511372459008.0, + "grad_norm": 0.09414845611868274, + "language_loss": 1.04584992, + "learning_rate": 0.0009998447142611248, + "loss": 1.05881214, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.14941406, + "step": 196, + "time_per_iteration": 2.6247317790985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128144, + "balance_loss_mlp": 1.26702762, + "epoch": 0.03789919199692189, + "flos": 807102061056.0, + "grad_norm": 0.05831249070889761, + "language_loss": 0.97701526, + "learning_rate": 0.0009998368533481387, + "loss": 0.9898296, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.14422607, + "step": 197, + "time_per_iteration": 3.01912784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294999, + "balance_loss_mlp": 1.27945375, + "epoch": 0.03809157368218546, + "flos": 690274234368.0, + "grad_norm": 0.06656848410147823, + "language_loss": 1.00630498, + "learning_rate": 0.0009998287983812762, + "loss": 1.01925504, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.15551758, + "step": 198, + "time_per_iteration": 2.8252804279327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0135387, + "balance_loss_mlp": 1.33592904, + "epoch": 0.03828395536744902, + "flos": 517675428864.0, + "grad_norm": 0.06988401379713739, + "language_loss": 1.06386423, + "learning_rate": 0.0009998205493636646, + "loss": 1.07740283, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.17944336, + "step": 199, + "time_per_iteration": 2.649765729904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339461, + "balance_loss_mlp": 1.32242572, + "epoch": 0.038476337052712584, + "flos": 581389138944.0, + "grad_norm": 0.07184113921580974, + "language_loss": 0.9925406, + "learning_rate": 0.0009998121062985063, + "loss": 1.00593519, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.17053223, + "step": 200, + "time_per_iteration": 2.6788320541381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328142, + "balance_loss_mlp": 1.31167912, + "epoch": 0.03866871873797614, + "flos": 576791359488.0, + "grad_norm": 0.059667024197710104, + "language_loss": 1.01260698, + "learning_rate": 0.0009998034691890794, + "loss": 1.02588844, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.16455078, + "step": 201, + "time_per_iteration": 2.753265380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297644, + "balance_loss_mlp": 1.28249288, + "epoch": 0.03886110042323971, + "flos": 540472117248.0, + "grad_norm": 0.07302515973387386, + "language_loss": 1.05948424, + "learning_rate": 0.0009997946380387369, + "loss": 1.07246065, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.15136719, + "step": 202, + "time_per_iteration": 2.618546485900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262023, + "balance_loss_mlp": 1.24746776, + "epoch": 0.03905348210850327, + "flos": 717694843392.0, + "grad_norm": 0.0775452329378228, + "language_loss": 1.08266401, + "learning_rate": 0.0009997856128509076, + "loss": 1.09528422, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.14550781, + "step": 203, + "time_per_iteration": 2.8284859657287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267878, + "balance_loss_mlp": 1.25321579, + "epoch": 0.039245863793766836, + "flos": 427268639232.0, + "grad_norm": 0.06664318613050589, + "language_loss": 1.02886617, + "learning_rate": 0.0009997763936290952, + "loss": 1.04154491, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.14660645, + "step": 204, + "time_per_iteration": 2.516263246536255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129892, + "balance_loss_mlp": 1.28264785, + "epoch": 0.039438245479030395, + "flos": 662804163072.0, + "grad_norm": 0.07463685050771204, + "language_loss": 1.0815413, + "learning_rate": 0.0009997669803768789, + "loss": 1.09453046, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.16271973, + "step": 205, + "time_per_iteration": 2.7576606273651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291456, + "balance_loss_mlp": 1.27812803, + "epoch": 0.03963062716429396, + "flos": 635063459328.0, + "grad_norm": 0.055878982250893716, + "language_loss": 1.03253651, + "learning_rate": 0.0009997573730979134, + "loss": 1.04545116, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.13342285, + "step": 206, + "time_per_iteration": 2.716325521469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.07279634, + "balance_loss_mlp": 4.65512276, + "epoch": 0.03982300884955752, + "flos": 1417813286400.0, + "grad_norm": 0.533603848118922, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.86472833, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 26.25, + "step": 207, + "time_per_iteration": 4.635821342468262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0137482, + "balance_loss_mlp": 1.35964513, + "epoch": 0.04001539053482109, + "flos": 688769713152.0, + "grad_norm": 0.1040721574676452, + "language_loss": 1.02094078, + "learning_rate": 0.0009997375764747294, + "loss": 1.03468895, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.1517334, + "step": 208, + "time_per_iteration": 2.974442481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415285, + "balance_loss_mlp": 1.40052748, + "epoch": 0.04020777222008465, + "flos": 533363042304.0, + "grad_norm": 0.08111266742266361, + "language_loss": 0.99458027, + "learning_rate": 0.0009997273871381967, + "loss": 1.00873303, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.14758301, + "step": 209, + "time_per_iteration": 2.6802144050598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466201, + "balance_loss_mlp": 1.44989347, + "epoch": 0.040400153905348214, + "flos": 567661381632.0, + "grad_norm": 0.06875741115436663, + "language_loss": 1.05031717, + "learning_rate": 0.0009997170037902862, + "loss": 1.0649792, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.16308594, + "step": 210, + "time_per_iteration": 2.6975836753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531768, + "balance_loss_mlp": 1.51399446, + "epoch": 0.040592535590611774, + "flos": 713130559488.0, + "grad_norm": 0.07197690318934227, + "language_loss": 1.07202697, + "learning_rate": 0.0009997064264350292, + "loss": 1.08734465, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.17785645, + "step": 211, + "time_per_iteration": 2.836771011352539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531925, + "balance_loss_mlp": 1.5154984, + "epoch": 0.04078491727587533, + "flos": 577824427008.0, + "grad_norm": 0.09120436996840299, + "language_loss": 1.0146966, + "learning_rate": 0.0009996956550765317, + "loss": 1.03001595, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.16430664, + "step": 212, + "time_per_iteration": 2.671292781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01499293, + "balance_loss_mlp": 1.4817214, + "epoch": 0.0409772989611389, + "flos": 552033404928.0, + "grad_norm": 0.11449485477945152, + "language_loss": 0.96278083, + "learning_rate": 0.0009996846897189762, + "loss": 0.97777379, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.17565918, + "step": 213, + "time_per_iteration": 2.6231424808502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014653, + "balance_loss_mlp": 1.44753814, + "epoch": 0.04116968064640246, + "flos": 555347833344.0, + "grad_norm": 0.09512793115916172, + "language_loss": 1.02356708, + "learning_rate": 0.0009996735303666193, + "loss": 1.03822017, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.1776123, + "step": 214, + "time_per_iteration": 2.6930177211761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01477298, + "balance_loss_mlp": 1.46134758, + "epoch": 0.041362062331666026, + "flos": 578204158464.0, + "grad_norm": 0.09141123477091552, + "language_loss": 1.04750729, + "learning_rate": 0.0009996621770237937, + "loss": 1.06228042, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.15942383, + "step": 215, + "time_per_iteration": 2.7448923587799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01578462, + "balance_loss_mlp": 1.56013966, + "epoch": 0.041554444016929586, + "flos": 611130396672.0, + "grad_norm": 0.10233552268903827, + "language_loss": 0.99822551, + "learning_rate": 0.0009996506296949073, + "loss": 1.01401007, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.18334961, + "step": 216, + "time_per_iteration": 2.8548526763916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01609008, + "balance_loss_mlp": 1.59156775, + "epoch": 0.04174682570219315, + "flos": 527857413120.0, + "grad_norm": 0.10522858499680945, + "language_loss": 0.99888742, + "learning_rate": 0.0009996388883844428, + "loss": 1.01497757, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.17456055, + "step": 217, + "time_per_iteration": 2.618546724319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01557164, + "balance_loss_mlp": 1.54124999, + "epoch": 0.04193920738745671, + "flos": 511258977792.0, + "grad_norm": 0.09341741551851517, + "language_loss": 1.03841758, + "learning_rate": 0.0009996269530969588, + "loss": 1.05398929, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.15905762, + "step": 218, + "time_per_iteration": 2.6204636096954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01525903, + "balance_loss_mlp": 1.50927377, + "epoch": 0.04213158907272028, + "flos": 571226093568.0, + "grad_norm": 0.09609660813155754, + "language_loss": 1.02944803, + "learning_rate": 0.0009996148238370888, + "loss": 1.04470706, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.16625977, + "step": 219, + "time_per_iteration": 2.7071943283081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0150071, + "balance_loss_mlp": 1.48340106, + "epoch": 0.04232397075798384, + "flos": 963803667456.0, + "grad_norm": 0.05454565212769997, + "language_loss": 0.9941752, + "learning_rate": 0.0009996025006095421, + "loss": 1.00918233, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.1730957, + "step": 220, + "time_per_iteration": 3.3006374835968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.10944285, + "balance_loss_mlp": 6.84272289, + "epoch": 0.042516352443247404, + "flos": 1468814777856.0, + "grad_norm": 0.48497418398004566, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.88727427, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 41.0, + "step": 221, + "time_per_iteration": 5.7136383056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601015, + "balance_loss_mlp": 1.58291924, + "epoch": 0.042708734128510964, + "flos": 654419091456.0, + "grad_norm": 0.10763646442297387, + "language_loss": 0.99765503, + "learning_rate": 0.0009995772722706307, + "loss": 1.0136652, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.1809082, + "step": 222, + "time_per_iteration": 2.8322792053222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01658843, + "balance_loss_mlp": 1.63811278, + "epoch": 0.04290111581377453, + "flos": 431601578496.0, + "grad_norm": 0.16393394652444138, + "language_loss": 1.13557565, + "learning_rate": 0.0009995643671690604, + "loss": 1.1521641, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.20739746, + "step": 223, + "time_per_iteration": 2.470729351043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163871, + "balance_loss_mlp": 1.61504686, + "epoch": 0.04309349749903809, + "flos": 644379701760.0, + "grad_norm": 0.08733094203359489, + "language_loss": 1.00837708, + "learning_rate": 0.0009995512681194023, + "loss": 1.02476418, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.23632812, + "step": 224, + "time_per_iteration": 2.8274452686309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615568, + "balance_loss_mlp": 1.58755326, + "epoch": 0.04328587918430166, + "flos": 830861853696.0, + "grad_norm": 0.12001676841435771, + "language_loss": 0.98664522, + "learning_rate": 0.0009995379751267417, + "loss": 1.00280082, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.28027344, + "step": 225, + "time_per_iteration": 3.275660991668701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01617416, + "balance_loss_mlp": 1.58639741, + "epoch": 0.043478260869565216, + "flos": 524804852736.0, + "grad_norm": 0.1467276253632499, + "language_loss": 1.0007726, + "learning_rate": 0.0009995244881962398, + "loss": 1.01694679, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.30981445, + "step": 226, + "time_per_iteration": 2.6300203800201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601732, + "balance_loss_mlp": 1.56787658, + "epoch": 0.04367064255482878, + "flos": 439253328384.0, + "grad_norm": 0.095918638324787, + "language_loss": 1.01389623, + "learning_rate": 0.0009995108073331323, + "loss": 1.02991343, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.33862305, + "step": 227, + "time_per_iteration": 2.667628765106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0158134, + "balance_loss_mlp": 1.5462923, + "epoch": 0.04386302424009234, + "flos": 507109330944.0, + "grad_norm": 0.08564981186298011, + "language_loss": 1.04024279, + "learning_rate": 0.0009994969325427309, + "loss": 1.05605614, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.35058594, + "step": 228, + "time_per_iteration": 2.6454501152038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558795, + "balance_loss_mlp": 1.52224541, + "epoch": 0.04405540592535591, + "flos": 540432829440.0, + "grad_norm": 0.07744391701114339, + "language_loss": 1.00052619, + "learning_rate": 0.0009994828638304218, + "loss": 1.016114, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.36547852, + "step": 229, + "time_per_iteration": 2.6468071937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01516137, + "balance_loss_mlp": 1.47794271, + "epoch": 0.04424778761061947, + "flos": 446136850944.0, + "grad_norm": 0.08052263902742013, + "language_loss": 1.06763554, + "learning_rate": 0.0009994686012016675, + "loss": 1.08279693, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.3815918, + "step": 230, + "time_per_iteration": 2.5467634201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01483037, + "balance_loss_mlp": 1.44515228, + "epoch": 0.044440169295883035, + "flos": 700383435264.0, + "grad_norm": 0.05918307184238542, + "language_loss": 1.0518043, + "learning_rate": 0.000999454144662005, + "loss": 1.06663465, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.37866211, + "step": 231, + "time_per_iteration": 2.8704099655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473358, + "balance_loss_mlp": 1.43549716, + "epoch": 0.044632550981146595, + "flos": 588055873536.0, + "grad_norm": 0.08626514264815018, + "language_loss": 0.99676436, + "learning_rate": 0.0009994394942170468, + "loss": 1.01149797, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.37866211, + "step": 232, + "time_per_iteration": 2.6578898429870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461415, + "balance_loss_mlp": 1.4258194, + "epoch": 0.04482493266641016, + "flos": 554534525952.0, + "grad_norm": 0.07124765242066121, + "language_loss": 0.96965969, + "learning_rate": 0.0009994246498724808, + "loss": 0.98427379, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.35620117, + "step": 233, + "time_per_iteration": 2.7764015197753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463645, + "balance_loss_mlp": 1.42790616, + "epoch": 0.04501731435167372, + "flos": 722500646400.0, + "grad_norm": 0.07759597622956232, + "language_loss": 0.99069166, + "learning_rate": 0.00099940961163407, + "loss": 1.00532806, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.35766602, + "step": 234, + "time_per_iteration": 2.8431143760681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454599, + "balance_loss_mlp": 1.42098188, + "epoch": 0.04520969603693728, + "flos": 511539784704.0, + "grad_norm": 0.05931413709293958, + "language_loss": 1.02564597, + "learning_rate": 0.0009993943795076528, + "loss": 1.04019189, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.33642578, + "step": 235, + "time_per_iteration": 2.645988702774048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444244, + "balance_loss_mlp": 1.40936303, + "epoch": 0.04540207772220085, + "flos": 364854246912.0, + "grad_norm": 0.07280953320994132, + "language_loss": 1.04776168, + "learning_rate": 0.0009993789534991427, + "loss": 1.062204, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.34912109, + "step": 236, + "time_per_iteration": 2.4084837436676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01418385, + "balance_loss_mlp": 1.38390946, + "epoch": 0.045594459407464406, + "flos": 522407038464.0, + "grad_norm": 0.060943880380569936, + "language_loss": 0.99500269, + "learning_rate": 0.0009993633336145287, + "loss": 1.00918651, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.34472656, + "step": 237, + "time_per_iteration": 2.6044533252716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406135, + "balance_loss_mlp": 1.3730185, + "epoch": 0.04578684109272797, + "flos": 671442338304.0, + "grad_norm": 0.06747057459653658, + "language_loss": 1.03573179, + "learning_rate": 0.0009993475198598752, + "loss": 1.04979324, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.33129883, + "step": 238, + "time_per_iteration": 2.9948084354400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01383668, + "balance_loss_mlp": 1.35164809, + "epoch": 0.04597922277799153, + "flos": 541387321344.0, + "grad_norm": 0.07135856148897902, + "language_loss": 0.99909985, + "learning_rate": 0.0009993315122413212, + "loss": 1.01293659, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.32006836, + "step": 239, + "time_per_iteration": 2.5848827362060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369111, + "balance_loss_mlp": 1.33773541, + "epoch": 0.0461716044632551, + "flos": 458732616192.0, + "grad_norm": 0.056000088810755834, + "language_loss": 1.0008105, + "learning_rate": 0.0009993153107650818, + "loss": 1.01450157, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.31347656, + "step": 240, + "time_per_iteration": 2.5492687225341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338815, + "balance_loss_mlp": 1.31015706, + "epoch": 0.04636398614851866, + "flos": 455009342976.0, + "grad_norm": 0.06491754001609312, + "language_loss": 0.99534512, + "learning_rate": 0.0009992989154374468, + "loss": 1.00873327, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.28662109, + "step": 241, + "time_per_iteration": 2.511237621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294622, + "balance_loss_mlp": 1.26833653, + "epoch": 0.046556367833782225, + "flos": 556558401024.0, + "grad_norm": 0.07592069792168304, + "language_loss": 1.06626534, + "learning_rate": 0.0009992823262647817, + "loss": 1.07921147, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26293945, + "step": 242, + "time_per_iteration": 2.7618701457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249282, + "balance_loss_mlp": 1.22240043, + "epoch": 0.046748749519045785, + "flos": 592625949696.0, + "grad_norm": 0.0687662987323222, + "language_loss": 1.00893593, + "learning_rate": 0.0009992655432535264, + "loss": 1.0214287, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.26879883, + "step": 243, + "time_per_iteration": 2.7471935749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255015, + "balance_loss_mlp": 1.23083937, + "epoch": 0.04694113120430935, + "flos": 569596506624.0, + "grad_norm": 0.07373455055845594, + "language_loss": 1.0054853, + "learning_rate": 0.0009992485664101973, + "loss": 1.01803541, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.24169922, + "step": 244, + "time_per_iteration": 2.635344982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295554, + "balance_loss_mlp": 1.27291572, + "epoch": 0.04713351288957291, + "flos": 863401158144.0, + "grad_norm": 0.10584905626659928, + "language_loss": 1.03312445, + "learning_rate": 0.000999231395741385, + "loss": 1.04607987, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.22631836, + "step": 245, + "time_per_iteration": 3.093386173248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128706, + "balance_loss_mlp": 1.26464868, + "epoch": 0.04732589457483648, + "flos": 536961249792.0, + "grad_norm": 0.08844420521863233, + "language_loss": 1.01371169, + "learning_rate": 0.0009992140312537557, + "loss": 1.02658224, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.22412109, + "step": 246, + "time_per_iteration": 2.667579412460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256359, + "balance_loss_mlp": 1.23515141, + "epoch": 0.04751827626010004, + "flos": 761566910976.0, + "grad_norm": 0.052835972446563725, + "language_loss": 0.9609164, + "learning_rate": 0.000999196472954051, + "loss": 0.97347999, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.2121582, + "step": 247, + "time_per_iteration": 2.9537084102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02369813, + "balance_loss_mlp": 2.16687083, + "epoch": 0.0477106579453636, + "flos": 1578961313280.0, + "grad_norm": 0.2151482568863758, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.81794667, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 2.03125, + "step": 248, + "time_per_iteration": 5.758621454238892 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289622, + "balance_loss_mlp": 1.27137113, + "epoch": 0.04790303963062716, + "flos": 457535195136.0, + "grad_norm": 0.10849969336884063, + "language_loss": 1.03316629, + "learning_rate": 0.0009991607749457578, + "loss": 1.04606247, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.18261719, + "step": 249, + "time_per_iteration": 2.5432913303375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334566, + "balance_loss_mlp": 1.31724536, + "epoch": 0.04809542131589073, + "flos": 782079266304.0, + "grad_norm": 0.08264534697846654, + "language_loss": 1.01180637, + "learning_rate": 0.0009991426352510286, + "loss": 1.02515209, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.17321777, + "step": 250, + "time_per_iteration": 3.1542766094207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351096, + "balance_loss_mlp": 1.33215368, + "epoch": 0.04828780300115429, + "flos": 558995503104.0, + "grad_norm": 0.06435857362074206, + "language_loss": 1.03307557, + "learning_rate": 0.0009991243017719422, + "loss": 1.04658651, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.18933105, + "step": 251, + "time_per_iteration": 2.693882942199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333217, + "balance_loss_mlp": 1.31485844, + "epoch": 0.048480184686417856, + "flos": 501682277376.0, + "grad_norm": 0.09276508096019526, + "language_loss": 0.97794825, + "learning_rate": 0.0009991057745156165, + "loss": 0.99128038, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.18347168, + "step": 252, + "time_per_iteration": 2.628873109817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01867297, + "balance_loss_mlp": 1.75514495, + "epoch": 0.048672566371681415, + "flos": 1535585430528.0, + "grad_norm": 0.16359674361847032, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.8377828, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.125, + "step": 253, + "time_per_iteration": 5.060615062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285031, + "balance_loss_mlp": 1.26567185, + "epoch": 0.04886494805694498, + "flos": 537665458176.0, + "grad_norm": 0.07164286827098729, + "language_loss": 1.06546187, + "learning_rate": 0.0009990681387000943, + "loss": 1.07831216, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.19384766, + "step": 254, + "time_per_iteration": 2.783367395401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275754, + "balance_loss_mlp": 1.25606036, + "epoch": 0.04905732974220854, + "flos": 679841966592.0, + "grad_norm": 0.06618046133348403, + "language_loss": 1.01404011, + "learning_rate": 0.0009990490301555093, + "loss": 1.02679765, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.19689941, + "step": 255, + "time_per_iteration": 2.9520761966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01936632, + "balance_loss_mlp": 1.86796737, + "epoch": 0.04924971142747211, + "flos": 1420408949760.0, + "grad_norm": 0.31562964738653715, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.81151783, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.6875, + "step": 256, + "time_per_iteration": 4.825209856033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615246, + "balance_loss_mlp": 1.55344784, + "epoch": 0.04944209311273567, + "flos": 1557202074624.0, + "grad_norm": 0.16937574338078817, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80857986, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.6171875, + "step": 257, + "time_per_iteration": 4.995501518249512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163115, + "balance_loss_mlp": 1.58422887, + "epoch": 0.04963447479799923, + "flos": 1569985514496.0, + "grad_norm": 0.13524925240989144, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71607035, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.46875, + "step": 258, + "time_per_iteration": 4.841471910476685 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272668, + "balance_loss_mlp": 1.24463022, + "epoch": 0.049826856483262794, + "flos": 625063357440.0, + "grad_norm": 0.06365504505971183, + "language_loss": 0.95603192, + "learning_rate": 0.0009989706585723202, + "loss": 0.96875864, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.28076172, + "step": 259, + "time_per_iteration": 2.7635786533355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130022, + "balance_loss_mlp": 1.27020288, + "epoch": 0.05001923816852635, + "flos": 503912765952.0, + "grad_norm": 0.062257698278494894, + "language_loss": 1.01846027, + "learning_rate": 0.0009989505813633442, + "loss": 1.03146255, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.29980469, + "step": 260, + "time_per_iteration": 2.6451833248138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131611, + "balance_loss_mlp": 1.28101516, + "epoch": 0.05021161985378992, + "flos": 587066476032.0, + "grad_norm": 0.06290514068599455, + "language_loss": 1.01911807, + "learning_rate": 0.000998930310444573, + "loss": 1.03227913, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.35083008, + "step": 261, + "time_per_iteration": 2.6989662647247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324978, + "balance_loss_mlp": 1.2880708, + "epoch": 0.05040400153905348, + "flos": 633029409792.0, + "grad_norm": 0.0625839964239575, + "language_loss": 1.00387836, + "learning_rate": 0.0009989098458238765, + "loss": 1.01712811, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.36914062, + "step": 262, + "time_per_iteration": 2.7581043243408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319841, + "balance_loss_mlp": 1.28395867, + "epoch": 0.050596383224317046, + "flos": 553344307200.0, + "grad_norm": 0.06067150197267865, + "language_loss": 0.99905968, + "learning_rate": 0.0009988891875091998, + "loss": 1.01225805, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.35913086, + "step": 263, + "time_per_iteration": 2.7601842880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131413, + "balance_loss_mlp": 1.27793837, + "epoch": 0.050788764909580605, + "flos": 549389689344.0, + "grad_norm": 0.07440292928735547, + "language_loss": 0.94277728, + "learning_rate": 0.0009988683355085636, + "loss": 0.95591855, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.36206055, + "step": 264, + "time_per_iteration": 2.7262909412384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277315, + "balance_loss_mlp": 1.24248254, + "epoch": 0.05098114659484417, + "flos": 604812870144.0, + "grad_norm": 0.06984595792035174, + "language_loss": 1.02861905, + "learning_rate": 0.000998847289830063, + "loss": 1.04139221, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.34838867, + "step": 265, + "time_per_iteration": 2.8318397998809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256298, + "balance_loss_mlp": 1.22272849, + "epoch": 0.05117352828010773, + "flos": 438317775360.0, + "grad_norm": 0.08677906198544101, + "language_loss": 0.95779377, + "learning_rate": 0.0009988260504818682, + "loss": 0.9703567, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.3359375, + "step": 266, + "time_per_iteration": 2.5212388038635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220367, + "balance_loss_mlp": 1.19046903, + "epoch": 0.0513659099653713, + "flos": 504784300032.0, + "grad_norm": 0.09456939977029206, + "language_loss": 1.01958096, + "learning_rate": 0.000998804617472226, + "loss": 1.03178465, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.29858398, + "step": 267, + "time_per_iteration": 2.649739980697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169139, + "balance_loss_mlp": 1.14131606, + "epoch": 0.05155829165063486, + "flos": 695183344128.0, + "grad_norm": 0.07125411147685125, + "language_loss": 0.97574937, + "learning_rate": 0.0009987829908094568, + "loss": 0.98744082, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.27856445, + "step": 268, + "time_per_iteration": 2.816098690032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119703, + "balance_loss_mlp": 1.09379935, + "epoch": 0.051750673335898424, + "flos": 1347751830528.0, + "grad_norm": 0.06583247177587333, + "language_loss": 1.04151332, + "learning_rate": 0.0009987611705019569, + "loss": 1.05271029, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.25927734, + "step": 269, + "time_per_iteration": 4.478148460388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103433, + "balance_loss_mlp": 1.08008027, + "epoch": 0.051943055021161984, + "flos": 489362936832.0, + "grad_norm": 0.06787757239342199, + "language_loss": 1.02481639, + "learning_rate": 0.0009987391565581978, + "loss": 1.03585076, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.23364258, + "step": 270, + "time_per_iteration": 2.5662009716033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111004, + "balance_loss_mlp": 1.08859241, + "epoch": 0.05213543670642555, + "flos": 545504882688.0, + "grad_norm": 0.08198896814085149, + "language_loss": 0.9504528, + "learning_rate": 0.000998716948986726, + "loss": 0.96156287, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.22424316, + "step": 271, + "time_per_iteration": 2.7815349102020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158552, + "balance_loss_mlp": 1.13697529, + "epoch": 0.05232781839168911, + "flos": 603285179904.0, + "grad_norm": 0.07646156534985457, + "language_loss": 0.97641921, + "learning_rate": 0.0009986945477961633, + "loss": 0.9880048, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.21569824, + "step": 272, + "time_per_iteration": 2.694547414779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188724, + "balance_loss_mlp": 1.16735017, + "epoch": 0.052520200076952676, + "flos": 538218307584.0, + "grad_norm": 0.07381807258867126, + "language_loss": 1.02498066, + "learning_rate": 0.0009986719529952066, + "loss": 1.03686786, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.21386719, + "step": 273, + "time_per_iteration": 2.8339192867279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121752, + "balance_loss_mlp": 1.19785035, + "epoch": 0.052712581762216236, + "flos": 463148513280.0, + "grad_norm": 0.0738352941440963, + "language_loss": 1.01808548, + "learning_rate": 0.000998649164592628, + "loss": 1.03026068, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.19677734, + "step": 274, + "time_per_iteration": 2.60577130317688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236713, + "balance_loss_mlp": 1.21763909, + "epoch": 0.0529049634474798, + "flos": 547749927936.0, + "grad_norm": 0.08134169766286939, + "language_loss": 0.99272913, + "learning_rate": 0.0009986261825972748, + "loss": 1.00509632, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.19055176, + "step": 275, + "time_per_iteration": 2.652561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196689, + "balance_loss_mlp": 1.17834246, + "epoch": 0.05309734513274336, + "flos": 617727320064.0, + "grad_norm": 0.09111845604121613, + "language_loss": 1.01860571, + "learning_rate": 0.000998603007018069, + "loss": 1.03057253, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.18334961, + "step": 276, + "time_per_iteration": 2.8293774127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011443, + "balance_loss_mlp": 1.1273365, + "epoch": 0.05328972681800693, + "flos": 605220304896.0, + "grad_norm": 0.07377841396756965, + "language_loss": 0.99345076, + "learning_rate": 0.0009985796378640089, + "loss": 1.00489378, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.16955566, + "step": 277, + "time_per_iteration": 2.694716215133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098185, + "balance_loss_mlp": 1.08067346, + "epoch": 0.05348210850327049, + "flos": 604197411840.0, + "grad_norm": 0.07074934963985437, + "language_loss": 0.99532163, + "learning_rate": 0.0009985560751441665, + "loss": 1.00630355, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.1751709, + "step": 278, + "time_per_iteration": 2.798563241958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095446, + "balance_loss_mlp": 1.07736206, + "epoch": 0.053674490188534055, + "flos": 630480236544.0, + "grad_norm": 0.054749659326078955, + "language_loss": 1.01733184, + "learning_rate": 0.00099853231886769, + "loss": 1.02828622, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.1809082, + "step": 279, + "time_per_iteration": 2.780940532684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134885, + "balance_loss_mlp": 1.11744475, + "epoch": 0.053866871873797614, + "flos": 478939433472.0, + "grad_norm": 0.06375435082524677, + "language_loss": 1.01461124, + "learning_rate": 0.0009985083690438024, + "loss": 1.02595997, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.17443848, + "step": 280, + "time_per_iteration": 2.68762469291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145965, + "balance_loss_mlp": 1.12913251, + "epoch": 0.054059253559061174, + "flos": 787673645568.0, + "grad_norm": 0.07384801764192533, + "language_loss": 0.92380941, + "learning_rate": 0.0009984842256818016, + "loss": 0.93526906, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.16845703, + "step": 281, + "time_per_iteration": 3.054032325744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114791, + "balance_loss_mlp": 1.13080359, + "epoch": 0.05425163524432474, + "flos": 628076630016.0, + "grad_norm": 0.082175996598207, + "language_loss": 1.0314945, + "learning_rate": 0.0009984598887910613, + "loss": 1.04297376, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.17114258, + "step": 282, + "time_per_iteration": 2.7095611095428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144379, + "balance_loss_mlp": 1.12627149, + "epoch": 0.0544440169295883, + "flos": 615453161472.0, + "grad_norm": 0.06813866095032944, + "language_loss": 0.9902432, + "learning_rate": 0.0009984353583810297, + "loss": 1.00168693, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.18103027, + "step": 283, + "time_per_iteration": 2.804438829421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124262, + "balance_loss_mlp": 1.10624945, + "epoch": 0.05463639861485187, + "flos": 647471549952.0, + "grad_norm": 0.10003204141391345, + "language_loss": 1.01340103, + "learning_rate": 0.0009984106344612302, + "loss": 1.02464366, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.18017578, + "step": 284, + "time_per_iteration": 2.7521376609802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109552, + "balance_loss_mlp": 1.07819879, + "epoch": 0.054828780300115426, + "flos": 796845883392.0, + "grad_norm": 0.07143310654982075, + "language_loss": 0.96421391, + "learning_rate": 0.0009983857170412615, + "loss": 0.97516906, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.17321777, + "step": 285, + "time_per_iteration": 2.9796621799468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089942, + "balance_loss_mlp": 1.07363439, + "epoch": 0.05502116198537899, + "flos": 549414420480.0, + "grad_norm": 0.05224422052371224, + "language_loss": 0.95713383, + "learning_rate": 0.000998360606130798, + "loss": 0.96803325, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.16308594, + "step": 286, + "time_per_iteration": 2.7950801849365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02908189, + "balance_loss_mlp": 2.83799911, + "epoch": 0.05521354367064255, + "flos": 1406967791616.0, + "grad_norm": 0.233188183772104, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71981305, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.703125, + "step": 287, + "time_per_iteration": 4.876653432846069 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179574, + "balance_loss_mlp": 1.1627655, + "epoch": 0.05540592535590612, + "flos": 645123197952.0, + "grad_norm": 0.17417830683261867, + "language_loss": 1.0204829, + "learning_rate": 0.0009983098038774552, + "loss": 1.03227878, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.16821289, + "step": 288, + "time_per_iteration": 2.7781550884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02101154, + "balance_loss_mlp": 2.07540464, + "epoch": 0.05559830704116968, + "flos": 1510293413376.0, + "grad_norm": 0.1730100464590254, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.80271375, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.2578125, + "step": 289, + "time_per_iteration": 4.801970481872559 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338926, + "balance_loss_mlp": 1.32332134, + "epoch": 0.055790688726433245, + "flos": 508078379520.0, + "grad_norm": 0.11288123874753296, + "language_loss": 0.99586821, + "learning_rate": 0.0009982582277800948, + "loss": 1.00925756, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.15588379, + "step": 290, + "time_per_iteration": 2.6019012928009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376714, + "balance_loss_mlp": 1.36076403, + "epoch": 0.055983070411696804, + "flos": 657570576384.0, + "grad_norm": 0.11158393407579077, + "language_loss": 1.06464982, + "learning_rate": 0.0009982321495648908, + "loss": 1.07841706, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.15942383, + "step": 291, + "time_per_iteration": 2.7833075523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281101, + "balance_loss_mlp": 1.26441216, + "epoch": 0.05617545209696037, + "flos": 587051919360.0, + "grad_norm": 0.091490024999748, + "language_loss": 0.97375935, + "learning_rate": 0.0009982058779188115, + "loss": 0.98657036, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.16699219, + "step": 292, + "time_per_iteration": 2.700998067855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223751, + "balance_loss_mlp": 1.20634639, + "epoch": 0.05636783378222393, + "flos": 611331217920.0, + "grad_norm": 0.09093545163733599, + "language_loss": 1.05090272, + "learning_rate": 0.0009981794128520567, + "loss": 1.06314015, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.17431641, + "step": 293, + "time_per_iteration": 2.769562244415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172918, + "balance_loss_mlp": 1.15501237, + "epoch": 0.0565602154674875, + "flos": 667847102976.0, + "grad_norm": 0.08200667246549262, + "language_loss": 1.02219713, + "learning_rate": 0.000998152754374901, + "loss": 1.03392649, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.17919922, + "step": 294, + "time_per_iteration": 2.8421483039855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140496, + "balance_loss_mlp": 1.12121987, + "epoch": 0.05675259715275106, + "flos": 616963474944.0, + "grad_norm": 0.06298459153201627, + "language_loss": 0.97706711, + "learning_rate": 0.0009981259024976943, + "loss": 0.98847204, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.19250488, + "step": 295, + "time_per_iteration": 2.709536552429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131247, + "balance_loss_mlp": 1.11139894, + "epoch": 0.05694497883801462, + "flos": 751424214528.0, + "grad_norm": 0.13011693222478776, + "language_loss": 0.96307456, + "learning_rate": 0.0009980988572308612, + "loss": 0.97438705, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.19848633, + "step": 296, + "time_per_iteration": 2.9606993198394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125313, + "balance_loss_mlp": 1.10492802, + "epoch": 0.05713736052327818, + "flos": 711669708288.0, + "grad_norm": 0.06808560063607492, + "language_loss": 0.9959082, + "learning_rate": 0.0009980716185849015, + "loss": 1.00716126, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.20385742, + "step": 297, + "time_per_iteration": 2.952467203140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133548, + "balance_loss_mlp": 1.11424804, + "epoch": 0.05732974220854175, + "flos": 468737100288.0, + "grad_norm": 0.05570922928007862, + "language_loss": 0.95103967, + "learning_rate": 0.0009980441865703904, + "loss": 0.9623751, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.19299316, + "step": 298, + "time_per_iteration": 2.6629996299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125947, + "balance_loss_mlp": 1.10630131, + "epoch": 0.05752212389380531, + "flos": 601143441408.0, + "grad_norm": 0.06175770353433084, + "language_loss": 1.038656, + "learning_rate": 0.000998016561197978, + "loss": 1.04991555, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.19628906, + "step": 299, + "time_per_iteration": 2.7027034759521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122899, + "balance_loss_mlp": 1.10499382, + "epoch": 0.057714505579068875, + "flos": 678344799744.0, + "grad_norm": 0.07709513760197055, + "language_loss": 0.95715761, + "learning_rate": 0.0009979887424783895, + "loss": 0.96838653, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.17907715, + "step": 300, + "time_per_iteration": 2.8467562198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122592, + "balance_loss_mlp": 1.10369694, + "epoch": 0.057906887264332435, + "flos": 595604316672.0, + "grad_norm": 0.05754387138467597, + "language_loss": 0.94804943, + "learning_rate": 0.0009979607304224248, + "loss": 0.95927536, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.18908691, + "step": 301, + "time_per_iteration": 2.7457566261291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135958, + "balance_loss_mlp": 1.11577594, + "epoch": 0.058099268949596, + "flos": 551855904768.0, + "grad_norm": 0.06951393564289957, + "language_loss": 1.02452385, + "learning_rate": 0.000997932525040959, + "loss": 1.03588343, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.20166016, + "step": 302, + "time_per_iteration": 2.670464038848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123814, + "balance_loss_mlp": 1.10513425, + "epoch": 0.05829165063485956, + "flos": 507906671616.0, + "grad_norm": 0.06408930588753382, + "language_loss": 1.04041958, + "learning_rate": 0.000997904126344943, + "loss": 1.05165768, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.18676758, + "step": 303, + "time_per_iteration": 2.654275417327881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122557, + "balance_loss_mlp": 1.10432982, + "epoch": 0.05848403232012313, + "flos": 614949774336.0, + "grad_norm": 0.10902949066110783, + "language_loss": 1.00108004, + "learning_rate": 0.0009978755343454018, + "loss": 1.0123055, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.18212891, + "step": 304, + "time_per_iteration": 2.7061922550201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118016, + "balance_loss_mlp": 1.10034943, + "epoch": 0.05867641400538669, + "flos": 499835902464.0, + "grad_norm": 0.07196511907519268, + "language_loss": 1.01183403, + "learning_rate": 0.0009978467490534355, + "loss": 1.02301419, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.17663574, + "step": 305, + "time_per_iteration": 2.5658843517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118418, + "balance_loss_mlp": 1.09971452, + "epoch": 0.05886879569065025, + "flos": 531019072512.0, + "grad_norm": 0.05577021807863236, + "language_loss": 0.98775607, + "learning_rate": 0.00099781777048022, + "loss": 0.99894023, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.18713379, + "step": 306, + "time_per_iteration": 2.688661813735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112614, + "balance_loss_mlp": 1.10866416, + "epoch": 0.05906117737591381, + "flos": 488811497472.0, + "grad_norm": 0.06489613907432343, + "language_loss": 0.99682212, + "learning_rate": 0.0009977885986370057, + "loss": 1.00808358, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.17480469, + "step": 307, + "time_per_iteration": 2.527008056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129188, + "balance_loss_mlp": 1.11242771, + "epoch": 0.05925355906117737, + "flos": 591213150720.0, + "grad_norm": 0.060579194597163814, + "language_loss": 0.94911426, + "learning_rate": 0.000997759233535118, + "loss": 0.96040612, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.16772461, + "step": 308, + "time_per_iteration": 2.768683433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_mlp": 1.09052539, + "epoch": 0.05944594074644094, + "flos": 563373522432.0, + "grad_norm": 0.074144120767366, + "language_loss": 1.01706028, + "learning_rate": 0.0009977296751859576, + "loss": 1.02814317, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.17749023, + "step": 309, + "time_per_iteration": 2.710550308227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109964, + "balance_loss_mlp": 1.0817585, + "epoch": 0.0596383224317045, + "flos": 538483147776.0, + "grad_norm": 0.1012520362466171, + "language_loss": 1.03562367, + "learning_rate": 0.0009976999236009998, + "loss": 1.04662001, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.17895508, + "step": 310, + "time_per_iteration": 2.7346065044403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095396, + "balance_loss_mlp": 1.07697809, + "epoch": 0.059830704116968066, + "flos": 560684726784.0, + "grad_norm": 0.05903807060939984, + "language_loss": 1.05193245, + "learning_rate": 0.0009976699787917955, + "loss": 1.06288636, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.18408203, + "step": 311, + "time_per_iteration": 2.737165689468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04018029, + "balance_loss_mlp": 3.94440532, + "epoch": 0.060023085802231625, + "flos": 1569759962112.0, + "grad_norm": 0.34396821433057967, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.77461016, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.734375, + "step": 312, + "time_per_iteration": 4.990010976791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130575, + "balance_loss_mlp": 1.11010623, + "epoch": 0.06021546748749519, + "flos": 482415395328.0, + "grad_norm": 0.18656347991450223, + "language_loss": 0.97164261, + "learning_rate": 0.0009976095095472243, + "loss": 0.98294836, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.20458984, + "step": 313, + "time_per_iteration": 2.5596373081207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143867, + "balance_loss_mlp": 1.12198031, + "epoch": 0.06040784917275875, + "flos": 619889407488.0, + "grad_norm": 0.10017394493353984, + "language_loss": 0.98154747, + "learning_rate": 0.0009975789851353334, + "loss": 0.9929862, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.21911621, + "step": 314, + "time_per_iteration": 2.783092498779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113993, + "balance_loss_mlp": 1.11832976, + "epoch": 0.06060023085802232, + "flos": 483292721664.0, + "grad_norm": 0.12837029886330253, + "language_loss": 1.00706339, + "learning_rate": 0.0009975482675461487, + "loss": 1.01846266, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.21594238, + "step": 315, + "time_per_iteration": 2.6375765800476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128184, + "balance_loss_mlp": 1.10697675, + "epoch": 0.06079261254328588, + "flos": 581620483584.0, + "grad_norm": 0.07139597701291463, + "language_loss": 0.9800331, + "learning_rate": 0.0009975173567915952, + "loss": 0.99131489, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.21228027, + "step": 316, + "time_per_iteration": 2.680223226547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116284, + "balance_loss_mlp": 1.09438515, + "epoch": 0.060984994228549444, + "flos": 687492306432.0, + "grad_norm": 0.12898022133672052, + "language_loss": 0.92624593, + "learning_rate": 0.000997486252883674, + "loss": 0.93740869, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.21887207, + "step": 317, + "time_per_iteration": 2.835162878036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_mlp": 1.08325243, + "epoch": 0.061177375913813004, + "flos": 1314284327424.0, + "grad_norm": 0.06442728945451602, + "language_loss": 0.97186124, + "learning_rate": 0.0009974549558344602, + "loss": 0.98290741, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.21350098, + "step": 318, + "time_per_iteration": 3.6293551921844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105129, + "balance_loss_mlp": 1.08439815, + "epoch": 0.06136975759907657, + "flos": 574072040448.0, + "grad_norm": 0.08131052095693254, + "language_loss": 1.07145, + "learning_rate": 0.000997423465656105, + "loss": 1.08250129, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.20715332, + "step": 319, + "time_per_iteration": 2.7070071697235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_mlp": 1.08168781, + "epoch": 0.06156213928434013, + "flos": 527281242624.0, + "grad_norm": 0.059301156484267634, + "language_loss": 1.04424822, + "learning_rate": 0.0009973917823608335, + "loss": 1.0552659, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.20092773, + "step": 320, + "time_per_iteration": 2.6225128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110531, + "balance_loss_mlp": 1.0897882, + "epoch": 0.061754520969603696, + "flos": 495238123008.0, + "grad_norm": 0.05387649814829365, + "language_loss": 0.98383266, + "learning_rate": 0.0009973599059609462, + "loss": 0.9949379, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.20739746, + "step": 321, + "time_per_iteration": 2.692152261734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107693, + "balance_loss_mlp": 1.08798778, + "epoch": 0.061946902654867256, + "flos": 439839673344.0, + "grad_norm": 0.06112812680296507, + "language_loss": 0.9711749, + "learning_rate": 0.000997327836468819, + "loss": 0.98225188, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.19702148, + "step": 322, + "time_per_iteration": 2.5772383213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110285, + "balance_loss_mlp": 1.0900557, + "epoch": 0.06213928434013082, + "flos": 598490961408.0, + "grad_norm": 0.0645434874295678, + "language_loss": 0.9942351, + "learning_rate": 0.000997295573896902, + "loss": 1.00533807, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.20239258, + "step": 323, + "time_per_iteration": 2.839282274246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02259253, + "balance_loss_mlp": 2.20088792, + "epoch": 0.06233166602539438, + "flos": 1449393716736.0, + "grad_norm": 0.19547826226404627, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83455294, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.58203125, + "step": 324, + "time_per_iteration": 4.67440938949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01888161, + "balance_loss_mlp": 1.83246601, + "epoch": 0.06252404771065795, + "flos": 1462504453632.0, + "grad_norm": 0.11962022052509429, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80460101, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.55859375, + "step": 325, + "time_per_iteration": 4.860283136367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177486, + "balance_loss_mlp": 1.15595722, + "epoch": 0.06271642939592151, + "flos": 464059335168.0, + "grad_norm": 0.06272096910143152, + "language_loss": 0.93621421, + "learning_rate": 0.000997197627828043, + "loss": 0.94798911, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.2154541, + "step": 326, + "time_per_iteration": 2.5594961643218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205877, + "balance_loss_mlp": 1.18165386, + "epoch": 0.06290881108118507, + "flos": 532111776768.0, + "grad_norm": 0.08849931028565244, + "language_loss": 0.89414704, + "learning_rate": 0.0009971645930629716, + "loss": 0.90620589, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.2421875, + "step": 327, + "time_per_iteration": 2.7163310050964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223238, + "balance_loss_mlp": 1.19748878, + "epoch": 0.06310119276644863, + "flos": 673262572032.0, + "grad_norm": 0.09892100413683627, + "language_loss": 1.02883804, + "learning_rate": 0.0009971313652814872, + "loss": 1.04107046, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.25769043, + "step": 328, + "time_per_iteration": 2.7786266803741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228803, + "balance_loss_mlp": 1.20175433, + "epoch": 0.0632935744517122, + "flos": 770404497408.0, + "grad_norm": 0.06852265531332852, + "language_loss": 0.99799907, + "learning_rate": 0.0009970979444964903, + "loss": 1.01028717, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.27050781, + "step": 329, + "time_per_iteration": 2.952498197555542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235649, + "balance_loss_mlp": 1.2062993, + "epoch": 0.06348595613697576, + "flos": 561649393152.0, + "grad_norm": 0.09680127661829774, + "language_loss": 1.0121367, + "learning_rate": 0.0009970643307209556, + "loss": 1.02449322, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.29296875, + "step": 330, + "time_per_iteration": 2.78190541267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240935, + "balance_loss_mlp": 1.20970178, + "epoch": 0.06367833782223932, + "flos": 675891730944.0, + "grad_norm": 0.08786526055569537, + "language_loss": 0.9788332, + "learning_rate": 0.0009970305239679334, + "loss": 0.99124253, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.31201172, + "step": 331, + "time_per_iteration": 2.805845022201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228576, + "balance_loss_mlp": 1.19891691, + "epoch": 0.06387071950750288, + "flos": 495035891712.0, + "grad_norm": 0.10390832636325384, + "language_loss": 1.03124022, + "learning_rate": 0.0009969965242505483, + "loss": 1.04352593, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.29614258, + "step": 332, + "time_per_iteration": 2.676711082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207199, + "balance_loss_mlp": 1.1777302, + "epoch": 0.06406310119276645, + "flos": 533170985472.0, + "grad_norm": 0.07105898063788767, + "language_loss": 0.98331362, + "learning_rate": 0.0009969623315820007, + "loss": 0.99538565, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.29418945, + "step": 333, + "time_per_iteration": 2.6556739807128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118815, + "balance_loss_mlp": 1.16106582, + "epoch": 0.06425548287803001, + "flos": 455940513792.0, + "grad_norm": 0.08067516684621483, + "language_loss": 0.99160993, + "learning_rate": 0.000996927945975565, + "loss": 1.0034914, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.27124023, + "step": 334, + "time_per_iteration": 2.5398526191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147495, + "balance_loss_mlp": 1.1214596, + "epoch": 0.06444786456329357, + "flos": 559817574912.0, + "grad_norm": 0.08169715789363684, + "language_loss": 0.96174645, + "learning_rate": 0.0009968933674445906, + "loss": 0.97322142, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.26062012, + "step": 335, + "time_per_iteration": 2.6592860221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112932, + "balance_loss_mlp": 1.0879097, + "epoch": 0.06464024624855713, + "flos": 665769383424.0, + "grad_norm": 0.07104021966044574, + "language_loss": 0.97756392, + "learning_rate": 0.0009968585960025028, + "loss": 0.98869324, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.25036621, + "step": 336, + "time_per_iteration": 2.9279658794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01860024, + "balance_loss_mlp": 1.84323907, + "epoch": 0.0648326279338207, + "flos": 1520578704384.0, + "grad_norm": 0.14426901756633248, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.7951321, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.16796875, + "step": 337, + "time_per_iteration": 4.810914993286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101739, + "balance_loss_mlp": 1.07948256, + "epoch": 0.06502500961908426, + "flos": 1142872768512.0, + "grad_norm": 0.058812216055980165, + "language_loss": 0.95864177, + "learning_rate": 0.0009967884744390583, + "loss": 0.96965921, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.22265625, + "step": 338, + "time_per_iteration": 3.512282371520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146504, + "balance_loss_mlp": 1.12267399, + "epoch": 0.06521739130434782, + "flos": 582339248640.0, + "grad_norm": 0.10793578588091769, + "language_loss": 0.97449529, + "learning_rate": 0.0009967531243449256, + "loss": 0.98596036, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.23828125, + "step": 339, + "time_per_iteration": 2.712907075881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154087, + "balance_loss_mlp": 1.12950587, + "epoch": 0.06540977298961138, + "flos": 497398800384.0, + "grad_norm": 0.06396927661276222, + "language_loss": 1.04641414, + "learning_rate": 0.000996717581394126, + "loss": 1.05795503, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.24584961, + "step": 340, + "time_per_iteration": 2.5783133506774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168509, + "balance_loss_mlp": 1.14584756, + "epoch": 0.06560215467487496, + "flos": 542613855744.0, + "grad_norm": 0.07568553531769329, + "language_loss": 1.05092287, + "learning_rate": 0.000996681845600459, + "loss": 1.062608, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.2265625, + "step": 341, + "time_per_iteration": 2.6543757915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.13118291, + "epoch": 0.06579453636013852, + "flos": 413230961664.0, + "grad_norm": 0.06593832485574395, + "language_loss": 0.97027373, + "learning_rate": 0.0009966459169777982, + "loss": 0.9818095, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.22387695, + "step": 342, + "time_per_iteration": 2.5120761394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141132, + "balance_loss_mlp": 1.11848283, + "epoch": 0.06598691804540208, + "flos": 560354457600.0, + "grad_norm": 0.055115078659976495, + "language_loss": 1.05281377, + "learning_rate": 0.0009966097955400924, + "loss": 1.0642252, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.22644043, + "step": 343, + "time_per_iteration": 2.6954751014709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111133, + "balance_loss_mlp": 1.08904982, + "epoch": 0.06617929973066564, + "flos": 571789117440.0, + "grad_norm": 0.06176008438986438, + "language_loss": 0.99064481, + "learning_rate": 0.0009965734813013652, + "loss": 1.00175822, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.22277832, + "step": 344, + "time_per_iteration": 2.8235929012298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090293, + "balance_loss_mlp": 1.06726193, + "epoch": 0.06637168141592921, + "flos": 490234470912.0, + "grad_norm": 0.05365164831273283, + "language_loss": 1.01308548, + "learning_rate": 0.0009965369742757151, + "loss": 1.02398837, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.23022461, + "step": 345, + "time_per_iteration": 2.5708556175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078086, + "balance_loss_mlp": 1.05727243, + "epoch": 0.06656406310119277, + "flos": 1078735656960.0, + "grad_norm": 0.04968829319439664, + "language_loss": 0.97902787, + "learning_rate": 0.0009965002744773152, + "loss": 0.98980874, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.20812988, + "step": 346, + "time_per_iteration": 3.4984121322631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086945, + "balance_loss_mlp": 1.06450987, + "epoch": 0.06675644478645633, + "flos": 513421065216.0, + "grad_norm": 0.06258978415695335, + "language_loss": 0.95138037, + "learning_rate": 0.0009964633819204139, + "loss": 0.96224982, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.22436523, + "step": 347, + "time_per_iteration": 2.6866109371185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01752926, + "balance_loss_mlp": 1.73108697, + "epoch": 0.06694882647171989, + "flos": 1446359943168.0, + "grad_norm": 0.11694655230354783, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83554041, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.21875, + "step": 348, + "time_per_iteration": 4.935550928115845 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01583198, + "balance_loss_mlp": 1.56116796, + "epoch": 0.06714120815698346, + "flos": 1551230784000.0, + "grad_norm": 0.0989027294649474, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76737082, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.22070312, + "step": 349, + "time_per_iteration": 4.891008615493774 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149858, + "balance_loss_mlp": 1.12826955, + "epoch": 0.06733358984224702, + "flos": 879689673216.0, + "grad_norm": 0.07075764146586616, + "language_loss": 0.94920838, + "learning_rate": 0.000996351547842304, + "loss": 0.96070701, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.21582031, + "step": 350, + "time_per_iteration": 3.156322717666626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192552, + "balance_loss_mlp": 1.17055774, + "epoch": 0.06752597152751058, + "flos": 518654651904.0, + "grad_norm": 0.09040238598346795, + "language_loss": 0.93423587, + "learning_rate": 0.0009963138843953744, + "loss": 0.94616139, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.2199707, + "step": 351, + "time_per_iteration": 2.610987663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206077, + "balance_loss_mlp": 1.18405879, + "epoch": 0.06771835321277414, + "flos": 539366266368.0, + "grad_norm": 0.08658544591035036, + "language_loss": 0.97852194, + "learning_rate": 0.000996276028262306, + "loss": 0.9905827, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.22021484, + "step": 352, + "time_per_iteration": 2.8413686752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166048, + "balance_loss_mlp": 1.14382768, + "epoch": 0.0679107348980377, + "flos": 460430604288.0, + "grad_norm": 0.09117082479319542, + "language_loss": 1.04269946, + "learning_rate": 0.0009962379794577964, + "loss": 1.05435991, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.22216797, + "step": 353, + "time_per_iteration": 2.591372489929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114388, + "balance_loss_mlp": 1.12227976, + "epoch": 0.06810311658330127, + "flos": 635601752064.0, + "grad_norm": 0.05781909345233015, + "language_loss": 0.94169199, + "learning_rate": 0.000996199737996617, + "loss": 0.95313084, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.21630859, + "step": 354, + "time_per_iteration": 2.9088492393493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125411, + "balance_loss_mlp": 1.10420346, + "epoch": 0.06829549826856483, + "flos": 464443448832.0, + "grad_norm": 0.06770201052263504, + "language_loss": 1.03043509, + "learning_rate": 0.0009961613038936149, + "loss": 1.04168916, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.2121582, + "step": 355, + "time_per_iteration": 2.571904420852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110613, + "balance_loss_mlp": 1.08917904, + "epoch": 0.06848787995382839, + "flos": 634335929856.0, + "grad_norm": 0.06097004840688574, + "language_loss": 0.95565176, + "learning_rate": 0.000996122677163711, + "loss": 0.96675789, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.21435547, + "step": 356, + "time_per_iteration": 2.794982671737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107296, + "balance_loss_mlp": 1.08667266, + "epoch": 0.06868026163909195, + "flos": 806023913472.0, + "grad_norm": 0.08020973782133771, + "language_loss": 1.01095176, + "learning_rate": 0.000996083857821902, + "loss": 1.02202487, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.20629883, + "step": 357, + "time_per_iteration": 3.007086753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101637, + "balance_loss_mlp": 1.08076346, + "epoch": 0.06887264332435553, + "flos": 438997252608.0, + "grad_norm": 0.08125476198078858, + "language_loss": 0.99797714, + "learning_rate": 0.0009960448458832588, + "loss": 1.00899351, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.2088623, + "step": 358, + "time_per_iteration": 2.699530601501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098146, + "balance_loss_mlp": 1.07872701, + "epoch": 0.06906502500961909, + "flos": 484513463808.0, + "grad_norm": 0.06827746260367892, + "language_loss": 0.99188638, + "learning_rate": 0.000996005641362927, + "loss": 1.00286782, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.1940918, + "step": 359, + "time_per_iteration": 2.5541014671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103345, + "balance_loss_mlp": 1.0841639, + "epoch": 0.06925740669488265, + "flos": 733293706752.0, + "grad_norm": 0.08731085845928575, + "language_loss": 1.02303529, + "learning_rate": 0.0009959662442761274, + "loss": 1.0340687, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.19189453, + "step": 360, + "time_per_iteration": 2.906623363494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093844, + "balance_loss_mlp": 1.07268476, + "epoch": 0.0694497883801462, + "flos": 552127947264.0, + "grad_norm": 0.06697663210144707, + "language_loss": 0.9595629, + "learning_rate": 0.000995926654638155, + "loss": 0.97050136, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.21179199, + "step": 361, + "time_per_iteration": 2.793663501739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082773, + "balance_loss_mlp": 1.06236482, + "epoch": 0.06964217006540978, + "flos": 677708992512.0, + "grad_norm": 0.06860924301964295, + "language_loss": 0.98198265, + "learning_rate": 0.00099588687246438, + "loss": 0.99281037, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.20410156, + "step": 362, + "time_per_iteration": 2.828139305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085274, + "balance_loss_mlp": 1.06330371, + "epoch": 0.06983455175067334, + "flos": 523987163136.0, + "grad_norm": 0.08747541291209461, + "language_loss": 1.04803789, + "learning_rate": 0.0009958468977702471, + "loss": 1.0588907, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.21972656, + "step": 363, + "time_per_iteration": 2.5759966373443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02224374, + "balance_loss_mlp": 2.20682669, + "epoch": 0.0700269334359369, + "flos": 1575943658496.0, + "grad_norm": 0.2746548069890379, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81959081, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.17578125, + "step": 364, + "time_per_iteration": 4.782835245132446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134514, + "balance_loss_mlp": 1.11340213, + "epoch": 0.07021931512120046, + "flos": 1012848274944.0, + "grad_norm": 0.08586169827549085, + "language_loss": 0.93286598, + "learning_rate": 0.0009957663708830612, + "loss": 0.94421113, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.21105957, + "step": 365, + "time_per_iteration": 3.2484283447265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116189, + "balance_loss_mlp": 1.13884652, + "epoch": 0.07041169680646403, + "flos": 822622348800.0, + "grad_norm": 0.09941073368395695, + "language_loss": 0.97043455, + "learning_rate": 0.0009957258187212714, + "loss": 0.98205346, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.23034668, + "step": 366, + "time_per_iteration": 3.009479522705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01756688, + "balance_loss_mlp": 1.7255981, + "epoch": 0.07060407849172759, + "flos": 1413670993920.0, + "grad_norm": 0.12374795181042475, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80951542, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.31054688, + "step": 367, + "time_per_iteration": 4.82874608039856 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152073, + "balance_loss_mlp": 1.13087749, + "epoch": 0.07079646017699115, + "flos": 512652837888.0, + "grad_norm": 0.06786716904588838, + "language_loss": 0.93450886, + "learning_rate": 0.0009956441370400167, + "loss": 0.94602954, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.21191406, + "step": 368, + "time_per_iteration": 2.6226603984832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153965, + "balance_loss_mlp": 1.13158989, + "epoch": 0.07098884186225471, + "flos": 540240772608.0, + "grad_norm": 0.08343626294497461, + "language_loss": 0.99467343, + "learning_rate": 0.0009956030075522636, + "loss": 1.00621307, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.22375488, + "step": 369, + "time_per_iteration": 2.7128794193267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142856, + "balance_loss_mlp": 1.12137485, + "epoch": 0.07118122354751828, + "flos": 548419230720.0, + "grad_norm": 0.07464528715750075, + "language_loss": 0.98955953, + "learning_rate": 0.0009955616856543587, + "loss": 1.00098813, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.21472168, + "step": 370, + "time_per_iteration": 2.613138198852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118555, + "balance_loss_mlp": 1.0958215, + "epoch": 0.07137360523278184, + "flos": 620612554752.0, + "grad_norm": 0.056434914921328155, + "language_loss": 0.91880834, + "learning_rate": 0.0009955201713623448, + "loss": 0.92999387, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.22717285, + "step": 371, + "time_per_iteration": 2.747133255004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01746336, + "balance_loss_mlp": 1.72154021, + "epoch": 0.0715659869180454, + "flos": 1501850115072.0, + "grad_norm": 0.08669176596007007, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78419054, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.24707031, + "step": 372, + "time_per_iteration": 4.931428670883179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102855, + "balance_loss_mlp": 1.08040774, + "epoch": 0.07175836860330896, + "flos": 495246887424.0, + "grad_norm": 0.07044890130803105, + "language_loss": 1.05121827, + "learning_rate": 0.0009954365656605333, + "loss": 1.06224692, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.22436523, + "step": 373, + "time_per_iteration": 2.550243616104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118244, + "balance_loss_mlp": 1.09438992, + "epoch": 0.07195075028857253, + "flos": 785387902464.0, + "grad_norm": 0.05415547127036835, + "language_loss": 0.98150015, + "learning_rate": 0.0009953944742831947, + "loss": 0.99268264, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.23864746, + "step": 374, + "time_per_iteration": 2.9659459590911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125507, + "balance_loss_mlp": 1.10202336, + "epoch": 0.0721431319738361, + "flos": 592799067648.0, + "grad_norm": 0.07003669353380264, + "language_loss": 1.01441097, + "learning_rate": 0.0009953521905766642, + "loss": 1.02566612, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.23486328, + "step": 375, + "time_per_iteration": 2.942763566970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117119, + "balance_loss_mlp": 1.09393334, + "epoch": 0.07233551365909965, + "flos": 547981272576.0, + "grad_norm": 0.06343477824222313, + "language_loss": 0.99901861, + "learning_rate": 0.0009953097145573577, + "loss": 1.01018989, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.23193359, + "step": 376, + "time_per_iteration": 2.6275272369384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113711, + "balance_loss_mlp": 1.09023869, + "epoch": 0.07252789534436321, + "flos": 957170428416.0, + "grad_norm": 0.0678891965164594, + "language_loss": 0.97798675, + "learning_rate": 0.000995267046241766, + "loss": 0.98912394, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.23474121, + "step": 377, + "time_per_iteration": 3.2014975547790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096997, + "balance_loss_mlp": 1.07496762, + "epoch": 0.07272027702962677, + "flos": 507398902272.0, + "grad_norm": 0.0806519998399971, + "language_loss": 0.97275257, + "learning_rate": 0.0009952241856464547, + "loss": 0.98372257, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.22045898, + "step": 378, + "time_per_iteration": 2.6189732551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109641, + "balance_loss_mlp": 1.0746069, + "epoch": 0.07291265871489035, + "flos": 612128558592.0, + "grad_norm": 0.0691049335661606, + "language_loss": 1.04592681, + "learning_rate": 0.0009951811327880632, + "loss": 1.05689096, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.21826172, + "step": 379, + "time_per_iteration": 2.7411558628082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092071, + "balance_loss_mlp": 1.07025611, + "epoch": 0.0731050404001539, + "flos": 495502963200.0, + "grad_norm": 0.05765504670581196, + "language_loss": 0.97682816, + "learning_rate": 0.0009951378876833063, + "loss": 0.98774892, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.21813965, + "step": 380, + "time_per_iteration": 2.6211278438568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081575, + "balance_loss_mlp": 1.06068945, + "epoch": 0.07329742208541747, + "flos": 639677205504.0, + "grad_norm": 0.06809750593205881, + "language_loss": 1.04190159, + "learning_rate": 0.0009950944503489736, + "loss": 1.05271733, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.20898438, + "step": 381, + "time_per_iteration": 2.7533762454986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081401, + "balance_loss_mlp": 1.0607307, + "epoch": 0.07348980377068103, + "flos": 815999284224.0, + "grad_norm": 0.06607035824886899, + "language_loss": 0.98459697, + "learning_rate": 0.0009950508208019285, + "loss": 0.99541104, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.20678711, + "step": 382, + "time_per_iteration": 2.9885637760162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073667, + "balance_loss_mlp": 1.05369973, + "epoch": 0.0736821854559446, + "flos": 508383917568.0, + "grad_norm": 0.05970909775769663, + "language_loss": 1.02745128, + "learning_rate": 0.0009950069990591096, + "loss": 1.03818798, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.19958496, + "step": 383, + "time_per_iteration": 2.6111788749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01835936, + "balance_loss_mlp": 1.8101871, + "epoch": 0.07387456714120816, + "flos": 1553801716224.0, + "grad_norm": 0.167122487372618, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.78237301, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.2578125, + "step": 384, + "time_per_iteration": 4.859915494918823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116619, + "balance_loss_mlp": 1.09575748, + "epoch": 0.07406694882647172, + "flos": 525219489792.0, + "grad_norm": 0.0799084124695288, + "language_loss": 0.96017051, + "learning_rate": 0.0009949187790542777, + "loss": 0.97133672, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.20861816, + "step": 385, + "time_per_iteration": 2.6976191997528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124239, + "balance_loss_mlp": 1.10322285, + "epoch": 0.07425933051173528, + "flos": 497468611584.0, + "grad_norm": 0.08753491640442414, + "language_loss": 0.91745877, + "learning_rate": 0.0009948743808265148, + "loss": 0.92870116, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.21020508, + "step": 386, + "time_per_iteration": 2.6870572566986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113476, + "balance_loss_mlp": 1.09249496, + "epoch": 0.07445171219699885, + "flos": 504740630016.0, + "grad_norm": 0.05063210924529089, + "language_loss": 1.0156467, + "learning_rate": 0.0009948297904714782, + "loss": 1.02678132, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.20996094, + "step": 387, + "time_per_iteration": 2.668027639389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097529, + "balance_loss_mlp": 1.07642913, + "epoch": 0.07464409388226241, + "flos": 553693515264.0, + "grad_norm": 0.06830922509793466, + "language_loss": 0.93493366, + "learning_rate": 0.0009947850080064796, + "loss": 0.9459089, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.21105957, + "step": 388, + "time_per_iteration": 2.79836106300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098078, + "balance_loss_mlp": 1.07695365, + "epoch": 0.07483647556752597, + "flos": 776511028224.0, + "grad_norm": 0.06471398355705121, + "language_loss": 0.98276728, + "learning_rate": 0.0009947400334489047, + "loss": 0.99374807, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.21130371, + "step": 389, + "time_per_iteration": 3.0046355724334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095267, + "balance_loss_mlp": 1.07513261, + "epoch": 0.07502885725278953, + "flos": 612256596480.0, + "grad_norm": 0.0754939105077014, + "language_loss": 0.90272582, + "learning_rate": 0.0009946948668162145, + "loss": 0.91367853, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.20141602, + "step": 390, + "time_per_iteration": 2.724792003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091157, + "balance_loss_mlp": 1.06946135, + "epoch": 0.0752212389380531, + "flos": 688324552704.0, + "grad_norm": 0.05626120625508035, + "language_loss": 0.9463594, + "learning_rate": 0.0009946495081259441, + "loss": 0.95727098, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.21704102, + "step": 391, + "time_per_iteration": 2.8221397399902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101684, + "balance_loss_mlp": 1.08008361, + "epoch": 0.07541362062331666, + "flos": 765362967552.0, + "grad_norm": 0.09729902751759628, + "language_loss": 0.97468722, + "learning_rate": 0.0009946039573957035, + "loss": 0.98570406, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.21606445, + "step": 392, + "time_per_iteration": 2.958655595779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095785, + "balance_loss_mlp": 1.07572174, + "epoch": 0.07560600230858022, + "flos": 588460336128.0, + "grad_norm": 0.06468718689622391, + "language_loss": 0.94257009, + "learning_rate": 0.000994558214643177, + "loss": 0.95352793, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.20056152, + "step": 393, + "time_per_iteration": 2.752979040145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101382, + "balance_loss_mlp": 1.08086586, + "epoch": 0.07579838399384378, + "flos": 749508028416.0, + "grad_norm": 0.06635223139616171, + "language_loss": 0.961483, + "learning_rate": 0.000994512279886123, + "loss": 0.97249681, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.20532227, + "step": 394, + "time_per_iteration": 3.055225133895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104661, + "balance_loss_mlp": 1.08346581, + "epoch": 0.07599076567910736, + "flos": 523185440256.0, + "grad_norm": 0.06901630142642712, + "language_loss": 0.96749192, + "learning_rate": 0.0009944661531423758, + "loss": 0.97853857, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.2121582, + "step": 395, + "time_per_iteration": 2.6922085285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093271, + "balance_loss_mlp": 1.07248056, + "epoch": 0.07618314736437092, + "flos": 550812662784.0, + "grad_norm": 0.07064334209039194, + "language_loss": 0.95375401, + "learning_rate": 0.000994419834429843, + "loss": 0.96468663, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.20788574, + "step": 396, + "time_per_iteration": 2.6657333374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092352, + "balance_loss_mlp": 1.0716933, + "epoch": 0.07637552904963447, + "flos": 697901253120.0, + "grad_norm": 0.07324881108467876, + "language_loss": 0.99580455, + "learning_rate": 0.0009943733237665069, + "loss": 1.00672793, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.20654297, + "step": 397, + "time_per_iteration": 2.8662500381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085112, + "balance_loss_mlp": 1.06454849, + "epoch": 0.07656791073489803, + "flos": 579066928128.0, + "grad_norm": 0.04790317238997088, + "language_loss": 0.98118353, + "learning_rate": 0.0009943266211704248, + "loss": 0.99203461, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.20568848, + "step": 398, + "time_per_iteration": 2.930741786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094784, + "balance_loss_mlp": 1.07348132, + "epoch": 0.0767602924201616, + "flos": 416923711488.0, + "grad_norm": 0.09980331544781734, + "language_loss": 1.00422275, + "learning_rate": 0.000994279726659728, + "loss": 1.01517057, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.21325684, + "step": 399, + "time_per_iteration": 2.533738851547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109036, + "balance_loss_mlp": 1.06970143, + "epoch": 0.07695267410542517, + "flos": 482671471104.0, + "grad_norm": 0.06967700921129397, + "language_loss": 0.97985041, + "learning_rate": 0.0009942326402526231, + "loss": 0.99075395, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.20666504, + "step": 400, + "time_per_iteration": 2.51460337638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096542, + "balance_loss_mlp": 1.07526302, + "epoch": 0.07714505579068873, + "flos": 530742647808.0, + "grad_norm": 0.052652305799428985, + "language_loss": 0.96639109, + "learning_rate": 0.0009941853619673902, + "loss": 0.97735649, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.2130127, + "step": 401, + "time_per_iteration": 2.620939016342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101036, + "balance_loss_mlp": 1.08012676, + "epoch": 0.07733743747595229, + "flos": 804635845632.0, + "grad_norm": 0.07273299487754427, + "language_loss": 0.99959278, + "learning_rate": 0.0009941378918223844, + "loss": 1.01060319, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.20910645, + "step": 402, + "time_per_iteration": 3.036839008331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110477, + "balance_loss_mlp": 1.08423018, + "epoch": 0.07752981916121585, + "flos": 622192679424.0, + "grad_norm": 0.05767312217272775, + "language_loss": 0.93044209, + "learning_rate": 0.0009940902298360354, + "loss": 0.94148982, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.20544434, + "step": 403, + "time_per_iteration": 2.7703943252563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097477, + "balance_loss_mlp": 1.07694876, + "epoch": 0.07772220084647942, + "flos": 727961195520.0, + "grad_norm": 0.0686344305115436, + "language_loss": 1.02037048, + "learning_rate": 0.0009940423760268473, + "loss": 1.03134525, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.2052002, + "step": 404, + "time_per_iteration": 2.8823602199554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.06497431, + "epoch": 0.07791458253174298, + "flos": 555149984256.0, + "grad_norm": 0.10727031409308073, + "language_loss": 0.96142864, + "learning_rate": 0.0009939943304133982, + "loss": 0.97228479, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.20654297, + "step": 405, + "time_per_iteration": 2.63908314704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078944, + "balance_loss_mlp": 1.05944133, + "epoch": 0.07810696421700654, + "flos": 552919495680.0, + "grad_norm": 0.08981509362846728, + "language_loss": 1.0302707, + "learning_rate": 0.0009939460930143416, + "loss": 1.04106021, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.19482422, + "step": 406, + "time_per_iteration": 2.63259220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079269, + "balance_loss_mlp": 1.05927801, + "epoch": 0.0782993459022701, + "flos": 650323289088.0, + "grad_norm": 0.07212254231156982, + "language_loss": 0.96910775, + "learning_rate": 0.0009938976638484043, + "loss": 0.97990054, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.1998291, + "step": 407, + "time_per_iteration": 2.9489452838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_mlp": 1.05239439, + "epoch": 0.07849172758753367, + "flos": 495926364672.0, + "grad_norm": 0.07302041560946317, + "language_loss": 0.9619081, + "learning_rate": 0.0009938490429343887, + "loss": 0.97263873, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.20666504, + "step": 408, + "time_per_iteration": 2.541293144226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078297, + "balance_loss_mlp": 1.05823374, + "epoch": 0.07868410927279723, + "flos": 577696389120.0, + "grad_norm": 0.06961121210328268, + "language_loss": 0.96404505, + "learning_rate": 0.0009938002302911709, + "loss": 0.974828, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.20056152, + "step": 409, + "time_per_iteration": 2.7890634536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.05869615, + "epoch": 0.07887649095806079, + "flos": 522698019840.0, + "grad_norm": 0.10283598941623227, + "language_loss": 0.99080813, + "learning_rate": 0.0009937512259377015, + "loss": 1.00159442, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.19921875, + "step": 410, + "time_per_iteration": 2.6631360054016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076374, + "balance_loss_mlp": 1.05739617, + "epoch": 0.07906887264332435, + "flos": 556958481408.0, + "grad_norm": 0.07518465865945036, + "language_loss": 0.97744381, + "learning_rate": 0.000993702029893006, + "loss": 0.98820746, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.18981934, + "step": 411, + "time_per_iteration": 2.762937068939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070708, + "balance_loss_mlp": 1.0512886, + "epoch": 0.07926125432858792, + "flos": 821641715712.0, + "grad_norm": 0.06547583340109177, + "language_loss": 0.97466588, + "learning_rate": 0.0009936526421761838, + "loss": 0.98537302, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.1940918, + "step": 412, + "time_per_iteration": 3.019529342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070741, + "balance_loss_mlp": 1.05210841, + "epoch": 0.07945363601385148, + "flos": 562072794624.0, + "grad_norm": 0.06412617323579047, + "language_loss": 0.9993977, + "learning_rate": 0.000993603062806409, + "loss": 1.01010513, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.18615723, + "step": 413, + "time_per_iteration": 2.667893409729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078833, + "balance_loss_mlp": 1.05879402, + "epoch": 0.07964601769911504, + "flos": 517615792128.0, + "grad_norm": 0.0777298152120257, + "language_loss": 1.03187037, + "learning_rate": 0.0009935532918029298, + "loss": 1.04265857, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.20031738, + "step": 414, + "time_per_iteration": 2.628847122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079604, + "balance_loss_mlp": 1.06020916, + "epoch": 0.0798383993843786, + "flos": 538956011520.0, + "grad_norm": 0.0762846382616791, + "language_loss": 0.96381676, + "learning_rate": 0.0009935033291850694, + "loss": 0.97461283, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.19384766, + "step": 415, + "time_per_iteration": 2.6874804496765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078311, + "balance_loss_mlp": 1.05915451, + "epoch": 0.08003078106964218, + "flos": 484901959680.0, + "grad_norm": 0.07548152614126195, + "language_loss": 0.9874112, + "learning_rate": 0.0009934531749722247, + "loss": 0.9981944, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.19177246, + "step": 416, + "time_per_iteration": 2.5752930641174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077721, + "balance_loss_mlp": 1.0581702, + "epoch": 0.08022316275490574, + "flos": 517999905792.0, + "grad_norm": 0.07373378819853486, + "language_loss": 0.97326815, + "learning_rate": 0.0009934028291838672, + "loss": 0.98404539, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.1953125, + "step": 417, + "time_per_iteration": 2.715142011642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078885, + "balance_loss_mlp": 1.0593344, + "epoch": 0.0804155444401693, + "flos": 493755512832.0, + "grad_norm": 0.06878732968267398, + "language_loss": 0.9290086, + "learning_rate": 0.0009933522918395433, + "loss": 0.93979746, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.19555664, + "step": 418, + "time_per_iteration": 2.7008063793182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01673141, + "balance_loss_mlp": 1.6505394, + "epoch": 0.08060792612543285, + "flos": 1580567579136.0, + "grad_norm": 0.10865535097535944, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.7992425, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.22558594, + "step": 419, + "time_per_iteration": 4.854820728302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092516, + "balance_loss_mlp": 1.07238102, + "epoch": 0.08080030781069643, + "flos": 525090041856.0, + "grad_norm": 0.07888672823303539, + "language_loss": 1.11010027, + "learning_rate": 0.000993250642561551, + "loss": 1.12102532, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.20129395, + "step": 420, + "time_per_iteration": 2.6152822971343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102725, + "balance_loss_mlp": 1.08251905, + "epoch": 0.08099268949595999, + "flos": 546459374592.0, + "grad_norm": 0.06927423279576624, + "language_loss": 0.96781242, + "learning_rate": 0.0009931995306673466, + "loss": 0.97883964, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.20202637, + "step": 421, + "time_per_iteration": 2.8378820419311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107938, + "balance_loss_mlp": 1.08725524, + "epoch": 0.08118507118122355, + "flos": 510116811264.0, + "grad_norm": 0.07245841989657228, + "language_loss": 1.01691484, + "learning_rate": 0.000993148227296103, + "loss": 1.02799416, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.20678711, + "step": 422, + "time_per_iteration": 2.6234657764434814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109153, + "balance_loss_mlp": 1.08827925, + "epoch": 0.08137745286648711, + "flos": 720339969024.0, + "grad_norm": 0.06440268991377437, + "language_loss": 0.90059143, + "learning_rate": 0.000993096732467738, + "loss": 0.91168296, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.2088623, + "step": 423, + "time_per_iteration": 2.9789979457855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107042, + "balance_loss_mlp": 1.08620405, + "epoch": 0.08156983455175067, + "flos": 679313848320.0, + "grad_norm": 0.09430690436493987, + "language_loss": 0.97591221, + "learning_rate": 0.0009930450462022435, + "loss": 0.9869827, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.20837402, + "step": 424, + "time_per_iteration": 2.7870407104492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01731933, + "balance_loss_mlp": 1.70847309, + "epoch": 0.08176221623701424, + "flos": 1452577135104.0, + "grad_norm": 0.13164555017172178, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80921739, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.234375, + "step": 425, + "time_per_iteration": 4.870323181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095108, + "balance_loss_mlp": 1.07456827, + "epoch": 0.0819545979222778, + "flos": 1556034071040.0, + "grad_norm": 0.10298759083167684, + "language_loss": 0.95328236, + "learning_rate": 0.0009929410994402065, + "loss": 0.9642334, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.20544434, + "step": 426, + "time_per_iteration": 3.7942585945129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093366, + "balance_loss_mlp": 1.07214665, + "epoch": 0.08214697960754136, + "flos": 512456398848.0, + "grad_norm": 0.069672302328133, + "language_loss": 0.99507213, + "learning_rate": 0.0009928888389840196, + "loss": 1.00600576, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.21240234, + "step": 427, + "time_per_iteration": 2.684760093688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073876, + "balance_loss_mlp": 1.05376494, + "epoch": 0.08233936129280492, + "flos": 594850646016.0, + "grad_norm": 0.07796900075206671, + "language_loss": 1.01749206, + "learning_rate": 0.0009928363871714147, + "loss": 1.02823079, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.20092773, + "step": 428, + "time_per_iteration": 2.6608195304870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078126, + "balance_loss_mlp": 1.05796742, + "epoch": 0.08253174297806849, + "flos": 571758594048.0, + "grad_norm": 0.07341701057973313, + "language_loss": 0.95524251, + "learning_rate": 0.0009927837440227556, + "loss": 0.96602374, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.20153809, + "step": 429, + "time_per_iteration": 2.824958324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083273, + "balance_loss_mlp": 1.06413972, + "epoch": 0.08272412466333205, + "flos": 623065623552.0, + "grad_norm": 0.06194570532237157, + "language_loss": 0.90308964, + "learning_rate": 0.0009927309095584798, + "loss": 0.91392243, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.19128418, + "step": 430, + "time_per_iteration": 2.9565205574035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105878, + "balance_loss_mlp": 1.08643484, + "epoch": 0.08291650634859561, + "flos": 513745542144.0, + "grad_norm": 0.09375416706629437, + "language_loss": 1.0225904, + "learning_rate": 0.0009926778837991, + "loss": 1.03364921, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.19433594, + "step": 431, + "time_per_iteration": 2.5606777667999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104802, + "balance_loss_mlp": 1.08521628, + "epoch": 0.08310888803385917, + "flos": 667073083392.0, + "grad_norm": 0.09022222071598751, + "language_loss": 1.00445497, + "learning_rate": 0.000992624666765202, + "loss": 1.01550293, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.19580078, + "step": 432, + "time_per_iteration": 2.763514995574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112312, + "balance_loss_mlp": 1.09166527, + "epoch": 0.08330126971912274, + "flos": 582995404800.0, + "grad_norm": 0.07142121215748316, + "language_loss": 0.98131895, + "learning_rate": 0.000992571258477447, + "loss": 0.99244213, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.20654297, + "step": 433, + "time_per_iteration": 2.7823588848114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086622, + "balance_loss_mlp": 1.06731021, + "epoch": 0.0834936514043863, + "flos": 561064458240.0, + "grad_norm": 0.06618743000622296, + "language_loss": 0.92206728, + "learning_rate": 0.0009925176589565695, + "loss": 0.93293345, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.1932373, + "step": 434, + "time_per_iteration": 2.7774362564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109069, + "balance_loss_mlp": 1.07043648, + "epoch": 0.08368603308964986, + "flos": 494272046592.0, + "grad_norm": 0.07800081613857189, + "language_loss": 1.01949787, + "learning_rate": 0.0009924638682233791, + "loss": 1.03040481, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.20251465, + "step": 435, + "time_per_iteration": 2.574716091156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236801, + "balance_loss_mlp": 1.21505737, + "epoch": 0.08387841477491342, + "flos": 1388322312192.0, + "grad_norm": 0.08820287098199171, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80801398, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.21777344, + "step": 436, + "time_per_iteration": 4.521069049835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087939, + "balance_loss_mlp": 1.06750691, + "epoch": 0.084070796460177, + "flos": 798642796032.0, + "grad_norm": 0.09737991847895365, + "language_loss": 0.92070073, + "learning_rate": 0.0009923557132036668, + "loss": 0.93158013, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.2043457, + "step": 437, + "time_per_iteration": 3.0401971340179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106896, + "balance_loss_mlp": 1.08635592, + "epoch": 0.08426317814544056, + "flos": 558681200640.0, + "grad_norm": 0.07082709395687636, + "language_loss": 0.96077365, + "learning_rate": 0.0009923013489591345, + "loss": 0.97184265, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.20532227, + "step": 438, + "time_per_iteration": 2.7388038635253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138911, + "balance_loss_mlp": 1.11965871, + "epoch": 0.08445555983070412, + "flos": 810057106944.0, + "grad_norm": 0.09946092642967543, + "language_loss": 0.94659293, + "learning_rate": 0.0009922467935862681, + "loss": 0.95798206, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.19250488, + "step": 439, + "time_per_iteration": 3.0827929973602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153103, + "balance_loss_mlp": 1.13278937, + "epoch": 0.08464794151596768, + "flos": 509939311104.0, + "grad_norm": 0.08658230076015333, + "language_loss": 0.97196984, + "learning_rate": 0.0009921920471062478, + "loss": 0.9835009, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.203125, + "step": 440, + "time_per_iteration": 2.5667247772216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_mlp": 1.08952785, + "epoch": 0.08484032320123125, + "flos": 556149556224.0, + "grad_norm": 0.0779492699350581, + "language_loss": 0.95526892, + "learning_rate": 0.0009921371095403281, + "loss": 0.96636182, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.19763184, + "step": 441, + "time_per_iteration": 2.6504476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081558, + "balance_loss_mlp": 1.06137586, + "epoch": 0.08503270488649481, + "flos": 527103742464.0, + "grad_norm": 0.0823758421396894, + "language_loss": 0.98291612, + "learning_rate": 0.0009920819809098379, + "loss": 0.99373174, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.20166016, + "step": 442, + "time_per_iteration": 2.5884947776794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076633, + "balance_loss_mlp": 1.05612862, + "epoch": 0.08522508657175837, + "flos": 613989490176.0, + "grad_norm": 0.07828377396362728, + "language_loss": 0.94043314, + "learning_rate": 0.0009920266612361798, + "loss": 0.95119947, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.20507812, + "step": 443, + "time_per_iteration": 2.7464845180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077144, + "balance_loss_mlp": 1.05650926, + "epoch": 0.08541746825702193, + "flos": 619495119360.0, + "grad_norm": 0.07442656272719532, + "language_loss": 0.94335687, + "learning_rate": 0.0009919711505408308, + "loss": 0.95412827, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.2064209, + "step": 444, + "time_per_iteration": 2.7615623474121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092391, + "balance_loss_mlp": 1.07126665, + "epoch": 0.08560984994228549, + "flos": 482671471104.0, + "grad_norm": 0.08601843511227286, + "language_loss": 0.92049706, + "learning_rate": 0.000991915448845342, + "loss": 0.93142092, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.21130371, + "step": 445, + "time_per_iteration": 2.519644260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103035, + "balance_loss_mlp": 1.08145857, + "epoch": 0.08580223162754906, + "flos": 516897027072.0, + "grad_norm": 0.07781715705073443, + "language_loss": 1.01207459, + "learning_rate": 0.000991859556171339, + "loss": 1.02310491, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.21569824, + "step": 446, + "time_per_iteration": 2.5678694248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116922, + "balance_loss_mlp": 1.09462976, + "epoch": 0.08599461331281262, + "flos": 531215511552.0, + "grad_norm": 0.11213971543052093, + "language_loss": 1.02931881, + "learning_rate": 0.000991803472540521, + "loss": 1.040488, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.22302246, + "step": 447, + "time_per_iteration": 2.6309196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124555, + "balance_loss_mlp": 1.10302639, + "epoch": 0.08618699499807618, + "flos": 789966743040.0, + "grad_norm": 0.07287006723198586, + "language_loss": 0.97443926, + "learning_rate": 0.0009917471979746615, + "loss": 0.98568487, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.21533203, + "step": 448, + "time_per_iteration": 2.9742491245269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134564, + "balance_loss_mlp": 1.11266506, + "epoch": 0.08637937668333974, + "flos": 565707317760.0, + "grad_norm": 0.08202115093309782, + "language_loss": 0.97199845, + "learning_rate": 0.0009916907324956086, + "loss": 0.98334408, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.21923828, + "step": 449, + "time_per_iteration": 2.704089641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151497, + "balance_loss_mlp": 1.12693954, + "epoch": 0.08657175836860331, + "flos": 444930665472.0, + "grad_norm": 0.09325215593581063, + "language_loss": 0.93441564, + "learning_rate": 0.0009916340761252837, + "loss": 0.9459306, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.24536133, + "step": 450, + "time_per_iteration": 2.5866575241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158359, + "balance_loss_mlp": 1.13567328, + "epoch": 0.08676414005386687, + "flos": 843789450240.0, + "grad_norm": 0.23711660967347972, + "language_loss": 0.90976942, + "learning_rate": 0.0009915772288856832, + "loss": 0.92135304, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.22668457, + "step": 451, + "time_per_iteration": 3.109010696411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118071, + "balance_loss_mlp": 1.15827537, + "epoch": 0.08695652173913043, + "flos": 602995608576.0, + "grad_norm": 0.08699490701012727, + "language_loss": 0.92036849, + "learning_rate": 0.000991520190798877, + "loss": 0.93217564, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.22424316, + "step": 452, + "time_per_iteration": 2.8523812294006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181191, + "balance_loss_mlp": 1.15807629, + "epoch": 0.08714890342439399, + "flos": 730423028736.0, + "grad_norm": 0.09293440668835976, + "language_loss": 1.01637089, + "learning_rate": 0.0009914629618870089, + "loss": 1.02818286, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.23095703, + "step": 453, + "time_per_iteration": 2.882887125015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142362, + "balance_loss_mlp": 1.12891519, + "epoch": 0.08734128510965757, + "flos": 1481518232064.0, + "grad_norm": 0.0645312523276542, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79818237, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.13476562, + "step": 454, + "time_per_iteration": 4.717878103256226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103219, + "balance_loss_mlp": 1.09034455, + "epoch": 0.08753366679492113, + "flos": 1522214083584.0, + "grad_norm": 0.04274098512475534, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82531178, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.12890625, + "step": 455, + "time_per_iteration": 4.838243246078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171645, + "balance_loss_mlp": 1.14951944, + "epoch": 0.08772604848018468, + "flos": 720935078400.0, + "grad_norm": 0.10543082910841049, + "language_loss": 0.94423014, + "learning_rate": 0.0009912901304235883, + "loss": 0.95594656, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.22131348, + "step": 456, + "time_per_iteration": 2.9432015419006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150762, + "balance_loss_mlp": 1.12861252, + "epoch": 0.08791843016544824, + "flos": 707926086144.0, + "grad_norm": 0.10980567381029156, + "language_loss": 0.91300154, + "learning_rate": 0.000991232138434397, + "loss": 0.92450917, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.22143555, + "step": 457, + "time_per_iteration": 2.832761526107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113929, + "balance_loss_mlp": 1.09195828, + "epoch": 0.08811081185071182, + "flos": 472799407104.0, + "grad_norm": 0.1324680836731367, + "language_loss": 0.97845554, + "learning_rate": 0.000991173955731976, + "loss": 0.98959482, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.21960449, + "step": 458, + "time_per_iteration": 2.660696506500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100778, + "balance_loss_mlp": 1.07958269, + "epoch": 0.08830319353597538, + "flos": 684647769600.0, + "grad_norm": 0.07138233575581546, + "language_loss": 1.0178268, + "learning_rate": 0.0009911155823389137, + "loss": 1.02883458, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.21203613, + "step": 459, + "time_per_iteration": 2.9878122806549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105128, + "balance_loss_mlp": 1.08344412, + "epoch": 0.08849557522123894, + "flos": 573235411968.0, + "grad_norm": 0.0735053314112025, + "language_loss": 0.9764787, + "learning_rate": 0.000991057018277873, + "loss": 0.98752999, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.21679688, + "step": 460, + "time_per_iteration": 2.707247018814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116963, + "balance_loss_mlp": 1.09422946, + "epoch": 0.0886879569065025, + "flos": 564303283200.0, + "grad_norm": 0.10552034142073316, + "language_loss": 0.9759655, + "learning_rate": 0.0009909982635715898, + "loss": 0.98713505, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.22729492, + "step": 461, + "time_per_iteration": 2.609016180038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120097, + "balance_loss_mlp": 1.09760189, + "epoch": 0.08888033859176607, + "flos": 563609249280.0, + "grad_norm": 0.09185893532484944, + "language_loss": 0.96625364, + "learning_rate": 0.0009909393182428751, + "loss": 0.97745454, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.22497559, + "step": 462, + "time_per_iteration": 2.682616949081421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116703, + "balance_loss_mlp": 1.09437466, + "epoch": 0.08907272027702963, + "flos": 465517214208.0, + "grad_norm": 0.08888403374641002, + "language_loss": 0.91300213, + "learning_rate": 0.000990880182314614, + "loss": 0.92416912, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.22314453, + "step": 463, + "time_per_iteration": 2.732579469680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122985, + "balance_loss_mlp": 1.10014486, + "epoch": 0.08926510196229319, + "flos": 681200921088.0, + "grad_norm": 0.07408309604525525, + "language_loss": 0.92294347, + "learning_rate": 0.0009908208558097643, + "loss": 0.93417335, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.22839355, + "step": 464, + "time_per_iteration": 2.910313606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137827, + "balance_loss_mlp": 1.115273, + "epoch": 0.08945748364755675, + "flos": 596411831808.0, + "grad_norm": 0.08673846989427919, + "language_loss": 0.93827909, + "learning_rate": 0.000990761338751359, + "loss": 0.94965738, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.22546387, + "step": 465, + "time_per_iteration": 2.827570676803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133815, + "balance_loss_mlp": 1.12222791, + "epoch": 0.08964986533282032, + "flos": 1585082400768.0, + "grad_norm": 0.06082202694548154, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74793446, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.11572266, + "step": 466, + "time_per_iteration": 4.960917234420776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177765, + "balance_loss_mlp": 1.15419745, + "epoch": 0.08984224701808388, + "flos": 533268499968.0, + "grad_norm": 0.4900596090566038, + "language_loss": 0.96587038, + "learning_rate": 0.0009906417330663815, + "loss": 0.97764802, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.23571777, + "step": 467, + "time_per_iteration": 2.5937299728393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157084, + "balance_loss_mlp": 1.13383865, + "epoch": 0.09003462870334744, + "flos": 478702296576.0, + "grad_norm": 0.08613132202477504, + "language_loss": 0.92798859, + "learning_rate": 0.0009905816444862442, + "loss": 0.93955946, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.23217773, + "step": 468, + "time_per_iteration": 2.6012237071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164868, + "balance_loss_mlp": 1.14150274, + "epoch": 0.090227010388611, + "flos": 653307448320.0, + "grad_norm": 0.08218040805372613, + "language_loss": 0.90769458, + "learning_rate": 0.0009905213654454216, + "loss": 0.91934329, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.23364258, + "step": 469, + "time_per_iteration": 2.8727760314941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176439, + "balance_loss_mlp": 1.15152478, + "epoch": 0.09041939207387456, + "flos": 617894645760.0, + "grad_norm": 0.09256259391525869, + "language_loss": 0.97864139, + "learning_rate": 0.0009904608959673158, + "loss": 0.9904058, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.24938965, + "step": 470, + "time_per_iteration": 2.7991952896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151805, + "balance_loss_mlp": 1.12671185, + "epoch": 0.09061177375913813, + "flos": 454137808896.0, + "grad_norm": 0.09693984756275055, + "language_loss": 0.97988749, + "learning_rate": 0.000990400236075403, + "loss": 0.99140555, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.25109863, + "step": 471, + "time_per_iteration": 2.523508310317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125394, + "balance_loss_mlp": 1.10119498, + "epoch": 0.0908041554444017, + "flos": 543982984704.0, + "grad_norm": 0.09250187628709369, + "language_loss": 0.9490509, + "learning_rate": 0.0009903393857932338, + "loss": 0.96030486, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.24194336, + "step": 472, + "time_per_iteration": 2.7065584659576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124084, + "balance_loss_mlp": 1.09912193, + "epoch": 0.09099653712966525, + "flos": 564052999680.0, + "grad_norm": 0.10897832311722938, + "language_loss": 0.93660218, + "learning_rate": 0.0009902783451444317, + "loss": 0.94784307, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.24963379, + "step": 473, + "time_per_iteration": 2.7067277431488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108649, + "balance_loss_mlp": 1.08496177, + "epoch": 0.09118891881492881, + "flos": 474300956160.0, + "grad_norm": 0.09402902414949979, + "language_loss": 0.97273493, + "learning_rate": 0.0009902171141526956, + "loss": 0.98382139, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.23693848, + "step": 474, + "time_per_iteration": 2.5281569957733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087186, + "balance_loss_mlp": 1.06240201, + "epoch": 0.09138130050019239, + "flos": 545579076096.0, + "grad_norm": 0.06728788346792411, + "language_loss": 0.85273343, + "learning_rate": 0.000990155692841797, + "loss": 0.86360526, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.2479248, + "step": 475, + "time_per_iteration": 2.970107316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_mlp": 1.0587163, + "epoch": 0.09157368218545595, + "flos": 732397441536.0, + "grad_norm": 0.07226189405033341, + "language_loss": 0.97062063, + "learning_rate": 0.0009900940812355818, + "loss": 0.98145562, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.24768066, + "step": 476, + "time_per_iteration": 2.959184169769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096233, + "balance_loss_mlp": 1.07208097, + "epoch": 0.0917660638707195, + "flos": 610709967360.0, + "grad_norm": 0.09034653129128065, + "language_loss": 0.92824447, + "learning_rate": 0.00099003227935797, + "loss": 0.93920678, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.24157715, + "step": 477, + "time_per_iteration": 2.7553765773773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113263, + "balance_loss_mlp": 1.08839583, + "epoch": 0.09195844555598306, + "flos": 655561257984.0, + "grad_norm": 0.09830094540804109, + "language_loss": 0.95358098, + "learning_rate": 0.000989970287232955, + "loss": 0.96471357, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.2487793, + "step": 478, + "time_per_iteration": 2.7916457653045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112064, + "balance_loss_mlp": 1.09633327, + "epoch": 0.09215082724124664, + "flos": 476339387904.0, + "grad_norm": 0.08054303064285366, + "language_loss": 0.93560576, + "learning_rate": 0.0009899081048846043, + "loss": 0.94681215, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.24267578, + "step": 479, + "time_per_iteration": 2.554161787033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114732, + "balance_loss_mlp": 1.12177348, + "epoch": 0.0923432089265102, + "flos": 524051182080.0, + "grad_norm": 0.1186512856896222, + "language_loss": 0.97593725, + "learning_rate": 0.0009898457323370593, + "loss": 0.98741049, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.25549316, + "step": 480, + "time_per_iteration": 2.5794191360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131558, + "balance_loss_mlp": 1.10608315, + "epoch": 0.09253559061177376, + "flos": 545302651392.0, + "grad_norm": 0.10688941209840569, + "language_loss": 0.96892118, + "learning_rate": 0.000989783169614535, + "loss": 0.98023689, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.25512695, + "step": 481, + "time_per_iteration": 2.6676101684570312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336494, + "balance_loss_mlp": 1.32304764, + "epoch": 0.09272797229703732, + "flos": 1537222219776.0, + "grad_norm": 0.112558059824644, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80089253, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.13476562, + "step": 482, + "time_per_iteration": 4.910710096359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121205, + "balance_loss_mlp": 1.09537172, + "epoch": 0.09292035398230089, + "flos": 689501624832.0, + "grad_norm": 0.08905484371867754, + "language_loss": 0.93989253, + "learning_rate": 0.000989657473741779, + "loss": 0.95110452, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.25866699, + "step": 483, + "time_per_iteration": 2.8736467361450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120092, + "balance_loss_mlp": 1.09219658, + "epoch": 0.09311273566756445, + "flos": 509482414080.0, + "grad_norm": 0.10011855628381364, + "language_loss": 0.94861096, + "learning_rate": 0.0009895943406403465, + "loss": 0.95981193, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.27905273, + "step": 484, + "time_per_iteration": 2.7233312129974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114409, + "balance_loss_mlp": 1.08641887, + "epoch": 0.09330511735282801, + "flos": 659111413248.0, + "grad_norm": 0.10884122740481975, + "language_loss": 0.87602448, + "learning_rate": 0.0009895310174615338, + "loss": 0.88716859, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.2800293, + "step": 485, + "time_per_iteration": 2.7538061141967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098211, + "balance_loss_mlp": 1.08533621, + "epoch": 0.09349749903809157, + "flos": 1452054809088.0, + "grad_norm": 0.04867374252302138, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76816726, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.12890625, + "step": 486, + "time_per_iteration": 4.681119441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121456, + "balance_loss_mlp": 1.09291732, + "epoch": 0.09368988072335514, + "flos": 520614508032.0, + "grad_norm": 0.07858969791005947, + "language_loss": 0.92458618, + "learning_rate": 0.0009894038009701782, + "loss": 0.93580067, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.28515625, + "step": 487, + "time_per_iteration": 2.6114649772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153128, + "balance_loss_mlp": 1.12148952, + "epoch": 0.0938822624086187, + "flos": 497502107136.0, + "grad_norm": 0.11959755259003642, + "language_loss": 0.91595036, + "learning_rate": 0.0009893399077070253, + "loss": 0.92748165, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.31616211, + "step": 488, + "time_per_iteration": 2.5603692531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127952, + "balance_loss_mlp": 1.09845996, + "epoch": 0.09407464409388226, + "flos": 532948405248.0, + "grad_norm": 0.09098963794592498, + "language_loss": 0.89760649, + "learning_rate": 0.0009892758244652718, + "loss": 0.90888608, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.29516602, + "step": 489, + "time_per_iteration": 2.65938401222229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127724, + "balance_loss_mlp": 1.09568012, + "epoch": 0.09426702577914582, + "flos": 585736634880.0, + "grad_norm": 0.09102778373185845, + "language_loss": 0.94519842, + "learning_rate": 0.0009892115512697968, + "loss": 0.95647562, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.3203125, + "step": 490, + "time_per_iteration": 2.6538186073303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120065, + "balance_loss_mlp": 1.08926105, + "epoch": 0.0944594074644094, + "flos": 503081929728.0, + "grad_norm": 0.07724049493821064, + "language_loss": 0.96624851, + "learning_rate": 0.0009891470881455537, + "loss": 0.97744912, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.30810547, + "step": 491, + "time_per_iteration": 2.699535608291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122711, + "balance_loss_mlp": 1.09145451, + "epoch": 0.09465178914967295, + "flos": 570748847616.0, + "grad_norm": 0.0816499633869022, + "language_loss": 0.94510269, + "learning_rate": 0.0009890824351175692, + "loss": 0.95632982, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.31225586, + "step": 492, + "time_per_iteration": 2.678191661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125893, + "balance_loss_mlp": 1.09418344, + "epoch": 0.09484417083493651, + "flos": 549098707968.0, + "grad_norm": 0.07977284094064935, + "language_loss": 0.98609412, + "learning_rate": 0.0009890175922109435, + "loss": 0.99735302, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.31689453, + "step": 493, + "time_per_iteration": 2.6466987133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138627, + "balance_loss_mlp": 1.10534418, + "epoch": 0.09503655252020007, + "flos": 823552109568.0, + "grad_norm": 0.09331424233507904, + "language_loss": 0.96939492, + "learning_rate": 0.0009889525594508513, + "loss": 0.9807812, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.33300781, + "step": 494, + "time_per_iteration": 3.009894371032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153225, + "balance_loss_mlp": 1.12218332, + "epoch": 0.09522893420546363, + "flos": 404397757440.0, + "grad_norm": 0.08141129996203125, + "language_loss": 0.91043431, + "learning_rate": 0.0009888873368625404, + "loss": 0.92196655, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.31030273, + "step": 495, + "time_per_iteration": 2.4904890060424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171726, + "balance_loss_mlp": 1.14025438, + "epoch": 0.0954213158907272, + "flos": 690707810304.0, + "grad_norm": 0.08256479818708104, + "language_loss": 0.94339681, + "learning_rate": 0.0009888219244713326, + "loss": 0.95511413, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.31445312, + "step": 496, + "time_per_iteration": 2.8060483932495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181664, + "balance_loss_mlp": 1.15033531, + "epoch": 0.09561369757599077, + "flos": 518739019776.0, + "grad_norm": 0.10472312979641793, + "language_loss": 0.94370055, + "learning_rate": 0.0009887563223026229, + "loss": 0.95551717, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.31323242, + "step": 497, + "time_per_iteration": 2.6536803245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228939, + "balance_loss_mlp": 1.21549225, + "epoch": 0.09580607926125433, + "flos": 1384825849344.0, + "grad_norm": 0.04877985805939708, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80297101, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.13476562, + "step": 498, + "time_per_iteration": 4.874605178833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197245, + "balance_loss_mlp": 1.16455829, + "epoch": 0.09599846094651789, + "flos": 717090969600.0, + "grad_norm": 0.08863465655244346, + "language_loss": 0.93284124, + "learning_rate": 0.0009886245487346482, + "loss": 0.94481373, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.3269043, + "step": 499, + "time_per_iteration": 3.047938108444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011865, + "balance_loss_mlp": 1.15474319, + "epoch": 0.09619084263178146, + "flos": 385824909312.0, + "grad_norm": 0.09673466805801513, + "language_loss": 0.96238041, + "learning_rate": 0.0009885583773865422, + "loss": 0.97424543, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.31762695, + "step": 500, + "time_per_iteration": 2.402763843536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140705, + "balance_loss_mlp": 1.1099968, + "epoch": 0.09638322431704502, + "flos": 533869401600.0, + "grad_norm": 0.08556524095898377, + "language_loss": 0.93457472, + "learning_rate": 0.0009884920163632524, + "loss": 0.94598186, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.30688477, + "step": 501, + "time_per_iteration": 2.7420296669006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155853, + "balance_loss_mlp": 1.12373805, + "epoch": 0.09657560600230858, + "flos": 500426629632.0, + "grad_norm": 0.08462195742795481, + "language_loss": 0.95688182, + "learning_rate": 0.000988425465690543, + "loss": 0.96844035, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.32104492, + "step": 502, + "time_per_iteration": 2.5425736904144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163304, + "balance_loss_mlp": 1.13099861, + "epoch": 0.09676798768757214, + "flos": 528995197440.0, + "grad_norm": 0.07192036847451248, + "language_loss": 0.92721838, + "learning_rate": 0.0009883587253942505, + "loss": 0.93885148, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.32324219, + "step": 503, + "time_per_iteration": 2.8340742588043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188959, + "balance_loss_mlp": 1.15598607, + "epoch": 0.09696036937283571, + "flos": 463379857920.0, + "grad_norm": 0.0888689340699796, + "language_loss": 0.99166393, + "learning_rate": 0.0009882917955002862, + "loss": 1.00355351, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.32983398, + "step": 504, + "time_per_iteration": 2.560448169708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147535, + "balance_loss_mlp": 1.11606395, + "epoch": 0.09715275105809927, + "flos": 534716204544.0, + "grad_norm": 0.07251663236407552, + "language_loss": 0.9150176, + "learning_rate": 0.0009882246760346343, + "loss": 0.92649293, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.31420898, + "step": 505, + "time_per_iteration": 2.6460299491882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114081, + "balance_loss_mlp": 1.10714495, + "epoch": 0.09734513274336283, + "flos": 454713979392.0, + "grad_norm": 0.10061537251918176, + "language_loss": 0.96100289, + "learning_rate": 0.0009881573670233533, + "loss": 0.97241098, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.33666992, + "step": 506, + "time_per_iteration": 2.5137040615081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109977, + "balance_loss_mlp": 1.08029366, + "epoch": 0.09753751442862639, + "flos": 508551243264.0, + "grad_norm": 0.0762964042901656, + "language_loss": 0.91185808, + "learning_rate": 0.0009880898684925747, + "loss": 0.92295784, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.29663086, + "step": 507, + "time_per_iteration": 2.6571738719940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110119, + "balance_loss_mlp": 1.07133985, + "epoch": 0.09772989611388996, + "flos": 484030425600.0, + "grad_norm": 0.07531505250568626, + "language_loss": 0.89554358, + "learning_rate": 0.0009880221804685037, + "loss": 0.90655547, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.29882812, + "step": 508, + "time_per_iteration": 2.596289873123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01404721, + "balance_loss_mlp": 1.39136958, + "epoch": 0.09792227779915352, + "flos": 1565306339328.0, + "grad_norm": 0.10151454340945995, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80749142, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.13378906, + "step": 509, + "time_per_iteration": 4.724441051483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116621, + "balance_loss_mlp": 1.08655643, + "epoch": 0.09811465948441708, + "flos": 587529165312.0, + "grad_norm": 0.08257009801201759, + "language_loss": 0.94708043, + "learning_rate": 0.0009878862360456733, + "loss": 0.95824659, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.30029297, + "step": 510, + "time_per_iteration": 2.703011989593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122701, + "balance_loss_mlp": 1.09406662, + "epoch": 0.09830704116968064, + "flos": 612719285760.0, + "grad_norm": 0.06191460590209878, + "language_loss": 0.88457662, + "learning_rate": 0.0009878179796996922, + "loss": 0.89580369, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.28637695, + "step": 511, + "time_per_iteration": 2.7212226390838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128587, + "balance_loss_mlp": 1.09885597, + "epoch": 0.09849942285494422, + "flos": 538528227840.0, + "grad_norm": 0.06874751685339883, + "language_loss": 0.9199326, + "learning_rate": 0.0009877495339659754, + "loss": 0.9312185, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.29724121, + "step": 512, + "time_per_iteration": 2.7520575523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111609, + "balance_loss_mlp": 1.08826661, + "epoch": 0.09869180454020778, + "flos": 620193535488.0, + "grad_norm": 0.06953003964378547, + "language_loss": 0.87301105, + "learning_rate": 0.000987680898871096, + "loss": 0.88417196, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.27832031, + "step": 513, + "time_per_iteration": 2.7121992111206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134292, + "balance_loss_mlp": 1.10401261, + "epoch": 0.09888418622547133, + "flos": 811375363584.0, + "grad_norm": 0.1024184057853134, + "language_loss": 0.87763435, + "learning_rate": 0.0009876120744417, + "loss": 0.88897729, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.30273438, + "step": 514, + "time_per_iteration": 2.971573829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123143, + "balance_loss_mlp": 1.09267306, + "epoch": 0.0990765679107349, + "flos": 535548450816.0, + "grad_norm": 0.06764912074049458, + "language_loss": 0.95588082, + "learning_rate": 0.0009875430607045078, + "loss": 0.9671123, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.3046875, + "step": 515, + "time_per_iteration": 2.6630361080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108813, + "balance_loss_mlp": 1.08072746, + "epoch": 0.09926894959599845, + "flos": 587607740928.0, + "grad_norm": 0.06593749006245919, + "language_loss": 0.92788792, + "learning_rate": 0.000987473857686313, + "loss": 0.93897605, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.28076172, + "step": 516, + "time_per_iteration": 2.710068702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121556, + "balance_loss_mlp": 1.09039485, + "epoch": 0.09946133128126203, + "flos": 640947409920.0, + "grad_norm": 0.08862761474564218, + "language_loss": 0.9451825, + "learning_rate": 0.0009874044654139824, + "loss": 0.95639801, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.3112793, + "step": 517, + "time_per_iteration": 2.729975461959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117034, + "balance_loss_mlp": 1.08520555, + "epoch": 0.09965371296652559, + "flos": 465546327552.0, + "grad_norm": 0.09157938746936445, + "language_loss": 0.9250825, + "learning_rate": 0.0009873348839144563, + "loss": 0.93625283, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.31811523, + "step": 518, + "time_per_iteration": 2.5117127895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112516, + "balance_loss_mlp": 1.09540534, + "epoch": 0.09984609465178915, + "flos": 483365505024.0, + "grad_norm": 0.07736257304557469, + "language_loss": 0.9674046, + "learning_rate": 0.000987265113214749, + "loss": 0.97865617, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.29711914, + "step": 519, + "time_per_iteration": 2.5816774368286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147544, + "balance_loss_mlp": 1.11421299, + "epoch": 0.1000384763370527, + "flos": 568764260352.0, + "grad_norm": 0.08763817133734854, + "language_loss": 0.96583092, + "learning_rate": 0.0009871951533419476, + "loss": 0.97730637, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.33325195, + "step": 520, + "time_per_iteration": 2.638664484024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140597, + "balance_loss_mlp": 1.108482, + "epoch": 0.10023085802231628, + "flos": 545515057152.0, + "grad_norm": 0.10925869968591369, + "language_loss": 0.88377398, + "learning_rate": 0.0009871250043232132, + "loss": 0.89517999, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.32104492, + "step": 521, + "time_per_iteration": 2.70491886138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136934, + "balance_loss_mlp": 1.10555792, + "epoch": 0.10042323970757984, + "flos": 503208557568.0, + "grad_norm": 0.07694864026409119, + "language_loss": 0.87725985, + "learning_rate": 0.0009870546661857797, + "loss": 0.8886292, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.31347656, + "step": 522, + "time_per_iteration": 2.653456211090088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126678, + "balance_loss_mlp": 1.09380031, + "epoch": 0.1006156213928434, + "flos": 770084402688.0, + "grad_norm": 0.08414569380370593, + "language_loss": 0.95787346, + "learning_rate": 0.0009869841389569553, + "loss": 0.96914017, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.32885742, + "step": 523, + "time_per_iteration": 2.9442663192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116557, + "balance_loss_mlp": 1.08625388, + "epoch": 0.10080800307810696, + "flos": 489786338304.0, + "grad_norm": 0.06587351152736676, + "language_loss": 0.88897854, + "learning_rate": 0.0009869134226641206, + "loss": 0.90014416, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.30297852, + "step": 524, + "time_per_iteration": 2.5559868812561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110225, + "balance_loss_mlp": 1.07746601, + "epoch": 0.10100038476337053, + "flos": 454478252544.0, + "grad_norm": 0.09167866019985617, + "language_loss": 0.88383424, + "learning_rate": 0.0009868425173347303, + "loss": 0.89493656, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.32788086, + "step": 525, + "time_per_iteration": 2.645116090774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111349, + "balance_loss_mlp": 1.08216143, + "epoch": 0.10119276644863409, + "flos": 556155348480.0, + "grad_norm": 0.07288604326691553, + "language_loss": 0.96749896, + "learning_rate": 0.0009867714229963125, + "loss": 0.97863394, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.31323242, + "step": 526, + "time_per_iteration": 2.730703592300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106354, + "balance_loss_mlp": 1.07540703, + "epoch": 0.10138514813389765, + "flos": 515990587392.0, + "grad_norm": 0.07095113284061857, + "language_loss": 0.93916923, + "learning_rate": 0.000986700139676468, + "loss": 0.95023274, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.30932617, + "step": 527, + "time_per_iteration": 2.5836338996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110446, + "balance_loss_mlp": 1.07833052, + "epoch": 0.10157752981916121, + "flos": 500323322880.0, + "grad_norm": 0.06933811905919615, + "language_loss": 0.91673893, + "learning_rate": 0.0009866286674028717, + "loss": 0.92784333, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.32104492, + "step": 528, + "time_per_iteration": 2.7084739208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_mlp": 1.07100391, + "epoch": 0.10176991150442478, + "flos": 656444376576.0, + "grad_norm": 0.07189407365130172, + "language_loss": 0.88586026, + "learning_rate": 0.0009865570062032717, + "loss": 0.8968786, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.30810547, + "step": 529, + "time_per_iteration": 2.9141628742218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103952, + "balance_loss_mlp": 1.07443571, + "epoch": 0.10196229318968834, + "flos": 572974953984.0, + "grad_norm": 0.06841647032337263, + "language_loss": 0.93659967, + "learning_rate": 0.0009864851561054893, + "loss": 0.94763923, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.29516602, + "step": 530, + "time_per_iteration": 2.7539894580841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090977, + "balance_loss_mlp": 1.06110358, + "epoch": 0.1021546748749519, + "flos": 517946061312.0, + "grad_norm": 0.07340246055426732, + "language_loss": 0.91722125, + "learning_rate": 0.0009864131171374191, + "loss": 0.92813098, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.29882812, + "step": 531, + "time_per_iteration": 2.6921956539154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109944, + "balance_loss_mlp": 1.06749225, + "epoch": 0.10234705656021546, + "flos": 609470286336.0, + "grad_norm": 0.07867637119915549, + "language_loss": 0.91107762, + "learning_rate": 0.0009863408893270292, + "loss": 0.92207205, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.31933594, + "step": 532, + "time_per_iteration": 2.7911570072174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106913, + "balance_loss_mlp": 1.07396317, + "epoch": 0.10253943824547904, + "flos": 601473710592.0, + "grad_norm": 0.08191923529880715, + "language_loss": 0.86522454, + "learning_rate": 0.0009862684727023605, + "loss": 0.87629366, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.3293457, + "step": 533, + "time_per_iteration": 2.7452800273895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105875, + "balance_loss_mlp": 1.07466602, + "epoch": 0.1027318199307426, + "flos": 662647011840.0, + "grad_norm": 0.07282647554851075, + "language_loss": 0.90315968, + "learning_rate": 0.0009861958672915283, + "loss": 0.91421843, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.31201172, + "step": 534, + "time_per_iteration": 2.8041269779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096602, + "balance_loss_mlp": 1.0673244, + "epoch": 0.10292420161600616, + "flos": 682962928128.0, + "grad_norm": 0.058349855756870184, + "language_loss": 0.90126884, + "learning_rate": 0.0009861230731227201, + "loss": 0.9122349, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.29248047, + "step": 535, + "time_per_iteration": 2.8627805709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108033, + "balance_loss_mlp": 1.07615674, + "epoch": 0.10311658330126972, + "flos": 490042414080.0, + "grad_norm": 0.091555564896082, + "language_loss": 0.91954774, + "learning_rate": 0.0009860500902241973, + "loss": 0.93062806, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.31884766, + "step": 536, + "time_per_iteration": 2.6052157878875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120335, + "balance_loss_mlp": 1.08800602, + "epoch": 0.10330896498653329, + "flos": 431508446208.0, + "grad_norm": 0.0585767653270487, + "language_loss": 0.96574026, + "learning_rate": 0.0009859769186242942, + "loss": 0.97694361, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.32324219, + "step": 537, + "time_per_iteration": 2.51180362701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116517, + "balance_loss_mlp": 1.08571362, + "epoch": 0.10350134667179685, + "flos": 549330052608.0, + "grad_norm": 0.0744119924563098, + "language_loss": 0.8926785, + "learning_rate": 0.0009859035583514187, + "loss": 0.90384364, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.30834961, + "step": 538, + "time_per_iteration": 2.6369993686676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146613, + "balance_loss_mlp": 1.11380613, + "epoch": 0.10369372835706041, + "flos": 640327569408.0, + "grad_norm": 0.09976070350989504, + "language_loss": 0.90389431, + "learning_rate": 0.0009858300094340517, + "loss": 0.91536051, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.328125, + "step": 539, + "time_per_iteration": 2.7695086002349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150737, + "balance_loss_mlp": 1.11838388, + "epoch": 0.10388611004232397, + "flos": 521500598784.0, + "grad_norm": 0.08771902350159133, + "language_loss": 0.85304511, + "learning_rate": 0.0009857562719007473, + "loss": 0.8645525, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.32324219, + "step": 540, + "time_per_iteration": 2.59881329536438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144681, + "balance_loss_mlp": 1.11320961, + "epoch": 0.10407849172758753, + "flos": 702111946752.0, + "grad_norm": 0.07496368213999542, + "language_loss": 0.88249481, + "learning_rate": 0.0009856823457801331, + "loss": 0.89394164, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.31494141, + "step": 541, + "time_per_iteration": 2.873481035232544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119735, + "balance_loss_mlp": 1.08738184, + "epoch": 0.1042708734128511, + "flos": 502652736000.0, + "grad_norm": 0.06973546911765124, + "language_loss": 0.94998306, + "learning_rate": 0.00098560823110091, + "loss": 0.96118045, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.32373047, + "step": 542, + "time_per_iteration": 2.661374807357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_mlp": 1.08757377, + "epoch": 0.10446325509811466, + "flos": 485331153408.0, + "grad_norm": 0.0792045331206184, + "language_loss": 0.95517921, + "learning_rate": 0.000985533927891851, + "loss": 0.96635967, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.30419922, + "step": 543, + "time_per_iteration": 2.7264697551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096256, + "balance_loss_mlp": 1.06502366, + "epoch": 0.10465563678337822, + "flos": 568365590016.0, + "grad_norm": 0.0919664039836503, + "language_loss": 0.93718112, + "learning_rate": 0.0009854594361818044, + "loss": 0.94814372, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.31201172, + "step": 544, + "time_per_iteration": 2.6869821548461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099422, + "balance_loss_mlp": 1.0683322, + "epoch": 0.10484801846864178, + "flos": 625806853632.0, + "grad_norm": 0.1054615502202609, + "language_loss": 0.927598, + "learning_rate": 0.0009853847559996897, + "loss": 0.9385922, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.31103516, + "step": 545, + "time_per_iteration": 2.7953526973724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100313, + "balance_loss_mlp": 1.06772113, + "epoch": 0.10504040015390535, + "flos": 743063874048.0, + "grad_norm": 0.0768702593450629, + "language_loss": 0.92008656, + "learning_rate": 0.0009853098873745, + "loss": 0.93108964, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.32592773, + "step": 546, + "time_per_iteration": 3.0344293117523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106321, + "balance_loss_mlp": 1.07430172, + "epoch": 0.10523278183916891, + "flos": 586382616576.0, + "grad_norm": 0.072035501246702, + "language_loss": 0.90983582, + "learning_rate": 0.0009852348303353027, + "loss": 0.92089903, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.32006836, + "step": 547, + "time_per_iteration": 2.7647972106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110403, + "balance_loss_mlp": 1.07100892, + "epoch": 0.10542516352443247, + "flos": 869270552064.0, + "grad_norm": 0.07817580313906373, + "language_loss": 0.84611928, + "learning_rate": 0.000985159584911237, + "loss": 0.85715961, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.33007812, + "step": 548, + "time_per_iteration": 3.143122434616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104478, + "balance_loss_mlp": 1.07212472, + "epoch": 0.10561754520969603, + "flos": 505182970368.0, + "grad_norm": 0.08898596974063745, + "language_loss": 0.91126573, + "learning_rate": 0.0009850841511315162, + "loss": 0.92231047, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.32348633, + "step": 549, + "time_per_iteration": 2.6164846420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112982, + "balance_loss_mlp": 1.07946038, + "epoch": 0.1058099268949596, + "flos": 559690947072.0, + "grad_norm": 0.06224197989448247, + "language_loss": 0.92054999, + "learning_rate": 0.0009850085290254256, + "loss": 0.93167984, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.33520508, + "step": 550, + "time_per_iteration": 2.7473480701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110676, + "balance_loss_mlp": 1.07431078, + "epoch": 0.10600230858022316, + "flos": 561773048832.0, + "grad_norm": 0.05678957528127819, + "language_loss": 0.88957977, + "learning_rate": 0.0009849327186223246, + "loss": 0.90064728, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.32446289, + "step": 551, + "time_per_iteration": 2.805126905441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094878, + "balance_loss_mlp": 1.06297779, + "epoch": 0.10619469026548672, + "flos": 494079989760.0, + "grad_norm": 0.07906939671673464, + "language_loss": 0.95596325, + "learning_rate": 0.000984856719951646, + "loss": 0.96691203, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.31860352, + "step": 552, + "time_per_iteration": 2.5688273906707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105536, + "balance_loss_mlp": 1.07370734, + "epoch": 0.10638707195075028, + "flos": 675843678720.0, + "grad_norm": 0.06469368191660979, + "language_loss": 0.93170857, + "learning_rate": 0.0009847805330428943, + "loss": 0.94276392, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.31811523, + "step": 553, + "time_per_iteration": 2.8858227729797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105116, + "balance_loss_mlp": 1.07080746, + "epoch": 0.10657945363601386, + "flos": 487811925504.0, + "grad_norm": 0.07365688544553677, + "language_loss": 0.94454086, + "learning_rate": 0.0009847041579256481, + "loss": 0.95559192, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.34326172, + "step": 554, + "time_per_iteration": 2.5912039279937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114154, + "balance_loss_mlp": 1.08158636, + "epoch": 0.10677183532127742, + "flos": 482706376704.0, + "grad_norm": 0.06731486395760358, + "language_loss": 0.95310724, + "learning_rate": 0.0009846275946295592, + "loss": 0.96424878, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.32568359, + "step": 555, + "time_per_iteration": 2.6071619987487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120557, + "balance_loss_mlp": 1.08755958, + "epoch": 0.10696421700654098, + "flos": 655917668352.0, + "grad_norm": 0.06239681935918944, + "language_loss": 0.88169777, + "learning_rate": 0.0009845508431843518, + "loss": 0.89290333, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.32983398, + "step": 556, + "time_per_iteration": 2.9906973838806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_mlp": 1.08986306, + "epoch": 0.10715659869180454, + "flos": 567483881472.0, + "grad_norm": 0.06803394611182671, + "language_loss": 0.89010829, + "learning_rate": 0.0009844739036198233, + "loss": 0.90133309, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.32592773, + "step": 557, + "time_per_iteration": 2.6462793350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113246, + "balance_loss_mlp": 1.09927225, + "epoch": 0.10734898037706811, + "flos": 540432829440.0, + "grad_norm": 0.0683091886411484, + "language_loss": 0.96000761, + "learning_rate": 0.0009843967759658448, + "loss": 0.97133219, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.33203125, + "step": 558, + "time_per_iteration": 2.664320707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369087, + "balance_loss_mlp": 1.3546865, + "epoch": 0.10754136206233167, + "flos": 1475870008320.0, + "grad_norm": 0.12144998025248735, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74136841, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.14355469, + "step": 559, + "time_per_iteration": 4.836310148239136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124853, + "balance_loss_mlp": 1.0925231, + "epoch": 0.10773374374759523, + "flos": 512155243008.0, + "grad_norm": 0.06725764235558847, + "language_loss": 0.96045369, + "learning_rate": 0.000984241956509384, + "loss": 0.97170222, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.32324219, + "step": 560, + "time_per_iteration": 2.7409372329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134795, + "balance_loss_mlp": 1.10005689, + "epoch": 0.10792612543285879, + "flos": 496261016064.0, + "grad_norm": 0.08502468521942065, + "language_loss": 0.91520619, + "learning_rate": 0.0009841642647670078, + "loss": 0.92655414, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.34741211, + "step": 561, + "time_per_iteration": 2.5360167026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134435, + "balance_loss_mlp": 1.10050821, + "epoch": 0.10811850711812235, + "flos": 735131317248.0, + "grad_norm": 0.08550854990342285, + "language_loss": 0.86122006, + "learning_rate": 0.0009840863850553944, + "loss": 0.87256444, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.33911133, + "step": 562, + "time_per_iteration": 3.0013930797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118751, + "balance_loss_mlp": 1.08604038, + "epoch": 0.10831088880338592, + "flos": 611257024512.0, + "grad_norm": 0.07414056330929218, + "language_loss": 0.92513216, + "learning_rate": 0.0009840083174047782, + "loss": 0.93631971, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.3269043, + "step": 563, + "time_per_iteration": 2.761746883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125103, + "balance_loss_mlp": 1.09353685, + "epoch": 0.10850327048864948, + "flos": 556022928384.0, + "grad_norm": 0.06849160846851732, + "language_loss": 0.86520386, + "learning_rate": 0.0009839300618454685, + "loss": 0.87645483, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.31518555, + "step": 564, + "time_per_iteration": 2.833545684814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124691, + "balance_loss_mlp": 1.09291005, + "epoch": 0.10869565217391304, + "flos": 602902476288.0, + "grad_norm": 0.06688991061359367, + "language_loss": 0.92471159, + "learning_rate": 0.0009838516184078466, + "loss": 0.9359585, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.31762695, + "step": 565, + "time_per_iteration": 2.838482618331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112559, + "balance_loss_mlp": 1.09345102, + "epoch": 0.1088880338591766, + "flos": 525922288128.0, + "grad_norm": 0.08266802783800845, + "language_loss": 0.89073956, + "learning_rate": 0.0009837729871223669, + "loss": 0.90199542, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.3215332, + "step": 566, + "time_per_iteration": 2.6670589447021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134729, + "balance_loss_mlp": 1.10073042, + "epoch": 0.10908041554444017, + "flos": 619986921984.0, + "grad_norm": 0.06816497946354988, + "language_loss": 0.89503658, + "learning_rate": 0.0009836941680195568, + "loss": 0.90638387, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.34033203, + "step": 567, + "time_per_iteration": 2.7894582748413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131677, + "balance_loss_mlp": 1.09691525, + "epoch": 0.10927279722970373, + "flos": 897740195328.0, + "grad_norm": 0.07371226629870802, + "language_loss": 0.8534497, + "learning_rate": 0.0009836151611300166, + "loss": 0.86476642, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.34765625, + "step": 568, + "time_per_iteration": 3.204500913619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116051, + "balance_loss_mlp": 1.08467555, + "epoch": 0.10946517891496729, + "flos": 528408852480.0, + "grad_norm": 0.061952855977424344, + "language_loss": 0.96103537, + "learning_rate": 0.0009835359664844194, + "loss": 0.97219586, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.3137207, + "step": 569, + "time_per_iteration": 2.6154720783233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124163, + "balance_loss_mlp": 1.11014414, + "epoch": 0.10965756060023085, + "flos": 1559944714752.0, + "grad_norm": 0.03358522647050957, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82160974, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.140625, + "step": 570, + "time_per_iteration": 4.907090187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112487, + "balance_loss_mlp": 1.09406638, + "epoch": 0.10984994228549443, + "flos": 512820163584.0, + "grad_norm": 0.08674533322611513, + "language_loss": 0.9339065, + "learning_rate": 0.0009833770140481118, + "loss": 0.9451552, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.30786133, + "step": 571, + "time_per_iteration": 2.694821357727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121358, + "balance_loss_mlp": 1.09072113, + "epoch": 0.11004232397075799, + "flos": 954314307072.0, + "grad_norm": 0.07582699316256973, + "language_loss": 0.84126109, + "learning_rate": 0.000983297256319112, + "loss": 0.85247469, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.30664062, + "step": 572, + "time_per_iteration": 3.208728313446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144326, + "balance_loss_mlp": 1.11097169, + "epoch": 0.11023470565602154, + "flos": 487921024512.0, + "grad_norm": 0.07530566153242002, + "language_loss": 0.8789041, + "learning_rate": 0.000983217310957477, + "loss": 0.89034736, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.33349609, + "step": 573, + "time_per_iteration": 2.7521331310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113474, + "balance_loss_mlp": 1.1014812, + "epoch": 0.1104270873412851, + "flos": 655521970176.0, + "grad_norm": 0.08427122985019045, + "language_loss": 0.91161472, + "learning_rate": 0.000983137177994244, + "loss": 0.92296207, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.33300781, + "step": 574, + "time_per_iteration": 2.869795083999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105984, + "balance_loss_mlp": 1.0752039, + "epoch": 0.11061946902654868, + "flos": 723097165824.0, + "grad_norm": 0.0803000190442887, + "language_loss": 0.87202144, + "learning_rate": 0.0009830568574605235, + "loss": 0.88308132, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.30737305, + "step": 575, + "time_per_iteration": 2.952505111694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111674, + "balance_loss_mlp": 1.07963109, + "epoch": 0.11081185071181224, + "flos": 835113397248.0, + "grad_norm": 0.07764025760375837, + "language_loss": 0.89234924, + "learning_rate": 0.0009829763493874992, + "loss": 0.90346599, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.3203125, + "step": 576, + "time_per_iteration": 3.0367727279663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110641, + "balance_loss_mlp": 1.07508206, + "epoch": 0.1110042323970758, + "flos": 608776252416.0, + "grad_norm": 0.06795308301133055, + "language_loss": 0.94366598, + "learning_rate": 0.0009828956538064264, + "loss": 0.95473009, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.31347656, + "step": 577, + "time_per_iteration": 2.783268928527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091394, + "balance_loss_mlp": 1.0610671, + "epoch": 0.11119661408233936, + "flos": 595643604480.0, + "grad_norm": 0.0662915232098912, + "language_loss": 0.9183138, + "learning_rate": 0.0009828147707486344, + "loss": 0.92922771, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.30297852, + "step": 578, + "time_per_iteration": 2.6628670692443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092993, + "balance_loss_mlp": 1.06109214, + "epoch": 0.11138899576760293, + "flos": 555573385728.0, + "grad_norm": 0.07355059798421615, + "language_loss": 0.87444091, + "learning_rate": 0.0009827337002455245, + "loss": 0.88537085, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.31884766, + "step": 579, + "time_per_iteration": 2.616842031478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087398, + "balance_loss_mlp": 1.05857313, + "epoch": 0.11158137745286649, + "flos": 689418667008.0, + "grad_norm": 0.05531737995895799, + "language_loss": 0.89474124, + "learning_rate": 0.0009826524423285712, + "loss": 0.90561521, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.28808594, + "step": 580, + "time_per_iteration": 2.896409749984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093471, + "balance_loss_mlp": 1.06393051, + "epoch": 0.11177375913813005, + "flos": 762688728576.0, + "grad_norm": 0.06807232662928764, + "language_loss": 0.9046967, + "learning_rate": 0.0009825709970293218, + "loss": 0.91563141, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.2956543, + "step": 581, + "time_per_iteration": 2.8843319416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096208, + "balance_loss_mlp": 1.0669775, + "epoch": 0.11196614082339361, + "flos": 806211588096.0, + "grad_norm": 0.07053725402235117, + "language_loss": 0.96166003, + "learning_rate": 0.0009824893643793956, + "loss": 0.9726221, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.29248047, + "step": 582, + "time_per_iteration": 3.04577898979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104715, + "balance_loss_mlp": 1.07305288, + "epoch": 0.11215852250865718, + "flos": 558350931456.0, + "grad_norm": 0.10752491555358674, + "language_loss": 0.89033759, + "learning_rate": 0.0009824075444104857, + "loss": 0.90138471, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.31689453, + "step": 583, + "time_per_iteration": 2.682020902633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125233, + "balance_loss_mlp": 1.09497714, + "epoch": 0.11235090419392074, + "flos": 513322140672.0, + "grad_norm": 0.06606619546840543, + "language_loss": 0.94941097, + "learning_rate": 0.000982325537154357, + "loss": 0.9606632, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.30224609, + "step": 584, + "time_per_iteration": 2.577632427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122311, + "balance_loss_mlp": 1.09045827, + "epoch": 0.1125432858791843, + "flos": 491209311744.0, + "grad_norm": 0.07452844115700766, + "language_loss": 0.95190644, + "learning_rate": 0.0009822433426428484, + "loss": 0.96312958, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.31860352, + "step": 585, + "time_per_iteration": 2.560591220855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126565, + "balance_loss_mlp": 1.09280539, + "epoch": 0.11273566756444786, + "flos": 510476193792.0, + "grad_norm": 0.11434848401200806, + "language_loss": 0.87964213, + "learning_rate": 0.0009821609609078697, + "loss": 0.89090776, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.3371582, + "step": 586, + "time_per_iteration": 2.633925437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118464, + "balance_loss_mlp": 1.08785152, + "epoch": 0.11292804924971142, + "flos": 622149009408.0, + "grad_norm": 0.08000190427267627, + "language_loss": 0.905334, + "learning_rate": 0.0009820783919814045, + "loss": 0.91651857, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.3059082, + "step": 587, + "time_per_iteration": 2.806704044342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111289, + "balance_loss_mlp": 1.07857847, + "epoch": 0.113120430934975, + "flos": 477811823616.0, + "grad_norm": 0.09357252991594707, + "language_loss": 0.83955467, + "learning_rate": 0.0009819956358955095, + "loss": 0.8506676, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.32714844, + "step": 588, + "time_per_iteration": 2.5903711318969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109097, + "balance_loss_mlp": 1.07455039, + "epoch": 0.11331281262023855, + "flos": 466801975296.0, + "grad_norm": 0.06610764616840299, + "language_loss": 0.85348701, + "learning_rate": 0.0009819126926823127, + "loss": 0.86457801, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.34570312, + "step": 589, + "time_per_iteration": 2.5726494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108183, + "balance_loss_mlp": 1.07535291, + "epoch": 0.11350519430550211, + "flos": 650164727808.0, + "grad_norm": 0.06035980490561805, + "language_loss": 0.87806922, + "learning_rate": 0.000981829562374016, + "loss": 0.8891511, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.328125, + "step": 590, + "time_per_iteration": 2.7960643768310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112987, + "balance_loss_mlp": 1.08041859, + "epoch": 0.11369757599076567, + "flos": 557547798528.0, + "grad_norm": 0.08830164474684658, + "language_loss": 0.98550045, + "learning_rate": 0.0009817462450028933, + "loss": 0.99663031, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.32568359, + "step": 591, + "time_per_iteration": 2.654860734939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107596, + "balance_loss_mlp": 1.07526684, + "epoch": 0.11388995767602925, + "flos": 570774988800.0, + "grad_norm": 0.06245390963608315, + "language_loss": 0.86587834, + "learning_rate": 0.0009816627406012916, + "loss": 0.87695432, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.32348633, + "step": 592, + "time_per_iteration": 2.8017733097076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101808, + "balance_loss_mlp": 1.07074225, + "epoch": 0.1140823393612928, + "flos": 740069540352.0, + "grad_norm": 0.06581053360364857, + "language_loss": 0.8595314, + "learning_rate": 0.0009815790492016295, + "loss": 0.87054944, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.31030273, + "step": 593, + "time_per_iteration": 2.9602174758911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097875, + "balance_loss_mlp": 1.06666636, + "epoch": 0.11427472104655637, + "flos": 698694211584.0, + "grad_norm": 0.07124053574400792, + "language_loss": 0.87982339, + "learning_rate": 0.0009814951708363993, + "loss": 0.89080215, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.31201172, + "step": 594, + "time_per_iteration": 2.818460702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167391, + "balance_loss_mlp": 1.15413451, + "epoch": 0.11446710273181993, + "flos": 1476387952128.0, + "grad_norm": 0.04038129773095179, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79158378, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.1328125, + "step": 595, + "time_per_iteration": 4.776912450790405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110386, + "balance_loss_mlp": 1.07250798, + "epoch": 0.1146594844170835, + "flos": 494641603584.0, + "grad_norm": 0.1404346857169784, + "language_loss": 0.89489102, + "learning_rate": 0.0009813268533395648, + "loss": 0.90592968, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.3137207, + "step": 596, + "time_per_iteration": 2.562816858291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115324, + "balance_loss_mlp": 1.08344746, + "epoch": 0.11485186610234706, + "flos": 474596319744.0, + "grad_norm": 0.07456374098915484, + "language_loss": 0.89145029, + "learning_rate": 0.0009812424142733073, + "loss": 0.90260351, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.31884766, + "step": 597, + "time_per_iteration": 2.5198655128479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_mlp": 1.0946219, + "epoch": 0.11504424778761062, + "flos": 730858014720.0, + "grad_norm": 0.05033183127205697, + "language_loss": 0.86898923, + "learning_rate": 0.000981157788372175, + "loss": 0.88022888, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.29345703, + "step": 598, + "time_per_iteration": 3.004558563232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155063, + "balance_loss_mlp": 1.12290049, + "epoch": 0.11523662947287418, + "flos": 545539788288.0, + "grad_norm": 0.07554757352201513, + "language_loss": 0.90216064, + "learning_rate": 0.0009810729756690223, + "loss": 0.91371131, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.3215332, + "step": 599, + "time_per_iteration": 2.7165520191192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149643, + "balance_loss_mlp": 1.11790919, + "epoch": 0.11542901115813775, + "flos": 774737436672.0, + "grad_norm": 0.08801397326806587, + "language_loss": 0.92855275, + "learning_rate": 0.0009809879761967766, + "loss": 0.94004917, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.31738281, + "step": 600, + "time_per_iteration": 2.9548492431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115619, + "balance_loss_mlp": 1.12004542, + "epoch": 0.11562139284340131, + "flos": 730585972224.0, + "grad_norm": 0.08285308963026158, + "language_loss": 0.87716347, + "learning_rate": 0.0009809027899884378, + "loss": 0.8887254, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.36157227, + "step": 601, + "time_per_iteration": 2.9107346534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131924, + "balance_loss_mlp": 1.10085821, + "epoch": 0.11581377452866487, + "flos": 535589148672.0, + "grad_norm": 0.07059046613839054, + "language_loss": 0.89834028, + "learning_rate": 0.0009808174170770779, + "loss": 0.90965956, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.31079102, + "step": 602, + "time_per_iteration": 2.79127836227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217718, + "balance_loss_mlp": 1.20541608, + "epoch": 0.11600615621392843, + "flos": 1554968613888.0, + "grad_norm": 0.07528653751738872, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86115962, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.12304688, + "step": 603, + "time_per_iteration": 4.862261772155762 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103115, + "balance_loss_mlp": 1.07238269, + "epoch": 0.116198537899192, + "flos": 537178037760.0, + "grad_norm": 0.08106568577848162, + "language_loss": 0.94434869, + "learning_rate": 0.0009806461112779462, + "loss": 0.95537978, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.30737305, + "step": 604, + "time_per_iteration": 2.600008249282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097342, + "balance_loss_mlp": 1.06427336, + "epoch": 0.11639091958445556, + "flos": 453970483200.0, + "grad_norm": 0.09761910402267754, + "language_loss": 0.89590895, + "learning_rate": 0.0009805601784566814, + "loss": 0.90688241, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.33056641, + "step": 605, + "time_per_iteration": 2.4687013626098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097807, + "balance_loss_mlp": 1.06635928, + "epoch": 0.11658330126971912, + "flos": 554815332864.0, + "grad_norm": 0.0628453025897625, + "language_loss": 0.96235836, + "learning_rate": 0.0009804740590654089, + "loss": 0.97333646, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.31469727, + "step": 606, + "time_per_iteration": 2.654134750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109435, + "balance_loss_mlp": 1.0789417, + "epoch": 0.11677568295498268, + "flos": 716025968640.0, + "grad_norm": 0.07837472156111998, + "language_loss": 0.90884066, + "learning_rate": 0.0009803877531375635, + "loss": 0.91993499, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.30493164, + "step": 607, + "time_per_iteration": 2.825778007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_mlp": 1.08074808, + "epoch": 0.11696806464024626, + "flos": 609474668544.0, + "grad_norm": 0.07263848878870109, + "language_loss": 0.91923869, + "learning_rate": 0.0009803012607066523, + "loss": 0.93036401, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.31787109, + "step": 608, + "time_per_iteration": 2.721005916595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101062, + "balance_loss_mlp": 1.06980491, + "epoch": 0.11716044632550981, + "flos": 520127087616.0, + "grad_norm": 0.06980646294906427, + "language_loss": 0.9077962, + "learning_rate": 0.0009802145818062543, + "loss": 0.91880679, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.31225586, + "step": 609, + "time_per_iteration": 2.707643985748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102401, + "balance_loss_mlp": 1.07035792, + "epoch": 0.11735282801077337, + "flos": 507246133248.0, + "grad_norm": 0.07162886221417876, + "language_loss": 0.9293434, + "learning_rate": 0.0009801277164700212, + "loss": 0.9403674, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.3203125, + "step": 610, + "time_per_iteration": 2.6389639377593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094537, + "balance_loss_mlp": 1.06323278, + "epoch": 0.11754520969603693, + "flos": 686339965440.0, + "grad_norm": 0.07220465483683103, + "language_loss": 0.90727574, + "learning_rate": 0.0009800406647316776, + "loss": 0.91822106, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.31274414, + "step": 611, + "time_per_iteration": 2.8033382892608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066854, + "balance_loss_mlp": 1.05369329, + "epoch": 0.1177375913813005, + "flos": 1541673022464.0, + "grad_norm": 0.030783707978337852, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.77981311, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.13183594, + "step": 612, + "time_per_iteration": 4.777275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116404, + "balance_loss_mlp": 1.08307314, + "epoch": 0.11792997306656407, + "flos": 520269682176.0, + "grad_norm": 0.07589987368124408, + "language_loss": 0.8961159, + "learning_rate": 0.000979866002183916, + "loss": 0.90727997, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.33325195, + "step": 613, + "time_per_iteration": 2.6848883628845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109453, + "balance_loss_mlp": 1.07719529, + "epoch": 0.11812235475182763, + "flos": 665980379136.0, + "grad_norm": 0.08667718058784188, + "language_loss": 0.91197205, + "learning_rate": 0.0009797783914423082, + "loss": 0.92306662, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.32250977, + "step": 614, + "time_per_iteration": 2.832414388656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.07140493, + "epoch": 0.11831473643709119, + "flos": 621021399552.0, + "grad_norm": 0.06050640051516142, + "language_loss": 0.85425436, + "learning_rate": 0.0009796905944342094, + "loss": 0.86530626, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.33813477, + "step": 615, + "time_per_iteration": 2.8220455646514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112849, + "balance_loss_mlp": 1.07913685, + "epoch": 0.11850711812235475, + "flos": 456438108672.0, + "grad_norm": 0.0714748534502384, + "language_loss": 0.893188, + "learning_rate": 0.0009796026111937057, + "loss": 0.90431643, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.3371582, + "step": 616, + "time_per_iteration": 2.590566873550415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102524, + "balance_loss_mlp": 1.07005119, + "epoch": 0.11869949980761832, + "flos": 513598565376.0, + "grad_norm": 0.06492309219220607, + "language_loss": 0.89778733, + "learning_rate": 0.0009795144417549552, + "loss": 0.90881252, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.32470703, + "step": 617, + "time_per_iteration": 2.672914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109626, + "balance_loss_mlp": 1.0773685, + "epoch": 0.11889188149288188, + "flos": 534732171264.0, + "grad_norm": 0.057544425945024125, + "language_loss": 0.90660846, + "learning_rate": 0.0009794260861521883, + "loss": 0.9177047, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.32250977, + "step": 618, + "time_per_iteration": 2.817354202270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_mlp": 1.07009149, + "epoch": 0.11908426317814544, + "flos": 498344527872.0, + "grad_norm": 0.0773697745436404, + "language_loss": 0.87738883, + "learning_rate": 0.0009793375444197075, + "loss": 0.88841403, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.32397461, + "step": 619, + "time_per_iteration": 2.607475996017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011109, + "balance_loss_mlp": 1.07697332, + "epoch": 0.119276644863409, + "flos": 659598833664.0, + "grad_norm": 0.06767977381214116, + "language_loss": 0.86337721, + "learning_rate": 0.000979248816591888, + "loss": 0.87448615, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.33935547, + "step": 620, + "time_per_iteration": 2.758866548538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098794, + "balance_loss_mlp": 1.06667948, + "epoch": 0.11946902654867257, + "flos": 758396487168.0, + "grad_norm": 0.06819106164994826, + "language_loss": 0.87032986, + "learning_rate": 0.0009791599027031766, + "loss": 0.88131785, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.32128906, + "step": 621, + "time_per_iteration": 3.029431104660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088156, + "balance_loss_mlp": 1.05611241, + "epoch": 0.11966140823393613, + "flos": 680697533952.0, + "grad_norm": 0.0732554324646167, + "language_loss": 0.87112588, + "learning_rate": 0.0009790708027880932, + "loss": 0.88200748, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.32055664, + "step": 622, + "time_per_iteration": 2.855576992034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056461, + "balance_loss_mlp": 1.04444504, + "epoch": 0.11985378991919969, + "flos": 1450268070912.0, + "grad_norm": 0.03732324883573809, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78483754, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.12011719, + "step": 623, + "time_per_iteration": 4.840993165969849 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108671, + "balance_loss_mlp": 1.0551914, + "epoch": 0.12004617160446325, + "flos": 527586780672.0, + "grad_norm": 0.07309096746678648, + "language_loss": 0.94236648, + "learning_rate": 0.0009788920450172487, + "loss": 0.9532336, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.31518555, + "step": 624, + "time_per_iteration": 2.6301677227020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102663, + "balance_loss_mlp": 1.07023823, + "epoch": 0.12023855328972682, + "flos": 473980861440.0, + "grad_norm": 0.15739190650861204, + "language_loss": 0.91515559, + "learning_rate": 0.0009788023872308875, + "loss": 0.92618221, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.32421875, + "step": 625, + "time_per_iteration": 2.506446361541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014454, + "balance_loss_mlp": 1.0033915, + "epoch": 0.12043093497499038, + "flos": 1530954155520.0, + "grad_norm": 0.02216054665264375, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76443458, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.11083984, + "step": 626, + "time_per_iteration": 4.713289260864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114644, + "balance_loss_mlp": 1.08391225, + "epoch": 0.12062331666025394, + "flos": 539571469824.0, + "grad_norm": 0.0672242080300053, + "language_loss": 0.94766486, + "learning_rate": 0.0009786225140303285, + "loss": 0.95881128, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.30761719, + "step": 627, + "time_per_iteration": 2.61875057220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011267, + "balance_loss_mlp": 1.09503818, + "epoch": 0.1208156983455175, + "flos": 511634327040.0, + "grad_norm": 0.06510849521455, + "language_loss": 0.925771, + "learning_rate": 0.0009785322986859634, + "loss": 0.93703806, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.31640625, + "step": 628, + "time_per_iteration": 2.6567625999450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141777, + "balance_loss_mlp": 1.11059177, + "epoch": 0.12100808003078108, + "flos": 596195043840.0, + "grad_norm": 0.06735600063735754, + "language_loss": 0.93719506, + "learning_rate": 0.0009784418975588838, + "loss": 0.94861281, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.31152344, + "step": 629, + "time_per_iteration": 2.697376012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122983, + "balance_loss_mlp": 1.09222674, + "epoch": 0.12120046171604464, + "flos": 522698019840.0, + "grad_norm": 0.47103484407124013, + "language_loss": 0.93927598, + "learning_rate": 0.0009783513106841862, + "loss": 0.95050573, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.30761719, + "step": 630, + "time_per_iteration": 2.7226808071136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143332, + "balance_loss_mlp": 1.13179243, + "epoch": 0.1213928434013082, + "flos": 1553605277184.0, + "grad_norm": 0.056788624646596834, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77876031, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.11523438, + "step": 631, + "time_per_iteration": 4.948111295700073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228128, + "balance_loss_mlp": 1.19219875, + "epoch": 0.12158522508657175, + "flos": 495143580672.0, + "grad_norm": 0.06834333100250278, + "language_loss": 0.88515621, + "learning_rate": 0.0009781695798326854, + "loss": 0.89743745, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.35961914, + "step": 632, + "time_per_iteration": 2.5616555213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267845, + "balance_loss_mlp": 1.23050833, + "epoch": 0.12177760677183531, + "flos": 475335433728.0, + "grad_norm": 0.1009303482431908, + "language_loss": 0.88543177, + "learning_rate": 0.0009780784359264365, + "loss": 0.89811015, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.37329102, + "step": 633, + "time_per_iteration": 2.597935438156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265484, + "balance_loss_mlp": 1.25370574, + "epoch": 0.12196998845709889, + "flos": 1467630351360.0, + "grad_norm": 0.08843071113371018, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75454181, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.11767578, + "step": 634, + "time_per_iteration": 4.768415451049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235432, + "balance_loss_mlp": 1.19976473, + "epoch": 0.12216237014236245, + "flos": 586279309824.0, + "grad_norm": 0.0829698455775257, + "language_loss": 0.88074899, + "learning_rate": 0.000977895591329867, + "loss": 0.89310336, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.35668945, + "step": 635, + "time_per_iteration": 2.7918457984924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214994, + "balance_loss_mlp": 1.17720437, + "epoch": 0.12235475182762601, + "flos": 597721324032.0, + "grad_norm": 0.0916527997361875, + "language_loss": 0.87791145, + "learning_rate": 0.000977803890710533, + "loss": 0.89006138, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.37792969, + "step": 636, + "time_per_iteration": 2.7248313426971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186705, + "balance_loss_mlp": 1.1509428, + "epoch": 0.12254713351288957, + "flos": 497487550464.0, + "grad_norm": 0.0702522126388857, + "language_loss": 0.93856937, + "learning_rate": 0.0009777120045912774, + "loss": 0.95043641, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.35766602, + "step": 637, + "time_per_iteration": 2.6079726219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180704, + "balance_loss_mlp": 1.14236617, + "epoch": 0.12273951519815314, + "flos": 605565130752.0, + "grad_norm": 0.06645311005239844, + "language_loss": 0.90599251, + "learning_rate": 0.0009776199330077736, + "loss": 0.91779959, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.38330078, + "step": 638, + "time_per_iteration": 2.7671282291412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196025, + "balance_loss_mlp": 1.15940344, + "epoch": 0.1229318968834167, + "flos": 597578729472.0, + "grad_norm": 0.09015200479441979, + "language_loss": 0.93140519, + "learning_rate": 0.0009775276759957667, + "loss": 0.94336545, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.36621094, + "step": 639, + "time_per_iteration": 2.6990442276000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179898, + "balance_loss_mlp": 1.14265716, + "epoch": 0.12312427856868026, + "flos": 678082931712.0, + "grad_norm": 0.08188642922116089, + "language_loss": 0.90714514, + "learning_rate": 0.0009774352335910745, + "loss": 0.91894412, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.37280273, + "step": 640, + "time_per_iteration": 2.7950265407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115004, + "balance_loss_mlp": 1.11658967, + "epoch": 0.12331666025394382, + "flos": 608656978944.0, + "grad_norm": 0.07361380744806716, + "language_loss": 0.95549798, + "learning_rate": 0.000977342605829586, + "loss": 0.96699834, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.3347168, + "step": 641, + "time_per_iteration": 2.6966538429260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140018, + "balance_loss_mlp": 1.10497069, + "epoch": 0.12350904193920739, + "flos": 762172194816.0, + "grad_norm": 0.08211004604029591, + "language_loss": 0.86708105, + "learning_rate": 0.0009772497927472623, + "loss": 0.87848121, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.35083008, + "step": 642, + "time_per_iteration": 3.050595998764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121507, + "balance_loss_mlp": 1.0852437, + "epoch": 0.12370142362447095, + "flos": 540699079680.0, + "grad_norm": 0.0716743258864478, + "language_loss": 0.85363436, + "learning_rate": 0.0009771567943801368, + "loss": 0.86484945, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.36254883, + "step": 643, + "time_per_iteration": 2.627019166946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112067, + "balance_loss_mlp": 1.07744884, + "epoch": 0.12389380530973451, + "flos": 547848852480.0, + "grad_norm": 0.06992166814052157, + "language_loss": 0.89936745, + "learning_rate": 0.0009770636107643152, + "loss": 0.91048813, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.34643555, + "step": 644, + "time_per_iteration": 2.696233034133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102963, + "balance_loss_mlp": 1.06846356, + "epoch": 0.12408618699499807, + "flos": 540048715776.0, + "grad_norm": 0.06268128655507912, + "language_loss": 0.88181639, + "learning_rate": 0.0009769702419359738, + "loss": 0.89284605, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.3449707, + "step": 645, + "time_per_iteration": 2.61401104927063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116268, + "balance_loss_mlp": 1.0810535, + "epoch": 0.12427856868026164, + "flos": 745451513856.0, + "grad_norm": 0.07610574883038115, + "language_loss": 0.89730537, + "learning_rate": 0.000976876687931362, + "loss": 0.90846807, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.35229492, + "step": 646, + "time_per_iteration": 2.999408721923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131315, + "balance_loss_mlp": 1.09622002, + "epoch": 0.1244709503655252, + "flos": 533460556800.0, + "grad_norm": 0.19449531308307466, + "language_loss": 0.85410094, + "learning_rate": 0.0009767829487868005, + "loss": 0.86541414, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.35107422, + "step": 647, + "time_per_iteration": 2.617666721343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138117, + "balance_loss_mlp": 1.10159075, + "epoch": 0.12466333205078876, + "flos": 507847034880.0, + "grad_norm": 0.07509451505155453, + "language_loss": 0.89358151, + "learning_rate": 0.000976689024538682, + "loss": 0.90496266, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.36499023, + "step": 648, + "time_per_iteration": 2.5929009914398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138062, + "balance_loss_mlp": 1.10110736, + "epoch": 0.12485571373605232, + "flos": 681023420928.0, + "grad_norm": 0.07057439208121223, + "language_loss": 0.87662494, + "learning_rate": 0.0009765949152234716, + "loss": 0.8880055, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.36962891, + "step": 649, + "time_per_iteration": 2.874701976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147728, + "balance_loss_mlp": 1.13504386, + "epoch": 0.1250480954213159, + "flos": 1329402668544.0, + "grad_norm": 0.04527818124304351, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79833812, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.12695312, + "step": 650, + "time_per_iteration": 4.680933713912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138039, + "balance_loss_mlp": 1.10287213, + "epoch": 0.12524047710657946, + "flos": 938140683264.0, + "grad_norm": 0.08375968037938068, + "language_loss": 0.82443976, + "learning_rate": 0.0009764061415379919, + "loss": 0.83582014, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.35205078, + "step": 651, + "time_per_iteration": 3.2550604343414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135369, + "balance_loss_mlp": 1.09774697, + "epoch": 0.12543285879184302, + "flos": 513642235392.0, + "grad_norm": 0.07146085627000143, + "language_loss": 0.89363486, + "learning_rate": 0.0009763114772410109, + "loss": 0.90498853, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.3762207, + "step": 652, + "time_per_iteration": 2.5937142372131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139745, + "balance_loss_mlp": 1.10419679, + "epoch": 0.12562524047710658, + "flos": 717991617024.0, + "grad_norm": 0.07913079577836896, + "language_loss": 0.87230957, + "learning_rate": 0.0009762166280235146, + "loss": 0.88370705, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.35571289, + "step": 653, + "time_per_iteration": 2.96162748336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147653, + "balance_loss_mlp": 1.10974443, + "epoch": 0.12581762216237014, + "flos": 563441923584.0, + "grad_norm": 0.06492259826928527, + "language_loss": 0.87890899, + "learning_rate": 0.0009761215939223267, + "loss": 0.89038551, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.37890625, + "step": 654, + "time_per_iteration": 2.714641809463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145182, + "balance_loss_mlp": 1.1077261, + "epoch": 0.1260100038476337, + "flos": 481642785792.0, + "grad_norm": 0.07920721431290144, + "language_loss": 0.86875665, + "learning_rate": 0.0009760263749743428, + "loss": 0.88020849, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.37426758, + "step": 655, + "time_per_iteration": 2.547499179840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145343, + "balance_loss_mlp": 1.11074805, + "epoch": 0.12620238553289725, + "flos": 575269461504.0, + "grad_norm": 0.06357383816141966, + "language_loss": 0.90176344, + "learning_rate": 0.0009759309712165299, + "loss": 0.91321695, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.34570312, + "step": 656, + "time_per_iteration": 2.693922996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137509, + "balance_loss_mlp": 1.103248, + "epoch": 0.12639476721816084, + "flos": 530909973504.0, + "grad_norm": 0.07169490366111804, + "language_loss": 0.93258119, + "learning_rate": 0.0009758353826859272, + "loss": 0.94395626, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.34277344, + "step": 657, + "time_per_iteration": 2.5744612216949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139269, + "balance_loss_mlp": 1.10314822, + "epoch": 0.1265871489034244, + "flos": 689654393856.0, + "grad_norm": 0.06860158128637554, + "language_loss": 0.89679217, + "learning_rate": 0.0009757396094196456, + "loss": 0.90818477, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36132812, + "step": 658, + "time_per_iteration": 2.851700782775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143308, + "balance_loss_mlp": 1.10675859, + "epoch": 0.12677953058868796, + "flos": 536863735296.0, + "grad_norm": 0.0696485175834739, + "language_loss": 0.84555894, + "learning_rate": 0.0009756436514548673, + "loss": 0.85699201, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.36523438, + "step": 659, + "time_per_iteration": 2.7971351146698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122244, + "balance_loss_mlp": 1.08800757, + "epoch": 0.12697191227395152, + "flos": 518749194240.0, + "grad_norm": 0.05327633329409036, + "language_loss": 0.88343394, + "learning_rate": 0.0009755475088288466, + "loss": 0.89465636, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.34228516, + "step": 660, + "time_per_iteration": 2.670555353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103127, + "balance_loss_mlp": 1.06903291, + "epoch": 0.12716429395921508, + "flos": 566341714944.0, + "grad_norm": 0.06801254087798507, + "language_loss": 0.90210187, + "learning_rate": 0.0009754511815789095, + "loss": 0.91313314, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.34106445, + "step": 661, + "time_per_iteration": 2.748224973678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102552, + "balance_loss_mlp": 1.06798172, + "epoch": 0.12735667564447864, + "flos": 513844466688.0, + "grad_norm": 0.06975204014846512, + "language_loss": 0.86245489, + "learning_rate": 0.0009753546697424533, + "loss": 0.87348044, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.34594727, + "step": 662, + "time_per_iteration": 2.664799213409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092108, + "balance_loss_mlp": 1.05863369, + "epoch": 0.1275490573297422, + "flos": 541023556608.0, + "grad_norm": 0.05485824904298714, + "language_loss": 0.90572149, + "learning_rate": 0.0009752579733569475, + "loss": 0.91664255, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.3347168, + "step": 663, + "time_per_iteration": 2.679975748062134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267369, + "balance_loss_mlp": 1.2515384, + "epoch": 0.12774143901500576, + "flos": 1557872787456.0, + "grad_norm": 0.0685532780556388, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7614876, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.15820312, + "step": 664, + "time_per_iteration": 4.938101053237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096151, + "balance_loss_mlp": 1.06177139, + "epoch": 0.12793382070026935, + "flos": 613462781952.0, + "grad_norm": 0.06920677464457729, + "language_loss": 0.90523887, + "learning_rate": 0.0009750640270890217, + "loss": 0.9162004, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.34375, + "step": 665, + "time_per_iteration": 2.6939845085144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099083, + "balance_loss_mlp": 1.06563258, + "epoch": 0.1281262023855329, + "flos": 707386231296.0, + "grad_norm": 0.06773970450457005, + "language_loss": 0.96531481, + "learning_rate": 0.0009749667772818983, + "loss": 0.9763056, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.33447266, + "step": 666, + "time_per_iteration": 2.967853307723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164762, + "balance_loss_mlp": 1.15131497, + "epoch": 0.12831858407079647, + "flos": 1424250086400.0, + "grad_norm": 0.045177828452490555, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78100705, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.13476562, + "step": 667, + "time_per_iteration": 4.85069465637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093582, + "balance_loss_mlp": 1.05958366, + "epoch": 0.12851096575606002, + "flos": 448869316608.0, + "grad_norm": 0.07778909975942494, + "language_loss": 0.95426726, + "learning_rate": 0.0009747717245101093, + "loss": 0.96520311, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.34008789, + "step": 668, + "time_per_iteration": 2.5234692096710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098998, + "balance_loss_mlp": 1.06519032, + "epoch": 0.12870334744132358, + "flos": 479697486336.0, + "grad_norm": 0.05465485885236262, + "language_loss": 0.84969366, + "learning_rate": 0.00097467392162117, + "loss": 0.86068368, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.33789062, + "step": 669, + "time_per_iteration": 2.601684808731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096385, + "balance_loss_mlp": 1.06341171, + "epoch": 0.12889572912658714, + "flos": 638633963520.0, + "grad_norm": 0.05954757179165737, + "language_loss": 0.91292465, + "learning_rate": 0.0009745759344474708, + "loss": 0.92388856, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.32983398, + "step": 670, + "time_per_iteration": 2.8225347995758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098806, + "balance_loss_mlp": 1.06411648, + "epoch": 0.1290881108118507, + "flos": 509693409792.0, + "grad_norm": 0.06976130099981656, + "language_loss": 0.89229816, + "learning_rate": 0.0009744777630270536, + "loss": 0.90328622, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.34692383, + "step": 671, + "time_per_iteration": 2.633571147918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109875, + "balance_loss_mlp": 1.07435024, + "epoch": 0.12928049249711426, + "flos": 670746894336.0, + "grad_norm": 0.08011077975608555, + "language_loss": 0.93749923, + "learning_rate": 0.000974379407398032, + "loss": 0.94859791, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.35546875, + "step": 672, + "time_per_iteration": 2.875609874725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093721, + "balance_loss_mlp": 1.06065273, + "epoch": 0.12947287418237785, + "flos": 793158925824.0, + "grad_norm": 0.05850057523774312, + "language_loss": 0.82016242, + "learning_rate": 0.0009742808675985913, + "loss": 0.83109969, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.33056641, + "step": 673, + "time_per_iteration": 3.087738275527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101029, + "balance_loss_mlp": 1.0646224, + "epoch": 0.1296652558676414, + "flos": 485222054400.0, + "grad_norm": 0.08954381825883409, + "language_loss": 0.9153564, + "learning_rate": 0.0009741821436669876, + "loss": 0.92636657, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.36450195, + "step": 674, + "time_per_iteration": 2.539849281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101326, + "balance_loss_mlp": 1.06673169, + "epoch": 0.12985763755290497, + "flos": 453226987008.0, + "grad_norm": 0.0648114016490977, + "language_loss": 0.9288274, + "learning_rate": 0.0009740832356415492, + "loss": 0.93984067, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.34619141, + "step": 675, + "time_per_iteration": 2.467801094055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097673, + "balance_loss_mlp": 1.06315041, + "epoch": 0.13005001923816853, + "flos": 824719007232.0, + "grad_norm": 0.0735546441878898, + "language_loss": 0.8857609, + "learning_rate": 0.0009739841435606756, + "loss": 0.89673769, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.34545898, + "step": 676, + "time_per_iteration": 3.008781909942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109741, + "balance_loss_mlp": 1.06457949, + "epoch": 0.1302424009234321, + "flos": 531107822592.0, + "grad_norm": 0.07312926894822828, + "language_loss": 0.90675485, + "learning_rate": 0.0009738848674628377, + "loss": 0.9177289, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.328125, + "step": 677, + "time_per_iteration": 2.695338010787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104607, + "balance_loss_mlp": 1.06955981, + "epoch": 0.13043478260869565, + "flos": 525626924544.0, + "grad_norm": 0.06033597827839572, + "language_loss": 0.89643902, + "learning_rate": 0.000973785407386578, + "loss": 0.90748513, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.35058594, + "step": 678, + "time_per_iteration": 2.7727599143981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101976, + "balance_loss_mlp": 1.06714272, + "epoch": 0.1306271642939592, + "flos": 625862108160.0, + "grad_norm": 0.05570081952525763, + "language_loss": 0.87361526, + "learning_rate": 0.0009736857633705103, + "loss": 0.88463503, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.34814453, + "step": 679, + "time_per_iteration": 2.843129873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110176, + "balance_loss_mlp": 1.06630766, + "epoch": 0.13081954597922277, + "flos": 550438723584.0, + "grad_norm": 0.06405817655948583, + "language_loss": 0.93204647, + "learning_rate": 0.0009735859354533196, + "loss": 0.94306409, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.35473633, + "step": 680, + "time_per_iteration": 2.7122464179992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093118, + "balance_loss_mlp": 1.05914354, + "epoch": 0.13101192766448633, + "flos": 536651329536.0, + "grad_norm": 0.06779912020183775, + "language_loss": 0.91948998, + "learning_rate": 0.0009734859236737628, + "loss": 0.93042123, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.33984375, + "step": 681, + "time_per_iteration": 2.594881296157837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093326, + "balance_loss_mlp": 1.0593034, + "epoch": 0.13120430934974991, + "flos": 503258019840.0, + "grad_norm": 0.06413082246497326, + "language_loss": 0.93904501, + "learning_rate": 0.0009733857280706678, + "loss": 0.94997829, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.34033203, + "step": 682, + "time_per_iteration": 2.5831425189971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010929, + "balance_loss_mlp": 1.05992687, + "epoch": 0.13139669103501347, + "flos": 614014221312.0, + "grad_norm": 0.06246118190021366, + "language_loss": 0.85051745, + "learning_rate": 0.000973285348682934, + "loss": 0.86144638, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.33007812, + "step": 683, + "time_per_iteration": 2.7236225605010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226892, + "balance_loss_mlp": 1.21096563, + "epoch": 0.13158907272027703, + "flos": 1484163357696.0, + "grad_norm": 0.08359566880013784, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79125261, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.15917969, + "step": 684, + "time_per_iteration": 4.87854790687561 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087932, + "balance_loss_mlp": 1.05488706, + "epoch": 0.1317814544055406, + "flos": 985049344512.0, + "grad_norm": 0.07039095593234826, + "language_loss": 0.85449159, + "learning_rate": 0.0009730840387095046, + "loss": 0.86537099, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.33056641, + "step": 685, + "time_per_iteration": 3.30759596824646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096943, + "balance_loss_mlp": 1.06156158, + "epoch": 0.13197383609080415, + "flos": 611163892224.0, + "grad_norm": 0.05759402546544749, + "language_loss": 0.912597, + "learning_rate": 0.0009729831082019642, + "loss": 0.92356646, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.35351562, + "step": 686, + "time_per_iteration": 2.7965087890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093388, + "balance_loss_mlp": 1.0589608, + "epoch": 0.1321662177760677, + "flos": 494116305408.0, + "grad_norm": 0.058033147986452156, + "language_loss": 0.89668858, + "learning_rate": 0.0009728819940660958, + "loss": 0.90762246, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.34399414, + "step": 687, + "time_per_iteration": 2.7347469329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110102, + "balance_loss_mlp": 1.0653528, + "epoch": 0.13235859946133127, + "flos": 495591713280.0, + "grad_norm": 0.07548862234195632, + "language_loss": 0.86088693, + "learning_rate": 0.0009727806963411557, + "loss": 0.87189722, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.35668945, + "step": 688, + "time_per_iteration": 2.621638774871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098222, + "balance_loss_mlp": 1.06279302, + "epoch": 0.13255098114659483, + "flos": 511417539072.0, + "grad_norm": 0.08656773393569435, + "language_loss": 0.88000298, + "learning_rate": 0.000972679215066471, + "loss": 0.89098513, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.35449219, + "step": 689, + "time_per_iteration": 2.6806418895721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103388, + "balance_loss_mlp": 1.06900764, + "epoch": 0.13274336283185842, + "flos": 547114120704.0, + "grad_norm": 0.07064056682134613, + "language_loss": 0.99675226, + "learning_rate": 0.0009725775502814401, + "loss": 1.00778604, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.34350586, + "step": 690, + "time_per_iteration": 2.607179641723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121046, + "balance_loss_mlp": 1.08397222, + "epoch": 0.13293574451712198, + "flos": 640465781760.0, + "grad_norm": 0.08777481913975324, + "language_loss": 0.85673726, + "learning_rate": 0.0009724757020255327, + "loss": 0.86794776, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.37084961, + "step": 691, + "time_per_iteration": 2.81113338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111244, + "balance_loss_mlp": 1.07726967, + "epoch": 0.13312812620238554, + "flos": 491234042880.0, + "grad_norm": 0.09165524457583717, + "language_loss": 0.87811983, + "learning_rate": 0.0009723736703382902, + "loss": 0.88923222, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.33984375, + "step": 692, + "time_per_iteration": 2.548689603805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110186, + "balance_loss_mlp": 1.0692203, + "epoch": 0.1333205078876491, + "flos": 508693837824.0, + "grad_norm": 0.061462060991887495, + "language_loss": 0.83746743, + "learning_rate": 0.0009722714552593244, + "loss": 0.84848601, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.32641602, + "step": 693, + "time_per_iteration": 2.6584513187408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099112, + "balance_loss_mlp": 1.06358743, + "epoch": 0.13351288957291266, + "flos": 418474722816.0, + "grad_norm": 0.07144638741394425, + "language_loss": 0.94810003, + "learning_rate": 0.000972169056828319, + "loss": 0.95909119, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.35522461, + "step": 694, + "time_per_iteration": 2.461437702178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_mlp": 1.06751275, + "epoch": 0.13370527125817622, + "flos": 615614694912.0, + "grad_norm": 0.05672506947017021, + "language_loss": 0.87834966, + "learning_rate": 0.0009720664750850283, + "loss": 0.88935745, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.33251953, + "step": 695, + "time_per_iteration": 2.7716193199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103519, + "balance_loss_mlp": 1.07085609, + "epoch": 0.13389765294343978, + "flos": 625757391360.0, + "grad_norm": 0.07304651625724701, + "language_loss": 0.93482703, + "learning_rate": 0.0009719637100692784, + "loss": 0.94586229, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.32666016, + "step": 696, + "time_per_iteration": 2.7741310596466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111476, + "balance_loss_mlp": 1.08090401, + "epoch": 0.13409003462870334, + "flos": 609391710720.0, + "grad_norm": 0.06235589965882817, + "language_loss": 0.83759153, + "learning_rate": 0.0009718607618209661, + "loss": 0.84873915, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.33862305, + "step": 697, + "time_per_iteration": 2.869180202484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128671, + "balance_loss_mlp": 1.09488726, + "epoch": 0.13428241631396692, + "flos": 683499810816.0, + "grad_norm": 0.0709058406100417, + "language_loss": 0.88053036, + "learning_rate": 0.0009717576303800595, + "loss": 0.89181709, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.33789062, + "step": 698, + "time_per_iteration": 3.007253408432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122193, + "balance_loss_mlp": 1.08716917, + "epoch": 0.13447479799923048, + "flos": 508565799936.0, + "grad_norm": 0.07060238478807088, + "language_loss": 0.86057615, + "learning_rate": 0.0009716543157865975, + "loss": 0.87179804, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.35083008, + "step": 699, + "time_per_iteration": 2.6622114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112811, + "balance_loss_mlp": 1.07812154, + "epoch": 0.13466717968449404, + "flos": 897124737024.0, + "grad_norm": 0.06896685381510245, + "language_loss": 0.84149206, + "learning_rate": 0.0009715508180806907, + "loss": 0.85262012, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.34716797, + "step": 700, + "time_per_iteration": 3.175494909286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106112, + "balance_loss_mlp": 1.07054055, + "epoch": 0.1348595613697576, + "flos": 989501557248.0, + "grad_norm": 0.07388845252403331, + "language_loss": 0.90260321, + "learning_rate": 0.0009714471373025202, + "loss": 0.91366434, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.35546875, + "step": 701, + "time_per_iteration": 3.3912835121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090254, + "balance_loss_mlp": 1.05499172, + "epoch": 0.13505194305502116, + "flos": 487580580864.0, + "grad_norm": 0.07959074518459132, + "language_loss": 0.89355272, + "learning_rate": 0.0009713432734923386, + "loss": 0.9044553, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.35253906, + "step": 702, + "time_per_iteration": 2.6718733310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090728, + "balance_loss_mlp": 1.05572796, + "epoch": 0.13524432474028472, + "flos": 613103399424.0, + "grad_norm": 0.06387437846302528, + "language_loss": 0.875036, + "learning_rate": 0.0009712392266904696, + "loss": 0.88594317, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.34985352, + "step": 703, + "time_per_iteration": 2.6985831260681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010863, + "balance_loss_mlp": 1.0524683, + "epoch": 0.13543670642554828, + "flos": 904425868800.0, + "grad_norm": 0.06666466963859687, + "language_loss": 0.86250496, + "learning_rate": 0.0009711349969373076, + "loss": 0.87336791, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.33862305, + "step": 704, + "time_per_iteration": 3.1328465938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095762, + "balance_loss_mlp": 1.0610956, + "epoch": 0.13562908811081184, + "flos": 550335416832.0, + "grad_norm": 0.0628446006314887, + "language_loss": 0.80944061, + "learning_rate": 0.0009710305842733178, + "loss": 0.82039821, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.34667969, + "step": 705, + "time_per_iteration": 2.7668187618255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093976, + "balance_loss_mlp": 1.06147909, + "epoch": 0.1358214697960754, + "flos": 507797572608.0, + "grad_norm": 0.06635154625105166, + "language_loss": 0.90133065, + "learning_rate": 0.0009709259887390373, + "loss": 0.91227043, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.32519531, + "step": 706, + "time_per_iteration": 2.656233072280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096924, + "balance_loss_mlp": 1.06390333, + "epoch": 0.136013851481339, + "flos": 528640197120.0, + "grad_norm": 0.09290535615143355, + "language_loss": 0.91425377, + "learning_rate": 0.0009708212103750737, + "loss": 0.92522299, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.33007812, + "step": 707, + "time_per_iteration": 2.569655656814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_mlp": 1.06812644, + "epoch": 0.13620623316660255, + "flos": 658772379648.0, + "grad_norm": 0.06731423560591156, + "language_loss": 0.87756282, + "learning_rate": 0.0009707162492221051, + "loss": 0.88857424, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.33007812, + "step": 708, + "time_per_iteration": 2.880669593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103707, + "balance_loss_mlp": 1.07009029, + "epoch": 0.1363986148518661, + "flos": 671583522816.0, + "grad_norm": 0.07312175328849302, + "language_loss": 0.88322687, + "learning_rate": 0.0009706111053208815, + "loss": 0.89426386, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.33642578, + "step": 709, + "time_per_iteration": 2.7878787517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097257, + "balance_loss_mlp": 1.06342554, + "epoch": 0.13659099653712967, + "flos": 472828520448.0, + "grad_norm": 0.06741688104713542, + "language_loss": 0.86067665, + "learning_rate": 0.0009705057787122232, + "loss": 0.87164921, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.33862305, + "step": 710, + "time_per_iteration": 2.528298854827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105446, + "balance_loss_mlp": 1.07190061, + "epoch": 0.13678337822239323, + "flos": 452483490816.0, + "grad_norm": 0.05706590332145298, + "language_loss": 0.91653168, + "learning_rate": 0.0009704002694370216, + "loss": 0.92758614, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.33569336, + "step": 711, + "time_per_iteration": 2.5201761722564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114394, + "balance_loss_mlp": 1.0794661, + "epoch": 0.13697575990765679, + "flos": 519373416960.0, + "grad_norm": 0.06387130477766731, + "language_loss": 0.86892813, + "learning_rate": 0.0009702945775362388, + "loss": 0.88007212, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.34960938, + "step": 712, + "time_per_iteration": 2.661848783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130562, + "balance_loss_mlp": 1.0947994, + "epoch": 0.13716814159292035, + "flos": 480145618944.0, + "grad_norm": 0.06038249383316015, + "language_loss": 0.87339497, + "learning_rate": 0.0009701887030509086, + "loss": 0.8847006, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.35766602, + "step": 713, + "time_per_iteration": 2.6068434715270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125148, + "balance_loss_mlp": 1.0908401, + "epoch": 0.1373605232781839, + "flos": 545376844800.0, + "grad_norm": 0.06924339631343991, + "language_loss": 0.92127877, + "learning_rate": 0.0009700826460221346, + "loss": 0.93253028, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.34301758, + "step": 714, + "time_per_iteration": 2.653224468231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145818, + "balance_loss_mlp": 1.11050797, + "epoch": 0.1375529049634475, + "flos": 708473143296.0, + "grad_norm": 0.0682346884445605, + "language_loss": 0.93435562, + "learning_rate": 0.0009699764064910921, + "loss": 0.94581378, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.35302734, + "step": 715, + "time_per_iteration": 2.878445625305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130305, + "balance_loss_mlp": 1.09542441, + "epoch": 0.13774528664871105, + "flos": 486452971008.0, + "grad_norm": 0.07091873756636237, + "language_loss": 0.87931371, + "learning_rate": 0.0009698699844990268, + "loss": 0.89061677, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.34863281, + "step": 716, + "time_per_iteration": 2.6278092861175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124133, + "balance_loss_mlp": 1.09070659, + "epoch": 0.1379376683339746, + "flos": 679885636608.0, + "grad_norm": 0.0686032560828043, + "language_loss": 0.88731855, + "learning_rate": 0.0009697633800872555, + "loss": 0.89855987, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.33422852, + "step": 717, + "time_per_iteration": 2.888576030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112997, + "balance_loss_mlp": 1.07825947, + "epoch": 0.13813005001923817, + "flos": 610628419584.0, + "grad_norm": 0.07907714555147631, + "language_loss": 0.9128629, + "learning_rate": 0.0009696565932971655, + "loss": 0.92399287, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.34741211, + "step": 718, + "time_per_iteration": 2.8937225341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110249, + "balance_loss_mlp": 1.06837237, + "epoch": 0.13832243170450173, + "flos": 588431222784.0, + "grad_norm": 0.05947825646897862, + "language_loss": 0.9001984, + "learning_rate": 0.0009695496241702153, + "loss": 0.91122329, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.34155273, + "step": 719, + "time_per_iteration": 2.791111469268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094313, + "balance_loss_mlp": 1.06093454, + "epoch": 0.1385148133897653, + "flos": 699674844672.0, + "grad_norm": 0.07440757355955382, + "language_loss": 0.86308432, + "learning_rate": 0.0009694424727479339, + "loss": 0.87402749, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.33398438, + "step": 720, + "time_per_iteration": 2.8781325817108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088445, + "balance_loss_mlp": 1.05475688, + "epoch": 0.13870719507502885, + "flos": 597977399808.0, + "grad_norm": 0.059872525751604476, + "language_loss": 0.90073895, + "learning_rate": 0.0009693351390719213, + "loss": 0.91162348, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.3371582, + "step": 721, + "time_per_iteration": 2.691493272781372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095999, + "balance_loss_mlp": 1.06242967, + "epoch": 0.1388995767602924, + "flos": 586279309824.0, + "grad_norm": 0.07792099406652078, + "language_loss": 0.91640067, + "learning_rate": 0.000969227623183848, + "loss": 0.92736065, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.33569336, + "step": 722, + "time_per_iteration": 2.768209218978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086676, + "balance_loss_mlp": 1.05475235, + "epoch": 0.139091958445556, + "flos": 650810709504.0, + "grad_norm": 0.07717859695455091, + "language_loss": 0.91485119, + "learning_rate": 0.0009691199251254554, + "loss": 0.92571795, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.3190918, + "step": 723, + "time_per_iteration": 2.813594102859497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093708, + "balance_loss_mlp": 1.06159282, + "epoch": 0.13928434013081956, + "flos": 575446961664.0, + "grad_norm": 0.06414169604653322, + "language_loss": 0.8718468, + "learning_rate": 0.0009690120449385555, + "loss": 0.88278389, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.32104492, + "step": 724, + "time_per_iteration": 2.732372999191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110008, + "balance_loss_mlp": 1.06574821, + "epoch": 0.13947672181608312, + "flos": 562954503168.0, + "grad_norm": 0.07538454681544235, + "language_loss": 0.93399024, + "learning_rate": 0.0009689039826650312, + "loss": 0.94499099, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.34375, + "step": 725, + "time_per_iteration": 2.769481658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111743, + "balance_loss_mlp": 1.09967864, + "epoch": 0.13966910350134668, + "flos": 1520699387904.0, + "grad_norm": 0.042030956775344956, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77634799, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.12060547, + "step": 726, + "time_per_iteration": 4.903716802597046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101674, + "balance_loss_mlp": 1.06619751, + "epoch": 0.13986148518661023, + "flos": 499604557824.0, + "grad_norm": 0.07361028590256702, + "language_loss": 0.88265646, + "learning_rate": 0.0009686873120259941, + "loss": 0.89367324, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.35522461, + "step": 727, + "time_per_iteration": 2.639673948287964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099007, + "balance_loss_mlp": 1.06612897, + "epoch": 0.1400538668718738, + "flos": 598381862400.0, + "grad_norm": 0.053177263225715844, + "language_loss": 0.87612498, + "learning_rate": 0.0009685787037446004, + "loss": 0.88711506, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.32885742, + "step": 728, + "time_per_iteration": 2.7457332611083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095755, + "balance_loss_mlp": 1.06135106, + "epoch": 0.14024624855713735, + "flos": 593757941760.0, + "grad_norm": 0.0730266030670127, + "language_loss": 0.88032103, + "learning_rate": 0.0009684699135448201, + "loss": 0.89127851, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.34423828, + "step": 729, + "time_per_iteration": 2.6995558738708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091636, + "balance_loss_mlp": 1.05940139, + "epoch": 0.1404386302424009, + "flos": 506335311360.0, + "grad_norm": 0.06378774069808751, + "language_loss": 0.93033969, + "learning_rate": 0.0009683609414688895, + "loss": 0.94125605, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.32226562, + "step": 730, + "time_per_iteration": 2.6648926734924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097348, + "balance_loss_mlp": 1.06175184, + "epoch": 0.14063101192766447, + "flos": 573132105216.0, + "grad_norm": 0.05452232030629634, + "language_loss": 0.86945236, + "learning_rate": 0.0009682517875591154, + "loss": 0.88042581, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.35620117, + "step": 731, + "time_per_iteration": 2.7333967685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099629, + "balance_loss_mlp": 1.06656027, + "epoch": 0.14082339361292806, + "flos": 564333806592.0, + "grad_norm": 0.06482276791137384, + "language_loss": 0.87207299, + "learning_rate": 0.0009681424518578749, + "loss": 0.88306928, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.33081055, + "step": 732, + "time_per_iteration": 2.706704616546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_mlp": 1.06734443, + "epoch": 0.14101577529819162, + "flos": 463336187904.0, + "grad_norm": 0.05411989278901109, + "language_loss": 0.88122302, + "learning_rate": 0.000968032934407616, + "loss": 0.89222693, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.33056641, + "step": 733, + "time_per_iteration": 2.5904436111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100997, + "balance_loss_mlp": 1.06766593, + "epoch": 0.14120815698345518, + "flos": 595791991296.0, + "grad_norm": 0.06321555834593343, + "language_loss": 0.82077157, + "learning_rate": 0.0009679232352508571, + "loss": 0.83178151, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.33349609, + "step": 734, + "time_per_iteration": 2.758493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102299, + "balance_loss_mlp": 1.06992185, + "epoch": 0.14140053866871874, + "flos": 534864591360.0, + "grad_norm": 0.05697576898708014, + "language_loss": 0.81442666, + "learning_rate": 0.0009678133544301871, + "loss": 0.82544965, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.32373047, + "step": 735, + "time_per_iteration": 2.6508195400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092208, + "balance_loss_mlp": 1.06006956, + "epoch": 0.1415929203539823, + "flos": 520013606400.0, + "grad_norm": 0.0400187761209974, + "language_loss": 0.91843486, + "learning_rate": 0.0009677032919882658, + "loss": 0.92935699, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.32128906, + "step": 736, + "time_per_iteration": 2.705019474029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_mlp": 1.06975937, + "epoch": 0.14178530203924586, + "flos": 482095300608.0, + "grad_norm": 0.07179339183341249, + "language_loss": 0.92199683, + "learning_rate": 0.000967593047967823, + "loss": 0.93300164, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.30712891, + "step": 737, + "time_per_iteration": 2.55415415763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109678, + "balance_loss_mlp": 1.07577443, + "epoch": 0.14197768372450942, + "flos": 676339863552.0, + "grad_norm": 0.08640894081958116, + "language_loss": 0.87084705, + "learning_rate": 0.0009674826224116593, + "loss": 0.88194382, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.33911133, + "step": 738, + "time_per_iteration": 2.819878101348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097633, + "balance_loss_mlp": 1.06544614, + "epoch": 0.14217006540977298, + "flos": 445802199552.0, + "grad_norm": 0.06953952980021996, + "language_loss": 0.8713401, + "learning_rate": 0.0009673720153626455, + "loss": 0.88231641, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.32177734, + "step": 739, + "time_per_iteration": 2.5987422466278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096281, + "balance_loss_mlp": 1.06385565, + "epoch": 0.14236244709503657, + "flos": 496261016064.0, + "grad_norm": 0.08400230511878481, + "language_loss": 0.87465405, + "learning_rate": 0.0009672612268637235, + "loss": 0.88561684, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.32421875, + "step": 740, + "time_per_iteration": 2.6148736476898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098472, + "balance_loss_mlp": 1.06669128, + "epoch": 0.14255482878030012, + "flos": 648022989312.0, + "grad_norm": 0.0806935070673247, + "language_loss": 0.846753, + "learning_rate": 0.0009671502569579048, + "loss": 0.85773772, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.31762695, + "step": 741, + "time_per_iteration": 2.7533769607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089599, + "balance_loss_mlp": 1.05774641, + "epoch": 0.14274721046556368, + "flos": 535888894464.0, + "grad_norm": 0.06572551706098649, + "language_loss": 0.90748239, + "learning_rate": 0.0009670391056882719, + "loss": 0.91837835, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.31835938, + "step": 742, + "time_per_iteration": 2.698690176010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088511, + "balance_loss_mlp": 1.0565629, + "epoch": 0.14293959215082724, + "flos": 956677215744.0, + "grad_norm": 0.07291469749344824, + "language_loss": 0.89417249, + "learning_rate": 0.0009669277730979776, + "loss": 0.90505755, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.31958008, + "step": 743, + "time_per_iteration": 3.1728732585906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108649, + "balance_loss_mlp": 1.05408931, + "epoch": 0.1431319738360908, + "flos": 692766590976.0, + "grad_norm": 0.06693583917292938, + "language_loss": 0.85588205, + "learning_rate": 0.0009668162592302449, + "loss": 0.86674696, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.32397461, + "step": 744, + "time_per_iteration": 2.896467685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099426, + "balance_loss_mlp": 1.06673896, + "epoch": 0.14332435552135436, + "flos": 565174817280.0, + "grad_norm": 0.0717564206721674, + "language_loss": 0.86683381, + "learning_rate": 0.0009667045641283676, + "loss": 0.877828, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.3269043, + "step": 745, + "time_per_iteration": 2.6326427459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095955, + "balance_loss_mlp": 1.06336319, + "epoch": 0.14351673720661792, + "flos": 738045665280.0, + "grad_norm": 0.07083856064802352, + "language_loss": 0.95545924, + "learning_rate": 0.0009665926878357092, + "loss": 0.96641874, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.32592773, + "step": 746, + "time_per_iteration": 2.902628183364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108393, + "balance_loss_mlp": 1.07565856, + "epoch": 0.14370911889188148, + "flos": 548951731200.0, + "grad_norm": 0.08672542857876225, + "language_loss": 0.91510898, + "learning_rate": 0.0009664806303957043, + "loss": 0.92619288, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.32714844, + "step": 747, + "time_per_iteration": 2.678656578063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107271, + "balance_loss_mlp": 1.07448816, + "epoch": 0.14390150057714507, + "flos": 589973469696.0, + "grad_norm": 0.06575006445724518, + "language_loss": 0.87633115, + "learning_rate": 0.0009663683918518571, + "loss": 0.88740385, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.32788086, + "step": 748, + "time_per_iteration": 2.894339084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116744, + "balance_loss_mlp": 1.08226848, + "epoch": 0.14409388226240863, + "flos": 590773782528.0, + "grad_norm": 0.06412555003569581, + "language_loss": 0.86334193, + "learning_rate": 0.0009662559722477428, + "loss": 0.87450933, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.3449707, + "step": 749, + "time_per_iteration": 2.6673357486724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116866, + "balance_loss_mlp": 1.15397346, + "epoch": 0.1442862639476722, + "flos": 1510418479104.0, + "grad_norm": 0.05654081816866197, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77331638, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.14648438, + "step": 750, + "time_per_iteration": 4.97744607925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111461, + "balance_loss_mlp": 1.0782733, + "epoch": 0.14447864563293575, + "flos": 496493770752.0, + "grad_norm": 0.05840496998451829, + "language_loss": 0.89989787, + "learning_rate": 0.0009660305900333632, + "loss": 0.91101241, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.33203125, + "step": 751, + "time_per_iteration": 2.6919631958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108129, + "balance_loss_mlp": 1.07513142, + "epoch": 0.1446710273181993, + "flos": 589400271360.0, + "grad_norm": 0.0663289310880325, + "language_loss": 0.83084202, + "learning_rate": 0.0009659176275105992, + "loss": 0.8419233, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.33007812, + "step": 752, + "time_per_iteration": 2.702003240585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097403, + "balance_loss_mlp": 1.0634284, + "epoch": 0.14486340900346287, + "flos": 585521256960.0, + "grad_norm": 0.05748666507804042, + "language_loss": 0.86628646, + "learning_rate": 0.0009658044841025701, + "loss": 0.87726045, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.34008789, + "step": 753, + "time_per_iteration": 2.7666702270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106626, + "balance_loss_mlp": 1.07114923, + "epoch": 0.14505579068872643, + "flos": 504405978624.0, + "grad_norm": 0.07320865998852653, + "language_loss": 0.81996346, + "learning_rate": 0.0009656911598532021, + "loss": 0.83102977, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.35498047, + "step": 754, + "time_per_iteration": 2.6273839473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094053, + "balance_loss_mlp": 1.05936301, + "epoch": 0.14524817237399, + "flos": 486566452224.0, + "grad_norm": 0.05776902712696923, + "language_loss": 0.90229332, + "learning_rate": 0.0009655776548064917, + "loss": 0.91323388, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.34667969, + "step": 755, + "time_per_iteration": 2.6639461517333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092368, + "balance_loss_mlp": 1.05867922, + "epoch": 0.14544055405925355, + "flos": 727857888768.0, + "grad_norm": 0.059694446461720084, + "language_loss": 0.88762641, + "learning_rate": 0.0009654639690065054, + "loss": 0.89855003, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.33691406, + "step": 756, + "time_per_iteration": 2.881164789199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092737, + "balance_loss_mlp": 1.05981112, + "epoch": 0.14563293574451713, + "flos": 593359271424.0, + "grad_norm": 0.0719411984245977, + "language_loss": 0.88362074, + "learning_rate": 0.00096535010249738, + "loss": 0.89454818, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.3293457, + "step": 757, + "time_per_iteration": 2.703355312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092325, + "balance_loss_mlp": 1.05925632, + "epoch": 0.1458253174297807, + "flos": 560192924160.0, + "grad_norm": 0.09095988428785044, + "language_loss": 0.8300786, + "learning_rate": 0.0009652360553233224, + "loss": 0.84100187, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.33081055, + "step": 758, + "time_per_iteration": 2.7321062088012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_mlp": 1.04821551, + "epoch": 0.14601769911504425, + "flos": 1557025984512.0, + "grad_norm": 0.03493248396843453, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74836457, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.14453125, + "step": 759, + "time_per_iteration": 4.917184591293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099151, + "balance_loss_mlp": 1.06605887, + "epoch": 0.1462100808003078, + "flos": 865922628096.0, + "grad_norm": 0.05465610046720203, + "language_loss": 0.8166393, + "learning_rate": 0.0009650074191575883, + "loss": 0.82763088, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.33105469, + "step": 760, + "time_per_iteration": 3.2009472846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097005, + "balance_loss_mlp": 1.06341171, + "epoch": 0.14640246248557137, + "flos": 522673288704.0, + "grad_norm": 0.07890258703475667, + "language_loss": 0.86329532, + "learning_rate": 0.0009648928302546766, + "loss": 0.87426543, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.3359375, + "step": 761, + "time_per_iteration": 2.6858482360839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087335, + "balance_loss_mlp": 1.05340791, + "epoch": 0.14659484417083493, + "flos": 1030121805312.0, + "grad_norm": 0.05505233607608704, + "language_loss": 0.8584463, + "learning_rate": 0.0009647780608643613, + "loss": 0.86931968, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.33935547, + "step": 762, + "time_per_iteration": 3.3784618377685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087006, + "balance_loss_mlp": 1.05365133, + "epoch": 0.1467872258560985, + "flos": 500426629632.0, + "grad_norm": 0.083565321416964, + "language_loss": 0.88299912, + "learning_rate": 0.0009646631110312001, + "loss": 0.89386916, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.33349609, + "step": 763, + "time_per_iteration": 2.642038345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096574, + "balance_loss_mlp": 1.06465006, + "epoch": 0.14697960754136205, + "flos": 547514201088.0, + "grad_norm": 0.05646167170610495, + "language_loss": 0.88908124, + "learning_rate": 0.0009645479807998203, + "loss": 0.900047, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.3190918, + "step": 764, + "time_per_iteration": 2.7709102630615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093321, + "balance_loss_mlp": 1.0614922, + "epoch": 0.14717198922662564, + "flos": 517586678784.0, + "grad_norm": 0.06731397985108602, + "language_loss": 0.93233657, + "learning_rate": 0.0009644326702149196, + "loss": 0.94326979, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.31811523, + "step": 765, + "time_per_iteration": 2.691761016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098194, + "balance_loss_mlp": 1.06472015, + "epoch": 0.1473643709118892, + "flos": 731661147648.0, + "grad_norm": 0.08664060064789567, + "language_loss": 0.85604531, + "learning_rate": 0.0009643171793212653, + "loss": 0.86702728, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.33496094, + "step": 766, + "time_per_iteration": 3.0578510761260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095374, + "balance_loss_mlp": 1.06190002, + "epoch": 0.14755675259715276, + "flos": 620257554432.0, + "grad_norm": 0.06875066800131625, + "language_loss": 0.90379435, + "learning_rate": 0.0009642015081636952, + "loss": 0.91474807, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.33496094, + "step": 767, + "time_per_iteration": 2.690892219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091039, + "balance_loss_mlp": 1.05830407, + "epoch": 0.14774913428241632, + "flos": 451981513728.0, + "grad_norm": 0.06617868208271054, + "language_loss": 0.88812423, + "learning_rate": 0.0009640856567871166, + "loss": 0.89903462, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.32714844, + "step": 768, + "time_per_iteration": 2.5108768939971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086849, + "balance_loss_mlp": 1.05316067, + "epoch": 0.14794151596767988, + "flos": 836881196544.0, + "grad_norm": 0.06813910901976611, + "language_loss": 0.89643073, + "learning_rate": 0.0009639696252365072, + "loss": 0.90729922, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.33691406, + "step": 769, + "time_per_iteration": 3.036872386932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087914, + "balance_loss_mlp": 1.05546558, + "epoch": 0.14813389765294344, + "flos": 685765204992.0, + "grad_norm": 0.06952898718112278, + "language_loss": 0.82433641, + "learning_rate": 0.0009638534135569144, + "loss": 0.83521557, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.32446289, + "step": 770, + "time_per_iteration": 2.920228958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096521, + "balance_loss_mlp": 1.06395316, + "epoch": 0.148326279338207, + "flos": 509625008640.0, + "grad_norm": 0.05850145176667806, + "language_loss": 0.90417981, + "learning_rate": 0.0009637370217934554, + "loss": 0.91514498, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.32568359, + "step": 771, + "time_per_iteration": 2.6692943572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0624088, + "epoch": 0.14851866102347056, + "flos": 587869608960.0, + "grad_norm": 0.06374792966079154, + "language_loss": 0.83362675, + "learning_rate": 0.0009636204499913175, + "loss": 0.84457153, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.32055664, + "step": 772, + "time_per_iteration": 2.9103784561157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101595, + "balance_loss_mlp": 1.07129157, + "epoch": 0.14871104270873411, + "flos": 690722366976.0, + "grad_norm": 0.05784692032564958, + "language_loss": 0.8891257, + "learning_rate": 0.0009635036981957581, + "loss": 0.90014172, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.30273438, + "step": 773, + "time_per_iteration": 2.840233087539673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109331, + "balance_loss_mlp": 1.06112361, + "epoch": 0.1489034243939977, + "flos": 654803205120.0, + "grad_norm": 0.06091674471201955, + "language_loss": 0.9126395, + "learning_rate": 0.0009633867664521043, + "loss": 0.9235726, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.32202148, + "step": 774, + "time_per_iteration": 2.8467912673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098219, + "balance_loss_mlp": 1.0643878, + "epoch": 0.14909580607926126, + "flos": 475595891712.0, + "grad_norm": 0.06395321005815084, + "language_loss": 0.87366414, + "learning_rate": 0.0009632696548057527, + "loss": 0.8846463, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.33862305, + "step": 775, + "time_per_iteration": 2.55267596244812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090534, + "balance_loss_mlp": 1.05729866, + "epoch": 0.14928818776452482, + "flos": 610789953024.0, + "grad_norm": 0.07257335679926562, + "language_loss": 0.85489643, + "learning_rate": 0.0009631523633021704, + "loss": 0.86580181, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.33251953, + "step": 776, + "time_per_iteration": 2.800656795501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090063, + "balance_loss_mlp": 1.05694628, + "epoch": 0.14948056944978838, + "flos": 561487859712.0, + "grad_norm": 0.058446141184189525, + "language_loss": 0.88943005, + "learning_rate": 0.0009630348919868936, + "loss": 0.90033066, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.33129883, + "step": 777, + "time_per_iteration": 2.7306644916534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088502, + "balance_loss_mlp": 1.05397916, + "epoch": 0.14967295113505194, + "flos": 448972623360.0, + "grad_norm": 0.08136314957760014, + "language_loss": 0.81536144, + "learning_rate": 0.0009629172409055293, + "loss": 0.82624644, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.34545898, + "step": 778, + "time_per_iteration": 2.532480239868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091534, + "balance_loss_mlp": 1.05937171, + "epoch": 0.1498653328203155, + "flos": 571000541184.0, + "grad_norm": 0.06865521140792329, + "language_loss": 0.88039231, + "learning_rate": 0.0009627994101037531, + "loss": 0.89130771, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.3215332, + "step": 779, + "time_per_iteration": 2.7336056232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091586, + "balance_loss_mlp": 1.05811191, + "epoch": 0.15005771450557906, + "flos": 630918194688.0, + "grad_norm": 0.06277485509918372, + "language_loss": 0.8981787, + "learning_rate": 0.0009626813996273114, + "loss": 0.90909451, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.3347168, + "step": 780, + "time_per_iteration": 2.8651859760284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092654, + "balance_loss_mlp": 1.06018162, + "epoch": 0.15025009619084262, + "flos": 577633780224.0, + "grad_norm": 0.06737111741199381, + "language_loss": 0.89359641, + "learning_rate": 0.0009625632095220198, + "loss": 0.90452296, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.32470703, + "step": 781, + "time_per_iteration": 2.910163640975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093984, + "balance_loss_mlp": 1.06041455, + "epoch": 0.1504424778761062, + "flos": 483646311936.0, + "grad_norm": 0.06188715182302237, + "language_loss": 0.87568116, + "learning_rate": 0.0009624448398337637, + "loss": 0.88662094, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.3359375, + "step": 782, + "time_per_iteration": 2.532055616378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100397, + "balance_loss_mlp": 1.06751907, + "epoch": 0.15063485956136977, + "flos": 762167812608.0, + "grad_norm": 0.06229794960735175, + "language_loss": 0.89905757, + "learning_rate": 0.0009623262906084984, + "loss": 0.91006154, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.32861328, + "step": 783, + "time_per_iteration": 2.9851605892181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104855, + "balance_loss_mlp": 1.0712378, + "epoch": 0.15082724124663333, + "flos": 497369687040.0, + "grad_norm": 0.060596744514248076, + "language_loss": 0.90796679, + "learning_rate": 0.0009622075618922486, + "loss": 0.91901541, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.33642578, + "step": 784, + "time_per_iteration": 2.6796786785125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102349, + "balance_loss_mlp": 1.06928015, + "epoch": 0.15101962293189689, + "flos": 509476621824.0, + "grad_norm": 0.06389342174673626, + "language_loss": 0.87423813, + "learning_rate": 0.0009620886537311091, + "loss": 0.8852616, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.33081055, + "step": 785, + "time_per_iteration": 2.6153056621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113163, + "balance_loss_mlp": 1.07685184, + "epoch": 0.15121200461716044, + "flos": 457520638464.0, + "grad_norm": 0.06793935281312648, + "language_loss": 0.85492945, + "learning_rate": 0.000961969566171244, + "loss": 0.86606109, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.36303711, + "step": 786, + "time_per_iteration": 2.506267786026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122272, + "balance_loss_mlp": 1.08703363, + "epoch": 0.151404386302424, + "flos": 537729477120.0, + "grad_norm": 0.0670602351843582, + "language_loss": 0.90370345, + "learning_rate": 0.0009618502992588873, + "loss": 0.91492617, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.35253906, + "step": 787, + "time_per_iteration": 2.623457670211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141844, + "balance_loss_mlp": 1.10658193, + "epoch": 0.15159676798768756, + "flos": 687858891264.0, + "grad_norm": 0.06543467559167064, + "language_loss": 0.88581872, + "learning_rate": 0.0009617308530403424, + "loss": 0.89723718, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.35302734, + "step": 788, + "time_per_iteration": 2.975861072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149381, + "balance_loss_mlp": 1.11371326, + "epoch": 0.15178914967295112, + "flos": 545042193408.0, + "grad_norm": 0.059566397417978756, + "language_loss": 0.87806541, + "learning_rate": 0.0009616112275619825, + "loss": 0.88955921, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.35668945, + "step": 789, + "time_per_iteration": 2.683262348175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152452, + "balance_loss_mlp": 1.1169517, + "epoch": 0.1519815313582147, + "flos": 511510671360.0, + "grad_norm": 0.05728483560240697, + "language_loss": 0.84466863, + "learning_rate": 0.0009614914228702503, + "loss": 0.85619313, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.35498047, + "step": 790, + "time_per_iteration": 2.6616339683532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142719, + "balance_loss_mlp": 1.10850596, + "epoch": 0.15217391304347827, + "flos": 683747122176.0, + "grad_norm": 0.057799273493116435, + "language_loss": 0.89279461, + "learning_rate": 0.0009613714390116581, + "loss": 0.90422177, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.34204102, + "step": 791, + "time_per_iteration": 2.947608470916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133841, + "balance_loss_mlp": 1.0997231, + "epoch": 0.15236629472874183, + "flos": 643873342464.0, + "grad_norm": 0.06413295296627212, + "language_loss": 0.86589968, + "learning_rate": 0.0009612512760327879, + "loss": 0.87723809, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.34155273, + "step": 792, + "time_per_iteration": 2.8261189460754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124449, + "balance_loss_mlp": 1.08727932, + "epoch": 0.1525586764140054, + "flos": 412654791168.0, + "grad_norm": 0.06095846853214657, + "language_loss": 0.85749042, + "learning_rate": 0.0009611309339802909, + "loss": 0.86873484, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.37182617, + "step": 793, + "time_per_iteration": 2.438474178314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113041, + "balance_loss_mlp": 1.07811236, + "epoch": 0.15275105809926895, + "flos": 802444644864.0, + "grad_norm": 0.04691390558901254, + "language_loss": 0.84620011, + "learning_rate": 0.0009610104129008881, + "loss": 0.85733056, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.34985352, + "step": 794, + "time_per_iteration": 3.1149892807006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092099, + "balance_loss_mlp": 1.05786228, + "epoch": 0.1529434397845325, + "flos": 612143115264.0, + "grad_norm": 0.06446455819394356, + "language_loss": 0.88995111, + "learning_rate": 0.0009608897128413701, + "loss": 0.90087205, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.3425293, + "step": 795, + "time_per_iteration": 2.7310965061187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085841, + "balance_loss_mlp": 1.05160367, + "epoch": 0.15313582146979607, + "flos": 614941009920.0, + "grad_norm": 0.04580320827636504, + "language_loss": 0.8595438, + "learning_rate": 0.0009607688338485965, + "loss": 0.87040222, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.3425293, + "step": 796, + "time_per_iteration": 2.8534584045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088916, + "balance_loss_mlp": 1.05427384, + "epoch": 0.15332820315505963, + "flos": 793256440320.0, + "grad_norm": 0.053101967265095064, + "language_loss": 0.91128695, + "learning_rate": 0.0009606477759694969, + "loss": 0.92217612, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.34643555, + "step": 797, + "time_per_iteration": 3.0544466972351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_mlp": 1.06441545, + "epoch": 0.1535205848403232, + "flos": 549945510912.0, + "grad_norm": 0.0662794157411924, + "language_loss": 0.87591946, + "learning_rate": 0.0009605265392510703, + "loss": 0.88690674, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.34350586, + "step": 798, + "time_per_iteration": 2.6120660305023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011091, + "balance_loss_mlp": 1.07417202, + "epoch": 0.15371296652558677, + "flos": 535691045376.0, + "grad_norm": 0.07220239734969772, + "language_loss": 0.92342889, + "learning_rate": 0.0009604051237403846, + "loss": 0.93451989, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.34960938, + "step": 799, + "time_per_iteration": 2.640749216079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111728, + "balance_loss_mlp": 1.07808757, + "epoch": 0.15390534821085033, + "flos": 395002939392.0, + "grad_norm": 0.06314402273456009, + "language_loss": 0.86126584, + "learning_rate": 0.0009602835294845776, + "loss": 0.87238312, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.33666992, + "step": 800, + "time_per_iteration": 2.44914174079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117351, + "balance_loss_mlp": 1.08254242, + "epoch": 0.1540977298961139, + "flos": 535587738624.0, + "grad_norm": 0.057636094576239, + "language_loss": 0.91100746, + "learning_rate": 0.0009601617565308565, + "loss": 0.92218101, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.34790039, + "step": 801, + "time_per_iteration": 2.599679470062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119062, + "balance_loss_mlp": 1.08511138, + "epoch": 0.15429011158137745, + "flos": 723388147200.0, + "grad_norm": 0.05961266019354579, + "language_loss": 0.86783326, + "learning_rate": 0.0009600398049264977, + "loss": 0.87902391, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.33935547, + "step": 802, + "time_per_iteration": 2.9514007568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121787, + "balance_loss_mlp": 1.08735943, + "epoch": 0.154482493266641, + "flos": 620209502208.0, + "grad_norm": 0.06366105456569557, + "language_loss": 0.92098475, + "learning_rate": 0.0009599176747188469, + "loss": 0.9322027, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.34448242, + "step": 803, + "time_per_iteration": 2.8068411350250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_mlp": 1.08012128, + "epoch": 0.15467487495190457, + "flos": 525351909888.0, + "grad_norm": 0.08101366702111423, + "language_loss": 0.83651662, + "learning_rate": 0.0009597953659553196, + "loss": 0.84765685, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.33911133, + "step": 804, + "time_per_iteration": 2.7075448036193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098377, + "balance_loss_mlp": 1.06616712, + "epoch": 0.15486725663716813, + "flos": 527473299456.0, + "grad_norm": 0.07377431927286832, + "language_loss": 0.89624304, + "learning_rate": 0.0009596728786833997, + "loss": 0.90722686, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.32202148, + "step": 805, + "time_per_iteration": 2.6376051902770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088085, + "balance_loss_mlp": 1.05420554, + "epoch": 0.1550596383224317, + "flos": 1048118482944.0, + "grad_norm": 0.06708822771662253, + "language_loss": 0.90018022, + "learning_rate": 0.0009595502129506415, + "loss": 0.91106105, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.33911133, + "step": 806, + "time_per_iteration": 3.3391284942626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092582, + "balance_loss_mlp": 1.05903625, + "epoch": 0.15525202000769528, + "flos": 613438050816.0, + "grad_norm": 0.06052700763637142, + "language_loss": 0.83084035, + "learning_rate": 0.0009594273688046678, + "loss": 0.84176612, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.33544922, + "step": 807, + "time_per_iteration": 2.7136006355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088184, + "balance_loss_mlp": 1.05273128, + "epoch": 0.15544440169295884, + "flos": 532805810688.0, + "grad_norm": 0.07048562468234597, + "language_loss": 0.86048424, + "learning_rate": 0.000959304346293171, + "loss": 0.87136608, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.35473633, + "step": 808, + "time_per_iteration": 2.6744906902313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097573, + "balance_loss_mlp": 1.06254935, + "epoch": 0.1556367833782224, + "flos": 644433546240.0, + "grad_norm": 0.06803397985071584, + "language_loss": 0.88331544, + "learning_rate": 0.0009591811454639125, + "loss": 0.89429116, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.3503418, + "step": 809, + "time_per_iteration": 2.730431079864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0610261, + "epoch": 0.15582916506348596, + "flos": 543540644352.0, + "grad_norm": 0.06204685505428811, + "language_loss": 0.88227659, + "learning_rate": 0.0009590577663647234, + "loss": 0.89322132, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.3347168, + "step": 810, + "time_per_iteration": 2.71469783782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104684, + "balance_loss_mlp": 1.07078123, + "epoch": 0.15602154674874952, + "flos": 579740613120.0, + "grad_norm": 0.05672341894910533, + "language_loss": 0.86610442, + "learning_rate": 0.0009589342090435036, + "loss": 0.87715125, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.33935547, + "step": 811, + "time_per_iteration": 2.799246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110387, + "balance_loss_mlp": 1.06918025, + "epoch": 0.15621392843401308, + "flos": 534982454784.0, + "grad_norm": 0.0647852675732537, + "language_loss": 0.87778354, + "learning_rate": 0.0009588104735482223, + "loss": 0.8888222, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.34692383, + "step": 812, + "time_per_iteration": 2.6684510707855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126921, + "balance_loss_mlp": 1.09106326, + "epoch": 0.15640631011927664, + "flos": 550635162624.0, + "grad_norm": 0.08222618986335321, + "language_loss": 0.84280443, + "learning_rate": 0.0009586865599269177, + "loss": 0.85407358, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.35864258, + "step": 813, + "time_per_iteration": 2.6293816566467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131277, + "balance_loss_mlp": 1.09651566, + "epoch": 0.1565986918045402, + "flos": 637190641152.0, + "grad_norm": 0.05945515562529824, + "language_loss": 0.88725412, + "learning_rate": 0.0009585624682276977, + "loss": 0.89856684, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.34814453, + "step": 814, + "time_per_iteration": 2.744253158569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137563, + "balance_loss_mlp": 1.10113239, + "epoch": 0.15679107348980378, + "flos": 490569122304.0, + "grad_norm": 0.09591637295165127, + "language_loss": 0.87945771, + "learning_rate": 0.0009584381984987386, + "loss": 0.89083332, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.36474609, + "step": 815, + "time_per_iteration": 2.5264036655426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124613, + "balance_loss_mlp": 1.0911386, + "epoch": 0.15698345517506734, + "flos": 529689231360.0, + "grad_norm": 0.05838460881618622, + "language_loss": 0.90277314, + "learning_rate": 0.0009583137507882864, + "loss": 0.91401929, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.3347168, + "step": 816, + "time_per_iteration": 2.6488330364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109898, + "balance_loss_mlp": 1.07418323, + "epoch": 0.1571758368603309, + "flos": 545779897344.0, + "grad_norm": 0.07313796537718548, + "language_loss": 0.81262791, + "learning_rate": 0.000958189125144656, + "loss": 0.82372689, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.35766602, + "step": 817, + "time_per_iteration": 2.7040657997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101746, + "balance_loss_mlp": 1.06672239, + "epoch": 0.15736821854559446, + "flos": 565377048576.0, + "grad_norm": 0.067694528538076, + "language_loss": 0.88558215, + "learning_rate": 0.0009580643216162313, + "loss": 0.89659959, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.3503418, + "step": 818, + "time_per_iteration": 2.6538634300231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096156, + "balance_loss_mlp": 1.06110835, + "epoch": 0.15756060023085802, + "flos": 500707436544.0, + "grad_norm": 0.05957146674366314, + "language_loss": 0.79884583, + "learning_rate": 0.0009579393402514652, + "loss": 0.80980736, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.35107422, + "step": 819, + "time_per_iteration": 2.5606625080108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082975, + "balance_loss_mlp": 1.048738, + "epoch": 0.15775298191612158, + "flos": 519014034432.0, + "grad_norm": 0.06194437160070725, + "language_loss": 0.91126758, + "learning_rate": 0.0009578141810988801, + "loss": 0.92209733, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.34228516, + "step": 820, + "time_per_iteration": 2.55538010597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082194, + "balance_loss_mlp": 1.04712272, + "epoch": 0.15794536360138514, + "flos": 465891153408.0, + "grad_norm": 0.060184436438788555, + "language_loss": 0.91010749, + "learning_rate": 0.0009576888442070668, + "loss": 0.92092943, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.35083008, + "step": 821, + "time_per_iteration": 2.6139276027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094225, + "balance_loss_mlp": 1.05982161, + "epoch": 0.1581377452866487, + "flos": 516911583744.0, + "grad_norm": 0.06832586535724347, + "language_loss": 0.92820144, + "learning_rate": 0.0009575633296246854, + "loss": 0.93914366, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.34423828, + "step": 822, + "time_per_iteration": 2.557404041290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096972, + "balance_loss_mlp": 1.06242526, + "epoch": 0.15833012697191226, + "flos": 549522109440.0, + "grad_norm": 0.06257557491721027, + "language_loss": 0.83520567, + "learning_rate": 0.0009574376374004652, + "loss": 0.84617537, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.34570312, + "step": 823, + "time_per_iteration": 2.673220157623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100077, + "balance_loss_mlp": 1.06395626, + "epoch": 0.15852250865717585, + "flos": 487206641664.0, + "grad_norm": 0.07116075590187526, + "language_loss": 0.81073487, + "learning_rate": 0.000957311767583204, + "loss": 0.82173562, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.36132812, + "step": 824, + "time_per_iteration": 2.605074882507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159484, + "balance_loss_mlp": 1.14126849, + "epoch": 0.1587148903424394, + "flos": 1309041672192.0, + "grad_norm": 0.051809649393169656, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83231074, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.18261719, + "step": 825, + "time_per_iteration": 4.726073265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111019, + "balance_loss_mlp": 1.07349157, + "epoch": 0.15890727202770297, + "flos": 466634649600.0, + "grad_norm": 0.07947222616221912, + "language_loss": 0.92132723, + "learning_rate": 0.0009570594953650961, + "loss": 0.93243748, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.37524414, + "step": 826, + "time_per_iteration": 2.5146830081939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109067, + "balance_loss_mlp": 1.07208848, + "epoch": 0.15909965371296653, + "flos": 776733608448.0, + "grad_norm": 0.06013225990958685, + "language_loss": 0.80852252, + "learning_rate": 0.00095693309306219, + "loss": 0.81961316, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.36962891, + "step": 827, + "time_per_iteration": 3.095632553100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117102, + "balance_loss_mlp": 1.07945621, + "epoch": 0.1592920353982301, + "flos": 1077852538368.0, + "grad_norm": 0.05984978885312211, + "language_loss": 0.88600951, + "learning_rate": 0.0009568065133621244, + "loss": 0.89718056, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.37646484, + "step": 828, + "time_per_iteration": 3.3153574466705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111269, + "balance_loss_mlp": 1.07584, + "epoch": 0.15948441708349365, + "flos": 725307305472.0, + "grad_norm": 0.0632864692280333, + "language_loss": 0.85493571, + "learning_rate": 0.0009566797563140422, + "loss": 0.86604846, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.35449219, + "step": 829, + "time_per_iteration": 2.8705785274505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121543, + "balance_loss_mlp": 1.08470702, + "epoch": 0.1596767987687572, + "flos": 578447087616.0, + "grad_norm": 0.06433687205870958, + "language_loss": 0.88630873, + "learning_rate": 0.0009565528219671547, + "loss": 0.89752412, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.36816406, + "step": 830, + "time_per_iteration": 2.8890771865844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137333, + "balance_loss_mlp": 1.10049748, + "epoch": 0.15986918045402077, + "flos": 528728947200.0, + "grad_norm": 0.04994246668943954, + "language_loss": 0.85232639, + "learning_rate": 0.0009564257103707418, + "loss": 0.86369967, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.36816406, + "step": 831, + "time_per_iteration": 2.5870308876037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133852, + "balance_loss_mlp": 1.09632492, + "epoch": 0.16006156213928435, + "flos": 574313559552.0, + "grad_norm": 0.0648316290803925, + "language_loss": 0.91675746, + "learning_rate": 0.0009562984215741533, + "loss": 0.92809594, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.37524414, + "step": 832, + "time_per_iteration": 2.655066967010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117496, + "balance_loss_mlp": 1.08170903, + "epoch": 0.1602539438245479, + "flos": 515258675712.0, + "grad_norm": 0.14271195523272245, + "language_loss": 0.82911491, + "learning_rate": 0.0009561709556268065, + "loss": 0.84028995, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.35839844, + "step": 833, + "time_per_iteration": 2.69999098777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119914, + "balance_loss_mlp": 1.08419931, + "epoch": 0.16044632550981147, + "flos": 620730418176.0, + "grad_norm": 0.05962773238435596, + "language_loss": 0.95060706, + "learning_rate": 0.0009560433125781884, + "loss": 0.96180618, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.35693359, + "step": 834, + "time_per_iteration": 2.711109161376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126421, + "balance_loss_mlp": 1.08977628, + "epoch": 0.16063870719507503, + "flos": 560817146880.0, + "grad_norm": 0.06388697234939344, + "language_loss": 0.92829657, + "learning_rate": 0.0009559154924778544, + "loss": 0.93956077, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.36621094, + "step": 835, + "time_per_iteration": 2.695260763168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121916, + "balance_loss_mlp": 1.08789361, + "epoch": 0.1608310888803386, + "flos": 804778440192.0, + "grad_norm": 0.05750453212643973, + "language_loss": 0.85217482, + "learning_rate": 0.0009557874953754284, + "loss": 0.86339402, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.34057617, + "step": 836, + "time_per_iteration": 3.002013921737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126166, + "balance_loss_mlp": 1.09204817, + "epoch": 0.16102347056560215, + "flos": 600311195136.0, + "grad_norm": 0.06332628409766573, + "language_loss": 0.84060842, + "learning_rate": 0.0009556593213206038, + "loss": 0.85187006, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.34130859, + "step": 837, + "time_per_iteration": 2.698716163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125003, + "balance_loss_mlp": 1.09102869, + "epoch": 0.1612158522508657, + "flos": 553235208192.0, + "grad_norm": 0.07524747482874264, + "language_loss": 0.87844718, + "learning_rate": 0.0009555309703631414, + "loss": 0.88969719, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.33984375, + "step": 838, + "time_per_iteration": 2.669588327407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133813, + "balance_loss_mlp": 1.09752607, + "epoch": 0.16140823393612927, + "flos": 555701423616.0, + "grad_norm": 0.07144746672945328, + "language_loss": 0.87685311, + "learning_rate": 0.0009554024425528722, + "loss": 0.88819122, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.36279297, + "step": 839, + "time_per_iteration": 2.6809709072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112028, + "balance_loss_mlp": 1.08737814, + "epoch": 0.16160061562139286, + "flos": 543613427712.0, + "grad_norm": 0.06970106087394082, + "language_loss": 0.8929134, + "learning_rate": 0.0009552737379396948, + "loss": 0.90411627, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.32885742, + "step": 840, + "time_per_iteration": 2.6100995540618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102587, + "balance_loss_mlp": 1.06920815, + "epoch": 0.16179299730665642, + "flos": 603590717952.0, + "grad_norm": 0.06131687325166246, + "language_loss": 0.87945604, + "learning_rate": 0.0009551448565735767, + "loss": 0.89048195, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.33398438, + "step": 841, + "time_per_iteration": 2.7360360622406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095736, + "balance_loss_mlp": 1.06168985, + "epoch": 0.16198537899191998, + "flos": 786821050368.0, + "grad_norm": 0.07162496841720159, + "language_loss": 0.8519845, + "learning_rate": 0.0009550157985045543, + "loss": 0.86294186, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.34082031, + "step": 842, + "time_per_iteration": 3.0436456203460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087632, + "balance_loss_mlp": 1.05413389, + "epoch": 0.16217776067718354, + "flos": 519550917120.0, + "grad_norm": 0.060562390499230526, + "language_loss": 0.89622426, + "learning_rate": 0.0009548865637827321, + "loss": 0.90710062, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.33496094, + "step": 843, + "time_per_iteration": 2.6422221660614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086448, + "balance_loss_mlp": 1.05342698, + "epoch": 0.1623701423624471, + "flos": 505015644672.0, + "grad_norm": 0.07097995853224412, + "language_loss": 0.90216166, + "learning_rate": 0.0009547571524582838, + "loss": 0.91302609, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.33032227, + "step": 844, + "time_per_iteration": 2.6082894802093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095057, + "balance_loss_mlp": 1.06031966, + "epoch": 0.16256252404771065, + "flos": 496940493312.0, + "grad_norm": 0.06932052947515681, + "language_loss": 0.92511153, + "learning_rate": 0.0009546275645814512, + "loss": 0.9360621, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.34765625, + "step": 845, + "time_per_iteration": 2.5985872745513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100013, + "balance_loss_mlp": 1.065418, + "epoch": 0.16275490573297421, + "flos": 502110061056.0, + "grad_norm": 0.07540183512891604, + "language_loss": 0.90294898, + "learning_rate": 0.0009544978002025446, + "loss": 0.91394913, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.34619141, + "step": 846, + "time_per_iteration": 2.5778391361236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096289, + "balance_loss_mlp": 1.06174231, + "epoch": 0.16294728741823777, + "flos": 506952179712.0, + "grad_norm": 0.06018935314915502, + "language_loss": 0.87532055, + "learning_rate": 0.0009543678593719434, + "loss": 0.8862834, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.34570312, + "step": 847, + "time_per_iteration": 2.697566270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102624, + "balance_loss_mlp": 1.06824434, + "epoch": 0.16313966910350133, + "flos": 509418395136.0, + "grad_norm": 0.054217985504269955, + "language_loss": 0.8754853, + "learning_rate": 0.0009542377421400945, + "loss": 0.88651162, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.34375, + "step": 848, + "time_per_iteration": 2.786766290664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104457, + "balance_loss_mlp": 1.06847942, + "epoch": 0.16333205078876492, + "flos": 543712352256.0, + "grad_norm": 0.06122856356214084, + "language_loss": 0.83524954, + "learning_rate": 0.0009541074485575145, + "loss": 0.84629411, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.35986328, + "step": 849, + "time_per_iteration": 2.713759183883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098701, + "balance_loss_mlp": 1.06346297, + "epoch": 0.16352443247402848, + "flos": 507477477888.0, + "grad_norm": 0.06331477383231503, + "language_loss": 0.92240757, + "learning_rate": 0.0009539769786747874, + "loss": 0.93339461, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.35253906, + "step": 850, + "time_per_iteration": 2.589945077896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100894, + "balance_loss_mlp": 1.06584692, + "epoch": 0.16371681415929204, + "flos": 541851420672.0, + "grad_norm": 0.06704648725492578, + "language_loss": 0.81567919, + "learning_rate": 0.0009538463325425665, + "loss": 0.82668811, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.35083008, + "step": 851, + "time_per_iteration": 2.6779844760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105544, + "balance_loss_mlp": 1.07042515, + "epoch": 0.1639091958445556, + "flos": 520501026816.0, + "grad_norm": 0.058426853420895056, + "language_loss": 0.8673842, + "learning_rate": 0.0009537155102115728, + "loss": 0.87843966, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.35131836, + "step": 852, + "time_per_iteration": 2.5614206790924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106136, + "balance_loss_mlp": 1.07175565, + "epoch": 0.16410157752981916, + "flos": 547149026304.0, + "grad_norm": 0.06460558975646845, + "language_loss": 0.83482397, + "learning_rate": 0.0009535845117325961, + "loss": 0.84588534, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.34423828, + "step": 853, + "time_per_iteration": 2.6453073024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098632, + "balance_loss_mlp": 1.06470513, + "epoch": 0.16429395921508272, + "flos": 582561828864.0, + "grad_norm": 0.052152281018199936, + "language_loss": 0.93584174, + "learning_rate": 0.0009534533371564946, + "loss": 0.94682807, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.33959961, + "step": 854, + "time_per_iteration": 2.75186824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111206, + "balance_loss_mlp": 1.07670665, + "epoch": 0.16448634090034628, + "flos": 530678628864.0, + "grad_norm": 0.06475772966833339, + "language_loss": 0.8907218, + "learning_rate": 0.0009533219865341949, + "loss": 0.90183383, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.3449707, + "step": 855, + "time_per_iteration": 2.581479787826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108852, + "balance_loss_mlp": 1.07285094, + "epoch": 0.16467872258560984, + "flos": 491623948800.0, + "grad_norm": 0.06378602693040462, + "language_loss": 0.87287533, + "learning_rate": 0.0009531904599166916, + "loss": 0.88396388, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.36035156, + "step": 856, + "time_per_iteration": 2.6429831981658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107008, + "balance_loss_mlp": 1.07141232, + "epoch": 0.16487110427087343, + "flos": 506015216640.0, + "grad_norm": 0.07162133431974482, + "language_loss": 0.85139728, + "learning_rate": 0.0009530587573550478, + "loss": 0.86246729, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.35620117, + "step": 857, + "time_per_iteration": 2.5667338371276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071319, + "balance_loss_mlp": 1.05434394, + "epoch": 0.16506348595613698, + "flos": 1432006553088.0, + "grad_norm": 0.02717136097410494, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75390708, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.16992188, + "step": 858, + "time_per_iteration": 5.02930474281311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107487, + "balance_loss_mlp": 1.0740366, + "epoch": 0.16525586764140054, + "flos": 476890827264.0, + "grad_norm": 0.06438670275364486, + "language_loss": 0.90481895, + "learning_rate": 0.0009527948246039337, + "loss": 0.91589379, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.33447266, + "step": 859, + "time_per_iteration": 2.5222055912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109782, + "balance_loss_mlp": 1.07618856, + "epoch": 0.1654482493266641, + "flos": 880737297408.0, + "grad_norm": 0.058857893791213665, + "language_loss": 0.88361865, + "learning_rate": 0.000952662594516931, + "loss": 0.8947165, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.33618164, + "step": 860, + "time_per_iteration": 3.065053701400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109979, + "balance_loss_mlp": 1.07497942, + "epoch": 0.16564063101192766, + "flos": 626527028736.0, + "grad_norm": 0.058557043780191484, + "language_loss": 0.86803752, + "learning_rate": 0.0009525301886907234, + "loss": 0.87913728, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.34985352, + "step": 861, + "time_per_iteration": 2.873415470123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121945, + "balance_loss_mlp": 1.08537149, + "epoch": 0.16583301269719122, + "flos": 561250722816.0, + "grad_norm": 0.0761086770239273, + "language_loss": 0.8825953, + "learning_rate": 0.0009523976071767155, + "loss": 0.8938148, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.36572266, + "step": 862, + "time_per_iteration": 2.71508526802063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115561, + "balance_loss_mlp": 1.07994115, + "epoch": 0.16602539438245478, + "flos": 567510022656.0, + "grad_norm": 0.05388299317844869, + "language_loss": 0.88433009, + "learning_rate": 0.00095226485002638, + "loss": 0.8954857, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.35620117, + "step": 863, + "time_per_iteration": 2.7524497509002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111173, + "balance_loss_mlp": 1.07617354, + "epoch": 0.16621777606771834, + "flos": 574589984256.0, + "grad_norm": 0.05833582522103205, + "language_loss": 0.89311475, + "learning_rate": 0.0009521319172912576, + "loss": 0.90422642, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.35009766, + "step": 864, + "time_per_iteration": 2.717493772506714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134798, + "balance_loss_mlp": 1.09846306, + "epoch": 0.16641015775298193, + "flos": 514292599296.0, + "grad_norm": 0.05644176285984134, + "language_loss": 0.94990546, + "learning_rate": 0.0009519988090229579, + "loss": 0.96125346, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.36352539, + "step": 865, + "time_per_iteration": 2.6850624084472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133116, + "balance_loss_mlp": 1.09668565, + "epoch": 0.1666025394382455, + "flos": 621395338752.0, + "grad_norm": 0.05816643645022503, + "language_loss": 0.88684535, + "learning_rate": 0.0009518655252731576, + "loss": 0.89817655, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.36450195, + "step": 866, + "time_per_iteration": 2.7240021228790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124082, + "balance_loss_mlp": 1.0882715, + "epoch": 0.16679492112350905, + "flos": 548528329728.0, + "grad_norm": 0.06128727898968579, + "language_loss": 0.9070124, + "learning_rate": 0.0009517320660936022, + "loss": 0.91825324, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.35839844, + "step": 867, + "time_per_iteration": 2.6959731578826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118134, + "balance_loss_mlp": 1.08260965, + "epoch": 0.1669873028087726, + "flos": 665379477504.0, + "grad_norm": 0.05857722537468161, + "language_loss": 0.83557463, + "learning_rate": 0.0009515984315361051, + "loss": 0.84675598, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.35546875, + "step": 868, + "time_per_iteration": 2.813674211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122458, + "balance_loss_mlp": 1.08638549, + "epoch": 0.16717968449403617, + "flos": 538305647616.0, + "grad_norm": 0.06553445455365839, + "language_loss": 0.87103701, + "learning_rate": 0.000951464621652548, + "loss": 0.88226151, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.36083984, + "step": 869, + "time_per_iteration": 2.674333333969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111253, + "balance_loss_mlp": 1.07757819, + "epoch": 0.16737206617929973, + "flos": 529833235968.0, + "grad_norm": 0.059309523866322815, + "language_loss": 0.78951609, + "learning_rate": 0.0009513306364948804, + "loss": 0.80064136, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.34985352, + "step": 870, + "time_per_iteration": 2.7519431114196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011143, + "balance_loss_mlp": 1.07953846, + "epoch": 0.1675644478645633, + "flos": 480529732608.0, + "grad_norm": 0.06711563999134491, + "language_loss": 0.89559376, + "learning_rate": 0.0009511964761151197, + "loss": 0.90673673, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.34814453, + "step": 871, + "time_per_iteration": 2.544520854949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113298, + "balance_loss_mlp": 1.07820272, + "epoch": 0.16775682954982685, + "flos": 494311334400.0, + "grad_norm": 0.06484202096701225, + "language_loss": 0.9050945, + "learning_rate": 0.0009510621405653521, + "loss": 0.91622752, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.35131836, + "step": 872, + "time_per_iteration": 2.594224452972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106918, + "balance_loss_mlp": 1.07265687, + "epoch": 0.1679492112350904, + "flos": 751694846976.0, + "grad_norm": 0.060317450015561574, + "language_loss": 0.847211, + "learning_rate": 0.0009509276298977309, + "loss": 0.85828018, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.34277344, + "step": 873, + "time_per_iteration": 2.9428915977478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110568, + "balance_loss_mlp": 1.07261181, + "epoch": 0.168141592920354, + "flos": 1135413075456.0, + "grad_norm": 0.05441785661992682, + "language_loss": 0.81867516, + "learning_rate": 0.0009507929441644778, + "loss": 0.82978088, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.37939453, + "step": 874, + "time_per_iteration": 3.52008318901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101336, + "balance_loss_mlp": 1.06640816, + "epoch": 0.16833397460561755, + "flos": 632114205696.0, + "grad_norm": 0.06557720885733571, + "language_loss": 0.86201179, + "learning_rate": 0.0009506580834178826, + "loss": 0.87302518, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.34936523, + "step": 875, + "time_per_iteration": 2.7744014263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110472, + "balance_loss_mlp": 1.06817079, + "epoch": 0.1685263562908811, + "flos": 541171943424.0, + "grad_norm": 0.06007828909359903, + "language_loss": 0.91612709, + "learning_rate": 0.0009505230477103028, + "loss": 0.92717427, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.36547852, + "step": 876, + "time_per_iteration": 2.6593635082244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103279, + "balance_loss_mlp": 1.06703997, + "epoch": 0.16871873797614467, + "flos": 619036812288.0, + "grad_norm": 0.08702038824672748, + "language_loss": 0.81312418, + "learning_rate": 0.0009503878370941641, + "loss": 0.824157, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.36206055, + "step": 877, + "time_per_iteration": 2.7511024475097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094199, + "balance_loss_mlp": 1.05986643, + "epoch": 0.16891111966140823, + "flos": 606067107840.0, + "grad_norm": 0.06953183101172467, + "language_loss": 0.88841844, + "learning_rate": 0.0009502524516219595, + "loss": 0.89936042, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.34375, + "step": 878, + "time_per_iteration": 2.697455406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091575, + "balance_loss_mlp": 1.05757689, + "epoch": 0.1691035013466718, + "flos": 552058136064.0, + "grad_norm": 0.0721678347454753, + "language_loss": 0.89980447, + "learning_rate": 0.0009501168913462506, + "loss": 0.91072023, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.34008789, + "step": 879, + "time_per_iteration": 2.6825287342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080465, + "balance_loss_mlp": 1.06263125, + "epoch": 0.16929588303193535, + "flos": 1475544121344.0, + "grad_norm": 0.044515803528062385, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80202389, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.17871094, + "step": 880, + "time_per_iteration": 4.825777769088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.0464623, + "epoch": 0.1694882647171989, + "flos": 925850456064.0, + "grad_norm": 0.06491790696384477, + "language_loss": 0.85360616, + "learning_rate": 0.0009498452465949042, + "loss": 0.86441934, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.34887695, + "step": 881, + "time_per_iteration": 3.2700376510620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086319, + "balance_loss_mlp": 1.05227244, + "epoch": 0.1696806464024625, + "flos": 545829359616.0, + "grad_norm": 0.057533624801199786, + "language_loss": 0.916857, + "learning_rate": 0.0009497091622247285, + "loss": 0.92772019, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.34082031, + "step": 882, + "time_per_iteration": 2.711721181869507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085559, + "balance_loss_mlp": 1.05184698, + "epoch": 0.16987302808772606, + "flos": 528970466304.0, + "grad_norm": 0.08384615451013337, + "language_loss": 0.93744707, + "learning_rate": 0.0009495729032619723, + "loss": 0.94830269, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.33740234, + "step": 883, + "time_per_iteration": 2.688525438308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084621, + "balance_loss_mlp": 1.05062199, + "epoch": 0.17006540977298962, + "flos": 754855096320.0, + "grad_norm": 0.06073677328113264, + "language_loss": 0.84419179, + "learning_rate": 0.0009494364697595354, + "loss": 0.85503805, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.34033203, + "step": 884, + "time_per_iteration": 2.9112000465393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092434, + "balance_loss_mlp": 1.05750597, + "epoch": 0.17025779145825318, + "flos": 558532813824.0, + "grad_norm": 0.06728326387015754, + "language_loss": 0.89818925, + "learning_rate": 0.0009492998617703867, + "loss": 0.90911365, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.34936523, + "step": 885, + "time_per_iteration": 2.6760926246643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093981, + "balance_loss_mlp": 1.06045985, + "epoch": 0.17045017314351674, + "flos": 511963186176.0, + "grad_norm": 0.0687386743468794, + "language_loss": 0.87971282, + "learning_rate": 0.0009491630793475619, + "loss": 0.89065266, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.33520508, + "step": 886, + "time_per_iteration": 2.59726619720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.06011951, + "epoch": 0.1706425548287803, + "flos": 508674898944.0, + "grad_norm": 0.058204707286146434, + "language_loss": 0.85722017, + "learning_rate": 0.0009490261225441643, + "loss": 0.8681649, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.34350586, + "step": 887, + "time_per_iteration": 2.900501012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087689, + "balance_loss_mlp": 1.0545013, + "epoch": 0.17083493651404386, + "flos": 717016776192.0, + "grad_norm": 0.05310353290702558, + "language_loss": 0.90775931, + "learning_rate": 0.0009488889914133656, + "loss": 0.9186362, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.33203125, + "step": 888, + "time_per_iteration": 2.992532968521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089684, + "balance_loss_mlp": 1.05520868, + "epoch": 0.17102731819930742, + "flos": 558852908544.0, + "grad_norm": 0.047287767355612194, + "language_loss": 0.88680297, + "learning_rate": 0.0009487516860084047, + "loss": 0.89769983, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.34472656, + "step": 889, + "time_per_iteration": 2.7029643058776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082798, + "balance_loss_mlp": 1.04858518, + "epoch": 0.17121969988457098, + "flos": 494542679040.0, + "grad_norm": 0.0765590367769256, + "language_loss": 0.88680983, + "learning_rate": 0.0009486142063825884, + "loss": 0.89763772, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.34228516, + "step": 890, + "time_per_iteration": 2.5640931129455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_mlp": 1.02402985, + "epoch": 0.17141208156983456, + "flos": 1548088063488.0, + "grad_norm": 0.02832238153451814, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.7346493, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.14648438, + "step": 891, + "time_per_iteration": 4.979609251022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108867, + "balance_loss_mlp": 1.0540278, + "epoch": 0.17160446325509812, + "flos": 619282713600.0, + "grad_norm": 0.06449268303648867, + "language_loss": 0.90758598, + "learning_rate": 0.0009483387246819542, + "loss": 0.91847265, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.34667969, + "step": 892, + "time_per_iteration": 2.731332540512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_mlp": 1.01767898, + "epoch": 0.17179684494036168, + "flos": 1381026972672.0, + "grad_norm": 0.016720826063608682, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83318138, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.1484375, + "step": 893, + "time_per_iteration": 4.641844987869263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097484, + "balance_loss_mlp": 1.06386662, + "epoch": 0.17198922662562524, + "flos": 492386383872.0, + "grad_norm": 0.05711411270468228, + "language_loss": 0.89587665, + "learning_rate": 0.0009480625467392688, + "loss": 0.90685147, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.33642578, + "step": 894, + "time_per_iteration": 2.6310250759124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_mlp": 1.01795936, + "epoch": 0.1721816083108888, + "flos": 1457529914880.0, + "grad_norm": 0.013728573618451478, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79027796, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.15136719, + "step": 895, + "time_per_iteration": 4.7525646686553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132534, + "balance_loss_mlp": 1.09834456, + "epoch": 0.17237398999615236, + "flos": 527853030912.0, + "grad_norm": 0.05821127752563967, + "language_loss": 0.87793648, + "learning_rate": 0.0009477856729834196, + "loss": 0.88926184, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.34228516, + "step": 896, + "time_per_iteration": 2.7015438079833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132108, + "balance_loss_mlp": 1.09901524, + "epoch": 0.17256637168141592, + "flos": 603644562432.0, + "grad_norm": 0.08337200045302615, + "language_loss": 0.9056648, + "learning_rate": 0.0009476469753098809, + "loss": 0.91698587, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.33105469, + "step": 897, + "time_per_iteration": 2.7035813331604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125108, + "balance_loss_mlp": 1.08922589, + "epoch": 0.17275875336667948, + "flos": 509437334016.0, + "grad_norm": 0.05742024530278536, + "language_loss": 0.874506, + "learning_rate": 0.0009475081038443738, + "loss": 0.88575709, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.35913086, + "step": 898, + "time_per_iteration": 2.584437370300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115669, + "balance_loss_mlp": 1.07971573, + "epoch": 0.17295113505194307, + "flos": 664951693824.0, + "grad_norm": 0.06535241228499304, + "language_loss": 0.85809892, + "learning_rate": 0.0009473690586408124, + "loss": 0.86925566, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.35986328, + "step": 899, + "time_per_iteration": 2.83156418800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116393, + "balance_loss_mlp": 1.08084452, + "epoch": 0.17314351673720663, + "flos": 555125253120.0, + "grad_norm": 0.0683413775827569, + "language_loss": 0.86639923, + "learning_rate": 0.0009472298397531792, + "loss": 0.87756318, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.35546875, + "step": 900, + "time_per_iteration": 2.6944193840026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123337, + "balance_loss_mlp": 1.08635855, + "epoch": 0.17333589842247019, + "flos": 503361326592.0, + "grad_norm": 0.09670394547256775, + "language_loss": 0.87118709, + "learning_rate": 0.0009470904472355235, + "loss": 0.88242042, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.36987305, + "step": 901, + "time_per_iteration": 2.637882709503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114178, + "balance_loss_mlp": 1.07982159, + "epoch": 0.17352828010773375, + "flos": 555924003840.0, + "grad_norm": 0.06358596699153923, + "language_loss": 0.79912066, + "learning_rate": 0.0009469508811419626, + "loss": 0.81026244, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.34399414, + "step": 902, + "time_per_iteration": 2.726072311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077408, + "balance_loss_mlp": 1.06453359, + "epoch": 0.1737206617929973, + "flos": 1553711556096.0, + "grad_norm": 0.030950293127884103, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72691238, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.12890625, + "step": 903, + "time_per_iteration": 4.791790723800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109546, + "balance_loss_mlp": 1.07445073, + "epoch": 0.17391304347826086, + "flos": 516390667776.0, + "grad_norm": 0.06883251868009001, + "language_loss": 0.84220147, + "learning_rate": 0.0009466712284439292, + "loss": 0.85329694, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.35131836, + "step": 904, + "time_per_iteration": 2.7648017406463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104504, + "balance_loss_mlp": 1.06995738, + "epoch": 0.17410542516352442, + "flos": 540773273088.0, + "grad_norm": 0.06988851938988141, + "language_loss": 0.8903957, + "learning_rate": 0.0009465311419480276, + "loss": 0.90144074, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.34545898, + "step": 905, + "time_per_iteration": 2.725829601287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098399, + "balance_loss_mlp": 1.0637325, + "epoch": 0.17429780684878798, + "flos": 623542869504.0, + "grad_norm": 0.06312030659776342, + "language_loss": 0.88624233, + "learning_rate": 0.0009463908820933622, + "loss": 0.89722633, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.34692383, + "step": 906, + "time_per_iteration": 2.8389482498168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093859, + "balance_loss_mlp": 1.05900264, + "epoch": 0.17449018853405157, + "flos": 575368386048.0, + "grad_norm": 0.056066721215551084, + "language_loss": 0.83138871, + "learning_rate": 0.0009462504489343868, + "loss": 0.84232736, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.34863281, + "step": 907, + "time_per_iteration": 2.863349437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086469, + "balance_loss_mlp": 1.05199337, + "epoch": 0.17468257021931513, + "flos": 533499844608.0, + "grad_norm": 0.07604190500703253, + "language_loss": 0.894853, + "learning_rate": 0.0009461098425256222, + "loss": 0.90571761, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.3449707, + "step": 908, + "time_per_iteration": 2.5941011905670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108834, + "balance_loss_mlp": 1.05345941, + "epoch": 0.1748749519045787, + "flos": 540496848384.0, + "grad_norm": 0.050694136543679796, + "language_loss": 0.85873353, + "learning_rate": 0.0009459690629216567, + "loss": 0.86961693, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.34887695, + "step": 909, + "time_per_iteration": 2.6097571849823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109032, + "balance_loss_mlp": 1.0570612, + "epoch": 0.17506733358984225, + "flos": 498373641216.0, + "grad_norm": 0.0569262349138849, + "language_loss": 0.88138729, + "learning_rate": 0.0009458281101771457, + "loss": 0.89229047, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.33276367, + "step": 910, + "time_per_iteration": 2.5904784202575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091588, + "balance_loss_mlp": 1.05744696, + "epoch": 0.1752597152751058, + "flos": 622621873152.0, + "grad_norm": 0.06350455217589325, + "language_loss": 0.8266046, + "learning_rate": 0.0009456869843468122, + "loss": 0.83752048, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.34179688, + "step": 911, + "time_per_iteration": 2.8930556774139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090023, + "balance_loss_mlp": 1.05476046, + "epoch": 0.17545209696036937, + "flos": 520717814784.0, + "grad_norm": 0.07844481886152296, + "language_loss": 0.79097009, + "learning_rate": 0.0009455456854854459, + "loss": 0.80187035, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.35302734, + "step": 912, + "time_per_iteration": 2.5984511375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096028, + "balance_loss_mlp": 1.0631026, + "epoch": 0.17564447864563293, + "flos": 461750270976.0, + "grad_norm": 0.05516798292623818, + "language_loss": 0.84505737, + "learning_rate": 0.0009454042136479039, + "loss": 0.85601771, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.3293457, + "step": 913, + "time_per_iteration": 2.586195945739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085981, + "balance_loss_mlp": 1.05286503, + "epoch": 0.1758368603308965, + "flos": 480416251392.0, + "grad_norm": 0.05301404729603274, + "language_loss": 0.83308446, + "learning_rate": 0.0009452625688891103, + "loss": 0.84394431, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.33129883, + "step": 914, + "time_per_iteration": 2.5374929904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052517, + "balance_loss_mlp": 1.038975, + "epoch": 0.17602924201616005, + "flos": 1478160133632.0, + "grad_norm": 0.03507986977902886, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79787254, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.13574219, + "step": 915, + "time_per_iteration": 4.5561583042144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097389, + "balance_loss_mlp": 1.06226993, + "epoch": 0.17622162370142364, + "flos": 602010593280.0, + "grad_norm": 0.06815502965849334, + "language_loss": 0.93451321, + "learning_rate": 0.0009449787608278015, + "loss": 0.94548714, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.35131836, + "step": 916, + "time_per_iteration": 2.7807908058166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092918, + "balance_loss_mlp": 1.0588007, + "epoch": 0.1764140053866872, + "flos": 442473214464.0, + "grad_norm": 0.0637680644109211, + "language_loss": 0.92700857, + "learning_rate": 0.0009448365976354704, + "loss": 0.93793774, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.34130859, + "step": 917, + "time_per_iteration": 2.4800124168395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.06909323, + "epoch": 0.17660638707195075, + "flos": 500362610688.0, + "grad_norm": 0.07080486598597346, + "language_loss": 0.90158784, + "learning_rate": 0.0009446942617422558, + "loss": 0.91264427, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.36547852, + "step": 918, + "time_per_iteration": 2.5415430068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101469, + "balance_loss_mlp": 1.06766129, + "epoch": 0.17679876875721431, + "flos": 538621360128.0, + "grad_norm": 0.060000223973742446, + "language_loss": 0.86201262, + "learning_rate": 0.0009445517532034176, + "loss": 0.87302732, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.33789062, + "step": 919, + "time_per_iteration": 2.6849868297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121669, + "balance_loss_mlp": 1.08569145, + "epoch": 0.17699115044247787, + "flos": 497477376000.0, + "grad_norm": 0.08221632690768264, + "language_loss": 0.89522099, + "learning_rate": 0.0009444090720742824, + "loss": 0.9064377, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.35986328, + "step": 920, + "time_per_iteration": 2.6034600734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113572, + "balance_loss_mlp": 1.07883418, + "epoch": 0.17718353212774143, + "flos": 662444780544.0, + "grad_norm": 0.08029288241638204, + "language_loss": 0.88040781, + "learning_rate": 0.0009442662184102439, + "loss": 0.89154357, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.34741211, + "step": 921, + "time_per_iteration": 2.767352342605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105947, + "balance_loss_mlp": 1.07309294, + "epoch": 0.177375913813005, + "flos": 582340658688.0, + "grad_norm": 0.0705507668945597, + "language_loss": 0.87951338, + "learning_rate": 0.000944123192266763, + "loss": 0.89057279, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.32836914, + "step": 922, + "time_per_iteration": 2.789315700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108289, + "balance_loss_mlp": 1.0727644, + "epoch": 0.17756829549826855, + "flos": 552285098496.0, + "grad_norm": 0.06115562628552814, + "language_loss": 0.83835006, + "learning_rate": 0.0009439799936993671, + "loss": 0.84943295, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.35546875, + "step": 923, + "time_per_iteration": 2.7160987854003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090669, + "balance_loss_mlp": 1.05733824, + "epoch": 0.17776067718353214, + "flos": 556060806144.0, + "grad_norm": 0.07059184324253498, + "language_loss": 0.88508618, + "learning_rate": 0.0009438366227636511, + "loss": 0.89599288, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.33349609, + "step": 924, + "time_per_iteration": 2.6319191455841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084897, + "balance_loss_mlp": 1.05163789, + "epoch": 0.1779530588687957, + "flos": 658161303552.0, + "grad_norm": 0.06263940487075517, + "language_loss": 0.86677843, + "learning_rate": 0.0009436930795152763, + "loss": 0.87762737, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.33276367, + "step": 925, + "time_per_iteration": 2.8063783645629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084326, + "balance_loss_mlp": 1.05159163, + "epoch": 0.17814544055405926, + "flos": 644187644928.0, + "grad_norm": 0.06448697412821461, + "language_loss": 0.8710525, + "learning_rate": 0.0009435493640099713, + "loss": 0.88189578, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.32739258, + "step": 926, + "time_per_iteration": 2.7599081993103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080787, + "balance_loss_mlp": 1.04664516, + "epoch": 0.17833782223932282, + "flos": 460672123392.0, + "grad_norm": 0.06497730431564504, + "language_loss": 0.84328961, + "learning_rate": 0.0009434054763035314, + "loss": 0.85409749, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.34155273, + "step": 927, + "time_per_iteration": 2.612910032272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081381, + "balance_loss_mlp": 1.04740596, + "epoch": 0.17853020392458638, + "flos": 759212766720.0, + "grad_norm": 0.04594292129212818, + "language_loss": 0.85898727, + "learning_rate": 0.0009432614164518185, + "loss": 0.8698011, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.33984375, + "step": 928, + "time_per_iteration": 2.926981210708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086846, + "balance_loss_mlp": 1.05153632, + "epoch": 0.17872258560984994, + "flos": 782320785408.0, + "grad_norm": 0.055185850673896385, + "language_loss": 0.84792197, + "learning_rate": 0.000943117184510762, + "loss": 0.85879046, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.35327148, + "step": 929, + "time_per_iteration": 2.995514154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039861, + "balance_loss_mlp": 1.02660513, + "epoch": 0.1789149672951135, + "flos": 1459095482880.0, + "grad_norm": 0.021362691821678215, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79829824, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.1328125, + "step": 930, + "time_per_iteration": 4.99839448928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091678, + "balance_loss_mlp": 1.05739331, + "epoch": 0.17910734898037706, + "flos": 503598463488.0, + "grad_norm": 0.05761618473313655, + "language_loss": 0.88773429, + "learning_rate": 0.0009428282045846674, + "loss": 0.89865112, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.34301758, + "step": 931, + "time_per_iteration": 2.6966652870178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.05452061, + "epoch": 0.17929973066564064, + "flos": 745895264256.0, + "grad_norm": 0.05798282919409206, + "language_loss": 0.89983928, + "learning_rate": 0.0009426834567118214, + "loss": 0.91071755, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.33300781, + "step": 932, + "time_per_iteration": 3.072160482406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092765, + "balance_loss_mlp": 1.05907631, + "epoch": 0.1794921123509042, + "flos": 712875893760.0, + "grad_norm": 0.055390897890994044, + "language_loss": 0.80879378, + "learning_rate": 0.0009425385369740155, + "loss": 0.81972146, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.3371582, + "step": 933, + "time_per_iteration": 3.0337042808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092731, + "balance_loss_mlp": 1.05825567, + "epoch": 0.17968449403616776, + "flos": 632838763008.0, + "grad_norm": 0.0687685702394307, + "language_loss": 0.87443584, + "learning_rate": 0.0009423934454275125, + "loss": 0.8853631, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.3449707, + "step": 934, + "time_per_iteration": 2.7970879077911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089195, + "balance_loss_mlp": 1.05526757, + "epoch": 0.17987687572143132, + "flos": 536060602368.0, + "grad_norm": 0.08214865293258214, + "language_loss": 0.92215371, + "learning_rate": 0.0009422481821286418, + "loss": 0.93304563, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.33959961, + "step": 935, + "time_per_iteration": 2.7134642601013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091708, + "balance_loss_mlp": 1.05914021, + "epoch": 0.18006925740669488, + "flos": 537818227200.0, + "grad_norm": 0.0718764173736199, + "language_loss": 0.87967253, + "learning_rate": 0.0009421027471337998, + "loss": 0.89058959, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.32568359, + "step": 936, + "time_per_iteration": 2.608764171600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098474, + "balance_loss_mlp": 1.06333113, + "epoch": 0.18026163909195844, + "flos": 539255757312.0, + "grad_norm": 0.06697051800305152, + "language_loss": 0.82882118, + "learning_rate": 0.0009419571404994493, + "loss": 0.83980596, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.3515625, + "step": 937, + "time_per_iteration": 2.620296001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096306, + "balance_loss_mlp": 1.06240284, + "epoch": 0.180454020777222, + "flos": 500382959616.0, + "grad_norm": 0.08555714620461663, + "language_loss": 0.90948844, + "learning_rate": 0.00094181136228212, + "loss": 0.92045152, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.33935547, + "step": 938, + "time_per_iteration": 2.62837290763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109732, + "balance_loss_mlp": 1.06415629, + "epoch": 0.18064640246248556, + "flos": 498689353728.0, + "grad_norm": 0.06983123921060745, + "language_loss": 0.86323059, + "learning_rate": 0.0009416654125384077, + "loss": 0.8742038, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.33154297, + "step": 939, + "time_per_iteration": 2.715686321258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054723, + "balance_loss_mlp": 1.04242051, + "epoch": 0.18083878414774912, + "flos": 1518572358144.0, + "grad_norm": 0.027679884047562747, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80827093, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.12304688, + "step": 940, + "time_per_iteration": 4.941875219345093 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090642, + "balance_loss_mlp": 1.05728722, + "epoch": 0.1810311658330127, + "flos": 727006703616.0, + "grad_norm": 0.07011009980003599, + "language_loss": 0.84326053, + "learning_rate": 0.000941372998698552, + "loss": 0.85416698, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.33374023, + "step": 941, + "time_per_iteration": 2.931520938873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094264, + "balance_loss_mlp": 1.0597409, + "epoch": 0.18122354751827627, + "flos": 564643726848.0, + "grad_norm": 0.08254502738164117, + "language_loss": 0.8207435, + "learning_rate": 0.0009412265347159336, + "loss": 0.8316862, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.34570312, + "step": 942, + "time_per_iteration": 2.696354627609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091238, + "balance_loss_mlp": 1.05869377, + "epoch": 0.18141592920353983, + "flos": 519024208896.0, + "grad_norm": 0.05729066672306875, + "language_loss": 0.85217965, + "learning_rate": 0.0009410798994339829, + "loss": 0.86309201, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.32543945, + "step": 943, + "time_per_iteration": 2.6009600162506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088013, + "balance_loss_mlp": 1.0545156, + "epoch": 0.1816083108888034, + "flos": 512219261952.0, + "grad_norm": 0.05342615519744699, + "language_loss": 0.88234782, + "learning_rate": 0.000940933092909628, + "loss": 0.89322793, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.33520508, + "step": 944, + "time_per_iteration": 2.618419647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095526, + "balance_loss_mlp": 1.06286263, + "epoch": 0.18180069257406695, + "flos": 492144864768.0, + "grad_norm": 0.053227732023653135, + "language_loss": 0.8393383, + "learning_rate": 0.0009407861151998649, + "loss": 0.85029352, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.32666016, + "step": 945, + "time_per_iteration": 2.5718705654144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097774, + "balance_loss_mlp": 1.06406188, + "epoch": 0.1819930742593305, + "flos": 569891870208.0, + "grad_norm": 0.05775782434029923, + "language_loss": 0.86156505, + "learning_rate": 0.0009406389663617552, + "loss": 0.87254274, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.33740234, + "step": 946, + "time_per_iteration": 2.66513729095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097522, + "balance_loss_mlp": 1.06433463, + "epoch": 0.18218545594459407, + "flos": 605693168640.0, + "grad_norm": 0.06350431386522506, + "language_loss": 0.85736459, + "learning_rate": 0.000940491646452427, + "loss": 0.86833978, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.33203125, + "step": 947, + "time_per_iteration": 2.715071201324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103916, + "balance_loss_mlp": 1.07010818, + "epoch": 0.18237783762985763, + "flos": 548419230720.0, + "grad_norm": 0.06277969821047595, + "language_loss": 0.91195452, + "learning_rate": 0.000940344155529075, + "loss": 0.92299366, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.33837891, + "step": 948, + "time_per_iteration": 2.6502938270568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099574, + "balance_loss_mlp": 1.06550407, + "epoch": 0.1825702193151212, + "flos": 450509078016.0, + "grad_norm": 0.06933176029299125, + "language_loss": 0.87683523, + "learning_rate": 0.0009401964936489605, + "loss": 0.88783091, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.34106445, + "step": 949, + "time_per_iteration": 2.5181798934936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084135, + "balance_loss_mlp": 1.05247355, + "epoch": 0.18276260100038477, + "flos": 588962313216.0, + "grad_norm": 0.07980064544074586, + "language_loss": 0.85422635, + "learning_rate": 0.0009400486608694108, + "loss": 0.86506772, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.31640625, + "step": 950, + "time_per_iteration": 2.7189955711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087871, + "balance_loss_mlp": 1.05384839, + "epoch": 0.18295498268564833, + "flos": 786988376064.0, + "grad_norm": 0.05265351460276348, + "language_loss": 0.87225658, + "learning_rate": 0.0009399006572478195, + "loss": 0.88313532, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.34033203, + "step": 951, + "time_per_iteration": 3.0805773735046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086089, + "balance_loss_mlp": 1.05218577, + "epoch": 0.1831473643709119, + "flos": 577878271488.0, + "grad_norm": 0.059447924131550096, + "language_loss": 0.91242015, + "learning_rate": 0.0009397524828416468, + "loss": 0.92328107, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.33935547, + "step": 952, + "time_per_iteration": 2.6567108631134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082155, + "balance_loss_mlp": 1.04801321, + "epoch": 0.18333974605617545, + "flos": 566622521856.0, + "grad_norm": 0.05513512337372911, + "language_loss": 0.96184212, + "learning_rate": 0.0009396041377084192, + "loss": 0.97266364, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.34179688, + "step": 953, + "time_per_iteration": 2.6937921047210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079403, + "balance_loss_mlp": 1.04478431, + "epoch": 0.183532127741439, + "flos": 526725421056.0, + "grad_norm": 0.07204875194033089, + "language_loss": 0.87840325, + "learning_rate": 0.0009394556219057295, + "loss": 0.88919723, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.34667969, + "step": 954, + "time_per_iteration": 2.6962215900421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107777, + "balance_loss_mlp": 1.04272258, + "epoch": 0.18372450942670257, + "flos": 594259918848.0, + "grad_norm": 0.07227161235955501, + "language_loss": 0.83883446, + "learning_rate": 0.0009393069354912362, + "loss": 0.84961212, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.35058594, + "step": 955, + "time_per_iteration": 2.718308925628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081248, + "balance_loss_mlp": 1.04677236, + "epoch": 0.18391689111196613, + "flos": 644720145408.0, + "grad_norm": 0.07091738302891186, + "language_loss": 0.82511717, + "learning_rate": 0.0009391580785226649, + "loss": 0.83592963, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.34521484, + "step": 956, + "time_per_iteration": 2.907367467880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077991, + "balance_loss_mlp": 1.06216049, + "epoch": 0.18410927279722972, + "flos": 1456246563840.0, + "grad_norm": 0.048423099914415325, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80418444, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.15820312, + "step": 957, + "time_per_iteration": 4.78663969039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091525, + "balance_loss_mlp": 1.05702567, + "epoch": 0.18430165448249328, + "flos": 658437728256.0, + "grad_norm": 0.09319397884021513, + "language_loss": 0.86484683, + "learning_rate": 0.0009388598531545196, + "loss": 0.8757621, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.34545898, + "step": 958, + "time_per_iteration": 2.8470118045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087326, + "balance_loss_mlp": 1.05285025, + "epoch": 0.18449403616775684, + "flos": 517679811072.0, + "grad_norm": 0.07377492103556435, + "language_loss": 0.86076611, + "learning_rate": 0.000938710484870727, + "loss": 0.87163937, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.3449707, + "step": 959, + "time_per_iteration": 2.5930731296539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090986, + "balance_loss_mlp": 1.05672574, + "epoch": 0.1846864178530204, + "flos": 552481537536.0, + "grad_norm": 0.06589557505977534, + "language_loss": 0.86379164, + "learning_rate": 0.0009385609462644189, + "loss": 0.8747015, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.34277344, + "step": 960, + "time_per_iteration": 2.688706636428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096456, + "balance_loss_mlp": 1.06212378, + "epoch": 0.18487879953828396, + "flos": 465930441216.0, + "grad_norm": 0.0643439417949763, + "language_loss": 0.86035949, + "learning_rate": 0.0009384112373936514, + "loss": 0.871324, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.34326172, + "step": 961, + "time_per_iteration": 4.0801496505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095895, + "balance_loss_mlp": 1.06132412, + "epoch": 0.18507118122354752, + "flos": 648200489472.0, + "grad_norm": 0.0614591664996872, + "language_loss": 0.91820455, + "learning_rate": 0.0009382613583165467, + "loss": 0.92916346, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.34594727, + "step": 962, + "time_per_iteration": 2.790069341659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113098, + "balance_loss_mlp": 1.07921863, + "epoch": 0.18526356290881107, + "flos": 626486330880.0, + "grad_norm": 0.06374556186760763, + "language_loss": 0.89594233, + "learning_rate": 0.0009381113090912928, + "loss": 0.90707326, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.33886719, + "step": 963, + "time_per_iteration": 2.6891098022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117951, + "balance_loss_mlp": 1.08559799, + "epoch": 0.18545594459407463, + "flos": 432497843712.0, + "grad_norm": 0.06491910119233056, + "language_loss": 0.90103394, + "learning_rate": 0.000937961089776144, + "loss": 0.91221344, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.32348633, + "step": 964, + "time_per_iteration": 2.5821962356567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124554, + "balance_loss_mlp": 1.08926833, + "epoch": 0.1856483262793382, + "flos": 748720862208.0, + "grad_norm": 0.06849062336391444, + "language_loss": 0.829036, + "learning_rate": 0.0009378107004294208, + "loss": 0.84028149, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.35302734, + "step": 965, + "time_per_iteration": 2.9898061752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115255, + "balance_loss_mlp": 1.081972, + "epoch": 0.18584070796460178, + "flos": 530058788352.0, + "grad_norm": 0.08647217477609576, + "language_loss": 0.91352308, + "learning_rate": 0.0009376601411095096, + "loss": 0.92467564, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.33300781, + "step": 966, + "time_per_iteration": 2.6415059566497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093436, + "balance_loss_mlp": 1.06196475, + "epoch": 0.18603308964986534, + "flos": 482863527936.0, + "grad_norm": 0.05783783242438048, + "language_loss": 0.8708145, + "learning_rate": 0.0009375094118748622, + "loss": 0.88174886, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.31445312, + "step": 967, + "time_per_iteration": 2.5149550437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089094, + "balance_loss_mlp": 1.05650234, + "epoch": 0.1862254713351289, + "flos": 800976591360.0, + "grad_norm": 0.0756042683078202, + "language_loss": 0.9083451, + "learning_rate": 0.0009373585127839976, + "loss": 0.91923606, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.32592773, + "step": 968, + "time_per_iteration": 2.9569485187530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084434, + "balance_loss_mlp": 1.05250978, + "epoch": 0.18641785302039246, + "flos": 478082456064.0, + "grad_norm": 0.06160067145414361, + "language_loss": 0.91074634, + "learning_rate": 0.0009372074438954994, + "loss": 0.92159069, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.3190918, + "step": 969, + "time_per_iteration": 2.508530378341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083542, + "balance_loss_mlp": 1.05040169, + "epoch": 0.18661023470565602, + "flos": 388695587328.0, + "grad_norm": 0.07517959095695621, + "language_loss": 0.91676056, + "learning_rate": 0.0009370562052680181, + "loss": 0.92759597, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.33154297, + "step": 970, + "time_per_iteration": 2.4572672843933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087332, + "balance_loss_mlp": 1.05400109, + "epoch": 0.18680261639091958, + "flos": 564402207744.0, + "grad_norm": 0.052448577146131624, + "language_loss": 0.89610398, + "learning_rate": 0.0009369047969602695, + "loss": 0.90697736, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.33349609, + "step": 971, + "time_per_iteration": 2.714704751968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101064, + "balance_loss_mlp": 1.06556404, + "epoch": 0.18699499807618314, + "flos": 479018009088.0, + "grad_norm": 0.06595213007116614, + "language_loss": 0.8674072, + "learning_rate": 0.0009367532190310357, + "loss": 0.87841785, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.35498047, + "step": 972, + "time_per_iteration": 2.589667558670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111914, + "balance_loss_mlp": 1.07660413, + "epoch": 0.1871873797614467, + "flos": 553022802432.0, + "grad_norm": 0.0720295199384638, + "language_loss": 0.88701892, + "learning_rate": 0.0009366014715391644, + "loss": 0.89813805, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.35327148, + "step": 973, + "time_per_iteration": 2.634023904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107724, + "balance_loss_mlp": 1.07389259, + "epoch": 0.18737976144671029, + "flos": 552526617600.0, + "grad_norm": 0.05153911900793568, + "language_loss": 0.8432554, + "learning_rate": 0.0009364495545435693, + "loss": 0.85433269, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.33837891, + "step": 974, + "time_per_iteration": 2.7729458808898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107099, + "balance_loss_mlp": 1.07281494, + "epoch": 0.18757214313197385, + "flos": 502002372096.0, + "grad_norm": 0.05815108638233015, + "language_loss": 0.88620323, + "learning_rate": 0.0009362974681032297, + "loss": 0.8972742, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.34326172, + "step": 975, + "time_per_iteration": 2.631744623184204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105498, + "balance_loss_mlp": 1.07130909, + "epoch": 0.1877645248172374, + "flos": 674691337728.0, + "grad_norm": 0.06841603134690444, + "language_loss": 0.88265896, + "learning_rate": 0.0009361452122771907, + "loss": 0.89371395, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.34204102, + "step": 976, + "time_per_iteration": 2.8427281379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094167, + "balance_loss_mlp": 1.06012082, + "epoch": 0.18795690650250096, + "flos": 404771696640.0, + "grad_norm": 0.07319435948671522, + "language_loss": 0.8377496, + "learning_rate": 0.0009359927871245635, + "loss": 0.84869128, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.34057617, + "step": 977, + "time_per_iteration": 2.4665186405181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090091, + "balance_loss_mlp": 1.0565697, + "epoch": 0.18814928818776452, + "flos": 637599485952.0, + "grad_norm": 0.05986452276683665, + "language_loss": 0.86337954, + "learning_rate": 0.0009358401927045246, + "loss": 0.87428045, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.33520508, + "step": 978, + "time_per_iteration": 2.8037781715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090672, + "balance_loss_mlp": 1.05707908, + "epoch": 0.18834166987302808, + "flos": 1137825446400.0, + "grad_norm": 0.054509582003230646, + "language_loss": 0.88314402, + "learning_rate": 0.0009356874290763166, + "loss": 0.89405078, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.33618164, + "step": 979, + "time_per_iteration": 3.456723213195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097981, + "balance_loss_mlp": 1.06481671, + "epoch": 0.18853405155829164, + "flos": 504538398720.0, + "grad_norm": 0.06366920756378494, + "language_loss": 0.8866874, + "learning_rate": 0.0009355344962992474, + "loss": 0.89766723, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.33154297, + "step": 980, + "time_per_iteration": 2.6105339527130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109825, + "balance_loss_mlp": 1.06494308, + "epoch": 0.1887264332435552, + "flos": 607879987200.0, + "grad_norm": 0.05130215804193928, + "language_loss": 0.88147485, + "learning_rate": 0.0009353813944326908, + "loss": 0.89245737, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.33325195, + "step": 981, + "time_per_iteration": 2.882836103439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109809, + "balance_loss_mlp": 1.0758822, + "epoch": 0.1889188149288188, + "flos": 552264749568.0, + "grad_norm": 0.07032712681879846, + "language_loss": 0.83146608, + "learning_rate": 0.0009352281235360863, + "loss": 0.84256417, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.33959961, + "step": 982, + "time_per_iteration": 2.695748805999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120418, + "balance_loss_mlp": 1.08775461, + "epoch": 0.18911119661408235, + "flos": 418332128256.0, + "grad_norm": 0.06033753714629359, + "language_loss": 0.84987485, + "learning_rate": 0.0009350746836689389, + "loss": 0.86107904, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.32666016, + "step": 983, + "time_per_iteration": 2.5073440074920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260435, + "balance_loss_mlp": 1.23916793, + "epoch": 0.1893035782993459, + "flos": 1481141320704.0, + "grad_norm": 0.0731593378732656, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82699656, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.21289062, + "step": 984, + "time_per_iteration": 5.065609931945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133244, + "balance_loss_mlp": 1.09831583, + "epoch": 0.18949595998460947, + "flos": 508220974080.0, + "grad_norm": 0.09166419018528392, + "language_loss": 0.83211792, + "learning_rate": 0.0009347672972613634, + "loss": 0.84345031, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.34936523, + "step": 985, + "time_per_iteration": 2.580009937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115864, + "balance_loss_mlp": 1.08270001, + "epoch": 0.18968834166987303, + "flos": 530812459008.0, + "grad_norm": 0.0668772854373454, + "language_loss": 0.85875785, + "learning_rate": 0.0009346133508402735, + "loss": 0.8699165, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.33178711, + "step": 986, + "time_per_iteration": 2.6872711181640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111031, + "balance_loss_mlp": 1.07724667, + "epoch": 0.1898807233551366, + "flos": 499515807744.0, + "grad_norm": 0.11088649382938841, + "language_loss": 0.8420769, + "learning_rate": 0.0009344592356873166, + "loss": 0.8531872, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.33813477, + "step": 987, + "time_per_iteration": 2.6347994804382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098007, + "balance_loss_mlp": 1.06462848, + "epoch": 0.19007310504040015, + "flos": 601936399872.0, + "grad_norm": 0.05765681888892058, + "language_loss": 0.78527796, + "learning_rate": 0.0009343049518623255, + "loss": 0.79625803, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.33398438, + "step": 988, + "time_per_iteration": 2.696929693222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082914, + "balance_loss_mlp": 1.05029869, + "epoch": 0.1902654867256637, + "flos": 601374786048.0, + "grad_norm": 0.05732720380572914, + "language_loss": 0.83250153, + "learning_rate": 0.0009341504994251985, + "loss": 0.84333068, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.32617188, + "step": 989, + "time_per_iteration": 2.8399016857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095582, + "balance_loss_mlp": 1.07841623, + "epoch": 0.19045786841092727, + "flos": 1574925147648.0, + "grad_norm": 0.03888561388969961, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74616081, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.171875, + "step": 990, + "time_per_iteration": 5.072636842727661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109747, + "balance_loss_mlp": 1.06394839, + "epoch": 0.19065025009619085, + "flos": 681280906752.0, + "grad_norm": 0.135211113906906, + "language_loss": 0.818295, + "learning_rate": 0.0009338410889544574, + "loss": 0.82926977, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.33544922, + "step": 991, + "time_per_iteration": 3.050665855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_mlp": 1.06786811, + "epoch": 0.1908426317814544, + "flos": 601971305472.0, + "grad_norm": 0.06286082016671143, + "language_loss": 0.87738663, + "learning_rate": 0.000933686131040967, + "loss": 0.88840532, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.34033203, + "step": 992, + "time_per_iteration": 2.7589659690856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089273, + "balance_loss_mlp": 1.05672884, + "epoch": 0.19103501346671797, + "flos": 586027616256.0, + "grad_norm": 0.0561482479745879, + "language_loss": 0.90427077, + "learning_rate": 0.0009335310047555883, + "loss": 0.91516346, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.32543945, + "step": 993, + "time_per_iteration": 2.7133467197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108709, + "balance_loss_mlp": 1.0532825, + "epoch": 0.19122739515198153, + "flos": 545494708224.0, + "grad_norm": 0.06221036652136981, + "language_loss": 0.88114065, + "learning_rate": 0.0009333757101585467, + "loss": 0.89201152, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.33837891, + "step": 994, + "time_per_iteration": 2.6733241081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083527, + "balance_loss_mlp": 1.05105424, + "epoch": 0.1914197768372451, + "flos": 521171739648.0, + "grad_norm": 0.05606370206634765, + "language_loss": 0.93617988, + "learning_rate": 0.0009332202473101329, + "loss": 0.94701517, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.32470703, + "step": 995, + "time_per_iteration": 2.6689558029174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079312, + "balance_loss_mlp": 1.04536152, + "epoch": 0.19161215852250865, + "flos": 610961660928.0, + "grad_norm": 0.05986652691328414, + "language_loss": 0.83121806, + "learning_rate": 0.0009330646162707028, + "loss": 0.84201121, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.33984375, + "step": 996, + "time_per_iteration": 2.7264511585235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081823, + "balance_loss_mlp": 1.04849207, + "epoch": 0.1918045402077722, + "flos": 846281806848.0, + "grad_norm": 0.05485586532204223, + "language_loss": 0.84800065, + "learning_rate": 0.0009329088171006779, + "loss": 0.85881883, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.33349609, + "step": 997, + "time_per_iteration": 3.1486315727233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097705, + "balance_loss_mlp": 1.06220424, + "epoch": 0.19199692189303577, + "flos": 465699096576.0, + "grad_norm": 0.06540772430376247, + "language_loss": 0.84963006, + "learning_rate": 0.0009327528498605446, + "loss": 0.86060709, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.35522461, + "step": 998, + "time_per_iteration": 2.532460927963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.0542109, + "epoch": 0.19218930357829936, + "flos": 531318818304.0, + "grad_norm": 0.06065225266474605, + "language_loss": 0.89716202, + "learning_rate": 0.0009325967146108548, + "loss": 0.90804029, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.33642578, + "step": 999, + "time_per_iteration": 2.6381072998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108792, + "balance_loss_mlp": 1.05334902, + "epoch": 0.19238168526356292, + "flos": 601350054912.0, + "grad_norm": 0.06318510310852068, + "language_loss": 0.87984866, + "learning_rate": 0.0009324404114122258, + "loss": 0.89072788, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.34594727, + "step": 1000, + "time_per_iteration": 2.7017252445220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088105, + "balance_loss_mlp": 1.0544883, + "epoch": 0.19257406694882648, + "flos": 571690192896.0, + "grad_norm": 0.05361295189234855, + "language_loss": 0.87132722, + "learning_rate": 0.0009322839403253397, + "loss": 0.88220823, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.33642578, + "step": 1001, + "time_per_iteration": 2.7725350856781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091645, + "balance_loss_mlp": 1.05759907, + "epoch": 0.19276644863409004, + "flos": 801478568448.0, + "grad_norm": 0.0661765462165054, + "language_loss": 0.84038174, + "learning_rate": 0.0009321273014109439, + "loss": 0.85129815, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.34082031, + "step": 1002, + "time_per_iteration": 2.9275383949279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089997, + "balance_loss_mlp": 1.05676103, + "epoch": 0.1929588303193536, + "flos": 563024314368.0, + "grad_norm": 0.05133430998282463, + "language_loss": 0.85232604, + "learning_rate": 0.0009319704947298513, + "loss": 0.863226, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.33251953, + "step": 1003, + "time_per_iteration": 2.9198272228240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083204, + "balance_loss_mlp": 1.05120838, + "epoch": 0.19315121200461716, + "flos": 626550349824.0, + "grad_norm": 0.04652496586479965, + "language_loss": 0.88737059, + "learning_rate": 0.0009318135203429393, + "loss": 0.8982026, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.31982422, + "step": 1004, + "time_per_iteration": 2.7145965099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094807, + "balance_loss_mlp": 1.06116605, + "epoch": 0.19334359368988072, + "flos": 517169069568.0, + "grad_norm": 0.06711221272981459, + "language_loss": 0.88228458, + "learning_rate": 0.0009316563783111511, + "loss": 0.8932327, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.33642578, + "step": 1005, + "time_per_iteration": 2.68135404586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095243, + "balance_loss_mlp": 1.06050563, + "epoch": 0.19353597537514428, + "flos": 693751606272.0, + "grad_norm": 0.04947727679523619, + "language_loss": 0.82323831, + "learning_rate": 0.0009314990686954943, + "loss": 0.83419079, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.34765625, + "step": 1006, + "time_per_iteration": 2.9068872928619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098932, + "balance_loss_mlp": 1.06495738, + "epoch": 0.19372835706040784, + "flos": 1209665180160.0, + "grad_norm": 0.05336104081377929, + "language_loss": 0.80917025, + "learning_rate": 0.000931341591557042, + "loss": 0.82015955, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.34008789, + "step": 1007, + "time_per_iteration": 3.759119749069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098415, + "balance_loss_mlp": 1.06291509, + "epoch": 0.19392073874567142, + "flos": 520368606720.0, + "grad_norm": 0.06549831272650784, + "language_loss": 0.87757689, + "learning_rate": 0.0009311839469569325, + "loss": 0.88856107, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.35522461, + "step": 1008, + "time_per_iteration": 2.6298930644989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100893, + "balance_loss_mlp": 1.06620264, + "epoch": 0.19411312043093498, + "flos": 588543293952.0, + "grad_norm": 0.06763315162421418, + "language_loss": 0.8732397, + "learning_rate": 0.0009310261349563687, + "loss": 0.88424855, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.34692383, + "step": 1009, + "time_per_iteration": 2.6843061447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110066, + "balance_loss_mlp": 1.06718588, + "epoch": 0.19430550211619854, + "flos": 579085867008.0, + "grad_norm": 0.05371296475785438, + "language_loss": 0.8534441, + "learning_rate": 0.0009308681556166186, + "loss": 0.86445075, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.33496094, + "step": 1010, + "time_per_iteration": 2.8197336196899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107606, + "balance_loss_mlp": 1.07291579, + "epoch": 0.1944978838014621, + "flos": 620848281600.0, + "grad_norm": 0.08312668477716535, + "language_loss": 0.87206143, + "learning_rate": 0.0009307100089990152, + "loss": 0.88313752, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.34716797, + "step": 1011, + "time_per_iteration": 2.7118990421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101924, + "balance_loss_mlp": 1.0672822, + "epoch": 0.19469026548672566, + "flos": 598440089088.0, + "grad_norm": 0.061832865854500894, + "language_loss": 0.83946323, + "learning_rate": 0.0009305516951649568, + "loss": 0.85048252, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.34667969, + "step": 1012, + "time_per_iteration": 2.667672872543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096764, + "balance_loss_mlp": 1.06314659, + "epoch": 0.19488264717198922, + "flos": 551890810368.0, + "grad_norm": 0.04827143175142062, + "language_loss": 0.87187612, + "learning_rate": 0.0009303932141759057, + "loss": 0.88284373, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.33642578, + "step": 1013, + "time_per_iteration": 2.7321088314056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101502, + "balance_loss_mlp": 1.06705046, + "epoch": 0.19507502885725278, + "flos": 665842166784.0, + "grad_norm": 0.05715794205563071, + "language_loss": 0.84201366, + "learning_rate": 0.0009302345660933902, + "loss": 0.85302866, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.3449707, + "step": 1014, + "time_per_iteration": 2.7699263095855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109904, + "balance_loss_mlp": 1.07616735, + "epoch": 0.19526741054251634, + "flos": 670771625472.0, + "grad_norm": 0.05949834877265084, + "language_loss": 0.84866655, + "learning_rate": 0.0009300757509790026, + "loss": 0.85976553, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.33764648, + "step": 1015, + "time_per_iteration": 2.8250515460968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110813, + "balance_loss_mlp": 1.0766474, + "epoch": 0.19545979222777993, + "flos": 446983653888.0, + "grad_norm": 0.0671511226198219, + "language_loss": 0.90974069, + "learning_rate": 0.0009299167688944005, + "loss": 0.92084885, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.34204102, + "step": 1016, + "time_per_iteration": 2.545133590698242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111157, + "balance_loss_mlp": 1.07778645, + "epoch": 0.1956521739130435, + "flos": 568813722624.0, + "grad_norm": 0.06338586690579641, + "language_loss": 0.85958129, + "learning_rate": 0.0009297576199013063, + "loss": 0.87069696, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.33813477, + "step": 1017, + "time_per_iteration": 2.668503761291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148218, + "balance_loss_mlp": 1.13295972, + "epoch": 0.19584455559830705, + "flos": 1454969157120.0, + "grad_norm": 0.047651466398381144, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74150348, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.15234375, + "step": 1018, + "time_per_iteration": 4.920944929122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104842, + "balance_loss_mlp": 1.09015501, + "epoch": 0.1960369372835706, + "flos": 1590320369664.0, + "grad_norm": 0.036993279908541045, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80531144, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.14648438, + "step": 1019, + "time_per_iteration": 6.0059425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118502, + "balance_loss_mlp": 1.08505166, + "epoch": 0.19622931896883417, + "flos": 615709237248.0, + "grad_norm": 0.05240041234704895, + "language_loss": 0.86600977, + "learning_rate": 0.0009292791720892659, + "loss": 0.87719476, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.3347168, + "step": 1020, + "time_per_iteration": 2.995192527770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113873, + "balance_loss_mlp": 1.07930255, + "epoch": 0.19642170065409773, + "flos": 465950790144.0, + "grad_norm": 0.0657036282835547, + "language_loss": 0.88724279, + "learning_rate": 0.0009291193560807218, + "loss": 0.89838147, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.34594727, + "step": 1021, + "time_per_iteration": 2.633256196975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114293, + "balance_loss_mlp": 1.07962656, + "epoch": 0.19661408233936128, + "flos": 515040477696.0, + "grad_norm": 0.054836200403870924, + "language_loss": 0.87439638, + "learning_rate": 0.0009289593734732688, + "loss": 0.88553929, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.34716797, + "step": 1022, + "time_per_iteration": 2.622284173965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107262, + "balance_loss_mlp": 1.0736922, + "epoch": 0.19680646402462484, + "flos": 392427624960.0, + "grad_norm": 0.053036961045345866, + "language_loss": 0.94139373, + "learning_rate": 0.0009287992243290175, + "loss": 0.95246631, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.3359375, + "step": 1023, + "time_per_iteration": 2.4402668476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108975, + "balance_loss_mlp": 1.07247353, + "epoch": 0.19699884570988843, + "flos": 626122566144.0, + "grad_norm": 0.056904835680118435, + "language_loss": 0.90850759, + "learning_rate": 0.0009286389087101435, + "loss": 0.91959733, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.36523438, + "step": 1024, + "time_per_iteration": 2.762068271636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110519, + "balance_loss_mlp": 1.06957078, + "epoch": 0.197191227395152, + "flos": 557710742016.0, + "grad_norm": 0.05298833269370499, + "language_loss": 0.88575542, + "learning_rate": 0.0009284784266788864, + "loss": 0.89680731, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.35668945, + "step": 1025, + "time_per_iteration": 4.087035417556763 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109606, + "balance_loss_mlp": 1.07565546, + "epoch": 0.19738360908041555, + "flos": 664681061376.0, + "grad_norm": 0.0565537913278748, + "language_loss": 0.92494339, + "learning_rate": 0.0009283177782975512, + "loss": 0.93603945, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.33984375, + "step": 1026, + "time_per_iteration": 2.948167562484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095626, + "balance_loss_mlp": 1.06117415, + "epoch": 0.1975759907656791, + "flos": 522244094976.0, + "grad_norm": 0.06218898027866582, + "language_loss": 0.88052273, + "learning_rate": 0.000928156963628507, + "loss": 0.89147896, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.3449707, + "step": 1027, + "time_per_iteration": 2.564019203186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091019, + "balance_loss_mlp": 1.05694866, + "epoch": 0.19776837245094267, + "flos": 462233309184.0, + "grad_norm": 0.056114928823487176, + "language_loss": 0.8826099, + "learning_rate": 0.0009279959827341877, + "loss": 0.89352006, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.34082031, + "step": 1028, + "time_per_iteration": 2.7226340770721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090946, + "balance_loss_mlp": 1.05699515, + "epoch": 0.19796075413620623, + "flos": 502809887232.0, + "grad_norm": 0.05507551359640612, + "language_loss": 0.88204837, + "learning_rate": 0.0009278348356770915, + "loss": 0.89295781, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.33984375, + "step": 1029, + "time_per_iteration": 2.592756748199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085455, + "balance_loss_mlp": 1.05093157, + "epoch": 0.1981531358214698, + "flos": 507281038848.0, + "grad_norm": 0.061172366255401664, + "language_loss": 0.85939109, + "learning_rate": 0.0009276735225197814, + "loss": 0.87024558, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.34570312, + "step": 1030, + "time_per_iteration": 2.598607063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088832, + "balance_loss_mlp": 1.05495238, + "epoch": 0.19834551750673335, + "flos": 531275148288.0, + "grad_norm": 0.0802549423316463, + "language_loss": 0.86293721, + "learning_rate": 0.0009275120433248847, + "loss": 0.87382561, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.33886719, + "step": 1031, + "time_per_iteration": 2.7143311500549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090216, + "balance_loss_mlp": 1.05726683, + "epoch": 0.1985378991919969, + "flos": 775147691520.0, + "grad_norm": 0.05308511447166053, + "language_loss": 0.86272347, + "learning_rate": 0.0009273503981550931, + "loss": 0.87362564, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.32958984, + "step": 1032, + "time_per_iteration": 3.0616648197174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087082, + "balance_loss_mlp": 1.05351269, + "epoch": 0.1987302808772605, + "flos": 434063411712.0, + "grad_norm": 0.059916166081832097, + "language_loss": 0.8703599, + "learning_rate": 0.0009271885870731626, + "loss": 0.88123071, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.3359375, + "step": 1033, + "time_per_iteration": 2.487316131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092715, + "balance_loss_mlp": 1.05921745, + "epoch": 0.19892266256252406, + "flos": 553342897152.0, + "grad_norm": 0.06168947094446192, + "language_loss": 0.88599998, + "learning_rate": 0.0009270266101419143, + "loss": 0.89692712, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.33520508, + "step": 1034, + "time_per_iteration": 2.5978119373321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085912, + "balance_loss_mlp": 1.05403578, + "epoch": 0.19911504424778761, + "flos": 549596302848.0, + "grad_norm": 0.06019117447906982, + "language_loss": 0.85564321, + "learning_rate": 0.0009268644674242328, + "loss": 0.86650234, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.31860352, + "step": 1035, + "time_per_iteration": 2.7259163856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097892, + "balance_loss_mlp": 1.0645138, + "epoch": 0.19930742593305117, + "flos": 518024636928.0, + "grad_norm": 0.05869793462101787, + "language_loss": 0.81141233, + "learning_rate": 0.0009267021589830678, + "loss": 0.82239127, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.33398438, + "step": 1036, + "time_per_iteration": 2.597724199295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161292, + "balance_loss_mlp": 1.14507985, + "epoch": 0.19949980761831473, + "flos": 1508516849664.0, + "grad_norm": 0.04621309141147155, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78788376, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.16210938, + "step": 1037, + "time_per_iteration": 4.918612241744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093993, + "balance_loss_mlp": 1.06044722, + "epoch": 0.1996921893035783, + "flos": 697803738624.0, + "grad_norm": 0.061892224045152405, + "language_loss": 0.93283784, + "learning_rate": 0.000926377045182406, + "loss": 0.94377768, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.33569336, + "step": 1038, + "time_per_iteration": 2.8800160884857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096412, + "balance_loss_mlp": 1.06334293, + "epoch": 0.19988457098884185, + "flos": 726682226688.0, + "grad_norm": 0.0613562398808313, + "language_loss": 0.87972045, + "learning_rate": 0.0009262142399491296, + "loss": 0.89068449, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.33081055, + "step": 1039, + "time_per_iteration": 3.0561435222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097284, + "balance_loss_mlp": 1.06345224, + "epoch": 0.2000769526741054, + "flos": 560275881984.0, + "grad_norm": 0.06364175085873486, + "language_loss": 0.87837642, + "learning_rate": 0.0009260512692448105, + "loss": 0.88934934, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.33862305, + "step": 1040, + "time_per_iteration": 2.7037088871002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090351, + "balance_loss_mlp": 1.05697203, + "epoch": 0.200269334359369, + "flos": 571758594048.0, + "grad_norm": 0.05851279903795688, + "language_loss": 0.84325236, + "learning_rate": 0.000925888133132719, + "loss": 0.85415584, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.33398438, + "step": 1041, + "time_per_iteration": 2.836177110671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082089, + "balance_loss_mlp": 1.06730711, + "epoch": 0.20046171604463256, + "flos": 1485362340864.0, + "grad_norm": 0.029405325300647274, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80692518, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.14746094, + "step": 1042, + "time_per_iteration": 4.901337146759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094947, + "balance_loss_mlp": 1.06140149, + "epoch": 0.20065409772989612, + "flos": 496266808320.0, + "grad_norm": 0.07205728359427886, + "language_loss": 0.81256473, + "learning_rate": 0.0009255613649386244, + "loss": 0.82351422, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.33544922, + "step": 1043, + "time_per_iteration": 2.6198885440826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089381, + "balance_loss_mlp": 1.05686069, + "epoch": 0.20084647941515968, + "flos": 579094631424.0, + "grad_norm": 0.06625931968059934, + "language_loss": 0.79017001, + "learning_rate": 0.0009253977329834838, + "loss": 0.80106384, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.32519531, + "step": 1044, + "time_per_iteration": 2.6872498989105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111234, + "balance_loss_mlp": 1.07745981, + "epoch": 0.20103886110042324, + "flos": 641775273984.0, + "grad_norm": 0.06628294367657735, + "language_loss": 0.86666185, + "learning_rate": 0.0009252339358742965, + "loss": 0.87778521, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.34912109, + "step": 1045, + "time_per_iteration": 2.7749996185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116952, + "balance_loss_mlp": 1.08219087, + "epoch": 0.2012312427856868, + "flos": 441720953856.0, + "grad_norm": 0.05401214919341486, + "language_loss": 0.83449644, + "learning_rate": 0.000925069973674654, + "loss": 0.84566593, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.34814453, + "step": 1046, + "time_per_iteration": 2.662992477416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114258, + "balance_loss_mlp": 1.08116508, + "epoch": 0.20142362447095036, + "flos": 554135855616.0, + "grad_norm": 0.049297877184233195, + "language_loss": 0.88960069, + "learning_rate": 0.000924905846448212, + "loss": 0.90074325, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.33105469, + "step": 1047, + "time_per_iteration": 2.8164925575256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127137, + "balance_loss_mlp": 1.09306693, + "epoch": 0.20161600615621392, + "flos": 669988841472.0, + "grad_norm": 0.07365230282100185, + "language_loss": 0.85615861, + "learning_rate": 0.0009247415542586906, + "loss": 0.86742997, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.34106445, + "step": 1048, + "time_per_iteration": 2.858611822128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115399, + "balance_loss_mlp": 1.08130527, + "epoch": 0.2018083878414775, + "flos": 572788689408.0, + "grad_norm": 0.05223287600750505, + "language_loss": 0.83514655, + "learning_rate": 0.0009245770971698735, + "loss": 0.84630048, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.34106445, + "step": 1049, + "time_per_iteration": 2.8758163452148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103626, + "balance_loss_mlp": 1.07036686, + "epoch": 0.20200076952674106, + "flos": 425624495616.0, + "grad_norm": 0.061140118103518055, + "language_loss": 0.88792473, + "learning_rate": 0.0009244124752456087, + "loss": 0.89896095, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.33276367, + "step": 1050, + "time_per_iteration": 2.501565456390381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097765, + "balance_loss_mlp": 1.06457758, + "epoch": 0.20219315121200462, + "flos": 536326852608.0, + "grad_norm": 0.049507299183714965, + "language_loss": 0.85344577, + "learning_rate": 0.0009242476885498081, + "loss": 0.86442339, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.33203125, + "step": 1051, + "time_per_iteration": 2.698791027069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095962, + "balance_loss_mlp": 1.06222594, + "epoch": 0.20238553289726818, + "flos": 477634323456.0, + "grad_norm": 0.07140169421024865, + "language_loss": 0.8134433, + "learning_rate": 0.0009240827371464474, + "loss": 0.82440293, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.33764648, + "step": 1052, + "time_per_iteration": 2.5603079795837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082157, + "balance_loss_mlp": 1.04958868, + "epoch": 0.20257791458253174, + "flos": 1151611430400.0, + "grad_norm": 0.06069279327125781, + "language_loss": 0.84372044, + "learning_rate": 0.0009239176210995666, + "loss": 0.85454196, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.32568359, + "step": 1053, + "time_per_iteration": 3.4549684524536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088056, + "balance_loss_mlp": 1.05393791, + "epoch": 0.2027702962677953, + "flos": 666606011904.0, + "grad_norm": 0.06066867592012189, + "language_loss": 0.93657684, + "learning_rate": 0.0009237523404732695, + "loss": 0.94745743, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.34130859, + "step": 1054, + "time_per_iteration": 4.344247817993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078384, + "balance_loss_mlp": 1.04567289, + "epoch": 0.20296267795305886, + "flos": 641011428864.0, + "grad_norm": 0.0678922331878557, + "language_loss": 0.84289086, + "learning_rate": 0.0009235868953317235, + "loss": 0.85367465, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.32714844, + "step": 1055, + "time_per_iteration": 2.7755184173583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086857, + "balance_loss_mlp": 1.05321646, + "epoch": 0.20315505963832242, + "flos": 930187777536.0, + "grad_norm": 0.06816541670806936, + "language_loss": 0.85603452, + "learning_rate": 0.0009234212857391602, + "loss": 0.86690307, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.33642578, + "step": 1056, + "time_per_iteration": 3.1736087799072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089034, + "balance_loss_mlp": 1.05477369, + "epoch": 0.20334744132358598, + "flos": 561818128896.0, + "grad_norm": 0.05209348313890264, + "language_loss": 0.88978589, + "learning_rate": 0.000923255511759875, + "loss": 0.90067613, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.34301758, + "step": 1057, + "time_per_iteration": 2.7823617458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093097, + "balance_loss_mlp": 1.05945587, + "epoch": 0.20353982300884957, + "flos": 643902455808.0, + "grad_norm": 0.061337083912670884, + "language_loss": 0.85219932, + "learning_rate": 0.000923089573458227, + "loss": 0.86313027, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.33666992, + "step": 1058, + "time_per_iteration": 2.8398988246917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092738, + "balance_loss_mlp": 1.05952644, + "epoch": 0.20373220469411313, + "flos": 651101690880.0, + "grad_norm": 0.0713114334987562, + "language_loss": 0.84425724, + "learning_rate": 0.0009229234708986392, + "loss": 0.85518456, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.33203125, + "step": 1059, + "time_per_iteration": 2.891934394836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069957, + "balance_loss_mlp": 1.05603302, + "epoch": 0.2039245863793767, + "flos": 1436939136000.0, + "grad_norm": 0.037855568460977755, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.8273685, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.13964844, + "step": 1060, + "time_per_iteration": 4.673142194747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092929, + "balance_loss_mlp": 1.05985999, + "epoch": 0.20411696806464025, + "flos": 596678082048.0, + "grad_norm": 0.07190006801568614, + "language_loss": 0.85404283, + "learning_rate": 0.0009225907732636548, + "loss": 0.86497211, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.33081055, + "step": 1061, + "time_per_iteration": 2.74110746383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091714, + "balance_loss_mlp": 1.0585742, + "epoch": 0.2043093497499038, + "flos": 573530775552.0, + "grad_norm": 0.06271161302412134, + "language_loss": 0.86991799, + "learning_rate": 0.0009224241783174227, + "loss": 0.88083506, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.33154297, + "step": 1062, + "time_per_iteration": 2.6885697841644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084619, + "balance_loss_mlp": 1.05233693, + "epoch": 0.20450173143516737, + "flos": 630061217280.0, + "grad_norm": 0.055816021094363524, + "language_loss": 0.85842204, + "learning_rate": 0.0009222574193715802, + "loss": 0.86926818, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.32275391, + "step": 1063, + "time_per_iteration": 2.7569899559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087796, + "balance_loss_mlp": 1.05522823, + "epoch": 0.20469411312043093, + "flos": 573718450176.0, + "grad_norm": 0.051897822989382614, + "language_loss": 0.8621105, + "learning_rate": 0.000922090496490869, + "loss": 0.87298846, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.32568359, + "step": 1064, + "time_per_iteration": 2.7099597454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082485, + "balance_loss_mlp": 1.05025065, + "epoch": 0.20488649480569449, + "flos": 636748300800.0, + "grad_norm": 0.04962250787968228, + "language_loss": 0.90165728, + "learning_rate": 0.0009219234097400937, + "loss": 0.91248214, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.32226562, + "step": 1065, + "time_per_iteration": 2.8492319583892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108042, + "balance_loss_mlp": 1.04806709, + "epoch": 0.20507887649095807, + "flos": 975383894016.0, + "grad_norm": 0.051536979552593745, + "language_loss": 0.83029723, + "learning_rate": 0.0009217561591841237, + "loss": 0.84110147, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.32348633, + "step": 1066, + "time_per_iteration": 3.267207145690918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082085, + "balance_loss_mlp": 1.04951739, + "epoch": 0.20527125817622163, + "flos": 485940819456.0, + "grad_norm": 0.09661652793466288, + "language_loss": 0.81334901, + "learning_rate": 0.0009215887448878913, + "loss": 0.82416987, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.32568359, + "step": 1067, + "time_per_iteration": 2.5429391860961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088998, + "balance_loss_mlp": 1.05552411, + "epoch": 0.2054636398614852, + "flos": 526921860096.0, + "grad_norm": 0.09953641970782799, + "language_loss": 0.85144234, + "learning_rate": 0.0009214211669163922, + "loss": 0.86233234, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.33496094, + "step": 1068, + "time_per_iteration": 2.7006540298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085875, + "balance_loss_mlp": 1.05428481, + "epoch": 0.20565602154674875, + "flos": 557898416640.0, + "grad_norm": 0.048729379907622286, + "language_loss": 0.93896133, + "learning_rate": 0.0009212534253346862, + "loss": 0.94982004, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.31567383, + "step": 1069, + "time_per_iteration": 2.7544496059417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098993, + "balance_loss_mlp": 1.06713986, + "epoch": 0.2058484032320123, + "flos": 503976784896.0, + "grad_norm": 0.06649355865978995, + "language_loss": 0.8497259, + "learning_rate": 0.0009210855202078964, + "loss": 0.86071587, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.31835938, + "step": 1070, + "time_per_iteration": 2.59660005569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113348, + "balance_loss_mlp": 1.07975471, + "epoch": 0.20604078491727587, + "flos": 432950358528.0, + "grad_norm": 0.06315152856471482, + "language_loss": 0.87476587, + "learning_rate": 0.0009209174516012091, + "loss": 0.88589936, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.3359375, + "step": 1071, + "time_per_iteration": 2.498087167739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110052, + "balance_loss_mlp": 1.07624412, + "epoch": 0.20623316660253943, + "flos": 608421252096.0, + "grad_norm": 0.06211591839104366, + "language_loss": 0.89244497, + "learning_rate": 0.0009207492195798747, + "loss": 0.9035455, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.33837891, + "step": 1072, + "time_per_iteration": 2.760019063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116237, + "balance_loss_mlp": 1.08142757, + "epoch": 0.206425548287803, + "flos": 480184906752.0, + "grad_norm": 0.07379229384440758, + "language_loss": 0.84887302, + "learning_rate": 0.0009205808242092061, + "loss": 0.86003542, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.34838867, + "step": 1073, + "time_per_iteration": 2.6034529209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118683, + "balance_loss_mlp": 1.08423102, + "epoch": 0.20661792997306658, + "flos": 949007937024.0, + "grad_norm": 0.0763588165275792, + "language_loss": 0.82845032, + "learning_rate": 0.0009204122655545808, + "loss": 0.83963716, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.34472656, + "step": 1074, + "time_per_iteration": 3.3222029209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111187, + "balance_loss_mlp": 1.07759392, + "epoch": 0.20681031165833014, + "flos": 603206604288.0, + "grad_norm": 0.05592396046249817, + "language_loss": 0.80705297, + "learning_rate": 0.0009202435436814388, + "loss": 0.81816483, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.33618164, + "step": 1075, + "time_per_iteration": 2.721888780593872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114903, + "balance_loss_mlp": 1.08121455, + "epoch": 0.2070026933435937, + "flos": 708665200128.0, + "grad_norm": 0.07630450069092473, + "language_loss": 0.89700603, + "learning_rate": 0.0009200746586552836, + "loss": 0.90815508, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.3371582, + "step": 1076, + "time_per_iteration": 2.8797900676727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107662, + "balance_loss_mlp": 1.07416463, + "epoch": 0.20719507502885726, + "flos": 829456409088.0, + "grad_norm": 0.06176881640488279, + "language_loss": 0.84210765, + "learning_rate": 0.0009199056105416825, + "loss": 0.85318428, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.33520508, + "step": 1077, + "time_per_iteration": 3.120950698852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107166, + "balance_loss_mlp": 1.07312012, + "epoch": 0.20738745671412082, + "flos": 637993774080.0, + "grad_norm": 0.055893649084458805, + "language_loss": 0.86594802, + "learning_rate": 0.0009197363994062654, + "loss": 0.87701964, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.34057617, + "step": 1078, + "time_per_iteration": 2.8197755813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086727, + "balance_loss_mlp": 1.05480289, + "epoch": 0.20757983839938438, + "flos": 685258845696.0, + "grad_norm": 0.054433441748304986, + "language_loss": 0.84861732, + "learning_rate": 0.0009195670253147262, + "loss": 0.85948461, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.3190918, + "step": 1079, + "time_per_iteration": 2.966987133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096354, + "balance_loss_mlp": 1.06340468, + "epoch": 0.20777222008464794, + "flos": 519024208896.0, + "grad_norm": 0.07868801214896702, + "language_loss": 0.82301188, + "learning_rate": 0.0009193974883328216, + "loss": 0.83397532, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.32958984, + "step": 1080, + "time_per_iteration": 2.620704174041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091209, + "balance_loss_mlp": 1.05725837, + "epoch": 0.2079646017699115, + "flos": 511136732160.0, + "grad_norm": 0.09961486538628272, + "language_loss": 0.87482947, + "learning_rate": 0.0009192277885263718, + "loss": 0.88574153, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.33984375, + "step": 1081, + "time_per_iteration": 2.6247479915618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087114, + "balance_loss_mlp": 1.05352044, + "epoch": 0.20815698345517505, + "flos": 931409929728.0, + "grad_norm": 0.05448561879445608, + "language_loss": 0.86255312, + "learning_rate": 0.0009190579259612602, + "loss": 0.87342417, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.33618164, + "step": 1082, + "time_per_iteration": 3.2661428451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085916, + "balance_loss_mlp": 1.05187023, + "epoch": 0.20834936514043864, + "flos": 632114205696.0, + "grad_norm": 0.059638645169798186, + "language_loss": 0.86669636, + "learning_rate": 0.000918887900703433, + "loss": 0.87755549, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.34082031, + "step": 1083, + "time_per_iteration": 2.7930080890655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083976, + "balance_loss_mlp": 1.05038285, + "epoch": 0.2085417468257022, + "flos": 394170693120.0, + "grad_norm": 0.06326041775418027, + "language_loss": 0.90427065, + "learning_rate": 0.0009187177128188999, + "loss": 0.91511047, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.33618164, + "step": 1084, + "time_per_iteration": 2.431358814239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054848, + "balance_loss_mlp": 1.04159176, + "epoch": 0.20873412851096576, + "flos": 1401387969024.0, + "grad_norm": 0.04127554175786628, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78211385, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.1328125, + "step": 1085, + "time_per_iteration": 6.352816343307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080772, + "balance_loss_mlp": 1.04686832, + "epoch": 0.20892651019622932, + "flos": 447599112192.0, + "grad_norm": 0.06234370040412467, + "language_loss": 0.8612783, + "learning_rate": 0.000918376849434071, + "loss": 0.87208605, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.33935547, + "step": 1086, + "time_per_iteration": 2.5168843269348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084999, + "balance_loss_mlp": 1.05040443, + "epoch": 0.20911889188149288, + "flos": 492863629824.0, + "grad_norm": 0.07820142019527274, + "language_loss": 0.90828383, + "learning_rate": 0.0009182061740661098, + "loss": 0.91913384, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.34643555, + "step": 1087, + "time_per_iteration": 2.5461525917053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083648, + "balance_loss_mlp": 1.0494113, + "epoch": 0.20931127356675644, + "flos": 840928946688.0, + "grad_norm": 0.05821627614551514, + "language_loss": 0.85034752, + "learning_rate": 0.0009180353363361127, + "loss": 0.86118406, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.3425293, + "step": 1088, + "time_per_iteration": 3.11942982673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084605, + "balance_loss_mlp": 1.05036855, + "epoch": 0.20950365525202, + "flos": 756796013568.0, + "grad_norm": 0.06471498550944753, + "language_loss": 0.82101512, + "learning_rate": 0.0009178643363104044, + "loss": 0.83186114, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.34277344, + "step": 1089, + "time_per_iteration": 3.0986390113830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107905, + "balance_loss_mlp": 1.04543328, + "epoch": 0.20969603693728356, + "flos": 472301812224.0, + "grad_norm": 0.07091461504319575, + "language_loss": 0.91050649, + "learning_rate": 0.0009176931740553735, + "loss": 0.92129695, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.33642578, + "step": 1090, + "time_per_iteration": 2.4965460300445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108219, + "balance_loss_mlp": 1.04845381, + "epoch": 0.20988841862254715, + "flos": 976507121664.0, + "grad_norm": 0.05967441428812083, + "language_loss": 0.82829833, + "learning_rate": 0.0009175218496374708, + "loss": 0.83912027, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.33740234, + "step": 1091, + "time_per_iteration": 3.325467348098755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082454, + "balance_loss_mlp": 1.04917121, + "epoch": 0.2100808003078107, + "flos": 1092697731072.0, + "grad_norm": 0.06552872916111846, + "language_loss": 0.85816884, + "learning_rate": 0.0009173503631232103, + "loss": 0.86899334, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.33300781, + "step": 1092, + "time_per_iteration": 3.3492543697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080972, + "balance_loss_mlp": 1.04761767, + "epoch": 0.21027318199307427, + "flos": 1012567468032.0, + "grad_norm": 0.06864870254184631, + "language_loss": 0.8205356, + "learning_rate": 0.0009171787145791691, + "loss": 0.83134532, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.33374023, + "step": 1093, + "time_per_iteration": 3.229302167892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083111, + "balance_loss_mlp": 1.04975629, + "epoch": 0.21046556367833782, + "flos": 521141216256.0, + "grad_norm": 0.08362122797221107, + "language_loss": 0.80208671, + "learning_rate": 0.000917006904071987, + "loss": 0.81291783, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.33374023, + "step": 1094, + "time_per_iteration": 2.5901217460632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091484, + "balance_loss_mlp": 1.05843902, + "epoch": 0.21065794536360138, + "flos": 603437948928.0, + "grad_norm": 0.05523679641811596, + "language_loss": 0.87209588, + "learning_rate": 0.0009168349316683669, + "loss": 0.88301063, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.33056641, + "step": 1095, + "time_per_iteration": 2.67250919342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093141, + "balance_loss_mlp": 1.06081104, + "epoch": 0.21085032704886494, + "flos": 603045070848.0, + "grad_norm": 0.05347318487829757, + "language_loss": 0.82685143, + "learning_rate": 0.0009166627974350741, + "loss": 0.83778286, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.32324219, + "step": 1096, + "time_per_iteration": 2.9291882514953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097603, + "balance_loss_mlp": 1.06362867, + "epoch": 0.2110427087341285, + "flos": 637382697984.0, + "grad_norm": 0.059512513015867, + "language_loss": 0.89716321, + "learning_rate": 0.0009164905014389373, + "loss": 0.90813923, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.34008789, + "step": 1097, + "time_per_iteration": 2.7384700775146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100621, + "balance_loss_mlp": 1.06798196, + "epoch": 0.21123509041939206, + "flos": 522667496448.0, + "grad_norm": 0.08051519151754843, + "language_loss": 0.87020361, + "learning_rate": 0.0009163180437468476, + "loss": 0.88120985, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.32641602, + "step": 1098, + "time_per_iteration": 2.584890365600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109632, + "balance_loss_mlp": 1.0635848, + "epoch": 0.21142747210465565, + "flos": 450938271744.0, + "grad_norm": 0.05811835985780437, + "language_loss": 0.86184567, + "learning_rate": 0.000916145424425759, + "loss": 0.87280893, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.32739258, + "step": 1099, + "time_per_iteration": 2.6362791061401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_mlp": 1.07059634, + "epoch": 0.2116198537899192, + "flos": 875813630976.0, + "grad_norm": 0.07623729144092387, + "language_loss": 0.9082064, + "learning_rate": 0.0009159726435426885, + "loss": 0.91924655, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.33447266, + "step": 1100, + "time_per_iteration": 3.0668158531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108924, + "balance_loss_mlp": 1.0554564, + "epoch": 0.21181223547518277, + "flos": 523410992640.0, + "grad_norm": 0.06029059005678133, + "language_loss": 0.90809137, + "learning_rate": 0.0009157997011647154, + "loss": 0.91898382, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.33813477, + "step": 1101, + "time_per_iteration": 2.5932393074035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110663, + "balance_loss_mlp": 1.07327497, + "epoch": 0.21200461716044633, + "flos": 572014669824.0, + "grad_norm": 0.05812758027328986, + "language_loss": 0.86378956, + "learning_rate": 0.0009156265973589817, + "loss": 0.87485588, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.33374023, + "step": 1102, + "time_per_iteration": 2.79496431350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110266, + "balance_loss_mlp": 1.07672012, + "epoch": 0.2121969988457099, + "flos": 544869075456.0, + "grad_norm": 0.0704183859149776, + "language_loss": 0.89789248, + "learning_rate": 0.0009154533321926926, + "loss": 0.90899515, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.33569336, + "step": 1103, + "time_per_iteration": 2.5982048511505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107149, + "balance_loss_mlp": 1.07393694, + "epoch": 0.21238938053097345, + "flos": 843489704448.0, + "grad_norm": 0.06399101868010165, + "language_loss": 0.87705767, + "learning_rate": 0.0009152799057331156, + "loss": 0.88812917, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.33227539, + "step": 1104, + "time_per_iteration": 3.088672637939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100871, + "balance_loss_mlp": 1.06768262, + "epoch": 0.212581762216237, + "flos": 445984081920.0, + "grad_norm": 0.064105004549741, + "language_loss": 0.91047186, + "learning_rate": 0.0009151063180475805, + "loss": 0.9214806, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.33203125, + "step": 1105, + "time_per_iteration": 2.569998025894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090698, + "balance_loss_mlp": 1.05798697, + "epoch": 0.21277414390150057, + "flos": 514129655808.0, + "grad_norm": 0.05967324045126681, + "language_loss": 0.84732658, + "learning_rate": 0.0009149325692034803, + "loss": 0.85823357, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.32714844, + "step": 1106, + "time_per_iteration": 2.6009016036987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149918, + "balance_loss_mlp": 1.13380063, + "epoch": 0.21296652558676413, + "flos": 1484790552576.0, + "grad_norm": 0.04210654195905191, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80353343, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.16113281, + "step": 1107, + "time_per_iteration": 4.820629596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082056, + "balance_loss_mlp": 1.04994082, + "epoch": 0.21315890727202771, + "flos": 845689669632.0, + "grad_norm": 0.07454945953507684, + "language_loss": 0.87513995, + "learning_rate": 0.0009145845883094678, + "loss": 0.88596046, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.32080078, + "step": 1108, + "time_per_iteration": 3.0311591625213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083334, + "balance_loss_mlp": 1.04971695, + "epoch": 0.21335128895729127, + "flos": 629086376448.0, + "grad_norm": 0.07212897946446892, + "language_loss": 0.85387337, + "learning_rate": 0.000914410356394654, + "loss": 0.86470675, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.33642578, + "step": 1109, + "time_per_iteration": 2.7746968269348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085071, + "balance_loss_mlp": 1.05102468, + "epoch": 0.21354367064255483, + "flos": 710649787392.0, + "grad_norm": 0.053148069764317206, + "language_loss": 0.85104829, + "learning_rate": 0.0009142359635914709, + "loss": 0.86189902, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.34057617, + "step": 1110, + "time_per_iteration": 3.109018564224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083884, + "balance_loss_mlp": 1.05067194, + "epoch": 0.2137360523278184, + "flos": 455950688256.0, + "grad_norm": 0.07113647076789116, + "language_loss": 0.84692943, + "learning_rate": 0.0009140614099676245, + "loss": 0.8577683, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.33203125, + "step": 1111, + "time_per_iteration": 2.5607409477233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083673, + "balance_loss_mlp": 1.05072355, + "epoch": 0.21392843401308195, + "flos": 665749034496.0, + "grad_norm": 0.059219994241997045, + "language_loss": 0.82997137, + "learning_rate": 0.0009138866955908821, + "loss": 0.84080815, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.32958984, + "step": 1112, + "time_per_iteration": 2.901376724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086583, + "balance_loss_mlp": 1.05327559, + "epoch": 0.2141208156983455, + "flos": 748656843264.0, + "grad_norm": 0.06302145936378449, + "language_loss": 0.80617785, + "learning_rate": 0.0009137118205290738, + "loss": 0.81704366, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.33325195, + "step": 1113, + "time_per_iteration": 2.9629132747650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085777, + "balance_loss_mlp": 1.05142069, + "epoch": 0.21431319738360907, + "flos": 418898124288.0, + "grad_norm": 0.06913372638273338, + "language_loss": 0.90778732, + "learning_rate": 0.0009135367848500924, + "loss": 0.91864502, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.34399414, + "step": 1114, + "time_per_iteration": 2.5860419273376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087087, + "balance_loss_mlp": 1.05406582, + "epoch": 0.21450557906887263, + "flos": 608849035776.0, + "grad_norm": 0.07370492115341919, + "language_loss": 0.86567605, + "learning_rate": 0.0009133615886218927, + "loss": 0.87654686, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.33032227, + "step": 1115, + "time_per_iteration": 2.6986265182495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095845, + "balance_loss_mlp": 1.06082106, + "epoch": 0.21469796075413622, + "flos": 561649393152.0, + "grad_norm": 0.0682239504380638, + "language_loss": 0.88444531, + "learning_rate": 0.0009131862319124917, + "loss": 0.89540386, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.3503418, + "step": 1116, + "time_per_iteration": 2.644977569580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086331, + "balance_loss_mlp": 1.0540725, + "epoch": 0.21489034243939978, + "flos": 594363225600.0, + "grad_norm": 0.06937847326766512, + "language_loss": 0.8429122, + "learning_rate": 0.0009130107147899691, + "loss": 0.85377544, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.32250977, + "step": 1117, + "time_per_iteration": 2.768064498901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084886, + "balance_loss_mlp": 1.05148315, + "epoch": 0.21508272412466334, + "flos": 441661317120.0, + "grad_norm": 0.09911577685113587, + "language_loss": 0.85504615, + "learning_rate": 0.0009128350373224665, + "loss": 0.86589503, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.33422852, + "step": 1118, + "time_per_iteration": 2.5369865894317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_mlp": 1.02206326, + "epoch": 0.2152751058099269, + "flos": 1495397348352.0, + "grad_norm": 0.028624916140058014, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82490271, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.11767578, + "step": 1119, + "time_per_iteration": 4.634536266326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109091, + "balance_loss_mlp": 1.05741262, + "epoch": 0.21546748749519046, + "flos": 493759895040.0, + "grad_norm": 0.057336284262766976, + "language_loss": 0.85470641, + "learning_rate": 0.0009124832016254005, + "loss": 0.86561549, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.33520508, + "step": 1120, + "time_per_iteration": 2.57099986076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097048, + "balance_loss_mlp": 1.06245303, + "epoch": 0.21565986918045402, + "flos": 634241387520.0, + "grad_norm": 0.0556622286599547, + "language_loss": 0.8842063, + "learning_rate": 0.0009123070435324316, + "loss": 0.89517677, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.34619141, + "step": 1121, + "time_per_iteration": 2.73698091506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010392, + "balance_loss_mlp": 1.02780366, + "epoch": 0.21585225086571758, + "flos": 1582502704128.0, + "grad_norm": 0.024824935431588098, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78914982, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.11376953, + "step": 1122, + "time_per_iteration": 4.960963010787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093163, + "balance_loss_mlp": 1.05897415, + "epoch": 0.21604463255098114, + "flos": 683799556608.0, + "grad_norm": 0.06637115500638362, + "language_loss": 0.86772728, + "learning_rate": 0.0009119542471995752, + "loss": 0.87865889, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.34204102, + "step": 1123, + "time_per_iteration": 2.819042205810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109659, + "balance_loss_mlp": 1.06228209, + "epoch": 0.2162370142362447, + "flos": 780660675072.0, + "grad_norm": 0.06221084946299637, + "language_loss": 0.81623554, + "learning_rate": 0.0009117776090966554, + "loss": 0.82720149, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.34326172, + "step": 1124, + "time_per_iteration": 2.9435975551605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090546, + "balance_loss_mlp": 1.05473578, + "epoch": 0.21642939592150828, + "flos": 1001745294336.0, + "grad_norm": 0.06219513600405685, + "language_loss": 0.86821365, + "learning_rate": 0.0009116008111274899, + "loss": 0.8791191, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.35839844, + "step": 1125, + "time_per_iteration": 3.250828504562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023544, + "balance_loss_mlp": 1.01271951, + "epoch": 0.21662177760677184, + "flos": 1481867440128.0, + "grad_norm": 0.013492425774453086, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.8013047, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.10839844, + "step": 1126, + "time_per_iteration": 4.836662530899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078043, + "balance_loss_mlp": 1.0431627, + "epoch": 0.2168141592920354, + "flos": 887030092800.0, + "grad_norm": 0.06408405180788145, + "language_loss": 0.84878719, + "learning_rate": 0.0009112467358650396, + "loss": 0.85956764, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.34912109, + "step": 1127, + "time_per_iteration": 3.118460178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087154, + "balance_loss_mlp": 1.05205846, + "epoch": 0.21700654097729896, + "flos": 545682382848.0, + "grad_norm": 0.06014422622436645, + "language_loss": 0.86521864, + "learning_rate": 0.0009110694587092192, + "loss": 0.87609017, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.35131836, + "step": 1128, + "time_per_iteration": 2.736814022064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080165, + "balance_loss_mlp": 1.0446167, + "epoch": 0.21719892266256252, + "flos": 509270008320.0, + "grad_norm": 0.06606219668196793, + "language_loss": 0.81429344, + "learning_rate": 0.0009108920219620815, + "loss": 0.82509506, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.35571289, + "step": 1129, + "time_per_iteration": 2.6214489936828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083138, + "balance_loss_mlp": 1.04782772, + "epoch": 0.21739130434782608, + "flos": 543150738432.0, + "grad_norm": 0.060577581075914995, + "language_loss": 0.89903116, + "learning_rate": 0.0009107144256925133, + "loss": 0.90986252, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.35302734, + "step": 1130, + "time_per_iteration": 2.6337971687316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079633, + "balance_loss_mlp": 1.04489541, + "epoch": 0.21758368603308964, + "flos": 616564804608.0, + "grad_norm": 0.0674499307688184, + "language_loss": 0.82610142, + "learning_rate": 0.0009105366699694638, + "loss": 0.83689773, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.34790039, + "step": 1131, + "time_per_iteration": 2.6984267234802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085371, + "balance_loss_mlp": 1.04979873, + "epoch": 0.2177760677183532, + "flos": 634813175808.0, + "grad_norm": 0.051829013054278075, + "language_loss": 0.8159321, + "learning_rate": 0.0009103587548619439, + "loss": 0.8267858, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.35571289, + "step": 1132, + "time_per_iteration": 2.8308732509613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083319, + "balance_loss_mlp": 1.04850996, + "epoch": 0.2179684494036168, + "flos": 532181587968.0, + "grad_norm": 0.06772780520844247, + "language_loss": 0.86115086, + "learning_rate": 0.0009101806804390261, + "loss": 0.87198412, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.34863281, + "step": 1133, + "time_per_iteration": 2.7745282649993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081903, + "balance_loss_mlp": 1.04671264, + "epoch": 0.21816083108888035, + "flos": 474980433408.0, + "grad_norm": 0.05911376481567057, + "language_loss": 0.90451765, + "learning_rate": 0.0009100024467698453, + "loss": 0.91533667, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.35205078, + "step": 1134, + "time_per_iteration": 2.551278829574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086865, + "balance_loss_mlp": 1.051054, + "epoch": 0.2183532127741439, + "flos": 577198794240.0, + "grad_norm": 0.07962415284192025, + "language_loss": 0.83050048, + "learning_rate": 0.0009098240539235981, + "loss": 0.84136909, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.3581543, + "step": 1135, + "time_per_iteration": 2.660019636154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086508, + "balance_loss_mlp": 1.05172312, + "epoch": 0.21854559445940747, + "flos": 593832135168.0, + "grad_norm": 0.05867668726509775, + "language_loss": 0.87679315, + "learning_rate": 0.0009096455019695423, + "loss": 0.88765824, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.34838867, + "step": 1136, + "time_per_iteration": 2.756463050842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090029, + "balance_loss_mlp": 1.05426645, + "epoch": 0.21873797614467103, + "flos": 408464446464.0, + "grad_norm": 0.06290439978907646, + "language_loss": 0.90092266, + "learning_rate": 0.000909466790976998, + "loss": 0.91182297, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.35791016, + "step": 1137, + "time_per_iteration": 2.5046186447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089401, + "balance_loss_mlp": 1.05373359, + "epoch": 0.21893035782993459, + "flos": 893824865280.0, + "grad_norm": 0.05253297698454947, + "language_loss": 0.83030021, + "learning_rate": 0.0009092879210153473, + "loss": 0.84119421, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.35668945, + "step": 1138, + "time_per_iteration": 3.1294023990631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092735, + "balance_loss_mlp": 1.05835557, + "epoch": 0.21912273951519814, + "flos": 467392702464.0, + "grad_norm": 0.05516730570048504, + "language_loss": 0.88930631, + "learning_rate": 0.0009091088921540333, + "loss": 0.90023363, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.34399414, + "step": 1139, + "time_per_iteration": 2.5161380767822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081896, + "balance_loss_mlp": 1.06921172, + "epoch": 0.2193151212004617, + "flos": 1531262665728.0, + "grad_norm": 0.036356034107047845, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76590574, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.12695312, + "step": 1140, + "time_per_iteration": 4.929131984710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087847, + "balance_loss_mlp": 1.05306172, + "epoch": 0.2195075028857253, + "flos": 590901820416.0, + "grad_norm": 0.07364984820319191, + "language_loss": 0.8488574, + "learning_rate": 0.0009087503580104985, + "loss": 0.85973585, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.34814453, + "step": 1141, + "time_per_iteration": 2.676321029663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087752, + "balance_loss_mlp": 1.05287158, + "epoch": 0.21969988457098885, + "flos": 636033917952.0, + "grad_norm": 0.0662048159418312, + "language_loss": 0.79610777, + "learning_rate": 0.0009085708528674728, + "loss": 0.80698538, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.34912109, + "step": 1142, + "time_per_iteration": 2.7667393684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088746, + "balance_loss_mlp": 1.05202913, + "epoch": 0.2198922662562524, + "flos": 911974311936.0, + "grad_norm": 0.07907290305355467, + "language_loss": 0.86086833, + "learning_rate": 0.0009083911891031745, + "loss": 0.87175578, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.36743164, + "step": 1143, + "time_per_iteration": 3.1026079654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093334, + "balance_loss_mlp": 1.05809617, + "epoch": 0.22008464794151597, + "flos": 822603409920.0, + "grad_norm": 0.06284406217527433, + "language_loss": 0.91362917, + "learning_rate": 0.0009082113667873553, + "loss": 0.92456251, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.3527832, + "step": 1144, + "time_per_iteration": 3.098741292953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107482, + "balance_loss_mlp": 1.07188582, + "epoch": 0.22027702962677953, + "flos": 459416475648.0, + "grad_norm": 0.06625631151069579, + "language_loss": 0.90562177, + "learning_rate": 0.0009080313859898283, + "loss": 0.91669661, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.35620117, + "step": 1145, + "time_per_iteration": 2.4998207092285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111259, + "balance_loss_mlp": 1.07606888, + "epoch": 0.2204694113120431, + "flos": 530998723584.0, + "grad_norm": 0.05051092763003013, + "language_loss": 0.91815794, + "learning_rate": 0.0009078512467804684, + "loss": 0.92927051, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.35180664, + "step": 1146, + "time_per_iteration": 2.569073438644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117194, + "balance_loss_mlp": 1.08162224, + "epoch": 0.22066179299730665, + "flos": 522382307328.0, + "grad_norm": 0.06837547739928014, + "language_loss": 0.90610331, + "learning_rate": 0.0009076709492292119, + "loss": 0.91727525, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.35571289, + "step": 1147, + "time_per_iteration": 2.614039659500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114459, + "balance_loss_mlp": 1.07969809, + "epoch": 0.2208541746825702, + "flos": 546188742144.0, + "grad_norm": 0.06837160959472317, + "language_loss": 0.89193797, + "learning_rate": 0.0009074904934060562, + "loss": 0.90308249, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.34790039, + "step": 1148, + "time_per_iteration": 2.6419012546539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112121, + "balance_loss_mlp": 1.08578134, + "epoch": 0.22104655636783377, + "flos": 708404742144.0, + "grad_norm": 0.07108081727062696, + "language_loss": 0.84988266, + "learning_rate": 0.0009073098793810607, + "loss": 0.86109483, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.35473633, + "step": 1149, + "time_per_iteration": 2.909515142440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116065, + "balance_loss_mlp": 1.08061242, + "epoch": 0.22123893805309736, + "flos": 584594468352.0, + "grad_norm": 0.07695680382665727, + "language_loss": 0.88374794, + "learning_rate": 0.000907129107224346, + "loss": 0.89490861, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.35522461, + "step": 1150, + "time_per_iteration": 2.7029008865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099143, + "balance_loss_mlp": 1.06369042, + "epoch": 0.22143131973836092, + "flos": 492002270208.0, + "grad_norm": 0.049049579749502144, + "language_loss": 0.88305712, + "learning_rate": 0.0009069481770060939, + "loss": 0.89404863, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.35498047, + "step": 1151, + "time_per_iteration": 2.65167236328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097606, + "balance_loss_mlp": 1.06248736, + "epoch": 0.22162370142362448, + "flos": 1079227459584.0, + "grad_norm": 0.054063738490033035, + "language_loss": 0.84240663, + "learning_rate": 0.000906767088796548, + "loss": 0.85338271, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.35180664, + "step": 1152, + "time_per_iteration": 3.423985004425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110736, + "balance_loss_mlp": 1.07300401, + "epoch": 0.22181608310888803, + "flos": 492258345984.0, + "grad_norm": 0.057939830998464815, + "language_loss": 0.87012136, + "learning_rate": 0.0009065858426660127, + "loss": 0.88119501, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.34399414, + "step": 1153, + "time_per_iteration": 2.5987319946289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108178, + "balance_loss_mlp": 1.07355952, + "epoch": 0.2220084647941516, + "flos": 723687892992.0, + "grad_norm": 0.0653796708212952, + "language_loss": 0.84926325, + "learning_rate": 0.0009064044386848543, + "loss": 0.86034507, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.34667969, + "step": 1154, + "time_per_iteration": 2.9024224281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112372, + "balance_loss_mlp": 1.0753932, + "epoch": 0.22220084647941515, + "flos": 488988997632.0, + "grad_norm": 0.06606878403176955, + "language_loss": 0.88905716, + "learning_rate": 0.0009062228769234997, + "loss": 0.90018088, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.36987305, + "step": 1155, + "time_per_iteration": 2.5483920574188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104305, + "balance_loss_mlp": 1.06868565, + "epoch": 0.2223932281646787, + "flos": 536025696768.0, + "grad_norm": 0.06185680569649912, + "language_loss": 0.81360811, + "learning_rate": 0.0009060411574524376, + "loss": 0.82465118, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.35644531, + "step": 1156, + "time_per_iteration": 2.629166841506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114341, + "balance_loss_mlp": 1.0794363, + "epoch": 0.22258560984994227, + "flos": 931034580480.0, + "grad_norm": 0.06288530021121215, + "language_loss": 0.88191485, + "learning_rate": 0.0009058592803422178, + "loss": 0.8930583, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.34936523, + "step": 1157, + "time_per_iteration": 3.133453845977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219449, + "balance_loss_mlp": 1.20495331, + "epoch": 0.22277799153520586, + "flos": 1198998443520.0, + "grad_norm": 0.06392494715081258, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79929739, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.14453125, + "step": 1158, + "time_per_iteration": 4.838433027267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095231, + "balance_loss_mlp": 1.0620904, + "epoch": 0.22297037322046942, + "flos": 501052262400.0, + "grad_norm": 0.059439708082357066, + "language_loss": 0.90095651, + "learning_rate": 0.00090549505348681, + "loss": 0.91190875, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.33154297, + "step": 1159, + "time_per_iteration": 2.561887264251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083603, + "balance_loss_mlp": 1.04977143, + "epoch": 0.22316275490573298, + "flos": 752413612032.0, + "grad_norm": 0.05610915875378834, + "language_loss": 0.84254742, + "learning_rate": 0.0009053127038830275, + "loss": 0.85338354, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.33862305, + "step": 1160, + "time_per_iteration": 2.9465925693511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082896, + "balance_loss_mlp": 1.04844403, + "epoch": 0.22335513659099654, + "flos": 514553057280.0, + "grad_norm": 0.06657410760601727, + "language_loss": 0.87182009, + "learning_rate": 0.000905130196922898, + "loss": 0.88264906, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.3449707, + "step": 1161, + "time_per_iteration": 2.597325325012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076013, + "balance_loss_mlp": 1.04213357, + "epoch": 0.2235475182762601, + "flos": 484286501376.0, + "grad_norm": 0.057467913173926514, + "language_loss": 0.87228084, + "learning_rate": 0.0009049475326772769, + "loss": 0.88304096, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.33911133, + "step": 1162, + "time_per_iteration": 2.591592788696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107232, + "balance_loss_mlp": 1.0379163, + "epoch": 0.22373989996152366, + "flos": 469698794496.0, + "grad_norm": 0.05481831884816676, + "language_loss": 0.83362567, + "learning_rate": 0.0009047647112170811, + "loss": 0.84434885, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.34448242, + "step": 1163, + "time_per_iteration": 2.7466936111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080512, + "balance_loss_mlp": 1.04503536, + "epoch": 0.22393228164678722, + "flos": 1270512594432.0, + "grad_norm": 0.0775991801606853, + "language_loss": 0.87402856, + "learning_rate": 0.0009045817326132876, + "loss": 0.88483369, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.35498047, + "step": 1164, + "time_per_iteration": 3.6615524291992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082563, + "balance_loss_mlp": 1.04665732, + "epoch": 0.22412466333205078, + "flos": 596052449280.0, + "grad_norm": 0.05603114612800397, + "language_loss": 0.83484542, + "learning_rate": 0.0009043985969369357, + "loss": 0.84567106, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.35913086, + "step": 1165, + "time_per_iteration": 2.800389528274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084793, + "balance_loss_mlp": 1.047647, + "epoch": 0.22431704501731436, + "flos": 608136062976.0, + "grad_norm": 0.052919924442321326, + "language_loss": 0.84423298, + "learning_rate": 0.0009042153042591245, + "loss": 0.85508084, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.37158203, + "step": 1166, + "time_per_iteration": 2.7848384380340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080872, + "balance_loss_mlp": 1.04622972, + "epoch": 0.22450942670257792, + "flos": 906203842560.0, + "grad_norm": 0.054053491984114646, + "language_loss": 0.85318398, + "learning_rate": 0.0009040318546510146, + "loss": 0.86399269, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.34667969, + "step": 1167, + "time_per_iteration": 3.1406538486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080871, + "balance_loss_mlp": 1.04529881, + "epoch": 0.22470180838784148, + "flos": 565032222720.0, + "grad_norm": 0.06590224184570584, + "language_loss": 0.85490131, + "learning_rate": 0.0009038482481838275, + "loss": 0.86571002, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.35620117, + "step": 1168, + "time_per_iteration": 2.6623363494873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107832, + "balance_loss_mlp": 1.04265296, + "epoch": 0.22489419007310504, + "flos": 834109443072.0, + "grad_norm": 0.05295244004415107, + "language_loss": 0.87364161, + "learning_rate": 0.0009036644849288455, + "loss": 0.88442481, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.35668945, + "step": 1169, + "time_per_iteration": 3.096397638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082872, + "balance_loss_mlp": 1.04567838, + "epoch": 0.2250865717583686, + "flos": 580788237312.0, + "grad_norm": 0.06189616009675637, + "language_loss": 0.85257494, + "learning_rate": 0.0009034805649574118, + "loss": 0.86340362, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.37207031, + "step": 1170, + "time_per_iteration": 2.655629873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081931, + "balance_loss_mlp": 1.04669285, + "epoch": 0.22527895344363216, + "flos": 600091435008.0, + "grad_norm": 0.0574349936504533, + "language_loss": 0.85081124, + "learning_rate": 0.0009032964883409308, + "loss": 0.86163056, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.35253906, + "step": 1171, + "time_per_iteration": 2.9228479862213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096453, + "balance_loss_mlp": 1.08109915, + "epoch": 0.22547133512889572, + "flos": 1440009073152.0, + "grad_norm": 0.03435009764288223, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74146986, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.15332031, + "step": 1172, + "time_per_iteration": 4.9870195388793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099751, + "balance_loss_mlp": 1.06365418, + "epoch": 0.22566371681415928, + "flos": 490377065472.0, + "grad_norm": 0.06750207251725504, + "language_loss": 0.87597418, + "learning_rate": 0.0009029278654587462, + "loss": 0.88697171, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.36108398, + "step": 1173, + "time_per_iteration": 2.545078754425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098309, + "balance_loss_mlp": 1.06156898, + "epoch": 0.22585609849942284, + "flos": 604334214144.0, + "grad_norm": 0.06244795934891309, + "language_loss": 0.82517409, + "learning_rate": 0.0009027433193361548, + "loss": 0.8361572, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.36767578, + "step": 1174, + "time_per_iteration": 2.69753098487854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100346, + "balance_loss_mlp": 1.06305695, + "epoch": 0.22604848018468643, + "flos": 635280247296.0, + "grad_norm": 0.06854123529633785, + "language_loss": 0.87138826, + "learning_rate": 0.00090255861685474, + "loss": 0.88239175, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.37280273, + "step": 1175, + "time_per_iteration": 2.7199149131774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094451, + "balance_loss_mlp": 1.05744886, + "epoch": 0.22624086186995, + "flos": 479633467392.0, + "grad_norm": 0.06836538183258173, + "language_loss": 0.91474092, + "learning_rate": 0.0009023737580862095, + "loss": 0.92568541, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.36962891, + "step": 1176, + "time_per_iteration": 2.51255464553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092025, + "balance_loss_mlp": 1.0570724, + "epoch": 0.22643324355521355, + "flos": 495566982144.0, + "grad_norm": 0.05906016973995859, + "language_loss": 0.83066601, + "learning_rate": 0.0009021887431023321, + "loss": 0.84158623, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.34985352, + "step": 1177, + "time_per_iteration": 2.5783960819244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084144, + "balance_loss_mlp": 1.04842877, + "epoch": 0.2266256252404771, + "flos": 561271071744.0, + "grad_norm": 0.05542928649781209, + "language_loss": 0.87720597, + "learning_rate": 0.0009020035719749369, + "loss": 0.8880474, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.35742188, + "step": 1178, + "time_per_iteration": 2.7076900005340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088743, + "balance_loss_mlp": 1.05259871, + "epoch": 0.22681800692574067, + "flos": 579353527296.0, + "grad_norm": 0.05892405405909356, + "language_loss": 0.77506709, + "learning_rate": 0.0009018182447759136, + "loss": 0.78595448, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.36157227, + "step": 1179, + "time_per_iteration": 2.974362373352051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083275, + "balance_loss_mlp": 1.04798961, + "epoch": 0.22701038861100423, + "flos": 739842577920.0, + "grad_norm": 0.0555118465290956, + "language_loss": 0.80168724, + "learning_rate": 0.0009016327615772126, + "loss": 0.81252003, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.35327148, + "step": 1180, + "time_per_iteration": 2.9207658767700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082522, + "balance_loss_mlp": 1.04776096, + "epoch": 0.2272027702962678, + "flos": 576996562944.0, + "grad_norm": 0.06857059729731818, + "language_loss": 0.88146389, + "learning_rate": 0.0009014471224508451, + "loss": 0.8922891, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.34790039, + "step": 1181, + "time_per_iteration": 2.6884429454803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080826, + "balance_loss_mlp": 1.0466603, + "epoch": 0.22739515198153135, + "flos": 544012098048.0, + "grad_norm": 0.07386093909180869, + "language_loss": 0.83020878, + "learning_rate": 0.0009012613274688823, + "loss": 0.84101701, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.34204102, + "step": 1182, + "time_per_iteration": 2.625973701477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082925, + "balance_loss_mlp": 1.04735291, + "epoch": 0.22758753366679493, + "flos": 439932805632.0, + "grad_norm": 0.06621157637783351, + "language_loss": 0.87839937, + "learning_rate": 0.0009010753767034565, + "loss": 0.88922858, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.35571289, + "step": 1183, + "time_per_iteration": 2.545569658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086712, + "balance_loss_mlp": 1.05030489, + "epoch": 0.2277799153520585, + "flos": 729104772096.0, + "grad_norm": 0.07242159959797279, + "language_loss": 0.79501748, + "learning_rate": 0.0009008892702267599, + "loss": 0.80588454, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.36425781, + "step": 1184, + "time_per_iteration": 2.9862120151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089174, + "balance_loss_mlp": 1.05322075, + "epoch": 0.22797229703732205, + "flos": 526641053184.0, + "grad_norm": 0.0740207336504876, + "language_loss": 0.89059424, + "learning_rate": 0.0009007030081110457, + "loss": 0.90148592, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.35961914, + "step": 1185, + "time_per_iteration": 2.6184284687042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083236, + "balance_loss_mlp": 1.04783034, + "epoch": 0.2281646787225856, + "flos": 535159954944.0, + "grad_norm": 0.06479663876551665, + "language_loss": 0.84969211, + "learning_rate": 0.000900516590428627, + "loss": 0.86052454, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.35449219, + "step": 1186, + "time_per_iteration": 2.724161386489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083897, + "balance_loss_mlp": 1.049088, + "epoch": 0.22835706040784917, + "flos": 541107924480.0, + "grad_norm": 0.052728830082858405, + "language_loss": 0.89177948, + "learning_rate": 0.0009003300172518778, + "loss": 0.90261841, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.34790039, + "step": 1187, + "time_per_iteration": 2.6810121536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108576, + "balance_loss_mlp": 1.05018783, + "epoch": 0.22854944209311273, + "flos": 790297012224.0, + "grad_norm": 0.05376177869775473, + "language_loss": 0.84676045, + "learning_rate": 0.0009001432886532321, + "loss": 0.85761803, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.35571289, + "step": 1188, + "time_per_iteration": 2.977433919906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109735, + "balance_loss_mlp": 1.0614686, + "epoch": 0.2287418237783763, + "flos": 469047020544.0, + "grad_norm": 0.06589135500726684, + "language_loss": 0.86752445, + "learning_rate": 0.0008999564047051843, + "loss": 0.87849802, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.35913086, + "step": 1189, + "time_per_iteration": 2.5126237869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094774, + "balance_loss_mlp": 1.06017935, + "epoch": 0.22893420546363985, + "flos": 467786990592.0, + "grad_norm": 0.061551577223012334, + "language_loss": 0.84713042, + "learning_rate": 0.0008997693654802894, + "loss": 0.85807812, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.34643555, + "step": 1190, + "time_per_iteration": 2.6570322513580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088276, + "balance_loss_mlp": 1.05318046, + "epoch": 0.22912658714890344, + "flos": 625974179328.0, + "grad_norm": 0.05326512300588333, + "language_loss": 0.86549705, + "learning_rate": 0.0008995821710511625, + "loss": 0.87637979, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.35107422, + "step": 1191, + "time_per_iteration": 2.8115806579589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108036, + "balance_loss_mlp": 1.04731488, + "epoch": 0.229318968834167, + "flos": 502785156096.0, + "grad_norm": 0.06330680163661413, + "language_loss": 0.8511278, + "learning_rate": 0.0008993948214904786, + "loss": 0.86193144, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.33056641, + "step": 1192, + "time_per_iteration": 2.546410083770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125589, + "balance_loss_mlp": 1.11023474, + "epoch": 0.22951135051943056, + "flos": 1374108544512.0, + "grad_norm": 0.06153086464986019, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79547799, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.15332031, + "step": 1193, + "time_per_iteration": 4.891269207000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099997, + "balance_loss_mlp": 1.06306624, + "epoch": 0.22970373220469412, + "flos": 644045050368.0, + "grad_norm": 0.06658536787009234, + "language_loss": 0.79028845, + "learning_rate": 0.0008990196572654427, + "loss": 0.80128849, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.36914062, + "step": 1194, + "time_per_iteration": 2.8504366874694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100467, + "balance_loss_mlp": 1.06582475, + "epoch": 0.22989611388995768, + "flos": 499945001472.0, + "grad_norm": 0.048025217626556156, + "language_loss": 0.87748766, + "learning_rate": 0.0008988318427467426, + "loss": 0.88849235, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.34667969, + "step": 1195, + "time_per_iteration": 2.735084056854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099859, + "balance_loss_mlp": 1.06524014, + "epoch": 0.23008849557522124, + "flos": 1096071796224.0, + "grad_norm": 0.06731751876810108, + "language_loss": 0.86263168, + "learning_rate": 0.0008986438733877887, + "loss": 0.87363023, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.34667969, + "step": 1196, + "time_per_iteration": 3.435035228729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100924, + "balance_loss_mlp": 1.06606746, + "epoch": 0.2302808772604848, + "flos": 683313546240.0, + "grad_norm": 0.04733445604251135, + "language_loss": 0.84099567, + "learning_rate": 0.0008984557492615576, + "loss": 0.85200489, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.34887695, + "step": 1197, + "time_per_iteration": 2.927668809890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107534, + "balance_loss_mlp": 1.07317793, + "epoch": 0.23047325894574835, + "flos": 528664928256.0, + "grad_norm": 0.0630370564804667, + "language_loss": 0.89949608, + "learning_rate": 0.0008982674704410854, + "loss": 0.91057146, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.34399414, + "step": 1198, + "time_per_iteration": 2.691016435623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107479, + "balance_loss_mlp": 1.07228875, + "epoch": 0.23066564063101191, + "flos": 682427455488.0, + "grad_norm": 0.06209648084563375, + "language_loss": 0.77829844, + "learning_rate": 0.0008980790369994682, + "loss": 0.78937328, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.35205078, + "step": 1199, + "time_per_iteration": 2.9320883750915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_mlp": 1.06525421, + "epoch": 0.2308580223162755, + "flos": 558247624704.0, + "grad_norm": 0.09180159748966574, + "language_loss": 0.87396461, + "learning_rate": 0.000897890449009863, + "loss": 0.88496947, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.3527832, + "step": 1200, + "time_per_iteration": 2.6804869174957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096306, + "balance_loss_mlp": 1.06183052, + "epoch": 0.23105040400153906, + "flos": 555406060032.0, + "grad_norm": 0.05982856494430897, + "language_loss": 0.90313268, + "learning_rate": 0.0008977017065454853, + "loss": 0.9140957, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.3449707, + "step": 1201, + "time_per_iteration": 2.639636754989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090282, + "balance_loss_mlp": 1.05556786, + "epoch": 0.23124278568680262, + "flos": 704474855424.0, + "grad_norm": 0.06077351963181601, + "language_loss": 0.80804175, + "learning_rate": 0.0008975128096796121, + "loss": 0.81894457, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.34765625, + "step": 1202, + "time_per_iteration": 2.8410260677337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078695, + "balance_loss_mlp": 1.04431474, + "epoch": 0.23143516737206618, + "flos": 612469002240.0, + "grad_norm": 0.07413481943536562, + "language_loss": 0.85940087, + "learning_rate": 0.0008973237584855794, + "loss": 0.87018776, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.34423828, + "step": 1203, + "time_per_iteration": 2.898423671722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077022, + "balance_loss_mlp": 1.04233205, + "epoch": 0.23162754905732974, + "flos": 389030238720.0, + "grad_norm": 0.06038618944932519, + "language_loss": 0.8201915, + "learning_rate": 0.0008971345530367832, + "loss": 0.83096182, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.34716797, + "step": 1204, + "time_per_iteration": 2.486668586730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082339, + "balance_loss_mlp": 1.04738712, + "epoch": 0.2318199307425933, + "flos": 667481928192.0, + "grad_norm": 0.05427081728260985, + "language_loss": 0.85029405, + "learning_rate": 0.0008969451934066799, + "loss": 0.86111748, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.34960938, + "step": 1205, + "time_per_iteration": 2.771306276321411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091653, + "balance_loss_mlp": 1.05662966, + "epoch": 0.23201231242785686, + "flos": 666093860352.0, + "grad_norm": 0.0707913589572404, + "language_loss": 0.80143172, + "learning_rate": 0.0008967556796687854, + "loss": 0.81234825, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.35058594, + "step": 1206, + "time_per_iteration": 2.8904309272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087698, + "balance_loss_mlp": 1.05353224, + "epoch": 0.23220469411312042, + "flos": 748498281984.0, + "grad_norm": 0.05559113870944949, + "language_loss": 0.83954245, + "learning_rate": 0.0008965660118966752, + "loss": 0.8504194, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.34204102, + "step": 1207, + "time_per_iteration": 2.9140615463256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108846, + "balance_loss_mlp": 1.05529559, + "epoch": 0.232397075798384, + "flos": 666763163136.0, + "grad_norm": 0.04975334384076733, + "language_loss": 0.90441763, + "learning_rate": 0.0008963761901639851, + "loss": 0.91530222, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.33154297, + "step": 1208, + "time_per_iteration": 2.8032286167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094486, + "balance_loss_mlp": 1.06008244, + "epoch": 0.23258945748364757, + "flos": 609937357824.0, + "grad_norm": 0.05840728669351643, + "language_loss": 0.83201033, + "learning_rate": 0.0008961862145444103, + "loss": 0.84295517, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.34399414, + "step": 1209, + "time_per_iteration": 2.7161943912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_mlp": 1.05954397, + "epoch": 0.23278183916891113, + "flos": 489397842432.0, + "grad_norm": 0.06743904317466738, + "language_loss": 0.85216832, + "learning_rate": 0.0008959960851117059, + "loss": 0.86311138, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.34790039, + "step": 1210, + "time_per_iteration": 2.5817031860351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095047, + "balance_loss_mlp": 1.06014228, + "epoch": 0.23297422085417469, + "flos": 511314232320.0, + "grad_norm": 0.057575168534165826, + "language_loss": 0.84338427, + "learning_rate": 0.0008958058019396868, + "loss": 0.85433477, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.34936523, + "step": 1211, + "time_per_iteration": 2.7788164615631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091213, + "balance_loss_mlp": 1.05730987, + "epoch": 0.23316660253943824, + "flos": 546145072128.0, + "grad_norm": 0.057082370400879795, + "language_loss": 0.86897939, + "learning_rate": 0.0008956153651022274, + "loss": 0.87989151, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.33935547, + "step": 1212, + "time_per_iteration": 2.7309062480926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090204, + "balance_loss_mlp": 1.05608642, + "epoch": 0.2333589842247018, + "flos": 509998947840.0, + "grad_norm": 0.06317396982696966, + "language_loss": 0.84641176, + "learning_rate": 0.0008954247746732618, + "loss": 0.85731381, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.34155273, + "step": 1213, + "time_per_iteration": 2.619058609008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084199, + "balance_loss_mlp": 1.05072522, + "epoch": 0.23355136590996536, + "flos": 662834686464.0, + "grad_norm": 0.09780220222501788, + "language_loss": 0.90954423, + "learning_rate": 0.0008952340307267837, + "loss": 0.9203862, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.33496094, + "step": 1214, + "time_per_iteration": 2.869351387023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108593, + "balance_loss_mlp": 1.05128753, + "epoch": 0.23374374759522892, + "flos": 508206417408.0, + "grad_norm": 0.061496426320555984, + "language_loss": 0.83373952, + "learning_rate": 0.0008950431333368468, + "loss": 0.84459883, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.34667969, + "step": 1215, + "time_per_iteration": 2.5557806491851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093645, + "balance_loss_mlp": 1.05928898, + "epoch": 0.2339361292804925, + "flos": 1293964028928.0, + "grad_norm": 0.062331860667319446, + "language_loss": 0.84730738, + "learning_rate": 0.0008948520825775634, + "loss": 0.85824382, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.34399414, + "step": 1216, + "time_per_iteration": 3.6050164699554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098293, + "balance_loss_mlp": 1.06343639, + "epoch": 0.23412851096575607, + "flos": 705617021952.0, + "grad_norm": 0.06500023378725601, + "language_loss": 0.84162283, + "learning_rate": 0.0008946608785231067, + "loss": 0.8526057, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.34863281, + "step": 1217, + "time_per_iteration": 2.858696699142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109995, + "balance_loss_mlp": 1.065045, + "epoch": 0.23432089265101963, + "flos": 438036968448.0, + "grad_norm": 0.06356573317347913, + "language_loss": 0.84325957, + "learning_rate": 0.0008944695212477084, + "loss": 0.85425907, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.34912109, + "step": 1218, + "time_per_iteration": 2.4787168502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107192, + "balance_loss_mlp": 1.07190585, + "epoch": 0.2345132743362832, + "flos": 480697058304.0, + "grad_norm": 0.05460931090439532, + "language_loss": 0.86098325, + "learning_rate": 0.0008942780108256599, + "loss": 0.87205517, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.35327148, + "step": 1219, + "time_per_iteration": 2.574692726135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105521, + "balance_loss_mlp": 1.06966305, + "epoch": 0.23470565602154675, + "flos": 411231817728.0, + "grad_norm": 0.05853057396081394, + "language_loss": 0.86360055, + "learning_rate": 0.0008940863473313121, + "loss": 0.87465572, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.35839844, + "step": 1220, + "time_per_iteration": 2.4849462509155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115361, + "balance_loss_mlp": 1.08024168, + "epoch": 0.2348980377068103, + "flos": 545189170176.0, + "grad_norm": 0.0745659618807548, + "language_loss": 0.87691534, + "learning_rate": 0.0008938945308390756, + "loss": 0.88806891, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.3515625, + "step": 1221, + "time_per_iteration": 2.6100285053253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107677, + "balance_loss_mlp": 1.07248664, + "epoch": 0.23509041939207387, + "flos": 575465900544.0, + "grad_norm": 0.055913245264753976, + "language_loss": 0.87316763, + "learning_rate": 0.00089370256142342, + "loss": 0.88424438, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.35205078, + "step": 1222, + "time_per_iteration": 2.726897716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109541, + "balance_loss_mlp": 1.06007659, + "epoch": 0.23528280107733743, + "flos": 588568025088.0, + "grad_norm": 0.04976165943815558, + "language_loss": 0.85095507, + "learning_rate": 0.0008935104391588746, + "loss": 0.86190915, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.35351562, + "step": 1223, + "time_per_iteration": 2.7249879837036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108966, + "balance_loss_mlp": 1.07313156, + "epoch": 0.235475182762601, + "flos": 822948235776.0, + "grad_norm": 0.05651852634602403, + "language_loss": 0.82749176, + "learning_rate": 0.0008933181641200276, + "loss": 0.83858138, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.35839844, + "step": 1224, + "time_per_iteration": 3.1651737689971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094197, + "balance_loss_mlp": 1.06017447, + "epoch": 0.23566756444786457, + "flos": 679865287680.0, + "grad_norm": 0.06356150049585653, + "language_loss": 0.8609674, + "learning_rate": 0.0008931257363815271, + "loss": 0.87190938, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.34033203, + "step": 1225, + "time_per_iteration": 2.891789674758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091395, + "balance_loss_mlp": 1.05711007, + "epoch": 0.23585994613312813, + "flos": 701481931776.0, + "grad_norm": 0.04853721262867189, + "language_loss": 0.89892405, + "learning_rate": 0.0008929331560180798, + "loss": 0.90983796, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.34277344, + "step": 1226, + "time_per_iteration": 2.934101104736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093973, + "balance_loss_mlp": 1.06002271, + "epoch": 0.2360523278183917, + "flos": 523923144192.0, + "grad_norm": 0.06491881814379113, + "language_loss": 0.91129786, + "learning_rate": 0.0008927404231044525, + "loss": 0.92223763, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.33984375, + "step": 1227, + "time_per_iteration": 2.682377815246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083665, + "balance_loss_mlp": 1.0493325, + "epoch": 0.23624470950365525, + "flos": 524027860992.0, + "grad_norm": 0.053423388326064705, + "language_loss": 0.81944436, + "learning_rate": 0.0008925475377154703, + "loss": 0.83028102, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.34375, + "step": 1228, + "time_per_iteration": 2.7511117458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077291, + "balance_loss_mlp": 1.04169512, + "epoch": 0.2364370911889188, + "flos": 596525313024.0, + "grad_norm": 0.05836717970983508, + "language_loss": 0.8241868, + "learning_rate": 0.0008923544999260183, + "loss": 0.83495975, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.35644531, + "step": 1229, + "time_per_iteration": 2.7915079593658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087923, + "balance_loss_mlp": 1.05194569, + "epoch": 0.23662947287418237, + "flos": 756519588864.0, + "grad_norm": 0.08156392485297027, + "language_loss": 0.91757852, + "learning_rate": 0.00089216130981104, + "loss": 0.92845774, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.35986328, + "step": 1230, + "time_per_iteration": 3.037900924682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089884, + "balance_loss_mlp": 1.0531199, + "epoch": 0.23682185455944593, + "flos": 545907935232.0, + "grad_norm": 0.05473268619072285, + "language_loss": 0.82659578, + "learning_rate": 0.000891967967445539, + "loss": 0.83749461, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.36743164, + "step": 1231, + "time_per_iteration": 2.6595497131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088497, + "balance_loss_mlp": 1.05201924, + "epoch": 0.2370142362447095, + "flos": 661977709056.0, + "grad_norm": 0.04604146434030928, + "language_loss": 0.88502473, + "learning_rate": 0.0008917744729045772, + "loss": 0.89590967, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.36499023, + "step": 1232, + "time_per_iteration": 2.851651668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095833, + "balance_loss_mlp": 1.05868709, + "epoch": 0.23720661792997308, + "flos": 683361598464.0, + "grad_norm": 0.06835104069372223, + "language_loss": 0.84165156, + "learning_rate": 0.0008915808262632757, + "loss": 0.85260987, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.37133789, + "step": 1233, + "time_per_iteration": 2.8114235401153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095976, + "balance_loss_mlp": 1.05892599, + "epoch": 0.23739899961523664, + "flos": 558631738368.0, + "grad_norm": 0.055258261409357204, + "language_loss": 0.92769438, + "learning_rate": 0.0008913870275968148, + "loss": 0.93865418, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.37036133, + "step": 1234, + "time_per_iteration": 2.705349922180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092916, + "balance_loss_mlp": 1.05629516, + "epoch": 0.2375913813005002, + "flos": 889144128000.0, + "grad_norm": 0.12876854850300654, + "language_loss": 0.87540263, + "learning_rate": 0.0008911930769804342, + "loss": 0.8863318, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.36621094, + "step": 1235, + "time_per_iteration": 3.2342941761016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091192, + "balance_loss_mlp": 1.05492854, + "epoch": 0.23778376298576376, + "flos": 640810607616.0, + "grad_norm": 0.044375832072417805, + "language_loss": 0.91481459, + "learning_rate": 0.0008909989744894318, + "loss": 0.92572653, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.36303711, + "step": 1236, + "time_per_iteration": 2.8858232498168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089597, + "balance_loss_mlp": 1.05333364, + "epoch": 0.23797614467102732, + "flos": 616540073472.0, + "grad_norm": 0.05892762197337364, + "language_loss": 0.81707233, + "learning_rate": 0.0008908047201991649, + "loss": 0.82796836, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.36279297, + "step": 1237, + "time_per_iteration": 2.785226583480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085624, + "balance_loss_mlp": 1.05071974, + "epoch": 0.23816852635629088, + "flos": 623941539840.0, + "grad_norm": 0.051487502947417364, + "language_loss": 0.86561942, + "learning_rate": 0.0008906103141850502, + "loss": 0.87647569, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.34960938, + "step": 1238, + "time_per_iteration": 2.868241310119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095513, + "balance_loss_mlp": 1.05901158, + "epoch": 0.23836090804155444, + "flos": 521180504064.0, + "grad_norm": 0.07170300234131513, + "language_loss": 0.88119614, + "learning_rate": 0.0008904157565225621, + "loss": 0.89215136, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.36499023, + "step": 1239, + "time_per_iteration": 2.610048294067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092952, + "balance_loss_mlp": 1.05716562, + "epoch": 0.238553289726818, + "flos": 1153527616512.0, + "grad_norm": 0.07764557472008667, + "language_loss": 0.82042629, + "learning_rate": 0.000890221047287235, + "loss": 0.83135581, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.3581543, + "step": 1240, + "time_per_iteration": 3.547147274017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102634, + "balance_loss_mlp": 1.06772995, + "epoch": 0.23874567141208156, + "flos": 499600175616.0, + "grad_norm": 0.07123563443936186, + "language_loss": 0.91052604, + "learning_rate": 0.0008900261865546615, + "loss": 0.92155242, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.34936523, + "step": 1241, + "time_per_iteration": 2.6277406215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110411, + "balance_loss_mlp": 1.06768012, + "epoch": 0.23893805309734514, + "flos": 556657325568.0, + "grad_norm": 0.08027126565183675, + "language_loss": 0.84991688, + "learning_rate": 0.0008898311744004936, + "loss": 0.86095798, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.36425781, + "step": 1242, + "time_per_iteration": 2.687009811401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112655, + "balance_loss_mlp": 1.07708287, + "epoch": 0.2391304347826087, + "flos": 549009957888.0, + "grad_norm": 0.05686617926086787, + "language_loss": 0.86918116, + "learning_rate": 0.0008896360109004414, + "loss": 0.88030773, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.35595703, + "step": 1243, + "time_per_iteration": 2.6292564868927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111871, + "balance_loss_mlp": 1.07629931, + "epoch": 0.23932281646787226, + "flos": 515794148352.0, + "grad_norm": 0.05075175877282041, + "language_loss": 0.84481502, + "learning_rate": 0.0008894406961302742, + "loss": 0.85593379, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.35595703, + "step": 1244, + "time_per_iteration": 2.5960640907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122594, + "balance_loss_mlp": 1.08737969, + "epoch": 0.23951519815313582, + "flos": 743353445376.0, + "grad_norm": 0.06488001286924965, + "language_loss": 0.84053004, + "learning_rate": 0.0008892452301658201, + "loss": 0.85175598, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.35253906, + "step": 1245, + "time_per_iteration": 2.9320998191833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123528, + "balance_loss_mlp": 1.08728814, + "epoch": 0.23970757983839938, + "flos": 553855048704.0, + "grad_norm": 0.05553543969160018, + "language_loss": 0.83631629, + "learning_rate": 0.0008890496130829653, + "loss": 0.84755158, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.36230469, + "step": 1246, + "time_per_iteration": 2.6420071125030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123934, + "balance_loss_mlp": 1.08802795, + "epoch": 0.23989996152366294, + "flos": 480416251392.0, + "grad_norm": 0.0595921721906752, + "language_loss": 0.85551775, + "learning_rate": 0.0008888538449576555, + "loss": 0.86675715, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.359375, + "step": 1247, + "time_per_iteration": 2.544706344604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123543, + "balance_loss_mlp": 1.08687472, + "epoch": 0.2400923432089265, + "flos": 485069285376.0, + "grad_norm": 0.06973867138143126, + "language_loss": 0.82958472, + "learning_rate": 0.0008886579258658944, + "loss": 0.84082007, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.36669922, + "step": 1248, + "time_per_iteration": 2.5460424423217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108744, + "balance_loss_mlp": 1.0724808, + "epoch": 0.24028472489419006, + "flos": 623247505920.0, + "grad_norm": 0.04817062293972818, + "language_loss": 0.85353303, + "learning_rate": 0.0008884618558837446, + "loss": 0.86462045, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.36279297, + "step": 1249, + "time_per_iteration": 2.80222487449646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125107, + "balance_loss_mlp": 1.08765173, + "epoch": 0.24047710657945365, + "flos": 601302002688.0, + "grad_norm": 0.052194699096834656, + "language_loss": 0.86387813, + "learning_rate": 0.0008882656350873273, + "loss": 0.87512922, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.37426758, + "step": 1250, + "time_per_iteration": 2.830887794494629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136265, + "balance_loss_mlp": 1.09911942, + "epoch": 0.2406694882647172, + "flos": 841199579136.0, + "grad_norm": 0.07156482775024936, + "language_loss": 0.86951184, + "learning_rate": 0.0008880692635528219, + "loss": 0.88087451, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.37109375, + "step": 1251, + "time_per_iteration": 3.0439021587371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140454, + "balance_loss_mlp": 1.10450029, + "epoch": 0.24086186994998077, + "flos": 526789440000.0, + "grad_norm": 0.062254670736574515, + "language_loss": 0.89187038, + "learning_rate": 0.0008878727413564669, + "loss": 0.90327489, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.36010742, + "step": 1252, + "time_per_iteration": 2.7363240718841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094248, + "balance_loss_mlp": 1.07717752, + "epoch": 0.24105425163524433, + "flos": 1337464673280.0, + "grad_norm": 0.032126183170312766, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81229842, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.17089844, + "step": 1253, + "time_per_iteration": 4.8370680809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175937, + "balance_loss_mlp": 1.13755202, + "epoch": 0.24124663332050789, + "flos": 613822164480.0, + "grad_norm": 0.06436318886622608, + "language_loss": 0.78452635, + "learning_rate": 0.0008874792452834528, + "loss": 0.79628575, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.38354492, + "step": 1254, + "time_per_iteration": 2.724947452545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151033, + "balance_loss_mlp": 1.11381602, + "epoch": 0.24143901500577145, + "flos": 575278225920.0, + "grad_norm": 0.08996846201845816, + "language_loss": 0.87516546, + "learning_rate": 0.0008872822715595626, + "loss": 0.88667583, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.37207031, + "step": 1255, + "time_per_iteration": 2.676539659500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118396, + "balance_loss_mlp": 1.08275259, + "epoch": 0.241631396691035, + "flos": 494941349376.0, + "grad_norm": 0.06475486920780314, + "language_loss": 0.87080252, + "learning_rate": 0.0008870851474793598, + "loss": 0.88198644, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.35668945, + "step": 1256, + "time_per_iteration": 2.5523862838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108714, + "balance_loss_mlp": 1.07287991, + "epoch": 0.24182377837629856, + "flos": 635891323392.0, + "grad_norm": 0.0627898868455093, + "language_loss": 0.89724898, + "learning_rate": 0.0008868878731193752, + "loss": 0.90833616, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.35888672, + "step": 1257, + "time_per_iteration": 2.8152451515197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094242, + "balance_loss_mlp": 1.05933762, + "epoch": 0.24201616006156215, + "flos": 514938580992.0, + "grad_norm": 0.06450256361572139, + "language_loss": 0.89708877, + "learning_rate": 0.0008866904485561973, + "loss": 0.90803117, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.34936523, + "step": 1258, + "time_per_iteration": 2.7304298877716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095529, + "balance_loss_mlp": 1.05945659, + "epoch": 0.2422085417468257, + "flos": 614837703168.0, + "grad_norm": 0.05809143078881904, + "language_loss": 0.83024096, + "learning_rate": 0.000886492873866473, + "loss": 0.8411963, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.36108398, + "step": 1259, + "time_per_iteration": 2.828904628753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090865, + "balance_loss_mlp": 1.05576968, + "epoch": 0.24240092343208927, + "flos": 585515464704.0, + "grad_norm": 0.07568124142212555, + "language_loss": 0.84760439, + "learning_rate": 0.000886295149126908, + "loss": 0.85851306, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.35131836, + "step": 1260, + "time_per_iteration": 2.7011313438415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109068, + "balance_loss_mlp": 1.05537009, + "epoch": 0.24259330511735283, + "flos": 761930675712.0, + "grad_norm": 0.05459059834864095, + "language_loss": 0.85652769, + "learning_rate": 0.0008860972744142655, + "loss": 0.8674345, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.35327148, + "step": 1261, + "time_per_iteration": 2.9039082527160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084953, + "balance_loss_mlp": 1.05028725, + "epoch": 0.2427856868026164, + "flos": 626566316544.0, + "grad_norm": 0.06267274795834049, + "language_loss": 0.8183161, + "learning_rate": 0.0008858992498053671, + "loss": 0.82916564, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.34692383, + "step": 1262, + "time_per_iteration": 2.8293697834014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080455, + "balance_loss_mlp": 1.06586385, + "epoch": 0.24297806848787995, + "flos": 1510840470528.0, + "grad_norm": 0.02756761643082338, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77669203, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.14550781, + "step": 1263, + "time_per_iteration": 4.8116748332977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087756, + "balance_loss_mlp": 1.05328119, + "epoch": 0.2431704501731435, + "flos": 541669538304.0, + "grad_norm": 0.05501719814044903, + "language_loss": 0.83684969, + "learning_rate": 0.0008855027512063817, + "loss": 0.8477273, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.3449707, + "step": 1264, + "time_per_iteration": 2.6995394229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084053, + "balance_loss_mlp": 1.0493865, + "epoch": 0.24336283185840707, + "flos": 523588492800.0, + "grad_norm": 0.06804757776515359, + "language_loss": 0.85974693, + "learning_rate": 0.0008853042773702292, + "loss": 0.87058747, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.34692383, + "step": 1265, + "time_per_iteration": 2.683969497680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086337, + "balance_loss_mlp": 1.05002618, + "epoch": 0.24355521354367063, + "flos": 536839004160.0, + "grad_norm": 0.05444938358074035, + "language_loss": 0.87678754, + "learning_rate": 0.0008851056539456896, + "loss": 0.88765097, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.36303711, + "step": 1266, + "time_per_iteration": 2.674891471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_mlp": 1.04830909, + "epoch": 0.24374759522893422, + "flos": 930050975232.0, + "grad_norm": 0.04940136823280911, + "language_loss": 0.82195789, + "learning_rate": 0.0008849068810098755, + "loss": 0.83279288, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.35229492, + "step": 1267, + "time_per_iteration": 3.27172589302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083691, + "balance_loss_mlp": 1.04859626, + "epoch": 0.24393997691419778, + "flos": 427564002816.0, + "grad_norm": 0.07591960175092535, + "language_loss": 0.83287823, + "learning_rate": 0.0008847079586399575, + "loss": 0.84371519, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.35131836, + "step": 1268, + "time_per_iteration": 2.4539763927459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010876, + "balance_loss_mlp": 1.05281472, + "epoch": 0.24413235859946134, + "flos": 578582479872.0, + "grad_norm": 0.059755639557228325, + "language_loss": 0.86095846, + "learning_rate": 0.0008845088869131641, + "loss": 0.87183452, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.34790039, + "step": 1269, + "time_per_iteration": 2.651010274887085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094661, + "balance_loss_mlp": 1.058851, + "epoch": 0.2443247402847249, + "flos": 529600481280.0, + "grad_norm": 0.07776240560166553, + "language_loss": 0.89366186, + "learning_rate": 0.0008843096659067818, + "loss": 0.90460849, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.35839844, + "step": 1270, + "time_per_iteration": 2.61082124710083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108773, + "balance_loss_mlp": 1.05292046, + "epoch": 0.24451712196998845, + "flos": 695996651520.0, + "grad_norm": 0.05083497592617014, + "language_loss": 0.86395383, + "learning_rate": 0.000884110295698155, + "loss": 0.87483108, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.34863281, + "step": 1271, + "time_per_iteration": 2.930372476577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085133, + "balance_loss_mlp": 1.05089653, + "epoch": 0.24470950365525201, + "flos": 529575750144.0, + "grad_norm": 0.05520811698213447, + "language_loss": 0.86009014, + "learning_rate": 0.0008839107763646861, + "loss": 0.87094152, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.34277344, + "step": 1272, + "time_per_iteration": 2.576322078704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088964, + "balance_loss_mlp": 1.05324888, + "epoch": 0.24490188534051557, + "flos": 491091448320.0, + "grad_norm": 0.0616556586287024, + "language_loss": 0.9024111, + "learning_rate": 0.0008837111079838353, + "loss": 0.91330075, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.35742188, + "step": 1273, + "time_per_iteration": 2.6859118938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109117, + "balance_loss_mlp": 1.05631351, + "epoch": 0.24509426702577913, + "flos": 473916842496.0, + "grad_norm": 0.05704478566457949, + "language_loss": 0.89869869, + "learning_rate": 0.000883511290633121, + "loss": 0.90961039, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.34887695, + "step": 1274, + "time_per_iteration": 2.5262861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096408, + "balance_loss_mlp": 1.06152773, + "epoch": 0.24528664871104272, + "flos": 550329624576.0, + "grad_norm": 0.04914382449005864, + "language_loss": 0.92288065, + "learning_rate": 0.000883311324390119, + "loss": 0.93384475, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.34887695, + "step": 1275, + "time_per_iteration": 2.6791441440582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100279, + "balance_loss_mlp": 1.0631578, + "epoch": 0.24547903039630628, + "flos": 825546871296.0, + "grad_norm": 0.0705624444694786, + "language_loss": 0.81542301, + "learning_rate": 0.0008831112093324629, + "loss": 0.82642579, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.37060547, + "step": 1276, + "time_per_iteration": 3.0612823963165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100002, + "balance_loss_mlp": 1.06419206, + "epoch": 0.24567141208156984, + "flos": 591325221888.0, + "grad_norm": 0.0822852621967946, + "language_loss": 0.89184481, + "learning_rate": 0.0008829109455378444, + "loss": 0.90284485, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.35839844, + "step": 1277, + "time_per_iteration": 2.6601858139038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091196, + "balance_loss_mlp": 1.05567181, + "epoch": 0.2458637937668334, + "flos": 547611715584.0, + "grad_norm": 0.05101212903881184, + "language_loss": 0.86474031, + "learning_rate": 0.000882710533084013, + "loss": 0.87565225, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.35546875, + "step": 1278, + "time_per_iteration": 2.6333553791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087707, + "balance_loss_mlp": 1.05158627, + "epoch": 0.24605617545209696, + "flos": 515641379328.0, + "grad_norm": 0.04855931692812416, + "language_loss": 0.89387107, + "learning_rate": 0.0008825099720487755, + "loss": 0.9047482, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.36108398, + "step": 1279, + "time_per_iteration": 2.6388816833496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071735, + "balance_loss_mlp": 1.05943298, + "epoch": 0.24624855713736052, + "flos": 1510953951744.0, + "grad_norm": 0.03612446278815301, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76332873, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.12304688, + "step": 1280, + "time_per_iteration": 4.837193727493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044143, + "balance_loss_mlp": 1.03145874, + "epoch": 0.24644093882262408, + "flos": 1526826419712.0, + "grad_norm": 0.020354868078157083, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.78988254, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.12695312, + "step": 1281, + "time_per_iteration": 4.7473485469818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078971, + "balance_loss_mlp": 1.04418564, + "epoch": 0.24663332050788764, + "flos": 658811667456.0, + "grad_norm": 0.060866999123497585, + "language_loss": 0.89327228, + "learning_rate": 0.0008819073982335619, + "loss": 0.90406203, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.34838867, + "step": 1282, + "time_per_iteration": 2.839691162109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107951, + "balance_loss_mlp": 1.0446527, + "epoch": 0.24682570219315123, + "flos": 541510977024.0, + "grad_norm": 0.05752783194209404, + "language_loss": 0.84339237, + "learning_rate": 0.0008817062436519235, + "loss": 0.85418749, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.34887695, + "step": 1283, + "time_per_iteration": 2.6106019020080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080546, + "balance_loss_mlp": 1.04459274, + "epoch": 0.24701808387841478, + "flos": 440455131648.0, + "grad_norm": 0.05999718389674832, + "language_loss": 0.89926815, + "learning_rate": 0.0008815049408787788, + "loss": 0.91007358, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.36010742, + "step": 1284, + "time_per_iteration": 2.5186686515808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079282, + "balance_loss_mlp": 1.04518795, + "epoch": 0.24721046556367834, + "flos": 467826278400.0, + "grad_norm": 0.054777388157378364, + "language_loss": 0.8565737, + "learning_rate": 0.0008813034899922805, + "loss": 0.86736655, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.34106445, + "step": 1285, + "time_per_iteration": 2.5217878818511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082636, + "balance_loss_mlp": 1.04730225, + "epoch": 0.2474028472489419, + "flos": 504183398400.0, + "grad_norm": 0.06351521025868076, + "language_loss": 0.90182853, + "learning_rate": 0.0008811018910706387, + "loss": 0.91265488, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.35375977, + "step": 1286, + "time_per_iteration": 2.549523115158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010823, + "balance_loss_mlp": 1.04787278, + "epoch": 0.24759522893420546, + "flos": 479707660800.0, + "grad_norm": 0.06857789842871208, + "language_loss": 0.81978023, + "learning_rate": 0.0008809001441921211, + "loss": 0.83060318, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.34448242, + "step": 1287, + "time_per_iteration": 2.7147598266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083728, + "balance_loss_mlp": 1.04984844, + "epoch": 0.24778761061946902, + "flos": 533446000128.0, + "grad_norm": 0.05733880184845353, + "language_loss": 0.85523212, + "learning_rate": 0.0008806982494350528, + "loss": 0.86606944, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.33911133, + "step": 1288, + "time_per_iteration": 2.6304967403411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086884, + "balance_loss_mlp": 1.05197978, + "epoch": 0.24797999230473258, + "flos": 559513446912.0, + "grad_norm": 0.04849910181782432, + "language_loss": 0.90370154, + "learning_rate": 0.0008804962068778161, + "loss": 0.91457039, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.34936523, + "step": 1289, + "time_per_iteration": 2.8194985389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087315, + "balance_loss_mlp": 1.05291104, + "epoch": 0.24817237398999614, + "flos": 623912426496.0, + "grad_norm": 0.05410640942937228, + "language_loss": 0.80728722, + "learning_rate": 0.0008802940165988511, + "loss": 0.81816041, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.34423828, + "step": 1290, + "time_per_iteration": 2.8703298568725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096846, + "balance_loss_mlp": 1.06225193, + "epoch": 0.2483647556752597, + "flos": 611981581824.0, + "grad_norm": 0.06277561181530684, + "language_loss": 0.88376027, + "learning_rate": 0.000880091678676655, + "loss": 0.89472872, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.34619141, + "step": 1291, + "time_per_iteration": 2.7943451404571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088363, + "balance_loss_mlp": 1.05496097, + "epoch": 0.2485571373605233, + "flos": 583270419456.0, + "grad_norm": 0.061640996967182685, + "language_loss": 0.89207399, + "learning_rate": 0.0008798891931897821, + "loss": 0.90295762, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.33422852, + "step": 1292, + "time_per_iteration": 2.7013609409332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05463946, + "epoch": 0.24874951904578685, + "flos": 494503391232.0, + "grad_norm": 0.0568342609101268, + "language_loss": 0.84605837, + "learning_rate": 0.0008796865602168447, + "loss": 0.8569414, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.33691406, + "step": 1293, + "time_per_iteration": 2.517571210861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.05448937, + "epoch": 0.2489419007310504, + "flos": 455925957120.0, + "grad_norm": 0.05011975537228715, + "language_loss": 0.88745099, + "learning_rate": 0.0008794837798365115, + "loss": 0.8983261, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.33032227, + "step": 1294, + "time_per_iteration": 2.6243135929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093031, + "balance_loss_mlp": 1.05958056, + "epoch": 0.24913428241631397, + "flos": 485198733312.0, + "grad_norm": 0.05031013210073455, + "language_loss": 0.88537574, + "learning_rate": 0.0008792808521275089, + "loss": 0.89630604, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.3347168, + "step": 1295, + "time_per_iteration": 2.743821144104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090644, + "balance_loss_mlp": 1.05664551, + "epoch": 0.24932666410157753, + "flos": 518654651904.0, + "grad_norm": 0.0628198177294759, + "language_loss": 0.87554896, + "learning_rate": 0.0008790777771686206, + "loss": 0.8864553, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.34033203, + "step": 1296, + "time_per_iteration": 2.55996036529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091195, + "balance_loss_mlp": 1.05819798, + "epoch": 0.2495190457868411, + "flos": 472365831168.0, + "grad_norm": 0.05367084005526609, + "language_loss": 0.85479438, + "learning_rate": 0.0008788745550386872, + "loss": 0.86570632, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.33007812, + "step": 1297, + "time_per_iteration": 2.555238723754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099543, + "balance_loss_mlp": 1.06494844, + "epoch": 0.24971142747210465, + "flos": 745559202816.0, + "grad_norm": 0.05557204977607519, + "language_loss": 0.80045742, + "learning_rate": 0.0008786711858166063, + "loss": 0.81145287, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.34643555, + "step": 1298, + "time_per_iteration": 2.940908670425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090601, + "balance_loss_mlp": 1.05757999, + "epoch": 0.2499038091573682, + "flos": 749222839296.0, + "grad_norm": 0.08262860681241094, + "language_loss": 0.83490336, + "learning_rate": 0.0008784676695813332, + "loss": 0.84580934, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.33032227, + "step": 1299, + "time_per_iteration": 2.966646432876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092772, + "balance_loss_mlp": 1.05870187, + "epoch": 0.2500961908426318, + "flos": 744741513216.0, + "grad_norm": 0.04756275395178792, + "language_loss": 0.84761405, + "learning_rate": 0.0008782640064118796, + "loss": 0.85854173, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.34082031, + "step": 1300, + "time_per_iteration": 2.8889827728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078115, + "balance_loss_mlp": 1.06447709, + "epoch": 0.2502885725278953, + "flos": 1416652180992.0, + "grad_norm": 0.036683670441934005, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77262866, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.13671875, + "step": 1301, + "time_per_iteration": 4.988169431686401 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094633, + "balance_loss_mlp": 1.06196928, + "epoch": 0.2504809542131589, + "flos": 514961902080.0, + "grad_norm": 0.05923567857946263, + "language_loss": 0.86476314, + "learning_rate": 0.0008778562395867648, + "loss": 0.87570941, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.32666016, + "step": 1302, + "time_per_iteration": 2.5900919437408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087269, + "balance_loss_mlp": 1.05436766, + "epoch": 0.25067333589842244, + "flos": 525562905600.0, + "grad_norm": 0.06049595368492962, + "language_loss": 0.83774143, + "learning_rate": 0.0008776521360894127, + "loss": 0.8486141, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.32910156, + "step": 1303, + "time_per_iteration": 2.6029298305511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_mlp": 1.02275884, + "epoch": 0.25086571758368603, + "flos": 1473085108224.0, + "grad_norm": 0.024331867442101186, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.79997885, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.13085938, + "step": 1304, + "time_per_iteration": 4.800757646560669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096353, + "balance_loss_mlp": 1.063833, + "epoch": 0.2510580992689496, + "flos": 528128045568.0, + "grad_norm": 0.053799887970574674, + "language_loss": 0.90341735, + "learning_rate": 0.0008772434893213186, + "loss": 0.91438091, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.32519531, + "step": 1305, + "time_per_iteration": 2.5816421508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104465, + "balance_loss_mlp": 1.07123005, + "epoch": 0.25125048095421315, + "flos": 517192390656.0, + "grad_norm": 0.058690449713219205, + "language_loss": 0.84433925, + "learning_rate": 0.0008770389462092276, + "loss": 0.85538393, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.33251953, + "step": 1306, + "time_per_iteration": 2.6747090816497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106931, + "balance_loss_mlp": 1.07309926, + "epoch": 0.25144286263947674, + "flos": 620160039936.0, + "grad_norm": 0.16660488736040688, + "language_loss": 0.86719346, + "learning_rate": 0.0008768342567176357, + "loss": 0.87826276, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.33862305, + "step": 1307, + "time_per_iteration": 2.7788002490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098824, + "balance_loss_mlp": 1.06425333, + "epoch": 0.25163524432474027, + "flos": 503534444544.0, + "grad_norm": 0.04824933548887647, + "language_loss": 0.90589297, + "learning_rate": 0.0008766294209260107, + "loss": 0.91688126, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.34619141, + "step": 1308, + "time_per_iteration": 2.6300241947174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010929, + "balance_loss_mlp": 1.05852032, + "epoch": 0.25182762601000386, + "flos": 508821875712.0, + "grad_norm": 0.0633327884934456, + "language_loss": 0.91549027, + "learning_rate": 0.0008764244389138767, + "loss": 0.92641926, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.34399414, + "step": 1309, + "time_per_iteration": 2.574056625366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088516, + "balance_loss_mlp": 1.05306351, + "epoch": 0.2520200076952674, + "flos": 633596815872.0, + "grad_norm": 0.05898934519456769, + "language_loss": 0.8269434, + "learning_rate": 0.000876219310760815, + "loss": 0.83782852, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.35449219, + "step": 1310, + "time_per_iteration": 2.87404465675354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010989, + "balance_loss_mlp": 1.06299448, + "epoch": 0.252212389380531, + "flos": 494385527808.0, + "grad_norm": 0.05968729718727878, + "language_loss": 0.8144334, + "learning_rate": 0.0008760140365464631, + "loss": 0.82542241, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.35913086, + "step": 1311, + "time_per_iteration": 2.599480390548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105508, + "balance_loss_mlp": 1.07062793, + "epoch": 0.2524047710657945, + "flos": 490298489856.0, + "grad_norm": 0.06557576312810307, + "language_loss": 0.87226975, + "learning_rate": 0.0008758086163505156, + "loss": 0.88332486, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.34912109, + "step": 1312, + "time_per_iteration": 2.6165666580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112941, + "balance_loss_mlp": 1.07698762, + "epoch": 0.2525971527510581, + "flos": 647136898560.0, + "grad_norm": 0.06425852435188892, + "language_loss": 0.89039612, + "learning_rate": 0.0008756030502527239, + "loss": 0.90152562, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.36010742, + "step": 1313, + "time_per_iteration": 2.794595956802368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112418, + "balance_loss_mlp": 1.07636952, + "epoch": 0.2527895344363217, + "flos": 568991222784.0, + "grad_norm": 0.05792474282671988, + "language_loss": 0.90396988, + "learning_rate": 0.0008753973383328954, + "loss": 0.91509414, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.36108398, + "step": 1314, + "time_per_iteration": 2.66343355178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110344, + "balance_loss_mlp": 1.07491553, + "epoch": 0.2529819161215852, + "flos": 513795004416.0, + "grad_norm": 0.10488361484557306, + "language_loss": 0.84231019, + "learning_rate": 0.0008751914806708952, + "loss": 0.8534137, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.35449219, + "step": 1315, + "time_per_iteration": 2.5714006423950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099121, + "balance_loss_mlp": 1.06357241, + "epoch": 0.2531742978068488, + "flos": 530979784704.0, + "grad_norm": 0.0646255116041034, + "language_loss": 0.81763697, + "learning_rate": 0.0008749854773466439, + "loss": 0.82862812, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.35571289, + "step": 1316, + "time_per_iteration": 2.6507568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093995, + "balance_loss_mlp": 1.05892396, + "epoch": 0.25336667949211233, + "flos": 596362369536.0, + "grad_norm": 0.11519177634747009, + "language_loss": 0.84297431, + "learning_rate": 0.0008747793284401192, + "loss": 0.8539142, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.35107422, + "step": 1317, + "time_per_iteration": 2.6708261966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109538, + "balance_loss_mlp": 1.05966473, + "epoch": 0.2535590611773759, + "flos": 601764691968.0, + "grad_norm": 0.05376009268762157, + "language_loss": 0.86145389, + "learning_rate": 0.0008745730340313551, + "loss": 0.87240773, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.35742188, + "step": 1318, + "time_per_iteration": 2.7465810775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100043, + "balance_loss_mlp": 1.06504369, + "epoch": 0.25375144286263945, + "flos": 495079561728.0, + "grad_norm": 0.053440140598651036, + "language_loss": 0.8468703, + "learning_rate": 0.0008743665942004422, + "loss": 0.85787076, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.35009766, + "step": 1319, + "time_per_iteration": 2.632645606994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109509, + "balance_loss_mlp": 1.05908918, + "epoch": 0.25394382454790304, + "flos": 512219261952.0, + "grad_norm": 0.050076364746318706, + "language_loss": 0.92529714, + "learning_rate": 0.0008741600090275277, + "loss": 0.93624806, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.35986328, + "step": 1320, + "time_per_iteration": 2.564730405807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097426, + "balance_loss_mlp": 1.06092453, + "epoch": 0.25413620623316663, + "flos": 958586047488.0, + "grad_norm": 0.058049172943507095, + "language_loss": 0.83939385, + "learning_rate": 0.0008739532785928151, + "loss": 0.85036814, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.36474609, + "step": 1321, + "time_per_iteration": 3.4496617317199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056798, + "balance_loss_mlp": 1.04439986, + "epoch": 0.25432858791843016, + "flos": 1576445635584.0, + "grad_norm": 0.03297734471592195, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75950378, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.12353516, + "step": 1322, + "time_per_iteration": 4.803644418716431 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102711, + "balance_loss_mlp": 1.06706691, + "epoch": 0.25452096960369375, + "flos": 583530877440.0, + "grad_norm": 0.056711392027496164, + "language_loss": 0.83213425, + "learning_rate": 0.0008735393822590908, + "loss": 0.84316134, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.35668945, + "step": 1323, + "time_per_iteration": 2.6643528938293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099364, + "balance_loss_mlp": 1.06434083, + "epoch": 0.2547133512889573, + "flos": 508344629760.0, + "grad_norm": 0.06006943476027706, + "language_loss": 0.87018919, + "learning_rate": 0.0008733322165207681, + "loss": 0.88118285, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.35083008, + "step": 1324, + "time_per_iteration": 2.627495765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110409, + "balance_loss_mlp": 1.06863689, + "epoch": 0.25490573297422087, + "flos": 782266940928.0, + "grad_norm": 0.05604709920606865, + "language_loss": 0.83055937, + "learning_rate": 0.0008731249058420247, + "loss": 0.8416003, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.35498047, + "step": 1325, + "time_per_iteration": 3.0361831188201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100095, + "balance_loss_mlp": 1.06468964, + "epoch": 0.2550981146594844, + "flos": 509610451968.0, + "grad_norm": 0.06314633870869373, + "language_loss": 0.90780556, + "learning_rate": 0.0008729174503033459, + "loss": 0.91880649, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.35424805, + "step": 1326, + "time_per_iteration": 2.639625072479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109522, + "balance_loss_mlp": 1.06007695, + "epoch": 0.255290496344748, + "flos": 676360212480.0, + "grad_norm": 0.06489195741671011, + "language_loss": 0.82650065, + "learning_rate": 0.0008727098499852728, + "loss": 0.83745289, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.35180664, + "step": 1327, + "time_per_iteration": 2.830500602722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109753, + "balance_loss_mlp": 1.06231546, + "epoch": 0.2554828780300115, + "flos": 537524273664.0, + "grad_norm": 0.06666455638552511, + "language_loss": 0.89945138, + "learning_rate": 0.0008725021049684034, + "loss": 0.91042662, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.35253906, + "step": 1328, + "time_per_iteration": 2.747800350189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097741, + "balance_loss_mlp": 1.06240726, + "epoch": 0.2556752597152751, + "flos": 823828534272.0, + "grad_norm": 0.052131047599379726, + "language_loss": 0.82919741, + "learning_rate": 0.000872294215333391, + "loss": 0.84017479, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.35400391, + "step": 1329, + "time_per_iteration": 3.1658926010131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096694, + "balance_loss_mlp": 1.06176591, + "epoch": 0.2558676414005387, + "flos": 570517502976.0, + "grad_norm": 0.05425014800623288, + "language_loss": 0.82993001, + "learning_rate": 0.0008720861811609457, + "loss": 0.84089696, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.34985352, + "step": 1330, + "time_per_iteration": 2.709085702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101009, + "balance_loss_mlp": 1.06448317, + "epoch": 0.2560600230858022, + "flos": 486419475456.0, + "grad_norm": 0.05425594622111712, + "language_loss": 0.83756936, + "learning_rate": 0.0008718780025318338, + "loss": 0.84857947, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.36523438, + "step": 1331, + "time_per_iteration": 2.7126388549804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097427, + "balance_loss_mlp": 1.06280875, + "epoch": 0.2562524047710658, + "flos": 512874008064.0, + "grad_norm": 0.06594145834934585, + "language_loss": 0.8406449, + "learning_rate": 0.0008716696795268771, + "loss": 0.85161918, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.34667969, + "step": 1332, + "time_per_iteration": 2.6650350093841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089329, + "balance_loss_mlp": 1.05308938, + "epoch": 0.25644478645632934, + "flos": 634498873344.0, + "grad_norm": 0.051413439896644035, + "language_loss": 0.85076845, + "learning_rate": 0.0008714612122269538, + "loss": 0.86166173, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.36279297, + "step": 1333, + "time_per_iteration": 2.8611392974853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109443, + "balance_loss_mlp": 1.05754697, + "epoch": 0.25663716814159293, + "flos": 436353537024.0, + "grad_norm": 0.0705935369031189, + "language_loss": 0.89120972, + "learning_rate": 0.0008712526007129982, + "loss": 0.90215403, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.36889648, + "step": 1334, + "time_per_iteration": 2.5217065811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109442, + "balance_loss_mlp": 1.05813241, + "epoch": 0.25682954982685646, + "flos": 497892013056.0, + "grad_norm": 0.06578019441075163, + "language_loss": 0.90784955, + "learning_rate": 0.0008710438450660003, + "loss": 0.91879368, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.36303711, + "step": 1335, + "time_per_iteration": 2.651367425918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093579, + "balance_loss_mlp": 1.05643392, + "epoch": 0.25702193151212005, + "flos": 457471176192.0, + "grad_norm": 0.07087944464696884, + "language_loss": 0.8744905, + "learning_rate": 0.0008708349453670064, + "loss": 0.88542628, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.37133789, + "step": 1336, + "time_per_iteration": 2.51411771774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090754, + "balance_loss_mlp": 1.0543952, + "epoch": 0.2572143131973836, + "flos": 598002130944.0, + "grad_norm": 0.06329524505646734, + "language_loss": 0.91480416, + "learning_rate": 0.0008706259016971185, + "loss": 0.92571175, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.36401367, + "step": 1337, + "time_per_iteration": 2.754173517227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096487, + "balance_loss_mlp": 1.0589596, + "epoch": 0.25740669488264717, + "flos": 698004559872.0, + "grad_norm": 0.06697174190166053, + "language_loss": 0.83331275, + "learning_rate": 0.0008704167141374944, + "loss": 0.84427762, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.375, + "step": 1338, + "time_per_iteration": 2.795552968978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011015, + "balance_loss_mlp": 1.06385398, + "epoch": 0.25759907656791076, + "flos": 502130409984.0, + "grad_norm": 0.06639008708045263, + "language_loss": 0.88657552, + "learning_rate": 0.0008702073827693482, + "loss": 0.89759052, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.3762207, + "step": 1339, + "time_per_iteration": 2.6935572624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103743, + "balance_loss_mlp": 1.065763, + "epoch": 0.2577914582531743, + "flos": 773541425664.0, + "grad_norm": 0.06917089880544881, + "language_loss": 0.88938046, + "learning_rate": 0.0008699979076739494, + "loss": 0.90041792, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.37963867, + "step": 1340, + "time_per_iteration": 2.951148509979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102218, + "balance_loss_mlp": 1.06552505, + "epoch": 0.2579838399384379, + "flos": 459431032320.0, + "grad_norm": 0.07085954822691051, + "language_loss": 0.88831556, + "learning_rate": 0.0008697882889326234, + "loss": 0.89933777, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.36669922, + "step": 1341, + "time_per_iteration": 2.492182731628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105931, + "balance_loss_mlp": 1.06916654, + "epoch": 0.2581762216237014, + "flos": 568917029376.0, + "grad_norm": 0.060702491086151805, + "language_loss": 0.86630756, + "learning_rate": 0.0008695785266267515, + "loss": 0.8773669, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.36816406, + "step": 1342, + "time_per_iteration": 2.6635031700134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111038, + "balance_loss_mlp": 1.07448828, + "epoch": 0.258368603308965, + "flos": 603906430464.0, + "grad_norm": 0.06467765584173796, + "language_loss": 0.83112109, + "learning_rate": 0.0008693686208377704, + "loss": 0.84223145, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.36547852, + "step": 1343, + "time_per_iteration": 2.7769596576690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105659, + "balance_loss_mlp": 1.06975329, + "epoch": 0.2585609849942285, + "flos": 491204929536.0, + "grad_norm": 0.06376456739082713, + "language_loss": 0.88889539, + "learning_rate": 0.0008691585716471733, + "loss": 0.89995199, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.35913086, + "step": 1344, + "time_per_iteration": 2.6467716693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104044, + "balance_loss_mlp": 1.06809044, + "epoch": 0.2587533666794921, + "flos": 640455607296.0, + "grad_norm": 0.057733681270749564, + "language_loss": 0.85255873, + "learning_rate": 0.0008689483791365079, + "loss": 0.86359918, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.35961914, + "step": 1345, + "time_per_iteration": 2.8041999340057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099237, + "balance_loss_mlp": 1.06380773, + "epoch": 0.2589457483647557, + "flos": 576564397056.0, + "grad_norm": 0.05015471530609978, + "language_loss": 0.89365089, + "learning_rate": 0.0008687380433873786, + "loss": 0.9046433, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.35473633, + "step": 1346, + "time_per_iteration": 2.7955591678619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101447, + "balance_loss_mlp": 1.06630445, + "epoch": 0.25913813005001923, + "flos": 535164337152.0, + "grad_norm": 0.06074647569776127, + "language_loss": 0.82164252, + "learning_rate": 0.0008685275644814448, + "loss": 0.83265698, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.3515625, + "step": 1347, + "time_per_iteration": 2.6922154426574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100414, + "balance_loss_mlp": 1.06419861, + "epoch": 0.2593305117352828, + "flos": 720713908224.0, + "grad_norm": 0.05981927153656866, + "language_loss": 0.8445859, + "learning_rate": 0.0008683169425004216, + "loss": 0.85558999, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.36230469, + "step": 1348, + "time_per_iteration": 2.8701395988464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.05186677, + "epoch": 0.25952289342054635, + "flos": 709782635520.0, + "grad_norm": 0.06994851779161643, + "language_loss": 0.83445579, + "learning_rate": 0.0008681061775260799, + "loss": 0.84533083, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.35644531, + "step": 1349, + "time_per_iteration": 2.8206968307495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096473, + "balance_loss_mlp": 1.06032848, + "epoch": 0.25971527510580994, + "flos": 455688820224.0, + "grad_norm": 0.06118298275127208, + "language_loss": 0.91987318, + "learning_rate": 0.0008678952696402458, + "loss": 0.93083793, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.36132812, + "step": 1350, + "time_per_iteration": 2.5547540187835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108511, + "balance_loss_mlp": 1.04932308, + "epoch": 0.25990765679107347, + "flos": 612223100928.0, + "grad_norm": 0.04808004024566397, + "language_loss": 0.86496055, + "learning_rate": 0.000867684218924801, + "loss": 0.8758117, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.35791016, + "step": 1351, + "time_per_iteration": 2.8406949043273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110301, + "balance_loss_mlp": 1.09857082, + "epoch": 0.26010003847633706, + "flos": 1537105766400.0, + "grad_norm": 0.059206679514604114, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80057395, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.1171875, + "step": 1352, + "time_per_iteration": 4.8775153160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082749, + "balance_loss_mlp": 1.04753494, + "epoch": 0.2602924201616006, + "flos": 715947393024.0, + "grad_norm": 0.046134849134736367, + "language_loss": 0.85103661, + "learning_rate": 0.0008672616893328834, + "loss": 0.86186409, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.35253906, + "step": 1353, + "time_per_iteration": 2.98063588142395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108322, + "balance_loss_mlp": 1.04764819, + "epoch": 0.2604848018468642, + "flos": 643241917440.0, + "grad_norm": 0.060512322591449175, + "language_loss": 0.9000203, + "learning_rate": 0.0008670502106204512, + "loss": 0.91085243, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.35595703, + "step": 1354, + "time_per_iteration": 2.832679271697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087805, + "balance_loss_mlp": 1.05073047, + "epoch": 0.26067718353212777, + "flos": 516783545856.0, + "grad_norm": 0.05860289542603218, + "language_loss": 0.8165139, + "learning_rate": 0.0008668385894064892, + "loss": 0.82739192, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.37084961, + "step": 1355, + "time_per_iteration": 2.6204822063446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086317, + "balance_loss_mlp": 1.05143666, + "epoch": 0.2608695652173913, + "flos": 822361890816.0, + "grad_norm": 0.0623840657908754, + "language_loss": 0.88803548, + "learning_rate": 0.0008666268257731562, + "loss": 0.8988986, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.34912109, + "step": 1356, + "time_per_iteration": 3.113147735595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109295, + "balance_loss_mlp": 1.0566628, + "epoch": 0.2610619469026549, + "flos": 1007451744768.0, + "grad_norm": 0.056693012024963345, + "language_loss": 0.85794425, + "learning_rate": 0.0008664149198026662, + "loss": 0.86887372, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.36279297, + "step": 1357, + "time_per_iteration": 3.2569541931152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095714, + "balance_loss_mlp": 1.06040418, + "epoch": 0.2612543285879184, + "flos": 536523291648.0, + "grad_norm": 0.061594313952015485, + "language_loss": 0.88599586, + "learning_rate": 0.0008662028715772883, + "loss": 0.89695299, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.35351562, + "step": 1358, + "time_per_iteration": 2.6102256774902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100258, + "balance_loss_mlp": 1.06475711, + "epoch": 0.261446710273182, + "flos": 519166803456.0, + "grad_norm": 0.04975036534081278, + "language_loss": 0.85662109, + "learning_rate": 0.0008659906811793467, + "loss": 0.86762363, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.35546875, + "step": 1359, + "time_per_iteration": 2.6921935081481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101082, + "balance_loss_mlp": 1.06543839, + "epoch": 0.26163909195844554, + "flos": 582975055872.0, + "grad_norm": 0.06646109128582675, + "language_loss": 0.89397144, + "learning_rate": 0.0008657783486912215, + "loss": 0.90498233, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.35693359, + "step": 1360, + "time_per_iteration": 2.7003283500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110494, + "balance_loss_mlp": 1.06920147, + "epoch": 0.2618314736437091, + "flos": 958362057216.0, + "grad_norm": 0.06344844215605515, + "language_loss": 0.89840877, + "learning_rate": 0.0008655658741953472, + "loss": 0.90945816, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.35742188, + "step": 1361, + "time_per_iteration": 3.207960844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101357, + "balance_loss_mlp": 1.0664053, + "epoch": 0.26202385532897265, + "flos": 574530347520.0, + "grad_norm": 0.04606923720206454, + "language_loss": 0.88105857, + "learning_rate": 0.0008653532577742136, + "loss": 0.89207214, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.34960938, + "step": 1362, + "time_per_iteration": 2.69209885597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091565, + "balance_loss_mlp": 1.05744767, + "epoch": 0.26221623701423624, + "flos": 445240585728.0, + "grad_norm": 0.05480512848555835, + "language_loss": 0.87200153, + "learning_rate": 0.0008651404995103659, + "loss": 0.88291717, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.34155273, + "step": 1363, + "time_per_iteration": 2.5325255393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095927, + "balance_loss_mlp": 1.06164205, + "epoch": 0.26240861869949983, + "flos": 535459700736.0, + "grad_norm": 0.04992660146640532, + "language_loss": 0.870713, + "learning_rate": 0.0008649275994864041, + "loss": 0.8816722, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.34301758, + "step": 1364, + "time_per_iteration": 2.682365894317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098846, + "balance_loss_mlp": 1.06267846, + "epoch": 0.26260100038476336, + "flos": 564940500480.0, + "grad_norm": 0.05369640644722127, + "language_loss": 0.83917898, + "learning_rate": 0.0008647145577849834, + "loss": 0.85016745, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.36157227, + "step": 1365, + "time_per_iteration": 2.8129918575286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102304, + "balance_loss_mlp": 1.06701851, + "epoch": 0.26279338207002695, + "flos": 612745426944.0, + "grad_norm": 0.045782565775991005, + "language_loss": 0.82809973, + "learning_rate": 0.0008645013744888139, + "loss": 0.83912277, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.35327148, + "step": 1366, + "time_per_iteration": 2.8523411750793457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101615, + "balance_loss_mlp": 1.06730664, + "epoch": 0.2629857637552905, + "flos": 522555425280.0, + "grad_norm": 0.0597350257589219, + "language_loss": 0.87579656, + "learning_rate": 0.0008642880496806607, + "loss": 0.88681269, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.34350586, + "step": 1367, + "time_per_iteration": 2.766350507736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105811, + "balance_loss_mlp": 1.0706687, + "epoch": 0.26317814544055407, + "flos": 534273864192.0, + "grad_norm": 0.05812227598952832, + "language_loss": 0.84219468, + "learning_rate": 0.0008640745834433437, + "loss": 0.85325277, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.35205078, + "step": 1368, + "time_per_iteration": 2.7220964431762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100459, + "balance_loss_mlp": 1.06553102, + "epoch": 0.2633705271258176, + "flos": 555235762176.0, + "grad_norm": 0.06954601812969684, + "language_loss": 0.86862296, + "learning_rate": 0.000863860975859738, + "loss": 0.87962759, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.34960938, + "step": 1369, + "time_per_iteration": 2.8985280990600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094199, + "balance_loss_mlp": 1.05931866, + "epoch": 0.2635629088110812, + "flos": 552136711680.0, + "grad_norm": 0.06493737783890446, + "language_loss": 0.88711715, + "learning_rate": 0.0008636472270127733, + "loss": 0.89805913, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.34936523, + "step": 1370, + "time_per_iteration": 2.634615182876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090389, + "balance_loss_mlp": 1.05498338, + "epoch": 0.2637552904963448, + "flos": 455752839168.0, + "grad_norm": 0.062476231294863314, + "language_loss": 0.89913595, + "learning_rate": 0.0008634333369854345, + "loss": 0.91003978, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.35424805, + "step": 1371, + "time_per_iteration": 2.5908331871032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082759, + "balance_loss_mlp": 1.04818797, + "epoch": 0.2639476721816083, + "flos": 612847323648.0, + "grad_norm": 0.05509554660574217, + "language_loss": 0.87495965, + "learning_rate": 0.0008632193058607608, + "loss": 0.88578725, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.34594727, + "step": 1372, + "time_per_iteration": 2.6963188648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108436, + "balance_loss_mlp": 1.04878759, + "epoch": 0.2641400538668719, + "flos": 571645112832.0, + "grad_norm": 0.05982264925210271, + "language_loss": 0.81028771, + "learning_rate": 0.0008630051337218466, + "loss": 0.82113135, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.35595703, + "step": 1373, + "time_per_iteration": 2.644540786743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079985, + "balance_loss_mlp": 1.04582, + "epoch": 0.2643324355521354, + "flos": 581979866112.0, + "grad_norm": 0.08561984623812412, + "language_loss": 0.82428128, + "learning_rate": 0.0008627908206518409, + "loss": 0.8350811, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.34179688, + "step": 1374, + "time_per_iteration": 2.660578966140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110354, + "balance_loss_mlp": 1.08904421, + "epoch": 0.264524817237399, + "flos": 1543845284352.0, + "grad_norm": 0.03725698642258328, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76254791, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.14453125, + "step": 1375, + "time_per_iteration": 4.9595324993133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079461, + "balance_loss_mlp": 1.04474711, + "epoch": 0.26471719892266254, + "flos": 517783117824.0, + "grad_norm": 0.05493851972821551, + "language_loss": 0.91330564, + "learning_rate": 0.0008623617720514241, + "loss": 0.92410028, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.34741211, + "step": 1376, + "time_per_iteration": 2.5929205417633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.05498242, + "epoch": 0.26490958060792613, + "flos": 516936314880.0, + "grad_norm": 0.08106601153347975, + "language_loss": 0.84946424, + "learning_rate": 0.0008621470366875848, + "loss": 0.8603667, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.3527832, + "step": 1377, + "time_per_iteration": 2.5729684829711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084439, + "balance_loss_mlp": 1.0497725, + "epoch": 0.26510196229318966, + "flos": 596298350592.0, + "grad_norm": 0.05588669268878349, + "language_loss": 0.87771004, + "learning_rate": 0.0008619321607257966, + "loss": 0.88855445, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.34692383, + "step": 1378, + "time_per_iteration": 2.6708004474639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082921, + "balance_loss_mlp": 1.0483501, + "epoch": 0.26529434397845325, + "flos": 685488780288.0, + "grad_norm": 0.051774701706919043, + "language_loss": 0.82311988, + "learning_rate": 0.000861717144249482, + "loss": 0.83394915, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.34594727, + "step": 1379, + "time_per_iteration": 2.8249831199645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081979, + "balance_loss_mlp": 1.0468595, + "epoch": 0.26548672566371684, + "flos": 424127328768.0, + "grad_norm": 0.06288210815556809, + "language_loss": 0.90205348, + "learning_rate": 0.0008615019873421175, + "loss": 0.91287327, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.35131836, + "step": 1380, + "time_per_iteration": 2.455320358276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108606, + "balance_loss_mlp": 1.05108428, + "epoch": 0.26567910734898037, + "flos": 489619012608.0, + "grad_norm": 0.05393715583789803, + "language_loss": 0.8609767, + "learning_rate": 0.0008612866900872349, + "loss": 0.87183726, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.35009766, + "step": 1381, + "time_per_iteration": 2.54070782661438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083002, + "balance_loss_mlp": 1.04895568, + "epoch": 0.26587148903424396, + "flos": 533947977216.0, + "grad_norm": 0.05290962754614328, + "language_loss": 0.88052452, + "learning_rate": 0.0008610712525684197, + "loss": 0.8913545, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.34082031, + "step": 1382, + "time_per_iteration": 2.6350595951080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084878, + "balance_loss_mlp": 1.05049801, + "epoch": 0.2660638707195075, + "flos": 1017067732992.0, + "grad_norm": 0.06267977315545337, + "language_loss": 0.84534729, + "learning_rate": 0.0008608556748693121, + "loss": 0.85619605, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.34423828, + "step": 1383, + "time_per_iteration": 3.231172561645508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086499, + "balance_loss_mlp": 1.05216646, + "epoch": 0.2662562524047711, + "flos": 523712148480.0, + "grad_norm": 0.0585640776606728, + "language_loss": 0.86247015, + "learning_rate": 0.000860639957073607, + "loss": 0.87333512, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.34375, + "step": 1384, + "time_per_iteration": 2.72265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108718, + "balance_loss_mlp": 1.05280018, + "epoch": 0.2664486340900346, + "flos": 552107598336.0, + "grad_norm": 0.07312693577598182, + "language_loss": 0.87888551, + "learning_rate": 0.0008604240992650534, + "loss": 0.88975734, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.34423828, + "step": 1385, + "time_per_iteration": 2.6524593830108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079729, + "balance_loss_mlp": 1.0455637, + "epoch": 0.2666410157752982, + "flos": 469895233536.0, + "grad_norm": 0.058731941016447735, + "language_loss": 0.89070451, + "learning_rate": 0.0008602081015274545, + "loss": 0.90150183, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.34179688, + "step": 1386, + "time_per_iteration": 2.7026963233947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083917, + "balance_loss_mlp": 1.04953694, + "epoch": 0.2668333974605617, + "flos": 569645968896.0, + "grad_norm": 0.04572049987167494, + "language_loss": 0.83031899, + "learning_rate": 0.0008599919639446684, + "loss": 0.84115815, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.34423828, + "step": 1387, + "time_per_iteration": 2.6891515254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083945, + "balance_loss_mlp": 1.04920745, + "epoch": 0.2670257791458253, + "flos": 398755325952.0, + "grad_norm": 0.06113709644372323, + "language_loss": 0.80263156, + "learning_rate": 0.000859775686600607, + "loss": 0.81347102, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.34765625, + "step": 1388, + "time_per_iteration": 2.5367043018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088195, + "balance_loss_mlp": 1.05400586, + "epoch": 0.2672181608310889, + "flos": 515587534848.0, + "grad_norm": 0.07715457421599592, + "language_loss": 0.85045016, + "learning_rate": 0.0008595592695792367, + "loss": 0.86133218, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.34228516, + "step": 1389, + "time_per_iteration": 2.653684139251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097788, + "balance_loss_mlp": 1.06348014, + "epoch": 0.26741054251635243, + "flos": 507270864384.0, + "grad_norm": 0.05276083290405683, + "language_loss": 0.9085412, + "learning_rate": 0.0008593427129645778, + "loss": 0.91951907, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.34350586, + "step": 1390, + "time_per_iteration": 2.5497426986694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100169, + "balance_loss_mlp": 1.06512117, + "epoch": 0.267602924201616, + "flos": 576357783552.0, + "grad_norm": 0.0907689109524766, + "language_loss": 0.85371816, + "learning_rate": 0.0008591260168407052, + "loss": 0.86471987, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.35083008, + "step": 1391, + "time_per_iteration": 2.752777576446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099505, + "balance_loss_mlp": 1.06598353, + "epoch": 0.26779530588687955, + "flos": 523731087360.0, + "grad_norm": 0.05201595058269412, + "language_loss": 0.83216429, + "learning_rate": 0.0008589091812917479, + "loss": 0.84315932, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.33544922, + "step": 1392, + "time_per_iteration": 2.602858781814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092875, + "balance_loss_mlp": 1.05897164, + "epoch": 0.26798768757214314, + "flos": 556508938752.0, + "grad_norm": 0.054199587407170555, + "language_loss": 0.85476619, + "learning_rate": 0.0008586922064018887, + "loss": 0.86569488, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.33911133, + "step": 1393, + "time_per_iteration": 2.663135528564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110003, + "balance_loss_mlp": 1.0641005, + "epoch": 0.2681800692574067, + "flos": 930246004224.0, + "grad_norm": 0.05606615550920643, + "language_loss": 0.89228028, + "learning_rate": 0.0008584750922553651, + "loss": 0.90328062, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.35961914, + "step": 1394, + "time_per_iteration": 3.126030206680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094414, + "balance_loss_mlp": 1.06020141, + "epoch": 0.26837245094267026, + "flos": 700771931136.0, + "grad_norm": 0.055333821001128054, + "language_loss": 0.83724457, + "learning_rate": 0.0008582578389364677, + "loss": 0.84818876, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.3425293, + "step": 1395, + "time_per_iteration": 2.858774423599243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096924, + "balance_loss_mlp": 1.06187642, + "epoch": 0.26856483262793385, + "flos": 592892199936.0, + "grad_norm": 0.04773968262798697, + "language_loss": 0.9195987, + "learning_rate": 0.0008580404465295422, + "loss": 0.93056792, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.35058594, + "step": 1396, + "time_per_iteration": 2.7737646102905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096258, + "balance_loss_mlp": 1.06190252, + "epoch": 0.2687572143131974, + "flos": 713943866880.0, + "grad_norm": 0.07288281155022573, + "language_loss": 0.88208908, + "learning_rate": 0.0008578229151189876, + "loss": 0.89305162, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.34375, + "step": 1397, + "time_per_iteration": 2.9974029064178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087674, + "balance_loss_mlp": 1.05441451, + "epoch": 0.26894959599846097, + "flos": 467481452544.0, + "grad_norm": 0.0581153622766974, + "language_loss": 0.81433654, + "learning_rate": 0.0008576052447892573, + "loss": 0.82521319, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.33276367, + "step": 1398, + "time_per_iteration": 2.586427688598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090085, + "balance_loss_mlp": 1.05551457, + "epoch": 0.2691419776837245, + "flos": 468470850048.0, + "grad_norm": 0.08083264737918114, + "language_loss": 0.86589479, + "learning_rate": 0.000857387435624858, + "loss": 0.87679559, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.34619141, + "step": 1399, + "time_per_iteration": 2.5227789878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096966, + "balance_loss_mlp": 1.06239533, + "epoch": 0.2693343593689881, + "flos": 937244418048.0, + "grad_norm": 0.0443934808912178, + "language_loss": 0.88525635, + "learning_rate": 0.0008571694877103513, + "loss": 0.89622605, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.34594727, + "step": 1400, + "time_per_iteration": 3.252573251724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095335, + "balance_loss_mlp": 1.06064546, + "epoch": 0.2695267410542516, + "flos": 577303511040.0, + "grad_norm": 0.05297583192015558, + "language_loss": 0.87603962, + "learning_rate": 0.0008569514011303515, + "loss": 0.88699305, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.34716797, + "step": 1401, + "time_per_iteration": 2.7824223041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092716, + "balance_loss_mlp": 1.0577879, + "epoch": 0.2697191227395152, + "flos": 556539462144.0, + "grad_norm": 0.06414718170709632, + "language_loss": 0.87859815, + "learning_rate": 0.0008567331759695277, + "loss": 0.88952529, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.34985352, + "step": 1402, + "time_per_iteration": 2.7109498977661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098142, + "balance_loss_mlp": 1.06183052, + "epoch": 0.26991150442477874, + "flos": 529024310784.0, + "grad_norm": 0.05462837975359106, + "language_loss": 0.86148876, + "learning_rate": 0.0008565148123126023, + "loss": 0.87247014, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.36328125, + "step": 1403, + "time_per_iteration": 2.6526310443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088301, + "balance_loss_mlp": 1.05289555, + "epoch": 0.2701038861100423, + "flos": 531737837568.0, + "grad_norm": 0.12276519374226595, + "language_loss": 0.86177158, + "learning_rate": 0.0008562963102443516, + "loss": 0.87265456, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.35400391, + "step": 1404, + "time_per_iteration": 2.6809849739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092814, + "balance_loss_mlp": 1.05757618, + "epoch": 0.2702962677953059, + "flos": 734908737024.0, + "grad_norm": 0.05743337882617235, + "language_loss": 0.85265231, + "learning_rate": 0.0008560776698496056, + "loss": 0.86358047, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.3527832, + "step": 1405, + "time_per_iteration": 2.9008774757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099947, + "balance_loss_mlp": 1.06420779, + "epoch": 0.27048864948056944, + "flos": 574453181952.0, + "grad_norm": 0.06281004106283315, + "language_loss": 0.85864103, + "learning_rate": 0.0008558588912132481, + "loss": 0.86964047, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.35742188, + "step": 1406, + "time_per_iteration": 2.8967840671539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057025, + "balance_loss_mlp": 1.04519963, + "epoch": 0.27068103116583303, + "flos": 1423091953152.0, + "grad_norm": 0.03126478371356873, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77516007, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.11816406, + "step": 1407, + "time_per_iteration": 4.933698892593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109874, + "balance_loss_mlp": 1.06400251, + "epoch": 0.27087341285109656, + "flos": 531742219776.0, + "grad_norm": 0.050597666424933845, + "language_loss": 0.82942683, + "learning_rate": 0.0008554209195555016, + "loss": 0.84041423, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.34765625, + "step": 1408, + "time_per_iteration": 2.6599888801574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093614, + "balance_loss_mlp": 1.0582329, + "epoch": 0.27106579453636015, + "flos": 581108332032.0, + "grad_norm": 0.058412744436649705, + "language_loss": 0.88199335, + "learning_rate": 0.0008552017267041483, + "loss": 0.89292949, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.35375977, + "step": 1409, + "time_per_iteration": 2.673828363418579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091257, + "balance_loss_mlp": 1.05563736, + "epoch": 0.2712581762216237, + "flos": 506533160448.0, + "grad_norm": 0.05246666988179206, + "language_loss": 0.83264577, + "learning_rate": 0.0008549823959512549, + "loss": 0.84355831, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.35644531, + "step": 1410, + "time_per_iteration": 2.634523868560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091517, + "balance_loss_mlp": 1.05451441, + "epoch": 0.27145055790688727, + "flos": 997019476992.0, + "grad_norm": 0.050905982394943275, + "language_loss": 0.86668658, + "learning_rate": 0.0008547629273819728, + "loss": 0.87760168, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.36987305, + "step": 1411, + "time_per_iteration": 3.3559322357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094566, + "balance_loss_mlp": 1.05875564, + "epoch": 0.2716429395921508, + "flos": 546420086784.0, + "grad_norm": 0.06363965087638479, + "language_loss": 0.83773881, + "learning_rate": 0.0008545433210815074, + "loss": 0.84868449, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.35839844, + "step": 1412, + "time_per_iteration": 2.607379913330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109226, + "balance_loss_mlp": 1.05606771, + "epoch": 0.2718353212774144, + "flos": 572954605056.0, + "grad_norm": 0.05881941163427475, + "language_loss": 0.87753916, + "learning_rate": 0.0008543235771351176, + "loss": 0.88846171, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.36230469, + "step": 1413, + "time_per_iteration": 2.722318649291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096323, + "balance_loss_mlp": 1.06118035, + "epoch": 0.272027702962678, + "flos": 643986823680.0, + "grad_norm": 0.044269909609048815, + "language_loss": 0.84649068, + "learning_rate": 0.0008541036956281154, + "loss": 0.85745388, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.3515625, + "step": 1414, + "time_per_iteration": 2.8785104751586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100326, + "balance_loss_mlp": 1.0645628, + "epoch": 0.2722200846479415, + "flos": 653410755072.0, + "grad_norm": 0.0658433573318433, + "language_loss": 0.82281864, + "learning_rate": 0.0008538836766458665, + "loss": 0.83382189, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.35791016, + "step": 1415, + "time_per_iteration": 2.834148645401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098566, + "balance_loss_mlp": 1.06311321, + "epoch": 0.2724124663332051, + "flos": 579346324992.0, + "grad_norm": 0.07330345680392343, + "language_loss": 0.85275221, + "learning_rate": 0.0008536635202737897, + "loss": 0.86373788, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.35473633, + "step": 1416, + "time_per_iteration": 2.7886626720428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099993, + "balance_loss_mlp": 1.06513667, + "epoch": 0.2726048480184686, + "flos": 537178037760.0, + "grad_norm": 0.06667202152625065, + "language_loss": 0.82212454, + "learning_rate": 0.0008534432265973573, + "loss": 0.8331244, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.34912109, + "step": 1417, + "time_per_iteration": 2.604626417160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095149, + "balance_loss_mlp": 1.05931497, + "epoch": 0.2727972297037322, + "flos": 995360776704.0, + "grad_norm": 0.08172035912068172, + "language_loss": 0.88052338, + "learning_rate": 0.000853222795702095, + "loss": 0.8914749, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.35839844, + "step": 1418, + "time_per_iteration": 3.391664505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095275, + "balance_loss_mlp": 1.06125307, + "epoch": 0.27298961138899575, + "flos": 605924513280.0, + "grad_norm": 0.05480231780963067, + "language_loss": 0.83608377, + "learning_rate": 0.0008530022276735813, + "loss": 0.84703648, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.34057617, + "step": 1419, + "time_per_iteration": 2.705235004425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107985, + "balance_loss_mlp": 1.04666257, + "epoch": 0.27318199307425933, + "flos": 529059216384.0, + "grad_norm": 0.054542785174291425, + "language_loss": 0.85957551, + "learning_rate": 0.0008527815225974489, + "loss": 0.87037402, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.33203125, + "step": 1420, + "time_per_iteration": 2.654003620147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087931, + "balance_loss_mlp": 1.05395651, + "epoch": 0.2733743747595229, + "flos": 408809272320.0, + "grad_norm": 0.06460584893454492, + "language_loss": 0.88538897, + "learning_rate": 0.0008525606805593829, + "loss": 0.89626825, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.33984375, + "step": 1421, + "time_per_iteration": 2.4287912845611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087097, + "balance_loss_mlp": 1.05233574, + "epoch": 0.27356675644478645, + "flos": 515976030720.0, + "grad_norm": 0.055753761808712644, + "language_loss": 0.82379127, + "learning_rate": 0.0008523397016451213, + "loss": 0.8346622, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.34814453, + "step": 1422, + "time_per_iteration": 2.5620808601379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096714, + "balance_loss_mlp": 1.0617379, + "epoch": 0.27375913813005004, + "flos": 1051914539520.0, + "grad_norm": 0.0481984630724129, + "language_loss": 0.87272507, + "learning_rate": 0.0008521185859404564, + "loss": 0.88369215, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.34985352, + "step": 1423, + "time_per_iteration": 3.361171245574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085067, + "balance_loss_mlp": 1.05037737, + "epoch": 0.27395151981531357, + "flos": 624507535872.0, + "grad_norm": 0.05502068897485729, + "language_loss": 0.89717311, + "learning_rate": 0.0008518973335312326, + "loss": 0.90802383, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.34716797, + "step": 1424, + "time_per_iteration": 2.7964961528778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088932, + "balance_loss_mlp": 1.05390823, + "epoch": 0.27414390150057716, + "flos": 550112836608.0, + "grad_norm": 0.056708357312241935, + "language_loss": 0.83878243, + "learning_rate": 0.0008516759445033477, + "loss": 0.84967172, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.35058594, + "step": 1425, + "time_per_iteration": 2.6100170612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090578, + "balance_loss_mlp": 1.05603087, + "epoch": 0.2743362831858407, + "flos": 539596200960.0, + "grad_norm": 0.061048707375716146, + "language_loss": 0.84983361, + "learning_rate": 0.0008514544189427526, + "loss": 0.86073935, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.34594727, + "step": 1426, + "time_per_iteration": 2.6465015411376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088002, + "balance_loss_mlp": 1.05347919, + "epoch": 0.2745286648711043, + "flos": 468352986624.0, + "grad_norm": 0.061383055639382046, + "language_loss": 0.8704657, + "learning_rate": 0.0008512327569354511, + "loss": 0.88134569, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.34570312, + "step": 1427, + "time_per_iteration": 2.5696229934692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087297, + "balance_loss_mlp": 1.05212998, + "epoch": 0.2747210465563678, + "flos": 472617524736.0, + "grad_norm": 0.05941983459852813, + "language_loss": 0.8349936, + "learning_rate": 0.0008510109585675001, + "loss": 0.84586656, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.35180664, + "step": 1428, + "time_per_iteration": 2.6195123195648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086947, + "balance_loss_mlp": 1.07245111, + "epoch": 0.2749134282416314, + "flos": 1314345070080.0, + "grad_norm": 0.037284634304165044, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82240289, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.14453125, + "step": 1429, + "time_per_iteration": 4.714681625366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083808, + "balance_loss_mlp": 1.04938078, + "epoch": 0.275105809926895, + "flos": 970445670912.0, + "grad_norm": 0.07686972857934649, + "language_loss": 0.80942416, + "learning_rate": 0.0008505669530941415, + "loss": 0.82026225, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.34472656, + "step": 1430, + "time_per_iteration": 3.2975006103515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080869, + "balance_loss_mlp": 1.04715657, + "epoch": 0.2752981916121585, + "flos": 527089185792.0, + "grad_norm": 0.061626933195079385, + "language_loss": 0.84357536, + "learning_rate": 0.000850344746161112, + "loss": 0.85438406, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.33740234, + "step": 1431, + "time_per_iteration": 2.596623182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079512, + "balance_loss_mlp": 1.04487014, + "epoch": 0.2754905732974221, + "flos": 453487444992.0, + "grad_norm": 0.05883177646218185, + "language_loss": 0.87880194, + "learning_rate": 0.0008501224032121894, + "loss": 0.88959706, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.34692383, + "step": 1432, + "time_per_iteration": 2.5134201049804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082699, + "balance_loss_mlp": 1.04817569, + "epoch": 0.27568295498268564, + "flos": 497216918016.0, + "grad_norm": 0.05235854639463291, + "language_loss": 0.82002538, + "learning_rate": 0.0008498999243336946, + "loss": 0.83085239, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.34570312, + "step": 1433, + "time_per_iteration": 2.601771593093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095225, + "balance_loss_mlp": 1.06060696, + "epoch": 0.2758753366679492, + "flos": 607890161664.0, + "grad_norm": 0.05891633941102979, + "language_loss": 0.87444806, + "learning_rate": 0.0008496773096120021, + "loss": 0.8854003, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.34643555, + "step": 1434, + "time_per_iteration": 2.788516044616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096763, + "balance_loss_mlp": 1.06212115, + "epoch": 0.27606771835321275, + "flos": 739803290112.0, + "grad_norm": 0.06770297286276174, + "language_loss": 0.84185004, + "learning_rate": 0.0008494545591335381, + "loss": 0.85281765, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.34667969, + "step": 1435, + "time_per_iteration": 2.8759751319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110127, + "balance_loss_mlp": 1.06736696, + "epoch": 0.27626010003847634, + "flos": 554279860224.0, + "grad_norm": 0.04450223786838935, + "language_loss": 0.87244952, + "learning_rate": 0.0008492316729847823, + "loss": 0.88346225, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.33935547, + "step": 1436, + "time_per_iteration": 2.781244993209839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094118, + "balance_loss_mlp": 1.06023908, + "epoch": 0.2764524817237399, + "flos": 542270439936.0, + "grad_norm": 0.055325808882979444, + "language_loss": 0.79874223, + "learning_rate": 0.0008490086512522664, + "loss": 0.80968338, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.33911133, + "step": 1437, + "time_per_iteration": 2.7197165489196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093138, + "balance_loss_mlp": 1.05913925, + "epoch": 0.27664486340900346, + "flos": 406027344384.0, + "grad_norm": 0.0539948754920925, + "language_loss": 0.90713239, + "learning_rate": 0.0008487854940225755, + "loss": 0.91806382, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.34008789, + "step": 1438, + "time_per_iteration": 2.438218593597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094602, + "balance_loss_mlp": 1.06017423, + "epoch": 0.27683724509426705, + "flos": 521884712448.0, + "grad_norm": 0.06140365718889793, + "language_loss": 0.90140885, + "learning_rate": 0.0008485622013823466, + "loss": 0.91235483, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.34448242, + "step": 1439, + "time_per_iteration": 2.653393268585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102393, + "balance_loss_mlp": 1.06770289, + "epoch": 0.2770296267795306, + "flos": 535085761536.0, + "grad_norm": 0.07554461134761571, + "language_loss": 0.8283006, + "learning_rate": 0.00084833877341827, + "loss": 0.83932453, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.34692383, + "step": 1440, + "time_per_iteration": 2.6312928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109947, + "balance_loss_mlp": 1.06587648, + "epoch": 0.27722200846479417, + "flos": 487747906560.0, + "grad_norm": 0.12145939933268801, + "language_loss": 0.8064183, + "learning_rate": 0.000848115210217088, + "loss": 0.81741297, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.33618164, + "step": 1441, + "time_per_iteration": 2.5490710735321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086774, + "balance_loss_mlp": 1.05167925, + "epoch": 0.2774143901500577, + "flos": 618012509184.0, + "grad_norm": 0.05766268366580332, + "language_loss": 0.82057106, + "learning_rate": 0.0008478915118655952, + "loss": 0.83143878, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.35131836, + "step": 1442, + "time_per_iteration": 2.710240602493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086605, + "balance_loss_mlp": 1.05160558, + "epoch": 0.2776067718353213, + "flos": 513563659776.0, + "grad_norm": 0.05774564569051742, + "language_loss": 0.86505657, + "learning_rate": 0.0008476676784506393, + "loss": 0.87592262, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.35009766, + "step": 1443, + "time_per_iteration": 2.636622667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108686, + "balance_loss_mlp": 1.0526228, + "epoch": 0.2777991535205848, + "flos": 1003985957376.0, + "grad_norm": 0.10311001825576924, + "language_loss": 0.82024419, + "learning_rate": 0.0008474437100591201, + "loss": 0.8311128, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.34277344, + "step": 1444, + "time_per_iteration": 3.282383918762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091274, + "balance_loss_mlp": 1.05517697, + "epoch": 0.2779915352058484, + "flos": 550005147648.0, + "grad_norm": 0.05151300271624721, + "language_loss": 0.85496646, + "learning_rate": 0.0008472196067779898, + "loss": 0.86587918, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.36108398, + "step": 1445, + "time_per_iteration": 2.703263998031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092087, + "balance_loss_mlp": 1.05591917, + "epoch": 0.278183916891112, + "flos": 873444930048.0, + "grad_norm": 0.06736388569990436, + "language_loss": 0.85432607, + "learning_rate": 0.0008469953686942531, + "loss": 0.86524689, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.36181641, + "step": 1446, + "time_per_iteration": 3.0834743976593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087224, + "balance_loss_mlp": 1.05305839, + "epoch": 0.2783762985763755, + "flos": 623782978560.0, + "grad_norm": 0.06536474240751361, + "language_loss": 0.83167183, + "learning_rate": 0.0008467709958949668, + "loss": 0.84254414, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.34204102, + "step": 1447, + "time_per_iteration": 2.737135887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087199, + "balance_loss_mlp": 1.05362952, + "epoch": 0.2785686802616391, + "flos": 581571021312.0, + "grad_norm": 0.057056565872365954, + "language_loss": 0.85917461, + "learning_rate": 0.0008465464884672403, + "loss": 0.87004662, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.33569336, + "step": 1448, + "time_per_iteration": 2.6771810054779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087224, + "balance_loss_mlp": 1.05346394, + "epoch": 0.27876106194690264, + "flos": 587032980480.0, + "grad_norm": 0.06237565084734976, + "language_loss": 0.85356677, + "learning_rate": 0.0008463218464982348, + "loss": 0.86443901, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.33789062, + "step": 1449, + "time_per_iteration": 2.799407720565796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086336, + "balance_loss_mlp": 1.05228984, + "epoch": 0.27895344363216623, + "flos": 875621574144.0, + "grad_norm": 0.06450477685794259, + "language_loss": 0.87800258, + "learning_rate": 0.0008460970700751645, + "loss": 0.88886595, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.34057617, + "step": 1450, + "time_per_iteration": 3.0517759323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086942, + "balance_loss_mlp": 1.05322921, + "epoch": 0.27914582531742976, + "flos": 603630005760.0, + "grad_norm": 0.06893143963997089, + "language_loss": 0.87761652, + "learning_rate": 0.000845872159285295, + "loss": 0.88848597, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.33740234, + "step": 1451, + "time_per_iteration": 2.6964316368103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042481, + "balance_loss_mlp": 1.0276041, + "epoch": 0.27933820700269335, + "flos": 1496892953088.0, + "grad_norm": 0.0242162718076618, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78809333, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.1484375, + "step": 1452, + "time_per_iteration": 4.936378717422485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085289, + "balance_loss_mlp": 1.05162406, + "epoch": 0.2795305886879569, + "flos": 1031445854208.0, + "grad_norm": 0.05721806240601363, + "language_loss": 0.86067116, + "learning_rate": 0.0008454219349544836, + "loss": 0.87152404, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.33691406, + "step": 1453, + "time_per_iteration": 3.336336135864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086797, + "balance_loss_mlp": 1.053442, + "epoch": 0.27972297037322047, + "flos": 606766934016.0, + "grad_norm": 0.056433536115445035, + "language_loss": 0.81829166, + "learning_rate": 0.000845196621588334, + "loss": 0.82915968, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.33374023, + "step": 1454, + "time_per_iteration": 2.7415192127227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088727, + "balance_loss_mlp": 1.05475271, + "epoch": 0.27991535205848406, + "flos": 630085948416.0, + "grad_norm": 0.05700257056170363, + "language_loss": 0.76605666, + "learning_rate": 0.0008449711742049706, + "loss": 0.77694392, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.33984375, + "step": 1455, + "time_per_iteration": 2.755082130432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093567, + "balance_loss_mlp": 1.06035542, + "epoch": 0.2801077337437476, + "flos": 549034689024.0, + "grad_norm": 0.056412270826162, + "language_loss": 0.83750427, + "learning_rate": 0.0008447455928919196, + "loss": 0.84843993, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.33227539, + "step": 1456, + "time_per_iteration": 2.601306438446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097544, + "balance_loss_mlp": 1.06423688, + "epoch": 0.2803001154290112, + "flos": 486516989952.0, + "grad_norm": 0.08664389404831466, + "language_loss": 0.86875856, + "learning_rate": 0.0008445198777367595, + "loss": 0.87973404, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.33325195, + "step": 1457, + "time_per_iteration": 2.5534915924072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106361, + "balance_loss_mlp": 1.07267249, + "epoch": 0.2804924971142747, + "flos": 521820693504.0, + "grad_norm": 0.060155581105879694, + "language_loss": 0.8096568, + "learning_rate": 0.0008442940288271208, + "loss": 0.82072043, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.33691406, + "step": 1458, + "time_per_iteration": 2.6646370887756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108311, + "balance_loss_mlp": 1.07459903, + "epoch": 0.2806848787995383, + "flos": 527410690560.0, + "grad_norm": 0.05492641307724046, + "language_loss": 0.86995763, + "learning_rate": 0.0008440680462506856, + "loss": 0.88104069, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.33740234, + "step": 1459, + "time_per_iteration": 2.793306589126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103222, + "balance_loss_mlp": 1.06927133, + "epoch": 0.2808772604848018, + "flos": 485246785536.0, + "grad_norm": 0.053370474172872176, + "language_loss": 0.86799729, + "learning_rate": 0.0008438419300951883, + "loss": 0.87902945, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.33984375, + "step": 1460, + "time_per_iteration": 2.6732945442199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098403, + "balance_loss_mlp": 1.06488156, + "epoch": 0.2810696421700654, + "flos": 617840801280.0, + "grad_norm": 0.06081455520295947, + "language_loss": 0.86599934, + "learning_rate": 0.0008436156804484148, + "loss": 0.87698334, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.33544922, + "step": 1461, + "time_per_iteration": 2.768385410308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087899, + "balance_loss_mlp": 1.05397177, + "epoch": 0.28126202385532895, + "flos": 454521922560.0, + "grad_norm": 0.061036272851527865, + "language_loss": 0.88221931, + "learning_rate": 0.0008433892973982031, + "loss": 0.89309829, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.33959961, + "step": 1462, + "time_per_iteration": 2.502269983291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108859, + "balance_loss_mlp": 1.0546149, + "epoch": 0.28145440554059253, + "flos": 530447284224.0, + "grad_norm": 0.06533100110399645, + "language_loss": 0.85006338, + "learning_rate": 0.0008431627810324431, + "loss": 0.86094928, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.34008789, + "step": 1463, + "time_per_iteration": 2.6481637954711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097854, + "balance_loss_mlp": 1.06344974, + "epoch": 0.2816467872258561, + "flos": 451996070400.0, + "grad_norm": 0.053948569536927254, + "language_loss": 0.81259125, + "learning_rate": 0.000842936131439076, + "loss": 0.82356977, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.34423828, + "step": 1464, + "time_per_iteration": 2.598619222640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100723, + "balance_loss_mlp": 1.06665313, + "epoch": 0.28183916891111965, + "flos": 472464755712.0, + "grad_norm": 0.06117554261618067, + "language_loss": 0.88043475, + "learning_rate": 0.0008427093487060951, + "loss": 0.89144206, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.34106445, + "step": 1465, + "time_per_iteration": 2.611067533493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092883, + "balance_loss_mlp": 1.05917072, + "epoch": 0.28203155059638324, + "flos": 556770806784.0, + "grad_norm": 0.05001896034653533, + "language_loss": 0.84742111, + "learning_rate": 0.000842482432921545, + "loss": 0.85834992, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.33740234, + "step": 1466, + "time_per_iteration": 2.8155059814453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109283, + "balance_loss_mlp": 1.05990458, + "epoch": 0.28222393228164677, + "flos": 416756385792.0, + "grad_norm": 0.06017010781955974, + "language_loss": 0.87132335, + "learning_rate": 0.0008422553841735225, + "loss": 0.88225162, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.3293457, + "step": 1467, + "time_per_iteration": 2.459348201751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091932, + "balance_loss_mlp": 1.05731392, + "epoch": 0.28241631396691036, + "flos": 604629577728.0, + "grad_norm": 0.060074700521020694, + "language_loss": 0.84810078, + "learning_rate": 0.0008420282025501757, + "loss": 0.85902011, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.34643555, + "step": 1468, + "time_per_iteration": 2.7499678134918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086806, + "balance_loss_mlp": 1.05275989, + "epoch": 0.2826086956521739, + "flos": 572698529280.0, + "grad_norm": 0.05717030328031113, + "language_loss": 0.854882, + "learning_rate": 0.0008418008881397043, + "loss": 0.86575013, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.34082031, + "step": 1469, + "time_per_iteration": 2.6512861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080943, + "balance_loss_mlp": 1.04677796, + "epoch": 0.2828010773374375, + "flos": 842367886848.0, + "grad_norm": 0.05184982716140645, + "language_loss": 0.82590878, + "learning_rate": 0.0008415734410303595, + "loss": 0.8367182, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.34204102, + "step": 1470, + "time_per_iteration": 3.1787467002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085547, + "balance_loss_mlp": 1.05214453, + "epoch": 0.28299345902270107, + "flos": 542402860032.0, + "grad_norm": 0.04590644458835405, + "language_loss": 0.90709066, + "learning_rate": 0.0008413458613104444, + "loss": 0.9179461, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.33447266, + "step": 1471, + "time_per_iteration": 2.6650707721710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092285, + "balance_loss_mlp": 1.05780995, + "epoch": 0.2831858407079646, + "flos": 571320635904.0, + "grad_norm": 0.05367648266979066, + "language_loss": 0.82737631, + "learning_rate": 0.0008411181490683129, + "loss": 0.83829916, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.34472656, + "step": 1472, + "time_per_iteration": 2.7423322200775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104233, + "balance_loss_mlp": 1.06909025, + "epoch": 0.2833782223932282, + "flos": 763491861504.0, + "grad_norm": 0.05498194123694656, + "language_loss": 0.82467097, + "learning_rate": 0.0008408903043923707, + "loss": 0.83571333, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.35180664, + "step": 1473, + "time_per_iteration": 2.991787910461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114011, + "balance_loss_mlp": 1.07810485, + "epoch": 0.2835706040784917, + "flos": 538793068032.0, + "grad_norm": 0.05681509946110844, + "language_loss": 0.81401253, + "learning_rate": 0.0008406623273710754, + "loss": 0.82515264, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.35913086, + "step": 1474, + "time_per_iteration": 2.58866286277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112011, + "balance_loss_mlp": 1.07732141, + "epoch": 0.2837629857637553, + "flos": 530329420800.0, + "grad_norm": 0.06008709036576614, + "language_loss": 0.82883334, + "learning_rate": 0.0008404342180929351, + "loss": 0.83995342, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.34716797, + "step": 1475, + "time_per_iteration": 2.636383295059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112842, + "balance_loss_mlp": 1.07834268, + "epoch": 0.28395536744901884, + "flos": 539763526656.0, + "grad_norm": 0.06514959519071023, + "language_loss": 0.81725156, + "learning_rate": 0.00084020597664651, + "loss": 0.82837999, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.34521484, + "step": 1476, + "time_per_iteration": 2.7587718963623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106069, + "balance_loss_mlp": 1.0697813, + "epoch": 0.2841477491342824, + "flos": 573344510976.0, + "grad_norm": 0.0608139165355994, + "language_loss": 0.84204602, + "learning_rate": 0.0008399776031204111, + "loss": 0.85310674, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.36303711, + "step": 1477, + "time_per_iteration": 2.7376203536987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097711, + "balance_loss_mlp": 1.06263912, + "epoch": 0.28434013081954596, + "flos": 571802264064.0, + "grad_norm": 0.06275845169868324, + "language_loss": 0.8026123, + "learning_rate": 0.0008397490976033009, + "loss": 0.81358939, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.35083008, + "step": 1478, + "time_per_iteration": 2.6391618251800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103776, + "balance_loss_mlp": 1.02412283, + "epoch": 0.28453251250480954, + "flos": 1552554832896.0, + "grad_norm": 0.016614249421738093, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78917408, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.13671875, + "step": 1479, + "time_per_iteration": 4.730362176895142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079858, + "balance_loss_mlp": 1.04550207, + "epoch": 0.28472489419007313, + "flos": 748720862208.0, + "grad_norm": 0.04873312803653651, + "language_loss": 0.85529596, + "learning_rate": 0.0008392916909509525, + "loss": 0.86609453, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.34399414, + "step": 1480, + "time_per_iteration": 3.0429892539978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083924, + "balance_loss_mlp": 1.0495913, + "epoch": 0.28491727587533666, + "flos": 489914376192.0, + "grad_norm": 0.056617404906403615, + "language_loss": 0.85149431, + "learning_rate": 0.0008390627899932954, + "loss": 0.86233348, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.34375, + "step": 1481, + "time_per_iteration": 2.6355843544006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083301, + "balance_loss_mlp": 1.04818201, + "epoch": 0.28510965756060025, + "flos": 728671196160.0, + "grad_norm": 0.06013951169928809, + "language_loss": 0.88358414, + "learning_rate": 0.000838833757399789, + "loss": 0.89441717, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.3515625, + "step": 1482, + "time_per_iteration": 2.9198856353759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088409, + "balance_loss_mlp": 1.05357623, + "epoch": 0.2853020392458638, + "flos": 551300083200.0, + "grad_norm": 0.06378715850004578, + "language_loss": 0.80512154, + "learning_rate": 0.0008386045932593515, + "loss": 0.81600565, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.34887695, + "step": 1483, + "time_per_iteration": 2.665919065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087283, + "balance_loss_mlp": 1.05233121, + "epoch": 0.28549442093112737, + "flos": 754456425984.0, + "grad_norm": 0.06049898751226662, + "language_loss": 0.86304945, + "learning_rate": 0.0008383752976609525, + "loss": 0.87392229, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.34960938, + "step": 1484, + "time_per_iteration": 2.9113876819610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081674, + "balance_loss_mlp": 1.04586315, + "epoch": 0.2856868026163909, + "flos": 538311439872.0, + "grad_norm": 0.05363431349597561, + "language_loss": 0.79897112, + "learning_rate": 0.0008381458706936123, + "loss": 0.80978787, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.3581543, + "step": 1485, + "time_per_iteration": 2.715182065963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082792, + "balance_loss_mlp": 1.0470289, + "epoch": 0.2858791843016545, + "flos": 583487207424.0, + "grad_norm": 0.055785658857036256, + "language_loss": 0.8776381, + "learning_rate": 0.0008379163124464025, + "loss": 0.888466, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.35766602, + "step": 1486, + "time_per_iteration": 2.713412284851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083174, + "balance_loss_mlp": 1.04881775, + "epoch": 0.286071565986918, + "flos": 644503357440.0, + "grad_norm": 0.05967072286491994, + "language_loss": 0.76593089, + "learning_rate": 0.0008376866230084452, + "loss": 0.7767626, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.34399414, + "step": 1487, + "time_per_iteration": 2.812953472137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091632, + "balance_loss_mlp": 1.05522537, + "epoch": 0.2862639476721816, + "flos": 491120561664.0, + "grad_norm": 0.06413337589788286, + "language_loss": 0.85965335, + "learning_rate": 0.000837456802468914, + "loss": 0.87056965, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.36401367, + "step": 1488, + "time_per_iteration": 2.5974318981170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096408, + "balance_loss_mlp": 1.06050217, + "epoch": 0.2864563293574452, + "flos": 521363796480.0, + "grad_norm": 0.06049840128310572, + "language_loss": 0.85439187, + "learning_rate": 0.0008372268509170331, + "loss": 0.86535597, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.35888672, + "step": 1489, + "time_per_iteration": 2.6646056175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100143, + "balance_loss_mlp": 1.06478548, + "epoch": 0.2866487110427087, + "flos": 546834723840.0, + "grad_norm": 0.05582745965585505, + "language_loss": 0.84845203, + "learning_rate": 0.0008369967684420779, + "loss": 0.85945344, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.35424805, + "step": 1490, + "time_per_iteration": 2.737180471420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094742, + "balance_loss_mlp": 1.05902719, + "epoch": 0.2868410927279723, + "flos": 481977437184.0, + "grad_norm": 0.0702351654670911, + "language_loss": 0.84684229, + "learning_rate": 0.0008367665551333736, + "loss": 0.85778964, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.35717773, + "step": 1491, + "time_per_iteration": 2.591179847717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095437, + "balance_loss_mlp": 1.05807662, + "epoch": 0.28703347441323585, + "flos": 724578365952.0, + "grad_norm": 0.0690733570245185, + "language_loss": 0.85732669, + "learning_rate": 0.0008365362110802977, + "loss": 0.86828107, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.37329102, + "step": 1492, + "time_per_iteration": 2.8586251735687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083578, + "balance_loss_mlp": 1.04733849, + "epoch": 0.28722585609849943, + "flos": 634670581248.0, + "grad_norm": 0.059898504183233336, + "language_loss": 0.82604659, + "learning_rate": 0.0008363057363722773, + "loss": 0.83688229, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.36254883, + "step": 1493, + "time_per_iteration": 2.8491427898406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076927, + "balance_loss_mlp": 1.04264212, + "epoch": 0.28741823778376296, + "flos": 509974216704.0, + "grad_norm": 0.05796804627405179, + "language_loss": 0.84095198, + "learning_rate": 0.0008360751310987906, + "loss": 0.85172129, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.34301758, + "step": 1494, + "time_per_iteration": 2.5735158920288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077369, + "balance_loss_mlp": 1.04294157, + "epoch": 0.28761061946902655, + "flos": 603458297856.0, + "grad_norm": 0.07368534281083228, + "language_loss": 0.85552645, + "learning_rate": 0.0008358443953493666, + "loss": 0.86630011, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.34472656, + "step": 1495, + "time_per_iteration": 2.8492400646209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078466, + "balance_loss_mlp": 1.04260767, + "epoch": 0.28780300115429014, + "flos": 406977454080.0, + "grad_norm": 0.061458136593000166, + "language_loss": 0.88553399, + "learning_rate": 0.0008356135292135851, + "loss": 0.89631861, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.35864258, + "step": 1496, + "time_per_iteration": 2.499234676361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109055, + "balance_loss_mlp": 1.05428672, + "epoch": 0.28799538283955367, + "flos": 374726310912.0, + "grad_norm": 0.06023187099093886, + "language_loss": 0.92244387, + "learning_rate": 0.0008353825327810758, + "loss": 0.93334937, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.36230469, + "step": 1497, + "time_per_iteration": 2.4068801403045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083025, + "balance_loss_mlp": 1.04761958, + "epoch": 0.28818776452481726, + "flos": 591645316608.0, + "grad_norm": 0.050935971597675156, + "language_loss": 0.81914794, + "learning_rate": 0.00083515140614152, + "loss": 0.82997811, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.35473633, + "step": 1498, + "time_per_iteration": 2.7172293663024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05012727, + "epoch": 0.2883801462100808, + "flos": 534819511296.0, + "grad_norm": 0.055380500747477406, + "language_loss": 0.86671853, + "learning_rate": 0.0008349201493846485, + "loss": 0.87756932, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.34985352, + "step": 1499, + "time_per_iteration": 2.666877508163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088163, + "balance_loss_mlp": 1.05268669, + "epoch": 0.2885725278953444, + "flos": 479850255360.0, + "grad_norm": 0.06392675802491345, + "language_loss": 0.89344347, + "learning_rate": 0.0008346887626002432, + "loss": 0.90432513, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.35473633, + "step": 1500, + "time_per_iteration": 2.547353744506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081111, + "balance_loss_mlp": 1.04613519, + "epoch": 0.2887649095806079, + "flos": 463798877184.0, + "grad_norm": 0.050375470508879826, + "language_loss": 0.86195928, + "learning_rate": 0.000834457245878137, + "loss": 0.87277037, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.35009766, + "step": 1501, + "time_per_iteration": 2.6108102798461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076981, + "balance_loss_mlp": 1.04317355, + "epoch": 0.2889572912658715, + "flos": 930631527936.0, + "grad_norm": 0.05668037017333152, + "language_loss": 0.81365681, + "learning_rate": 0.000834225599308212, + "loss": 0.82442665, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.33837891, + "step": 1502, + "time_per_iteration": 3.2222447395324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085232, + "balance_loss_mlp": 1.04994583, + "epoch": 0.28914967295113503, + "flos": 569848200192.0, + "grad_norm": 0.05132223508893719, + "language_loss": 0.85018057, + "learning_rate": 0.0008339938229804016, + "loss": 0.8610329, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.35327148, + "step": 1503, + "time_per_iteration": 2.698528289794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_mlp": 1.01945031, + "epoch": 0.2893420546363986, + "flos": 1485803119104.0, + "grad_norm": 0.02573157997511775, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76467812, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.13574219, + "step": 1504, + "time_per_iteration": 4.950274467468262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083767, + "balance_loss_mlp": 1.04793239, + "epoch": 0.2895344363216622, + "flos": 469938903552.0, + "grad_norm": 0.06568085425348943, + "language_loss": 0.84119928, + "learning_rate": 0.0008335298814111094, + "loss": 0.85203701, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.35864258, + "step": 1505, + "time_per_iteration": 2.542043924331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085418, + "balance_loss_mlp": 1.05089498, + "epoch": 0.28972681800692573, + "flos": 647909508096.0, + "grad_norm": 0.06591449016003405, + "language_loss": 0.87860626, + "learning_rate": 0.0008332977163497455, + "loss": 0.88946044, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.34570312, + "step": 1506, + "time_per_iteration": 2.810399293899536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087351, + "balance_loss_mlp": 1.05163622, + "epoch": 0.2899191996921893, + "flos": 571955033088.0, + "grad_norm": 0.054529888005095714, + "language_loss": 0.83185649, + "learning_rate": 0.0008330654218907325, + "loss": 0.84272999, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.35742188, + "step": 1507, + "time_per_iteration": 2.6968414783477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084817, + "balance_loss_mlp": 1.04981744, + "epoch": 0.29011158137745285, + "flos": 661037773824.0, + "grad_norm": 0.1280653735040032, + "language_loss": 0.81777966, + "learning_rate": 0.0008328329981242548, + "loss": 0.82862782, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.3503418, + "step": 1508, + "time_per_iteration": 2.886396884918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.05441689, + "epoch": 0.29030396306271644, + "flos": 535933974528.0, + "grad_norm": 0.060234790533374126, + "language_loss": 0.87937206, + "learning_rate": 0.0008326004451405475, + "loss": 0.89026886, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.3527832, + "step": 1509, + "time_per_iteration": 2.7797772884368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096033, + "balance_loss_mlp": 1.06148589, + "epoch": 0.29049634474798, + "flos": 511707110400.0, + "grad_norm": 0.05385470227261208, + "language_loss": 0.82548428, + "learning_rate": 0.0008323677630298957, + "loss": 0.83644462, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.34594727, + "step": 1510, + "time_per_iteration": 2.5542855262756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093546, + "balance_loss_mlp": 1.05892766, + "epoch": 0.29068872643324356, + "flos": 613454017536.0, + "grad_norm": 0.05556182475666109, + "language_loss": 0.85001689, + "learning_rate": 0.0008321349518826345, + "loss": 0.86095232, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.34643555, + "step": 1511, + "time_per_iteration": 2.7849388122558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093876, + "balance_loss_mlp": 1.05878115, + "epoch": 0.2908811081185071, + "flos": 546164011008.0, + "grad_norm": 0.07046084545113683, + "language_loss": 0.94823933, + "learning_rate": 0.0008319020117891491, + "loss": 0.95917809, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.35131836, + "step": 1512, + "time_per_iteration": 2.5936031341552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090975, + "balance_loss_mlp": 1.05485463, + "epoch": 0.2910734898037707, + "flos": 604516096512.0, + "grad_norm": 0.06307487928884016, + "language_loss": 0.87063539, + "learning_rate": 0.0008316689428398751, + "loss": 0.88154513, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.36108398, + "step": 1513, + "time_per_iteration": 2.6774067878723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082761, + "balance_loss_mlp": 1.04792798, + "epoch": 0.29126587148903427, + "flos": 574383370752.0, + "grad_norm": 0.043578668947666564, + "language_loss": 0.88254529, + "learning_rate": 0.0008314357451252979, + "loss": 0.89337289, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.34887695, + "step": 1514, + "time_per_iteration": 2.7561941146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086548, + "balance_loss_mlp": 1.05092812, + "epoch": 0.2914582531742978, + "flos": 570802692096.0, + "grad_norm": 0.06240160889449628, + "language_loss": 0.87564558, + "learning_rate": 0.0008312024187359527, + "loss": 0.88651109, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.35644531, + "step": 1515, + "time_per_iteration": 2.636056900024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084088, + "balance_loss_mlp": 1.04942179, + "epoch": 0.2916506348595614, + "flos": 730523363328.0, + "grad_norm": 0.06185972429295104, + "language_loss": 0.87361014, + "learning_rate": 0.000830968963762425, + "loss": 0.88445103, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.34716797, + "step": 1516, + "time_per_iteration": 3.021732807159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091474, + "balance_loss_mlp": 1.05528235, + "epoch": 0.2918430165448249, + "flos": 510220118016.0, + "grad_norm": 0.05583453201751925, + "language_loss": 0.83947027, + "learning_rate": 0.0008307353802953497, + "loss": 0.85038507, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.36206055, + "step": 1517, + "time_per_iteration": 2.6659955978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100836, + "balance_loss_mlp": 1.06318951, + "epoch": 0.2920353982300885, + "flos": 630096122880.0, + "grad_norm": 0.04472517729516854, + "language_loss": 0.86110896, + "learning_rate": 0.0008305016684254125, + "loss": 0.87211728, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.37646484, + "step": 1518, + "time_per_iteration": 2.7896409034729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098986, + "balance_loss_mlp": 1.06153059, + "epoch": 0.29222777991535204, + "flos": 501411644928.0, + "grad_norm": 0.055409034097420505, + "language_loss": 0.86932153, + "learning_rate": 0.0008302678282433479, + "loss": 0.88031137, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.37451172, + "step": 1519, + "time_per_iteration": 2.585256814956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095676, + "balance_loss_mlp": 1.05891216, + "epoch": 0.2924201616006156, + "flos": 486522782208.0, + "grad_norm": 0.057505891705300044, + "language_loss": 0.85011005, + "learning_rate": 0.0008300338598399411, + "loss": 0.86106682, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.36791992, + "step": 1520, + "time_per_iteration": 2.6471352577209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109825, + "balance_loss_mlp": 1.07210708, + "epoch": 0.2926125432858792, + "flos": 476211350016.0, + "grad_norm": 0.05302442020038178, + "language_loss": 0.9456166, + "learning_rate": 0.0008297997633060263, + "loss": 0.95671487, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.37719727, + "step": 1521, + "time_per_iteration": 2.547457695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.06987774, + "epoch": 0.29280492497114274, + "flos": 676379151360.0, + "grad_norm": 0.054888704647412474, + "language_loss": 0.85310549, + "learning_rate": 0.0008295655387324883, + "loss": 0.86417043, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.36645508, + "step": 1522, + "time_per_iteration": 2.822557210922241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105716, + "balance_loss_mlp": 1.0686183, + "epoch": 0.29299730665640633, + "flos": 458175384576.0, + "grad_norm": 0.055715580232585875, + "language_loss": 0.85025144, + "learning_rate": 0.0008293311862102609, + "loss": 0.86130863, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.37084961, + "step": 1523, + "time_per_iteration": 2.5033791065216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103076, + "balance_loss_mlp": 1.06795669, + "epoch": 0.29318968834166986, + "flos": 446343464448.0, + "grad_norm": 0.0584953499596263, + "language_loss": 0.88722956, + "learning_rate": 0.0008290967058303275, + "loss": 0.89826035, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.3515625, + "step": 1524, + "time_per_iteration": 2.5981156826019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_mlp": 1.06630898, + "epoch": 0.29338207002693345, + "flos": 450085676544.0, + "grad_norm": 0.05072610752657829, + "language_loss": 0.86932707, + "learning_rate": 0.0008288620976837219, + "loss": 0.88035178, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.36181641, + "step": 1525, + "time_per_iteration": 2.522019863128662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092716, + "balance_loss_mlp": 1.05623853, + "epoch": 0.293574451712197, + "flos": 502027103232.0, + "grad_norm": 0.05230718040210392, + "language_loss": 0.83001733, + "learning_rate": 0.000828627361861527, + "loss": 0.84094453, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.36474609, + "step": 1526, + "time_per_iteration": 2.559201955795288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085779, + "balance_loss_mlp": 1.05051661, + "epoch": 0.29376683339746057, + "flos": 696158184960.0, + "grad_norm": 0.071892180297548, + "language_loss": 0.8465147, + "learning_rate": 0.0008283924984548752, + "loss": 0.85737246, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.3527832, + "step": 1527, + "time_per_iteration": 2.8108816146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079347, + "balance_loss_mlp": 1.04494321, + "epoch": 0.2939592150827241, + "flos": 478353088512.0, + "grad_norm": 0.05128355551395112, + "language_loss": 0.85087478, + "learning_rate": 0.0008281575075549485, + "loss": 0.86166823, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.34399414, + "step": 1528, + "time_per_iteration": 2.576814889907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051452, + "balance_loss_mlp": 1.04057992, + "epoch": 0.2941515967679877, + "flos": 1484482042368.0, + "grad_norm": 0.031732851839211505, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.7840414, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.10888672, + "step": 1529, + "time_per_iteration": 4.662023067474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089321, + "balance_loss_mlp": 1.0547502, + "epoch": 0.2943439784532513, + "flos": 673848916992.0, + "grad_norm": 0.06453398347295829, + "language_loss": 0.90086716, + "learning_rate": 0.0008276871436402469, + "loss": 0.91176039, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.34619141, + "step": 1530, + "time_per_iteration": 2.795783758163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097742, + "balance_loss_mlp": 1.06460166, + "epoch": 0.2945363601385148, + "flos": 576031896576.0, + "grad_norm": 0.05195467848041957, + "language_loss": 0.87790835, + "learning_rate": 0.000827451770808083, + "loss": 0.88888574, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.33154297, + "step": 1531, + "time_per_iteration": 2.6522414684295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100953, + "balance_loss_mlp": 1.06628692, + "epoch": 0.2947287418237784, + "flos": 480416251392.0, + "grad_norm": 0.05572078055736918, + "language_loss": 0.83276248, + "learning_rate": 0.0008272162708478674, + "loss": 0.84377199, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.34692383, + "step": 1532, + "time_per_iteration": 2.5960874557495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098518, + "balance_loss_mlp": 1.06459141, + "epoch": 0.2949211235090419, + "flos": 557917355520.0, + "grad_norm": 0.05232404820193651, + "language_loss": 0.86136103, + "learning_rate": 0.000826980643851029, + "loss": 0.87234622, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.33959961, + "step": 1533, + "time_per_iteration": 2.671867609024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106353, + "balance_loss_mlp": 1.07147205, + "epoch": 0.2951135051943055, + "flos": 483646311936.0, + "grad_norm": 0.06650262295584625, + "language_loss": 0.84864676, + "learning_rate": 0.0008267448899090464, + "loss": 0.85971034, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.34887695, + "step": 1534, + "time_per_iteration": 2.5133543014526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095921, + "balance_loss_mlp": 1.0604682, + "epoch": 0.29530588687956905, + "flos": 550015322112.0, + "grad_norm": 0.05711998360561463, + "language_loss": 0.80980158, + "learning_rate": 0.0008265090091134473, + "loss": 0.82076073, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.35473633, + "step": 1535, + "time_per_iteration": 2.8528778553009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085228, + "balance_loss_mlp": 1.05072904, + "epoch": 0.29549826856483263, + "flos": 672731481600.0, + "grad_norm": 0.047870597747086484, + "language_loss": 0.80150926, + "learning_rate": 0.0008262730015558088, + "loss": 0.8123616, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.34521484, + "step": 1536, + "time_per_iteration": 2.8849382400512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076376, + "balance_loss_mlp": 1.04192495, + "epoch": 0.29569065025009617, + "flos": 764300786688.0, + "grad_norm": 0.06331525049863725, + "language_loss": 0.82269859, + "learning_rate": 0.0008260368673277574, + "loss": 0.8334623, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.34472656, + "step": 1537, + "time_per_iteration": 3.12172269821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078873, + "balance_loss_mlp": 1.04480314, + "epoch": 0.29588303193535975, + "flos": 543398049792.0, + "grad_norm": 0.05107262607685598, + "language_loss": 0.84019077, + "learning_rate": 0.0008258006065209682, + "loss": 0.85097957, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.34106445, + "step": 1538, + "time_per_iteration": 2.7388381958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082306, + "balance_loss_mlp": 1.04744971, + "epoch": 0.29607541362062334, + "flos": 596648968704.0, + "grad_norm": 0.06469434822608365, + "language_loss": 0.80634302, + "learning_rate": 0.0008255642192271657, + "loss": 0.81716609, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.34863281, + "step": 1539, + "time_per_iteration": 2.7957324981689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082434, + "balance_loss_mlp": 1.04774427, + "epoch": 0.29626779530588687, + "flos": 609588149760.0, + "grad_norm": 0.06097977692176942, + "language_loss": 0.83830953, + "learning_rate": 0.0008253277055381241, + "loss": 0.84913385, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.34741211, + "step": 1540, + "time_per_iteration": 2.8428521156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085866, + "balance_loss_mlp": 1.05146217, + "epoch": 0.29646017699115046, + "flos": 867050237952.0, + "grad_norm": 0.06407432486539091, + "language_loss": 0.8580029, + "learning_rate": 0.0008250910655456658, + "loss": 0.8688615, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.34448242, + "step": 1541, + "time_per_iteration": 3.1741185188293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081587, + "balance_loss_mlp": 1.04696846, + "epoch": 0.296652558676414, + "flos": 495616444416.0, + "grad_norm": 0.06683547404256097, + "language_loss": 0.83703458, + "learning_rate": 0.0008248542993416625, + "loss": 0.84785044, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.34643555, + "step": 1542, + "time_per_iteration": 2.5634429454803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083482, + "balance_loss_mlp": 1.04914963, + "epoch": 0.2968449403616776, + "flos": 571275555840.0, + "grad_norm": 0.054805025189504364, + "language_loss": 0.83645159, + "learning_rate": 0.0008246174070180352, + "loss": 0.84728634, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.34375, + "step": 1543, + "time_per_iteration": 2.664029121398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108698, + "balance_loss_mlp": 1.05226684, + "epoch": 0.2970373220469411, + "flos": 793799115264.0, + "grad_norm": 0.06369286414713611, + "language_loss": 0.84087443, + "learning_rate": 0.0008243803886667537, + "loss": 0.85174423, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.34765625, + "step": 1544, + "time_per_iteration": 3.129185199737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082817, + "balance_loss_mlp": 1.04793644, + "epoch": 0.2972297037322047, + "flos": 660736617984.0, + "grad_norm": 0.0569938777400986, + "language_loss": 0.79051471, + "learning_rate": 0.0008241432443798364, + "loss": 0.80134284, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.34936523, + "step": 1545, + "time_per_iteration": 2.7968478202819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076425, + "balance_loss_mlp": 1.04225969, + "epoch": 0.29742208541746823, + "flos": 596849789952.0, + "grad_norm": 0.05185676674228935, + "language_loss": 0.85634965, + "learning_rate": 0.0008239059742493512, + "loss": 0.86711389, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.34204102, + "step": 1546, + "time_per_iteration": 2.730803966522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079044, + "balance_loss_mlp": 1.0448308, + "epoch": 0.2976144671027318, + "flos": 769519816704.0, + "grad_norm": 0.049935350225070424, + "language_loss": 0.87424839, + "learning_rate": 0.0008236685783674142, + "loss": 0.88503873, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.34228516, + "step": 1547, + "time_per_iteration": 3.0735998153686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060573, + "balance_loss_mlp": 1.04903388, + "epoch": 0.2978068487879954, + "flos": 1483980065280.0, + "grad_norm": 0.022808758650826922, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77281767, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.11523438, + "step": 1548, + "time_per_iteration": 4.902673959732056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088892, + "balance_loss_mlp": 1.05460715, + "epoch": 0.29799923047325894, + "flos": 475079357952.0, + "grad_norm": 0.07696298762455249, + "language_loss": 0.82568306, + "learning_rate": 0.0008231934097178955, + "loss": 0.83657193, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.34326172, + "step": 1549, + "time_per_iteration": 2.59600567817688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087503, + "balance_loss_mlp": 1.05173981, + "epoch": 0.2981916121585225, + "flos": 759464460288.0, + "grad_norm": 0.05308820200633048, + "language_loss": 0.8525809, + "learning_rate": 0.0008229556371347903, + "loss": 0.86345589, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.35791016, + "step": 1550, + "time_per_iteration": 2.955467939376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080247, + "balance_loss_mlp": 1.04593909, + "epoch": 0.29838399384378606, + "flos": 874642351104.0, + "grad_norm": 0.058723621398699785, + "language_loss": 0.79088616, + "learning_rate": 0.0008227177391691874, + "loss": 0.80168855, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.34350586, + "step": 1551, + "time_per_iteration": 3.1204521656036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084967, + "balance_loss_mlp": 1.05001473, + "epoch": 0.29857637552904964, + "flos": 579389995008.0, + "grad_norm": 0.060980844602782615, + "language_loss": 0.89576113, + "learning_rate": 0.0008224797159134463, + "loss": 0.90661073, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.34985352, + "step": 1552, + "time_per_iteration": 2.7535347938537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.04132128, + "epoch": 0.2987687572143132, + "flos": 836048950272.0, + "grad_norm": 0.05791718796165568, + "language_loss": 0.83571118, + "learning_rate": 0.0008222415674599765, + "loss": 0.84646177, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.33764648, + "step": 1553, + "time_per_iteration": 3.0609707832336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077849, + "balance_loss_mlp": 1.0417521, + "epoch": 0.29896113889957676, + "flos": 566800022016.0, + "grad_norm": 0.05477323920870417, + "language_loss": 0.83255476, + "learning_rate": 0.0008220032939012349, + "loss": 0.84333324, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.36108398, + "step": 1554, + "time_per_iteration": 2.6683521270751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077555, + "balance_loss_mlp": 1.04310393, + "epoch": 0.29915352058484035, + "flos": 498370669056.0, + "grad_norm": 0.049159177960894807, + "language_loss": 0.87956095, + "learning_rate": 0.0008217648953297277, + "loss": 0.89033645, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.34472656, + "step": 1555, + "time_per_iteration": 2.82114315032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109245, + "balance_loss_mlp": 1.05711639, + "epoch": 0.2993459022701039, + "flos": 591837373440.0, + "grad_norm": 0.06210935096260163, + "language_loss": 0.7799179, + "learning_rate": 0.0008215263718380095, + "loss": 0.79084241, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.35327148, + "step": 1556, + "time_per_iteration": 2.6806485652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104132, + "balance_loss_mlp": 1.06815481, + "epoch": 0.29953828395536747, + "flos": 572107802112.0, + "grad_norm": 0.051501670996139066, + "language_loss": 0.8437115, + "learning_rate": 0.0008212877235186833, + "loss": 0.8547529, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.36010742, + "step": 1557, + "time_per_iteration": 2.706531286239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075275, + "balance_loss_mlp": 1.06321061, + "epoch": 0.299730665640631, + "flos": 1503855051264.0, + "grad_norm": 0.03618665962020262, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78812838, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.12060547, + "step": 1558, + "time_per_iteration": 4.914030313491821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102225, + "balance_loss_mlp": 1.06641483, + "epoch": 0.2999230473258946, + "flos": 513538928640.0, + "grad_norm": 0.06717328469529676, + "language_loss": 0.80777293, + "learning_rate": 0.0008208100527678611, + "loss": 0.8187952, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.3581543, + "step": 1559, + "time_per_iteration": 2.5862650871276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097511, + "balance_loss_mlp": 1.06294012, + "epoch": 0.3001154290111581, + "flos": 834128381952.0, + "grad_norm": 0.05731533213860712, + "language_loss": 0.78337204, + "learning_rate": 0.0008205710305218135, + "loss": 0.79434717, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.34594727, + "step": 1560, + "time_per_iteration": 3.00581693649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094947, + "balance_loss_mlp": 1.06149733, + "epoch": 0.3003078106964217, + "flos": 556485617664.0, + "grad_norm": 0.051151635719759364, + "language_loss": 0.89917201, + "learning_rate": 0.0008203318838190541, + "loss": 0.9101215, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.3347168, + "step": 1561, + "time_per_iteration": 2.730187177658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087932, + "balance_loss_mlp": 1.05345702, + "epoch": 0.30050019238168524, + "flos": 525897556992.0, + "grad_norm": 0.07455466191279551, + "language_loss": 0.85053575, + "learning_rate": 0.0008200926127524281, + "loss": 0.86141509, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.34521484, + "step": 1562, + "time_per_iteration": 2.6252634525299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082824, + "balance_loss_mlp": 1.04837239, + "epoch": 0.3006925740669488, + "flos": 577582907904.0, + "grad_norm": 0.08578868432126639, + "language_loss": 0.83193934, + "learning_rate": 0.0008198532174148289, + "loss": 0.84276754, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.3449707, + "step": 1563, + "time_per_iteration": 2.784132957458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010668, + "balance_loss_mlp": 0.99912882, + "epoch": 0.3008849557522124, + "flos": 1489408528896.0, + "grad_norm": 0.006418694176289122, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81696838, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.11523438, + "step": 1564, + "time_per_iteration": 4.826026678085327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079569, + "balance_loss_mlp": 1.04607105, + "epoch": 0.30107733743747594, + "flos": 509565371904.0, + "grad_norm": 0.057361266050022765, + "language_loss": 0.88701093, + "learning_rate": 0.0008193740542985244, + "loss": 0.89780664, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.33520508, + "step": 1565, + "time_per_iteration": 2.5685722827911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082334, + "balance_loss_mlp": 1.04881263, + "epoch": 0.30126971912273953, + "flos": 587425858560.0, + "grad_norm": 0.055549771382925904, + "language_loss": 0.86598676, + "learning_rate": 0.0008191342867058467, + "loss": 0.87681007, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.33520508, + "step": 1566, + "time_per_iteration": 2.7413527965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088116, + "balance_loss_mlp": 1.05423677, + "epoch": 0.30146210080800306, + "flos": 601822918656.0, + "grad_norm": 0.054174391750340056, + "language_loss": 0.83411789, + "learning_rate": 0.0008188943952142509, + "loss": 0.84499902, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.33911133, + "step": 1567, + "time_per_iteration": 2.816777229309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098385, + "balance_loss_mlp": 1.06376624, + "epoch": 0.30165448249326665, + "flos": 917424686592.0, + "grad_norm": 0.057308899380469513, + "language_loss": 0.81973398, + "learning_rate": 0.0008186543799168711, + "loss": 0.8307178, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.34643555, + "step": 1568, + "time_per_iteration": 3.138439655303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094575, + "balance_loss_mlp": 1.060076, + "epoch": 0.3018468641785302, + "flos": 776953368576.0, + "grad_norm": 0.06314470525088989, + "language_loss": 0.88671768, + "learning_rate": 0.0008184142409068892, + "loss": 0.89766341, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.34545898, + "step": 1569, + "time_per_iteration": 3.0061678886413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104914, + "balance_loss_mlp": 1.07134473, + "epoch": 0.30203924586379377, + "flos": 522101500416.0, + "grad_norm": 0.05000282823150535, + "language_loss": 0.86630476, + "learning_rate": 0.000818173978277536, + "loss": 0.87735385, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.3359375, + "step": 1570, + "time_per_iteration": 2.7171432971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101367, + "balance_loss_mlp": 1.06779718, + "epoch": 0.3022316275490573, + "flos": 524288318976.0, + "grad_norm": 0.052630401262377564, + "language_loss": 0.83781934, + "learning_rate": 0.000817933592122089, + "loss": 0.84883296, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.3359375, + "step": 1571, + "time_per_iteration": 2.7346580028533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105149, + "balance_loss_mlp": 1.0710789, + "epoch": 0.3024240092343209, + "flos": 479672755200.0, + "grad_norm": 0.05357670269103591, + "language_loss": 0.83451247, + "learning_rate": 0.0008176930825338749, + "loss": 0.84556395, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.34106445, + "step": 1572, + "time_per_iteration": 2.5449459552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095582, + "balance_loss_mlp": 1.06110692, + "epoch": 0.3026163909195845, + "flos": 686901579264.0, + "grad_norm": 0.06283664606524127, + "language_loss": 0.8873198, + "learning_rate": 0.0008174524496062679, + "loss": 0.89827561, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.3449707, + "step": 1573, + "time_per_iteration": 2.8826043605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108869, + "balance_loss_mlp": 1.05380964, + "epoch": 0.302808772604848, + "flos": 542654553600.0, + "grad_norm": 0.05929060654319276, + "language_loss": 0.85444844, + "learning_rate": 0.0008172116934326894, + "loss": 0.86533535, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.34912109, + "step": 1574, + "time_per_iteration": 2.7539572715759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088334, + "balance_loss_mlp": 1.05435979, + "epoch": 0.3030011542901116, + "flos": 474852395520.0, + "grad_norm": 0.051325587648683973, + "language_loss": 0.87683225, + "learning_rate": 0.0008169708141066097, + "loss": 0.88771558, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.34008789, + "step": 1575, + "time_per_iteration": 2.5635225772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086472, + "balance_loss_mlp": 1.05199683, + "epoch": 0.30319353597537513, + "flos": 481233940992.0, + "grad_norm": 0.06106098638193731, + "language_loss": 0.90820259, + "learning_rate": 0.0008167298117215465, + "loss": 0.91906732, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.3449707, + "step": 1576, + "time_per_iteration": 2.5388035774230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087257, + "balance_loss_mlp": 1.05242443, + "epoch": 0.3033859176606387, + "flos": 704455916544.0, + "grad_norm": 0.06728579874610481, + "language_loss": 0.88300574, + "learning_rate": 0.0008164886863710649, + "loss": 0.89387834, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.34887695, + "step": 1577, + "time_per_iteration": 2.8935675621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088541, + "balance_loss_mlp": 1.05554342, + "epoch": 0.30357829934590225, + "flos": 764344456704.0, + "grad_norm": 0.04642698643554312, + "language_loss": 0.86113924, + "learning_rate": 0.0008162474381487783, + "loss": 0.87202466, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.33007812, + "step": 1578, + "time_per_iteration": 3.0151257514953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088208, + "balance_loss_mlp": 1.05401909, + "epoch": 0.30377068103116583, + "flos": 532082663424.0, + "grad_norm": 0.05691489249418783, + "language_loss": 0.84894794, + "learning_rate": 0.0008160060671483475, + "loss": 0.85983002, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.34228516, + "step": 1579, + "time_per_iteration": 2.6575984954833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097191, + "balance_loss_mlp": 1.06338358, + "epoch": 0.3039630627164294, + "flos": 509934928896.0, + "grad_norm": 0.07240450604386858, + "language_loss": 0.83520651, + "learning_rate": 0.0008157645734634809, + "loss": 0.84617841, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.33837891, + "step": 1580, + "time_per_iteration": 2.5869438648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061274, + "balance_loss_mlp": 1.04992568, + "epoch": 0.30415544440169295, + "flos": 1505206803456.0, + "grad_norm": 0.030998653754179664, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.77957761, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.11328125, + "step": 1581, + "time_per_iteration": 4.907174348831177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_mlp": 1.01606703, + "epoch": 0.30434782608695654, + "flos": 1457976637440.0, + "grad_norm": 0.012214555664928241, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74242246, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.11669922, + "step": 1582, + "time_per_iteration": 4.8717710971832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102415, + "balance_loss_mlp": 1.0685122, + "epoch": 0.3045402077722201, + "flos": 482312088576.0, + "grad_norm": 0.05813255519406619, + "language_loss": 0.83851862, + "learning_rate": 0.000815039357240067, + "loss": 0.84954274, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.33935547, + "step": 1583, + "time_per_iteration": 2.640148401260376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102551, + "balance_loss_mlp": 1.06879056, + "epoch": 0.30473258945748366, + "flos": 543220549632.0, + "grad_norm": 0.06312099992380371, + "language_loss": 0.85312426, + "learning_rate": 0.0008147973737554952, + "loss": 0.86414981, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.33789062, + "step": 1584, + "time_per_iteration": 2.772367000579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098103, + "balance_loss_mlp": 1.06443787, + "epoch": 0.3049249711427472, + "flos": 566789847552.0, + "grad_norm": 0.054268296030043885, + "language_loss": 0.85613728, + "learning_rate": 0.000814555268055744, + "loss": 0.86711836, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.33691406, + "step": 1585, + "time_per_iteration": 2.616687536239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109794, + "balance_loss_mlp": 1.06441879, + "epoch": 0.3051173528280108, + "flos": 527970894336.0, + "grad_norm": 0.05527556644311566, + "language_loss": 0.87648201, + "learning_rate": 0.0008143130402348073, + "loss": 0.88746148, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.33544922, + "step": 1586, + "time_per_iteration": 2.635103940963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094771, + "balance_loss_mlp": 1.06141627, + "epoch": 0.3053097345132743, + "flos": 586097427456.0, + "grad_norm": 0.052385807505719764, + "language_loss": 0.79520649, + "learning_rate": 0.0008140706903867265, + "loss": 0.80615419, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.33349609, + "step": 1587, + "time_per_iteration": 2.7922940254211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087687, + "balance_loss_mlp": 1.05263984, + "epoch": 0.3055021161985379, + "flos": 606810604032.0, + "grad_norm": 0.054380951058352583, + "language_loss": 0.90043247, + "learning_rate": 0.0008138282186055897, + "loss": 0.9113093, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.35058594, + "step": 1588, + "time_per_iteration": 2.683783769607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085545, + "balance_loss_mlp": 1.05290556, + "epoch": 0.3056944978838015, + "flos": 573594794496.0, + "grad_norm": 0.05235756550943364, + "language_loss": 0.8193745, + "learning_rate": 0.0008135856249855331, + "loss": 0.83023, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.32641602, + "step": 1589, + "time_per_iteration": 2.6717309951782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081523, + "balance_loss_mlp": 1.04900289, + "epoch": 0.305886879569065, + "flos": 633640485888.0, + "grad_norm": 0.06284243799371535, + "language_loss": 0.89691997, + "learning_rate": 0.0008133429096207398, + "loss": 0.90773523, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.32519531, + "step": 1590, + "time_per_iteration": 2.757962465286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037916, + "balance_loss_mlp": 1.02561319, + "epoch": 0.3060792612543286, + "flos": 1368227414016.0, + "grad_norm": 0.023218914608202516, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76350176, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.12304688, + "step": 1591, + "time_per_iteration": 4.927033185958862 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078048, + "balance_loss_mlp": 1.0450511, + "epoch": 0.30627164293959214, + "flos": 518290887168.0, + "grad_norm": 0.05132667013942606, + "language_loss": 0.86601979, + "learning_rate": 0.0008128571140339123, + "loss": 0.87680024, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.33007812, + "step": 1592, + "time_per_iteration": 2.627272367477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075862, + "balance_loss_mlp": 1.0423162, + "epoch": 0.3064640246248557, + "flos": 455354168832.0, + "grad_norm": 0.054345541641725725, + "language_loss": 0.87405455, + "learning_rate": 0.0008126140340004805, + "loss": 0.88481319, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.33569336, + "step": 1593, + "time_per_iteration": 2.5047686100006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076578, + "balance_loss_mlp": 1.04355717, + "epoch": 0.30665640631011926, + "flos": 849718480896.0, + "grad_norm": 0.04925367115496714, + "language_loss": 0.82262254, + "learning_rate": 0.0008123708325995172, + "loss": 0.83338827, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.33032227, + "step": 1594, + "time_per_iteration": 3.1693196296691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107923, + "balance_loss_mlp": 1.04582715, + "epoch": 0.30684878799538284, + "flos": 757996406784.0, + "grad_norm": 0.04977841797679214, + "language_loss": 0.79901791, + "learning_rate": 0.0008121275099254414, + "loss": 0.80981016, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.33422852, + "step": 1595, + "time_per_iteration": 2.9197185039520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108089, + "balance_loss_mlp": 1.04758275, + "epoch": 0.3070411696806464, + "flos": 517320428544.0, + "grad_norm": 0.05488318824662342, + "language_loss": 0.88300943, + "learning_rate": 0.0008118840660727194, + "loss": 0.89381832, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.33325195, + "step": 1596, + "time_per_iteration": 2.6452150344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079954, + "balance_loss_mlp": 1.04788685, + "epoch": 0.30723355136590996, + "flos": 843883992576.0, + "grad_norm": 0.05612425557740203, + "language_loss": 0.87403214, + "learning_rate": 0.0008116405011358644, + "loss": 0.88483167, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.32055664, + "step": 1597, + "time_per_iteration": 3.135666608810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_mlp": 1.05092525, + "epoch": 0.30742593305117355, + "flos": 465905710080.0, + "grad_norm": 0.05391343675647517, + "language_loss": 0.80005342, + "learning_rate": 0.0008113968152094369, + "loss": 0.81089526, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.33276367, + "step": 1598, + "time_per_iteration": 2.5122313499450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076054, + "balance_loss_mlp": 1.04331923, + "epoch": 0.3076183147364371, + "flos": 686286120960.0, + "grad_norm": 0.04979397496165333, + "language_loss": 0.82305032, + "learning_rate": 0.0008111530083880438, + "loss": 0.83381081, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.32739258, + "step": 1599, + "time_per_iteration": 2.8883755207061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072089, + "balance_loss_mlp": 1.03949702, + "epoch": 0.30781069642170067, + "flos": 613729032192.0, + "grad_norm": 0.059000164712882774, + "language_loss": 0.86657357, + "learning_rate": 0.0008109090807663399, + "loss": 0.87729448, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.32592773, + "step": 1600, + "time_per_iteration": 2.7799928188323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075122, + "balance_loss_mlp": 1.04260147, + "epoch": 0.3080030781069642, + "flos": 590021521920.0, + "grad_norm": 0.046450828420206536, + "language_loss": 0.88735926, + "learning_rate": 0.0008106650324390257, + "loss": 0.89811045, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.32519531, + "step": 1601, + "time_per_iteration": 2.7887444496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071489, + "balance_loss_mlp": 1.03913534, + "epoch": 0.3081954597922278, + "flos": 562353601536.0, + "grad_norm": 0.06865077604181559, + "language_loss": 0.81335884, + "learning_rate": 0.0008104208635008493, + "loss": 0.82407373, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.32348633, + "step": 1602, + "time_per_iteration": 2.6526358127593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077173, + "balance_loss_mlp": 1.04448628, + "epoch": 0.3083878414774913, + "flos": 447599112192.0, + "grad_norm": 0.053973671264543166, + "language_loss": 0.81925714, + "learning_rate": 0.0008101765740466058, + "loss": 0.83002889, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.3269043, + "step": 1603, + "time_per_iteration": 2.5337142944335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073512, + "balance_loss_mlp": 1.04020488, + "epoch": 0.3085802231627549, + "flos": 493297205760.0, + "grad_norm": 0.05670542728571842, + "language_loss": 0.84135199, + "learning_rate": 0.0008099321641711364, + "loss": 0.85208714, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.33325195, + "step": 1604, + "time_per_iteration": 2.6616737842559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071402, + "balance_loss_mlp": 1.03804755, + "epoch": 0.3087726048480185, + "flos": 487437986304.0, + "grad_norm": 0.0517354770696361, + "language_loss": 0.8343811, + "learning_rate": 0.0008096876339693295, + "loss": 0.8450951, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.33374023, + "step": 1605, + "time_per_iteration": 2.6034042835235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078765, + "balance_loss_mlp": 1.04412317, + "epoch": 0.308964986533282, + "flos": 730265877504.0, + "grad_norm": 0.0630488444124333, + "language_loss": 0.8123467, + "learning_rate": 0.0008094429835361206, + "loss": 0.8231343, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.34667969, + "step": 1606, + "time_per_iteration": 2.9442811012268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_mlp": 1.03788495, + "epoch": 0.3091573682185456, + "flos": 605131554816.0, + "grad_norm": 0.0490228515497239, + "language_loss": 0.85833865, + "learning_rate": 0.0008091982129664908, + "loss": 0.8690542, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.33691406, + "step": 1607, + "time_per_iteration": 2.734976053237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077455, + "balance_loss_mlp": 1.04290783, + "epoch": 0.30934974990380915, + "flos": 460081396224.0, + "grad_norm": 0.04772079658934369, + "language_loss": 0.82646394, + "learning_rate": 0.0008089533223554687, + "loss": 0.83723843, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.34594727, + "step": 1608, + "time_per_iteration": 2.756741762161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080172, + "balance_loss_mlp": 1.04669785, + "epoch": 0.30954213158907273, + "flos": 553142075904.0, + "grad_norm": 0.05499274022240881, + "language_loss": 0.8525604, + "learning_rate": 0.0008087083117981294, + "loss": 0.86336207, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.33496094, + "step": 1609, + "time_per_iteration": 2.9062788486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081142, + "balance_loss_mlp": 1.04676199, + "epoch": 0.30973451327433627, + "flos": 552776901120.0, + "grad_norm": 0.0512798930400947, + "language_loss": 0.87996054, + "learning_rate": 0.0008084631813895943, + "loss": 0.89077199, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.34375, + "step": 1610, + "time_per_iteration": 2.789893627166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079299, + "balance_loss_mlp": 1.04575384, + "epoch": 0.30992689495959985, + "flos": 565430893056.0, + "grad_norm": 0.07403274033744815, + "language_loss": 0.83632123, + "learning_rate": 0.0008082179312250315, + "loss": 0.84711421, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.33544922, + "step": 1611, + "time_per_iteration": 2.713533878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_mlp": 1.02864099, + "epoch": 0.3101192766448634, + "flos": 1441621131264.0, + "grad_norm": 0.023040649208851512, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.80895925, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.11425781, + "step": 1612, + "time_per_iteration": 4.866560459136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_mlp": 1.02448523, + "epoch": 0.31031165833012697, + "flos": 1531086575616.0, + "grad_norm": 0.021256554447441355, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77664924, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.11132812, + "step": 1613, + "time_per_iteration": 5.01593279838562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010886, + "balance_loss_mlp": 1.05483997, + "epoch": 0.31050404001539056, + "flos": 991534196736.0, + "grad_norm": 0.06253960188626659, + "language_loss": 0.81937206, + "learning_rate": 0.0008074814631475545, + "loss": 0.83025801, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.33789062, + "step": 1614, + "time_per_iteration": 3.3154871463775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092644, + "balance_loss_mlp": 1.05940843, + "epoch": 0.3106964217006541, + "flos": 445748355072.0, + "grad_norm": 0.0719929788966035, + "language_loss": 0.78655052, + "learning_rate": 0.0008072357349114907, + "loss": 0.79747701, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.33251953, + "step": 1615, + "time_per_iteration": 2.6783502101898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084302, + "balance_loss_mlp": 1.05063736, + "epoch": 0.3108888033859177, + "flos": 510259405824.0, + "grad_norm": 0.06269338504314155, + "language_loss": 0.88523185, + "learning_rate": 0.0008069898873959363, + "loss": 0.89607489, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.33691406, + "step": 1616, + "time_per_iteration": 2.6805779933929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092165, + "balance_loss_mlp": 1.05952573, + "epoch": 0.3110811850711812, + "flos": 520471913472.0, + "grad_norm": 0.06669389997650658, + "language_loss": 0.85964763, + "learning_rate": 0.0008067439206963375, + "loss": 0.87056935, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.32641602, + "step": 1617, + "time_per_iteration": 2.6084542274475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091143, + "balance_loss_mlp": 1.05797851, + "epoch": 0.3112735667564448, + "flos": 686085299712.0, + "grad_norm": 0.06020913179087489, + "language_loss": 0.86049557, + "learning_rate": 0.0008064978349081873, + "loss": 0.87140703, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.33178711, + "step": 1618, + "time_per_iteration": 2.9622116088867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089777, + "balance_loss_mlp": 1.05554032, + "epoch": 0.31146594844170833, + "flos": 532786871808.0, + "grad_norm": 0.04562356821057988, + "language_loss": 0.86218596, + "learning_rate": 0.0008062516301270245, + "loss": 0.87308377, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.3425293, + "step": 1619, + "time_per_iteration": 2.691589593887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091127, + "balance_loss_mlp": 1.0575099, + "epoch": 0.3116583301269719, + "flos": 679187220480.0, + "grad_norm": 0.05429224886242875, + "language_loss": 0.88343138, + "learning_rate": 0.0008060053064484343, + "loss": 0.89434266, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.33642578, + "step": 1620, + "time_per_iteration": 2.936244487762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096127, + "balance_loss_mlp": 1.06277251, + "epoch": 0.31185071181223545, + "flos": 585855908352.0, + "grad_norm": 0.05040245512912965, + "language_loss": 0.85009742, + "learning_rate": 0.0008057588639680482, + "loss": 0.86105865, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.33374023, + "step": 1621, + "time_per_iteration": 2.7633163928985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107807, + "balance_loss_mlp": 1.07309282, + "epoch": 0.31204309349749904, + "flos": 725090517504.0, + "grad_norm": 0.06801147163116106, + "language_loss": 0.82624507, + "learning_rate": 0.0008055123027815434, + "loss": 0.83732307, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.34741211, + "step": 1622, + "time_per_iteration": 2.946943521499634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105084, + "balance_loss_mlp": 1.07156253, + "epoch": 0.3122354751827626, + "flos": 576558604800.0, + "grad_norm": 0.0611005921730787, + "language_loss": 0.85109818, + "learning_rate": 0.0008052656229846436, + "loss": 0.862149, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.33544922, + "step": 1623, + "time_per_iteration": 2.6431145668029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106208, + "balance_loss_mlp": 1.07106483, + "epoch": 0.31242785686802615, + "flos": 575672514048.0, + "grad_norm": 0.055122717603047884, + "language_loss": 0.90674621, + "learning_rate": 0.0008050188246731182, + "loss": 0.91780829, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.35180664, + "step": 1624, + "time_per_iteration": 2.6666738986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101645, + "balance_loss_mlp": 1.06745625, + "epoch": 0.31262023855328974, + "flos": 736490271744.0, + "grad_norm": 0.05430344032768667, + "language_loss": 0.81962687, + "learning_rate": 0.0008047719079427834, + "loss": 0.8306433, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.34204102, + "step": 1625, + "time_per_iteration": 2.978775978088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095181, + "balance_loss_mlp": 1.07791936, + "epoch": 0.3128126202385533, + "flos": 1558395113472.0, + "grad_norm": 0.034550759669135796, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75446886, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.17285156, + "step": 1626, + "time_per_iteration": 4.800370931625366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109181, + "balance_loss_mlp": 1.05695319, + "epoch": 0.31300500192381686, + "flos": 514666538496.0, + "grad_norm": 0.04752817769408696, + "language_loss": 0.86259782, + "learning_rate": 0.0008042777196091757, + "loss": 0.87351596, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.34863281, + "step": 1627, + "time_per_iteration": 2.695350408554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088728, + "balance_loss_mlp": 1.05301261, + "epoch": 0.3131973836090804, + "flos": 526370420736.0, + "grad_norm": 0.06407391520506579, + "language_loss": 0.8214981, + "learning_rate": 0.0008040304481977643, + "loss": 0.83238542, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.35742188, + "step": 1628, + "time_per_iteration": 2.6652393341064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_mlp": 1.05030346, + "epoch": 0.313389765294344, + "flos": 822473961984.0, + "grad_norm": 0.08346342139557943, + "language_loss": 0.86950874, + "learning_rate": 0.0008037830587512649, + "loss": 0.88034296, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.33129883, + "step": 1629, + "time_per_iteration": 3.0668327808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090413, + "balance_loss_mlp": 1.05651021, + "epoch": 0.31358214697960757, + "flos": 393604697088.0, + "grad_norm": 0.061409761762948115, + "language_loss": 0.78720629, + "learning_rate": 0.0008035355513657224, + "loss": 0.79811049, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.33935547, + "step": 1630, + "time_per_iteration": 2.5013740062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087918, + "balance_loss_mlp": 1.05449188, + "epoch": 0.3137745286648711, + "flos": 571611617280.0, + "grad_norm": 0.049199842100191564, + "language_loss": 0.93020999, + "learning_rate": 0.0008032879261372279, + "loss": 0.94108921, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.33447266, + "step": 1631, + "time_per_iteration": 2.8559622764587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088241, + "balance_loss_mlp": 1.07612944, + "epoch": 0.3139669103501347, + "flos": 1497614690304.0, + "grad_norm": 0.04267228885339989, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80724084, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.12109375, + "step": 1632, + "time_per_iteration": 5.726024627685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081932, + "balance_loss_mlp": 1.04886341, + "epoch": 0.3141592920353982, + "flos": 525090041856.0, + "grad_norm": 0.04986838794009694, + "language_loss": 0.87459773, + "learning_rate": 0.0008027923225359748, + "loss": 0.88541704, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.33081055, + "step": 1633, + "time_per_iteration": 2.599775791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078913, + "balance_loss_mlp": 1.04465246, + "epoch": 0.3143516737206618, + "flos": 592989714432.0, + "grad_norm": 0.05680374588643473, + "language_loss": 0.8835839, + "learning_rate": 0.0008025443443556267, + "loss": 0.89437306, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.34301758, + "step": 1634, + "time_per_iteration": 2.7439024448394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010804, + "balance_loss_mlp": 1.04776073, + "epoch": 0.31454405540592534, + "flos": 648034573824.0, + "grad_norm": 0.04764849369773053, + "language_loss": 0.88161099, + "learning_rate": 0.000802296248717147, + "loss": 0.89241499, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.32641602, + "step": 1635, + "time_per_iteration": 2.902290105819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082457, + "balance_loss_mlp": 1.04850602, + "epoch": 0.3147364370911889, + "flos": 642543501312.0, + "grad_norm": 0.05380775409858787, + "language_loss": 0.79150212, + "learning_rate": 0.0008020480357168554, + "loss": 0.80232668, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.33984375, + "step": 1636, + "time_per_iteration": 2.7940564155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107486, + "balance_loss_mlp": 1.04176795, + "epoch": 0.31492881877645246, + "flos": 471607778304.0, + "grad_norm": 0.05509564816324918, + "language_loss": 0.88341308, + "learning_rate": 0.0008017997054511165, + "loss": 0.89416164, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.33105469, + "step": 1637, + "time_per_iteration": 2.596212148666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075188, + "balance_loss_mlp": 1.04157114, + "epoch": 0.31512120046171604, + "flos": 629135838720.0, + "grad_norm": 0.0536589952194777, + "language_loss": 0.8549943, + "learning_rate": 0.0008015512580163407, + "loss": 0.86574614, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.33642578, + "step": 1638, + "time_per_iteration": 2.763343334197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107569, + "balance_loss_mlp": 1.0416913, + "epoch": 0.31531358214697963, + "flos": 703460726784.0, + "grad_norm": 0.0636877441873346, + "language_loss": 0.80888116, + "learning_rate": 0.0008013026935089838, + "loss": 0.81963813, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.34033203, + "step": 1639, + "time_per_iteration": 2.9786083698272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070933, + "balance_loss_mlp": 1.03798366, + "epoch": 0.31550596383224316, + "flos": 572275127808.0, + "grad_norm": 0.055086353977466425, + "language_loss": 0.83909047, + "learning_rate": 0.0008010540120255472, + "loss": 0.84979975, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.32958984, + "step": 1640, + "time_per_iteration": 2.666520357131958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075633, + "balance_loss_mlp": 1.04196858, + "epoch": 0.31569834551750675, + "flos": 658047822336.0, + "grad_norm": 0.06483249406864507, + "language_loss": 0.86052895, + "learning_rate": 0.0008008052136625774, + "loss": 0.8712852, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.33691406, + "step": 1641, + "time_per_iteration": 2.8062589168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078165, + "balance_loss_mlp": 1.04407096, + "epoch": 0.3158907272027703, + "flos": 566002681344.0, + "grad_norm": 0.05792040128516231, + "language_loss": 0.86837387, + "learning_rate": 0.0008005562985166666, + "loss": 0.87915552, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.34130859, + "step": 1642, + "time_per_iteration": 2.6996512413024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081029, + "balance_loss_mlp": 1.04576707, + "epoch": 0.31608310888803387, + "flos": 536622216192.0, + "grad_norm": 0.04642534602938139, + "language_loss": 0.84936821, + "learning_rate": 0.0008003072666844524, + "loss": 0.86017853, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.35302734, + "step": 1643, + "time_per_iteration": 2.6999776363372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078239, + "balance_loss_mlp": 1.04292917, + "epoch": 0.3162754905732974, + "flos": 486428239872.0, + "grad_norm": 0.08271259063406261, + "language_loss": 0.82613683, + "learning_rate": 0.0008000581182626173, + "loss": 0.83691919, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.35302734, + "step": 1644, + "time_per_iteration": 2.541093111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082389, + "balance_loss_mlp": 1.04893875, + "epoch": 0.316467872258561, + "flos": 529792538112.0, + "grad_norm": 0.058359985905672214, + "language_loss": 0.86275887, + "learning_rate": 0.0007998088533478894, + "loss": 0.87358278, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.33447266, + "step": 1645, + "time_per_iteration": 2.641402006149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077931, + "balance_loss_mlp": 1.04309845, + "epoch": 0.3166602539438245, + "flos": 443197771776.0, + "grad_norm": 0.07387441321187599, + "language_loss": 0.84062803, + "learning_rate": 0.000799559472037042, + "loss": 0.85140741, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.34887695, + "step": 1646, + "time_per_iteration": 2.5274438858032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076892, + "balance_loss_mlp": 1.04222584, + "epoch": 0.3168526356290881, + "flos": 645513103872.0, + "grad_norm": 0.053861363144643716, + "language_loss": 0.87875295, + "learning_rate": 0.0007993099744268932, + "loss": 0.8895219, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.34716797, + "step": 1647, + "time_per_iteration": 2.8893649578094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070585, + "balance_loss_mlp": 1.03646684, + "epoch": 0.3170450173143517, + "flos": 585889403904.0, + "grad_norm": 0.05841982976759713, + "language_loss": 0.87792766, + "learning_rate": 0.000799060360614307, + "loss": 0.88863349, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.34155273, + "step": 1648, + "time_per_iteration": 2.6867480278015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076826, + "balance_loss_mlp": 1.04273248, + "epoch": 0.3172373989996152, + "flos": 826763231232.0, + "grad_norm": 0.05654214693871822, + "language_loss": 0.83848637, + "learning_rate": 0.0007988106306961917, + "loss": 0.84925467, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.34130859, + "step": 1649, + "time_per_iteration": 3.1321003437042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080036, + "balance_loss_mlp": 1.04577541, + "epoch": 0.3174297806848788, + "flos": 527153204736.0, + "grad_norm": 0.060794493166337976, + "language_loss": 0.84529203, + "learning_rate": 0.0007985607847695014, + "loss": 0.85609239, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.34301758, + "step": 1650, + "time_per_iteration": 2.6306049823760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081772, + "balance_loss_mlp": 1.04851258, + "epoch": 0.31762216237014235, + "flos": 712855544832.0, + "grad_norm": 0.05325998456044798, + "language_loss": 0.82638443, + "learning_rate": 0.0007983108229312345, + "loss": 0.83720207, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.33276367, + "step": 1651, + "time_per_iteration": 2.909571647644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108965, + "balance_loss_mlp": 1.05567503, + "epoch": 0.31781454405540593, + "flos": 483567736320.0, + "grad_norm": 0.0653784528473409, + "language_loss": 0.86672306, + "learning_rate": 0.0007980607452784351, + "loss": 0.87761962, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.33984375, + "step": 1652, + "time_per_iteration": 2.5339765548706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_mlp": 1.06639075, + "epoch": 0.31800692574066947, + "flos": 548483249664.0, + "grad_norm": 0.0685029555550019, + "language_loss": 0.90562367, + "learning_rate": 0.0007978105519081919, + "loss": 0.91662765, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.34008789, + "step": 1653, + "time_per_iteration": 2.6916213035583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096542, + "balance_loss_mlp": 1.06213784, + "epoch": 0.31819930742593305, + "flos": 516640951296.0, + "grad_norm": 0.07941193091019123, + "language_loss": 0.87969935, + "learning_rate": 0.0007975602429176385, + "loss": 0.89066482, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.34423828, + "step": 1654, + "time_per_iteration": 2.5997297763824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100835, + "balance_loss_mlp": 1.06695616, + "epoch": 0.31839168911119664, + "flos": 455748456960.0, + "grad_norm": 0.07129171690745044, + "language_loss": 0.81582803, + "learning_rate": 0.0007973098184039536, + "loss": 0.82683635, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.33911133, + "step": 1655, + "time_per_iteration": 2.654914140701294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096724, + "balance_loss_mlp": 1.06284511, + "epoch": 0.3185840707964602, + "flos": 625719513600.0, + "grad_norm": 0.05658637496419385, + "language_loss": 0.86710656, + "learning_rate": 0.0007970592784643602, + "loss": 0.87807381, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.33911133, + "step": 1656, + "time_per_iteration": 2.846390962600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090807, + "balance_loss_mlp": 1.05719042, + "epoch": 0.31877645248172376, + "flos": 567213249024.0, + "grad_norm": 0.058346793379709355, + "language_loss": 0.85032123, + "learning_rate": 0.0007968086231961272, + "loss": 0.8612293, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.33642578, + "step": 1657, + "time_per_iteration": 2.652986526489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094073, + "balance_loss_mlp": 1.0593828, + "epoch": 0.3189688341669873, + "flos": 489338205696.0, + "grad_norm": 0.08644842740903268, + "language_loss": 0.836254, + "learning_rate": 0.0007965578526965671, + "loss": 0.84719473, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.34741211, + "step": 1658, + "time_per_iteration": 2.5607872009277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092531, + "balance_loss_mlp": 1.05860353, + "epoch": 0.3191612158522509, + "flos": 575948938752.0, + "grad_norm": 0.04707712809776705, + "language_loss": 0.86020696, + "learning_rate": 0.0007963069670630377, + "loss": 0.87113225, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.33959961, + "step": 1659, + "time_per_iteration": 2.7435247898101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097687, + "balance_loss_mlp": 1.06447566, + "epoch": 0.3193535975375144, + "flos": 537867689472.0, + "grad_norm": 0.062321727217464123, + "language_loss": 0.87956834, + "learning_rate": 0.0007960559663929416, + "loss": 0.89054519, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.33227539, + "step": 1660, + "time_per_iteration": 2.6282846927642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096682, + "balance_loss_mlp": 1.06265998, + "epoch": 0.319545979222778, + "flos": 733954245120.0, + "grad_norm": 0.07201541894751945, + "language_loss": 0.87465358, + "learning_rate": 0.0007958048507837259, + "loss": 0.88562042, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.34057617, + "step": 1661, + "time_per_iteration": 2.9250974655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099132, + "balance_loss_mlp": 1.0647999, + "epoch": 0.31973836090804153, + "flos": 764136433152.0, + "grad_norm": 0.0721917610669121, + "language_loss": 0.87230003, + "learning_rate": 0.0007955536203328822, + "loss": 0.8832913, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.34375, + "step": 1662, + "time_per_iteration": 2.899735450744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109485, + "balance_loss_mlp": 1.06109047, + "epoch": 0.3199307425933051, + "flos": 560252560896.0, + "grad_norm": 0.06666532975578916, + "language_loss": 0.83308822, + "learning_rate": 0.0007953022751379469, + "loss": 0.84403676, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.33789062, + "step": 1663, + "time_per_iteration": 2.7743418216705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093375, + "balance_loss_mlp": 1.05899549, + "epoch": 0.3201231242785687, + "flos": 751019751936.0, + "grad_norm": 0.058114271957014456, + "language_loss": 0.81677037, + "learning_rate": 0.000795050815296501, + "loss": 0.82770407, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.34399414, + "step": 1664, + "time_per_iteration": 2.9620323181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091229, + "balance_loss_mlp": 1.05768323, + "epoch": 0.32031550596383224, + "flos": 496157709312.0, + "grad_norm": 0.061791342299560625, + "language_loss": 0.93274921, + "learning_rate": 0.0007947992409061695, + "loss": 0.94366151, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.33569336, + "step": 1665, + "time_per_iteration": 2.6097099781036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083934, + "balance_loss_mlp": 1.05022132, + "epoch": 0.3205078876490958, + "flos": 731294562816.0, + "grad_norm": 0.05133774923717053, + "language_loss": 0.86471802, + "learning_rate": 0.0007945475520646226, + "loss": 0.8755573, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.33740234, + "step": 1666, + "time_per_iteration": 2.9224231243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088823, + "balance_loss_mlp": 1.05487204, + "epoch": 0.32070026933435936, + "flos": 549177283584.0, + "grad_norm": 0.1345109768982335, + "language_loss": 0.8496111, + "learning_rate": 0.0007942957488695743, + "loss": 0.86049932, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.33959961, + "step": 1667, + "time_per_iteration": 2.6267666816711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086537, + "balance_loss_mlp": 1.05265749, + "epoch": 0.32089265101962294, + "flos": 744949536768.0, + "grad_norm": 0.061316479944915916, + "language_loss": 0.80963373, + "learning_rate": 0.0007940438314187833, + "loss": 0.82049918, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.33886719, + "step": 1668, + "time_per_iteration": 3.00421142578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089646, + "balance_loss_mlp": 1.05638647, + "epoch": 0.3210850327048865, + "flos": 493937395200.0, + "grad_norm": 0.05864654089818211, + "language_loss": 0.80047274, + "learning_rate": 0.0007937917998100529, + "loss": 0.81136918, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.33276367, + "step": 1669, + "time_per_iteration": 2.607917070388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_mlp": 1.06610548, + "epoch": 0.32127741439015006, + "flos": 530383265280.0, + "grad_norm": 0.060159342011431034, + "language_loss": 0.78680766, + "learning_rate": 0.0007935396541412302, + "loss": 0.79781532, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.34692383, + "step": 1670, + "time_per_iteration": 2.6022346019744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108678, + "balance_loss_mlp": 1.07458389, + "epoch": 0.3214697960754136, + "flos": 500948955648.0, + "grad_norm": 0.07085567852213893, + "language_loss": 0.85879421, + "learning_rate": 0.0007932873945102068, + "loss": 0.86988097, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.34130859, + "step": 1671, + "time_per_iteration": 2.581815719604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120776, + "balance_loss_mlp": 1.10942781, + "epoch": 0.3216621777606772, + "flos": 1382579394048.0, + "grad_norm": 0.04969555951860313, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76882553, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.11328125, + "step": 1672, + "time_per_iteration": 4.821724891662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106649, + "balance_loss_mlp": 1.07193518, + "epoch": 0.32185455944594077, + "flos": 571260999168.0, + "grad_norm": 0.05896773993357689, + "language_loss": 0.86527109, + "learning_rate": 0.0007927825337533461, + "loss": 0.87633765, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.34765625, + "step": 1673, + "time_per_iteration": 2.6640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105888, + "balance_loss_mlp": 1.07184219, + "epoch": 0.3220469411312043, + "flos": 543652715520.0, + "grad_norm": 0.06618360944756078, + "language_loss": 0.84761298, + "learning_rate": 0.0007925299328235131, + "loss": 0.85867184, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.34057617, + "step": 1674, + "time_per_iteration": 2.6524405479431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102416, + "balance_loss_mlp": 1.06705832, + "epoch": 0.3222393228164679, + "flos": 490884834816.0, + "grad_norm": 0.05872681692102293, + "language_loss": 0.85148364, + "learning_rate": 0.000792277218323488, + "loss": 0.86250782, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.35424805, + "step": 1675, + "time_per_iteration": 2.557460069656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100116, + "balance_loss_mlp": 1.06523526, + "epoch": 0.3224317045017314, + "flos": 490145720832.0, + "grad_norm": 0.05188137415598196, + "language_loss": 0.84647608, + "learning_rate": 0.0007920243903513833, + "loss": 0.85747719, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.34912109, + "step": 1676, + "time_per_iteration": 2.5208208560943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092752, + "balance_loss_mlp": 1.05813313, + "epoch": 0.322624086186995, + "flos": 575505188352.0, + "grad_norm": 0.06429192544800656, + "language_loss": 0.83992624, + "learning_rate": 0.0007917714490053556, + "loss": 0.8508538, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.34667969, + "step": 1677, + "time_per_iteration": 2.653686761856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109123, + "balance_loss_mlp": 1.05668271, + "epoch": 0.32281646787225854, + "flos": 628974305280.0, + "grad_norm": 0.048890211607645416, + "language_loss": 0.86094737, + "learning_rate": 0.0007915183943836055, + "loss": 0.87185967, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.34594727, + "step": 1678, + "time_per_iteration": 2.852612018585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083818, + "balance_loss_mlp": 1.04950905, + "epoch": 0.3230088495575221, + "flos": 781036024320.0, + "grad_norm": 0.05620908679364121, + "language_loss": 0.83880055, + "learning_rate": 0.0007912652265843773, + "loss": 0.8496387, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.34350586, + "step": 1679, + "time_per_iteration": 3.006805419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.0433867, + "epoch": 0.3232012312427857, + "flos": 535839432192.0, + "grad_norm": 0.04762836982551939, + "language_loss": 0.81587136, + "learning_rate": 0.0007910119457059597, + "loss": 0.82664907, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.34423828, + "step": 1680, + "time_per_iteration": 2.6930737495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_mlp": 1.04129148, + "epoch": 0.32339361292804925, + "flos": 704515553280.0, + "grad_norm": 0.06110281418881611, + "language_loss": 0.80031025, + "learning_rate": 0.0007907585518466849, + "loss": 0.81107438, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.35180664, + "step": 1681, + "time_per_iteration": 2.940950870513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081204, + "balance_loss_mlp": 1.04603744, + "epoch": 0.32358599461331283, + "flos": 452099377152.0, + "grad_norm": 0.0474614445796137, + "language_loss": 0.90124965, + "learning_rate": 0.000790505045104929, + "loss": 0.91206169, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.35205078, + "step": 1682, + "time_per_iteration": 2.4919469356536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078264, + "balance_loss_mlp": 1.0435977, + "epoch": 0.32377837629857636, + "flos": 600597794304.0, + "grad_norm": 0.057051782898604, + "language_loss": 0.86989701, + "learning_rate": 0.0007902514255791125, + "loss": 0.88067961, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.34692383, + "step": 1683, + "time_per_iteration": 2.7545859813690186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108269, + "balance_loss_mlp": 1.04721308, + "epoch": 0.32397075798383995, + "flos": 807180636672.0, + "grad_norm": 0.05145240385219177, + "language_loss": 0.87981123, + "learning_rate": 0.0007899976933676986, + "loss": 0.89063811, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.35498047, + "step": 1684, + "time_per_iteration": 2.97807240486145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081365, + "balance_loss_mlp": 1.04638934, + "epoch": 0.3241631396691035, + "flos": 601414073856.0, + "grad_norm": 0.06429290680378846, + "language_loss": 0.8767072, + "learning_rate": 0.0007897438485691955, + "loss": 0.88752091, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.3503418, + "step": 1685, + "time_per_iteration": 2.704326868057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083548, + "balance_loss_mlp": 1.04826176, + "epoch": 0.32435552135436707, + "flos": 473980861440.0, + "grad_norm": 0.058364951070402814, + "language_loss": 0.82023847, + "learning_rate": 0.0007894898912821542, + "loss": 0.831074, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.3527832, + "step": 1686, + "time_per_iteration": 2.5206680297851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076257, + "balance_loss_mlp": 1.04178166, + "epoch": 0.3245479030396306, + "flos": 537824019456.0, + "grad_norm": 0.04476181031616706, + "language_loss": 0.86661267, + "learning_rate": 0.0007892358216051695, + "loss": 0.87737525, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.3449707, + "step": 1687, + "time_per_iteration": 2.7332026958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075747, + "balance_loss_mlp": 1.04108071, + "epoch": 0.3247402847248942, + "flos": 547394927616.0, + "grad_norm": 0.05643246072623682, + "language_loss": 0.92275292, + "learning_rate": 0.0007889816396368803, + "loss": 0.93351042, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.34692383, + "step": 1688, + "time_per_iteration": 2.6158432960510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077832, + "balance_loss_mlp": 1.04388082, + "epoch": 0.3249326664101578, + "flos": 377941814784.0, + "grad_norm": 0.04953960067471088, + "language_loss": 0.85575634, + "learning_rate": 0.0007887273454759687, + "loss": 0.86653465, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.33984375, + "step": 1689, + "time_per_iteration": 2.4958317279815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075156, + "balance_loss_mlp": 1.04051399, + "epoch": 0.3251250480954213, + "flos": 527818125312.0, + "grad_norm": 0.050587956688220255, + "language_loss": 0.82717729, + "learning_rate": 0.0007884729392211603, + "loss": 0.83792883, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.34692383, + "step": 1690, + "time_per_iteration": 2.6325736045837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075634, + "balance_loss_mlp": 1.04113472, + "epoch": 0.3253174297806849, + "flos": 449435312640.0, + "grad_norm": 0.06211432544239721, + "language_loss": 0.85214412, + "learning_rate": 0.0007882184209712245, + "loss": 0.8629005, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.34545898, + "step": 1691, + "time_per_iteration": 2.5199012756347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076936, + "balance_loss_mlp": 1.04303288, + "epoch": 0.32550981146594843, + "flos": 703855014912.0, + "grad_norm": 0.0444021152083115, + "language_loss": 0.85646939, + "learning_rate": 0.000787963790824974, + "loss": 0.86723876, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.33935547, + "step": 1692, + "time_per_iteration": 2.9585483074188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076864, + "balance_loss_mlp": 1.04217362, + "epoch": 0.325702193151212, + "flos": 392491643904.0, + "grad_norm": 0.06035071191190156, + "language_loss": 0.89588344, + "learning_rate": 0.0007877090488812651, + "loss": 0.90665203, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.34716797, + "step": 1693, + "time_per_iteration": 2.4247992038726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073167, + "balance_loss_mlp": 1.0379529, + "epoch": 0.32589457483647555, + "flos": 577223525376.0, + "grad_norm": 0.051335929306222446, + "language_loss": 0.83377099, + "learning_rate": 0.0007874541952389973, + "loss": 0.84450269, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.35253906, + "step": 1694, + "time_per_iteration": 2.679868459701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074307, + "balance_loss_mlp": 1.03947401, + "epoch": 0.32608695652173914, + "flos": 498092834304.0, + "grad_norm": 0.051580366849716015, + "language_loss": 0.86795568, + "learning_rate": 0.0007871992299971136, + "loss": 0.87869877, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.34887695, + "step": 1695, + "time_per_iteration": 2.6005072593688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079265, + "balance_loss_mlp": 1.04431272, + "epoch": 0.32627933820700267, + "flos": 590858150400.0, + "grad_norm": 0.054409905067417906, + "language_loss": 0.84529006, + "learning_rate": 0.0007869441532546001, + "loss": 0.85608268, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.34985352, + "step": 1696, + "time_per_iteration": 2.7373292446136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071229, + "balance_loss_mlp": 1.03749299, + "epoch": 0.32647171989226625, + "flos": 608790809088.0, + "grad_norm": 0.05196776598603691, + "language_loss": 0.79551816, + "learning_rate": 0.0007866889651104867, + "loss": 0.80623043, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.33764648, + "step": 1697, + "time_per_iteration": 2.768869638442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069598, + "balance_loss_mlp": 1.03464603, + "epoch": 0.32666410157752984, + "flos": 476896619520.0, + "grad_norm": 0.05699082390473629, + "language_loss": 0.83313, + "learning_rate": 0.000786433665663846, + "loss": 0.84382606, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.34985352, + "step": 1698, + "time_per_iteration": 2.6574184894561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070595, + "balance_loss_mlp": 1.03664398, + "epoch": 0.3268564832627934, + "flos": 718060018176.0, + "grad_norm": 0.0499104315286651, + "language_loss": 0.86441195, + "learning_rate": 0.0007861782550137942, + "loss": 0.8751179, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.33984375, + "step": 1699, + "time_per_iteration": 2.8897016048431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071541, + "balance_loss_mlp": 1.0379957, + "epoch": 0.32704886494805696, + "flos": 768469372416.0, + "grad_norm": 0.05892131453680714, + "language_loss": 0.85990739, + "learning_rate": 0.0007859227332594901, + "loss": 0.87062275, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.33569336, + "step": 1700, + "time_per_iteration": 2.8941755294799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080492, + "balance_loss_mlp": 1.046803, + "epoch": 0.3272412466333205, + "flos": 849540980736.0, + "grad_norm": 0.0647173620985618, + "language_loss": 0.84537613, + "learning_rate": 0.0007856671005001365, + "loss": 0.85618103, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.3371582, + "step": 1701, + "time_per_iteration": 3.1362555027008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107778, + "balance_loss_mlp": 1.04373336, + "epoch": 0.3274336283185841, + "flos": 831224208384.0, + "grad_norm": 0.055785838120560656, + "language_loss": 0.81608075, + "learning_rate": 0.0007854113568349787, + "loss": 0.82685852, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.34082031, + "step": 1702, + "time_per_iteration": 3.0684425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108348, + "balance_loss_mlp": 1.04900455, + "epoch": 0.3276260100038476, + "flos": 691721938944.0, + "grad_norm": 0.059478075679183354, + "language_loss": 0.80008304, + "learning_rate": 0.0007851555023633052, + "loss": 0.81091785, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.34521484, + "step": 1703, + "time_per_iteration": 2.829838991165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083551, + "balance_loss_mlp": 1.04974365, + "epoch": 0.3278183916891112, + "flos": 435831211008.0, + "grad_norm": 0.05938301584715095, + "language_loss": 0.82290888, + "learning_rate": 0.0007848995371844474, + "loss": 0.83374435, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.33837891, + "step": 1704, + "time_per_iteration": 2.498462200164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.0532254, + "epoch": 0.3280107733743748, + "flos": 460883119104.0, + "grad_norm": 0.06015932024064871, + "language_loss": 0.80622214, + "learning_rate": 0.0007846434613977801, + "loss": 0.81709725, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.34326172, + "step": 1705, + "time_per_iteration": 2.4933369159698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087045, + "balance_loss_mlp": 1.05330932, + "epoch": 0.3282031550596383, + "flos": 679018484736.0, + "grad_norm": 0.05558890685700398, + "language_loss": 0.78265798, + "learning_rate": 0.0007843872751027203, + "loss": 0.7935285, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.33764648, + "step": 1706, + "time_per_iteration": 2.81091046333313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090191, + "balance_loss_mlp": 1.05621648, + "epoch": 0.3283955367449019, + "flos": 544821023232.0, + "grad_norm": 0.10097233050810657, + "language_loss": 0.87312186, + "learning_rate": 0.0007841309783987287, + "loss": 0.88402379, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.34008789, + "step": 1707, + "time_per_iteration": 2.7456212043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083118, + "balance_loss_mlp": 1.0490005, + "epoch": 0.32858791843016544, + "flos": 481017153024.0, + "grad_norm": 0.06288690811568091, + "language_loss": 0.89185357, + "learning_rate": 0.0007838745713853084, + "loss": 0.90268475, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.34155273, + "step": 1708, + "time_per_iteration": 2.5734565258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086944, + "balance_loss_mlp": 1.0529933, + "epoch": 0.328780300115429, + "flos": 566529389568.0, + "grad_norm": 0.059735917623485235, + "language_loss": 0.84101981, + "learning_rate": 0.0007836180541620053, + "loss": 0.85188925, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.33984375, + "step": 1709, + "time_per_iteration": 2.6734848022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082239, + "balance_loss_mlp": 1.04843152, + "epoch": 0.32897268180069256, + "flos": 475787948544.0, + "grad_norm": 0.06557165815913592, + "language_loss": 0.86666405, + "learning_rate": 0.0007833614268284082, + "loss": 0.87748647, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.33813477, + "step": 1710, + "time_per_iteration": 2.5004236698150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109204, + "balance_loss_mlp": 1.07611382, + "epoch": 0.32916506348595614, + "flos": 1576517008896.0, + "grad_norm": 0.028486921929439343, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75201809, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.15917969, + "step": 1711, + "time_per_iteration": 4.857421159744263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084065, + "balance_loss_mlp": 1.05011439, + "epoch": 0.3293574451712197, + "flos": 482646739968.0, + "grad_norm": 0.05383776069577274, + "language_loss": 0.78376174, + "learning_rate": 0.0007828478422289016, + "loss": 0.79460239, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.33984375, + "step": 1712, + "time_per_iteration": 2.5763661861419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089524, + "balance_loss_mlp": 1.05438161, + "epoch": 0.32954982685648326, + "flos": 622266872832.0, + "grad_norm": 0.05220026625301518, + "language_loss": 0.89185119, + "learning_rate": 0.0007825908851623833, + "loss": 0.90274644, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.35205078, + "step": 1713, + "time_per_iteration": 2.7262768745422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081531, + "balance_loss_mlp": 1.04648352, + "epoch": 0.32974220854174685, + "flos": 544697367552.0, + "grad_norm": 0.06806070360888057, + "language_loss": 0.85360491, + "learning_rate": 0.0007823338183843533, + "loss": 0.86442018, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.35083008, + "step": 1714, + "time_per_iteration": 2.652278184890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078166, + "balance_loss_mlp": 1.0447638, + "epoch": 0.3299345902270104, + "flos": 981740708352.0, + "grad_norm": 0.05603975865081876, + "language_loss": 0.8075726, + "learning_rate": 0.0007820766419946141, + "loss": 0.81835425, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.33422852, + "step": 1715, + "time_per_iteration": 3.282278537750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087722, + "balance_loss_mlp": 1.07227242, + "epoch": 0.33012697191227397, + "flos": 1402857432576.0, + "grad_norm": 0.02753251532821737, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80760199, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.15429688, + "step": 1716, + "time_per_iteration": 4.925649881362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081747, + "balance_loss_mlp": 1.04789162, + "epoch": 0.3303193535975375, + "flos": 504897781248.0, + "grad_norm": 0.09582479469105179, + "language_loss": 0.75968826, + "learning_rate": 0.0007815619607794288, + "loss": 0.77050573, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.33886719, + "step": 1717, + "time_per_iteration": 2.653355598449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077451, + "balance_loss_mlp": 1.04423952, + "epoch": 0.3305117352828011, + "flos": 937602390528.0, + "grad_norm": 0.059316474336830904, + "language_loss": 0.8254683, + "learning_rate": 0.0007813044561538001, + "loss": 0.83624279, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.33227539, + "step": 1718, + "time_per_iteration": 3.1251535415649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084283, + "balance_loss_mlp": 1.05035567, + "epoch": 0.3307041169680646, + "flos": 721176597504.0, + "grad_norm": 0.08429030846847434, + "language_loss": 0.88411027, + "learning_rate": 0.0007810468423160958, + "loss": 0.89495313, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.33959961, + "step": 1719, + "time_per_iteration": 2.8598783016204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090007, + "balance_loss_mlp": 1.05760598, + "epoch": 0.3308964986533282, + "flos": 583315499520.0, + "grad_norm": 0.04547421634197757, + "language_loss": 0.81920642, + "learning_rate": 0.0007807891193663306, + "loss": 0.8301065, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.32397461, + "step": 1720, + "time_per_iteration": 2.7775802612304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092448, + "balance_loss_mlp": 1.0582583, + "epoch": 0.33108888033859174, + "flos": 473340672000.0, + "grad_norm": 0.07591254280459368, + "language_loss": 0.82440275, + "learning_rate": 0.0007805312874045614, + "loss": 0.83532727, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.34228516, + "step": 1721, + "time_per_iteration": 2.5138351917266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100808, + "balance_loss_mlp": 1.06657076, + "epoch": 0.3312812620238553, + "flos": 385913659392.0, + "grad_norm": 0.08101052667778896, + "language_loss": 0.86919391, + "learning_rate": 0.0007802733465308874, + "loss": 0.88020205, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.34277344, + "step": 1722, + "time_per_iteration": 2.440809726715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106056, + "balance_loss_mlp": 1.07074606, + "epoch": 0.3314736437091189, + "flos": 494292395520.0, + "grad_norm": 0.0567806329299034, + "language_loss": 0.8509872, + "learning_rate": 0.0007800152968454501, + "loss": 0.86204773, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.35375977, + "step": 1723, + "time_per_iteration": 2.61602520942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090428, + "balance_loss_mlp": 1.056072, + "epoch": 0.33166602539438245, + "flos": 653346736128.0, + "grad_norm": 0.038882210578800376, + "language_loss": 0.90476918, + "learning_rate": 0.0007797571384484334, + "loss": 0.91567349, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.34399414, + "step": 1724, + "time_per_iteration": 2.857562780380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090291, + "balance_loss_mlp": 1.05586314, + "epoch": 0.33185840707964603, + "flos": 520550489088.0, + "grad_norm": 0.04870849772261114, + "language_loss": 0.91599178, + "learning_rate": 0.0007794988714400633, + "loss": 0.92689478, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.34448242, + "step": 1725, + "time_per_iteration": 2.5884361267089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097227, + "balance_loss_mlp": 1.06077266, + "epoch": 0.33205078876490957, + "flos": 436712919552.0, + "grad_norm": 0.05260760436426434, + "language_loss": 0.85199809, + "learning_rate": 0.0007792404959206079, + "loss": 0.86297035, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.36474609, + "step": 1726, + "time_per_iteration": 2.4855122566223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095942, + "balance_loss_mlp": 1.05965447, + "epoch": 0.33224317045017315, + "flos": 768400971264.0, + "grad_norm": 0.052329818141719754, + "language_loss": 0.81527805, + "learning_rate": 0.0007789820119903774, + "loss": 0.82623744, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.36279297, + "step": 1727, + "time_per_iteration": 2.945405960083008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105991, + "balance_loss_mlp": 1.04579556, + "epoch": 0.3324355521354367, + "flos": 1465656090624.0, + "grad_norm": 0.02932642968329903, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79552573, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.14160156, + "step": 1728, + "time_per_iteration": 4.810296297073364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084477, + "balance_loss_mlp": 1.04880977, + "epoch": 0.3326279338207003, + "flos": 496415195136.0, + "grad_norm": 0.05339720943334808, + "language_loss": 0.83919221, + "learning_rate": 0.0007784647192990428, + "loss": 0.85003698, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.35693359, + "step": 1729, + "time_per_iteration": 2.6848132610321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083753, + "balance_loss_mlp": 1.04844344, + "epoch": 0.33282031550596386, + "flos": 635600342016.0, + "grad_norm": 0.05212570885713578, + "language_loss": 0.80661625, + "learning_rate": 0.0007782059107387696, + "loss": 0.81745386, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.35351562, + "step": 1730, + "time_per_iteration": 2.874356269836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078294, + "balance_loss_mlp": 1.04329371, + "epoch": 0.3330126971912274, + "flos": 689210643456.0, + "grad_norm": 0.05636936917064103, + "language_loss": 0.88407743, + "learning_rate": 0.0007779469941693826, + "loss": 0.89486033, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.3503418, + "step": 1731, + "time_per_iteration": 2.7914862632751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079351, + "balance_loss_mlp": 1.04511368, + "epoch": 0.333205078876491, + "flos": 566184563712.0, + "grad_norm": 0.05730145040609657, + "language_loss": 0.77017218, + "learning_rate": 0.0007776879696914029, + "loss": 0.78096569, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.34277344, + "step": 1732, + "time_per_iteration": 2.8158769607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081, + "balance_loss_mlp": 1.04666734, + "epoch": 0.3333974605617545, + "flos": 640618550784.0, + "grad_norm": 0.044212495165629015, + "language_loss": 0.8903594, + "learning_rate": 0.000777428837405392, + "loss": 0.90116942, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.34375, + "step": 1733, + "time_per_iteration": 2.8417906761169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079717, + "balance_loss_mlp": 1.04536092, + "epoch": 0.3335898422470181, + "flos": 461597501952.0, + "grad_norm": 0.05109390766697835, + "language_loss": 0.87070495, + "learning_rate": 0.0007771695974119544, + "loss": 0.88150203, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.34399414, + "step": 1734, + "time_per_iteration": 2.4995057582855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078764, + "balance_loss_mlp": 1.04514742, + "epoch": 0.33378222393228163, + "flos": 852504791040.0, + "grad_norm": 0.05825672376588237, + "language_loss": 0.75576115, + "learning_rate": 0.0007769102498117359, + "loss": 0.76654887, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.33642578, + "step": 1735, + "time_per_iteration": 3.0868663787841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083295, + "balance_loss_mlp": 1.04991651, + "epoch": 0.3339746056175452, + "flos": 954256080384.0, + "grad_norm": 0.05069255593645712, + "language_loss": 0.79858601, + "learning_rate": 0.000776650794705424, + "loss": 0.80941892, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.33398438, + "step": 1736, + "time_per_iteration": 3.2665328979492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108809, + "balance_loss_mlp": 1.05387688, + "epoch": 0.33416698730280875, + "flos": 544559155200.0, + "grad_norm": 0.045819605067785145, + "language_loss": 0.82160866, + "learning_rate": 0.0007763912321937483, + "loss": 0.83248949, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.34228516, + "step": 1737, + "time_per_iteration": 2.677316665649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081816, + "balance_loss_mlp": 1.04798412, + "epoch": 0.33435936898807234, + "flos": 1013652817920.0, + "grad_norm": 0.053421386657792044, + "language_loss": 0.82471478, + "learning_rate": 0.0007761315623774799, + "loss": 0.8355329, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.33862305, + "step": 1738, + "time_per_iteration": 3.4182283878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089571, + "balance_loss_mlp": 1.05554879, + "epoch": 0.3345517506733359, + "flos": 614935217664.0, + "grad_norm": 0.051536505858366714, + "language_loss": 0.87671852, + "learning_rate": 0.0007758717853574313, + "loss": 0.88761419, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.34057617, + "step": 1739, + "time_per_iteration": 2.7348380088806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084561, + "balance_loss_mlp": 1.05125391, + "epoch": 0.33474413235859946, + "flos": 494350622208.0, + "grad_norm": 0.06141180611747274, + "language_loss": 0.9002257, + "learning_rate": 0.0007756119012344571, + "loss": 0.91107136, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.33325195, + "step": 1740, + "time_per_iteration": 2.536121129989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091135, + "balance_loss_mlp": 1.05754209, + "epoch": 0.33493651404386304, + "flos": 628105743360.0, + "grad_norm": 0.06662069566578578, + "language_loss": 0.84404671, + "learning_rate": 0.0007753519101094535, + "loss": 0.85495806, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.33618164, + "step": 1741, + "time_per_iteration": 2.753371238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082833, + "balance_loss_mlp": 1.04945421, + "epoch": 0.3351288957291266, + "flos": 513474909696.0, + "grad_norm": 0.05750412427252262, + "language_loss": 0.86366677, + "learning_rate": 0.0007750918120833575, + "loss": 0.87449515, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.33398438, + "step": 1742, + "time_per_iteration": 2.56093168258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082276, + "balance_loss_mlp": 1.05037546, + "epoch": 0.33532127741439016, + "flos": 647008860672.0, + "grad_norm": 0.0676260973392943, + "language_loss": 0.87342101, + "learning_rate": 0.0007748316072571485, + "loss": 0.88424373, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.31884766, + "step": 1743, + "time_per_iteration": 2.7759556770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086123, + "balance_loss_mlp": 1.05193388, + "epoch": 0.3355136590996537, + "flos": 768134721024.0, + "grad_norm": 0.047185436483198326, + "language_loss": 0.79306734, + "learning_rate": 0.0007745712957318467, + "loss": 0.80392861, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.34228516, + "step": 1744, + "time_per_iteration": 2.959686756134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085284, + "balance_loss_mlp": 1.05119014, + "epoch": 0.3357060407849173, + "flos": 595259490816.0, + "grad_norm": 0.046948111550021425, + "language_loss": 0.86506951, + "learning_rate": 0.0007743108776085141, + "loss": 0.87592232, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.34106445, + "step": 1745, + "time_per_iteration": 2.7391204833984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089169, + "balance_loss_mlp": 1.05462217, + "epoch": 0.3358984224701808, + "flos": 598288730112.0, + "grad_norm": 0.04983419543630797, + "language_loss": 0.82728243, + "learning_rate": 0.0007740503529882543, + "loss": 0.8381741, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.34594727, + "step": 1746, + "time_per_iteration": 2.788041114807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108953, + "balance_loss_mlp": 1.05474496, + "epoch": 0.3360908041554444, + "flos": 578055771648.0, + "grad_norm": 0.05677254755827829, + "language_loss": 0.91252941, + "learning_rate": 0.0007737897219722114, + "loss": 0.92342472, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.34790039, + "step": 1747, + "time_per_iteration": 2.6752376556396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083715, + "balance_loss_mlp": 1.04945374, + "epoch": 0.336283185840708, + "flos": 513332315136.0, + "grad_norm": 0.05427874766165502, + "language_loss": 0.81146061, + "learning_rate": 0.0007735289846615716, + "loss": 0.82229781, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.34301758, + "step": 1748, + "time_per_iteration": 2.670315742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083181, + "balance_loss_mlp": 1.04984999, + "epoch": 0.3364755675259715, + "flos": 524716102656.0, + "grad_norm": 0.05445380235157479, + "language_loss": 0.81899059, + "learning_rate": 0.0007732681411575621, + "loss": 0.82982242, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.33349609, + "step": 1749, + "time_per_iteration": 2.644740104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079502, + "balance_loss_mlp": 1.04567027, + "epoch": 0.3366679492112351, + "flos": 554594162688.0, + "grad_norm": 0.05291201013717534, + "language_loss": 0.87517959, + "learning_rate": 0.0007730071915614514, + "loss": 0.88597459, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.33862305, + "step": 1750, + "time_per_iteration": 2.6605777740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082694, + "balance_loss_mlp": 1.04874277, + "epoch": 0.33686033089649864, + "flos": 427051851264.0, + "grad_norm": 0.07867660779661921, + "language_loss": 0.88976741, + "learning_rate": 0.0007727461359745489, + "loss": 0.90059435, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.33984375, + "step": 1751, + "time_per_iteration": 2.4562768936157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082471, + "balance_loss_mlp": 1.04987907, + "epoch": 0.3370527125817622, + "flos": 541452750336.0, + "grad_norm": 0.05472309390748721, + "language_loss": 0.86156446, + "learning_rate": 0.0007724849744982056, + "loss": 0.87238914, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.32592773, + "step": 1752, + "time_per_iteration": 2.683575391769409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086284, + "balance_loss_mlp": 1.05295336, + "epoch": 0.33724509426702576, + "flos": 541836864000.0, + "grad_norm": 0.052181206472060114, + "language_loss": 0.81578106, + "learning_rate": 0.0007722237072338131, + "loss": 0.82664388, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.33349609, + "step": 1753, + "time_per_iteration": 2.7059788703918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108949, + "balance_loss_mlp": 1.05563486, + "epoch": 0.33743747595228935, + "flos": 472557888000.0, + "grad_norm": 0.063588606701447, + "language_loss": 0.85402888, + "learning_rate": 0.0007719623342828046, + "loss": 0.86492383, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.33886719, + "step": 1754, + "time_per_iteration": 2.5117459297180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090728, + "balance_loss_mlp": 1.05708706, + "epoch": 0.33762985763755293, + "flos": 469564964352.0, + "grad_norm": 0.05602573096673387, + "language_loss": 0.84115714, + "learning_rate": 0.000771700855746654, + "loss": 0.85206437, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.33666992, + "step": 1755, + "time_per_iteration": 2.5685064792633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085423, + "balance_loss_mlp": 1.05214, + "epoch": 0.33782223932281646, + "flos": 492002270208.0, + "grad_norm": 0.05352941428578995, + "language_loss": 0.88329422, + "learning_rate": 0.0007714392717268763, + "loss": 0.89414847, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.33300781, + "step": 1756, + "time_per_iteration": 2.568432569503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084365, + "balance_loss_mlp": 1.04981852, + "epoch": 0.33801462100808005, + "flos": 464827562496.0, + "grad_norm": 0.051807056092833426, + "language_loss": 0.86368155, + "learning_rate": 0.0007711775823250273, + "loss": 0.87452519, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.34594727, + "step": 1757, + "time_per_iteration": 2.5542263984680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010902, + "balance_loss_mlp": 1.05660665, + "epoch": 0.3382070026933436, + "flos": 795319603200.0, + "grad_norm": 0.05510084593487172, + "language_loss": 0.83019066, + "learning_rate": 0.0007709157876427039, + "loss": 0.84109271, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.33618164, + "step": 1758, + "time_per_iteration": 3.07852840423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082149, + "balance_loss_mlp": 1.04903245, + "epoch": 0.33839938437860717, + "flos": 508181686272.0, + "grad_norm": 0.0524958838474987, + "language_loss": 0.85602981, + "learning_rate": 0.0007706538877815439, + "loss": 0.86685127, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.33129883, + "step": 1759, + "time_per_iteration": 2.6002085208892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082716, + "balance_loss_mlp": 1.05021918, + "epoch": 0.3385917660638707, + "flos": 483986755584.0, + "grad_norm": 0.05079207863068971, + "language_loss": 0.83150595, + "learning_rate": 0.0007703918828432259, + "loss": 0.84233308, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.32495117, + "step": 1760, + "time_per_iteration": 2.5961215496063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086286, + "balance_loss_mlp": 1.05297899, + "epoch": 0.3387841477491343, + "flos": 545071306752.0, + "grad_norm": 0.0542286668270813, + "language_loss": 0.89021361, + "learning_rate": 0.000770129772929469, + "loss": 0.9010765, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.33325195, + "step": 1761, + "time_per_iteration": 2.6393394470214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076289, + "balance_loss_mlp": 1.04264784, + "epoch": 0.3389765294343978, + "flos": 719487373824.0, + "grad_norm": 0.057381721603975526, + "language_loss": 0.88803625, + "learning_rate": 0.0007698675581420334, + "loss": 0.89879912, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.33666992, + "step": 1762, + "time_per_iteration": 2.8959014415740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_mlp": 1.03968656, + "epoch": 0.3391689111196614, + "flos": 699596269056.0, + "grad_norm": 0.05381480837735757, + "language_loss": 0.78922743, + "learning_rate": 0.0007696052385827199, + "loss": 0.79995811, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.33398438, + "step": 1763, + "time_per_iteration": 2.9110584259033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068767, + "balance_loss_mlp": 1.03519773, + "epoch": 0.339361292804925, + "flos": 626806425600.0, + "grad_norm": 0.05521588721088573, + "language_loss": 0.78407156, + "learning_rate": 0.00076934281435337, + "loss": 0.79475927, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.33569336, + "step": 1764, + "time_per_iteration": 2.7673115730285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073935, + "balance_loss_mlp": 1.04043674, + "epoch": 0.33955367449018853, + "flos": 609302960640.0, + "grad_norm": 0.0615155635578628, + "language_loss": 0.85995364, + "learning_rate": 0.0007690802855558658, + "loss": 0.87069303, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.33520508, + "step": 1765, + "time_per_iteration": 2.871255397796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_mlp": 1.07131636, + "epoch": 0.3397460561754521, + "flos": 1452494177280.0, + "grad_norm": 0.03113174858532202, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77458668, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.12402344, + "step": 1766, + "time_per_iteration": 4.906975746154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089831, + "balance_loss_mlp": 1.05485463, + "epoch": 0.33993843786071565, + "flos": 487068429312.0, + "grad_norm": 0.059784397062932884, + "language_loss": 0.89060128, + "learning_rate": 0.0007685549146641262, + "loss": 0.90149957, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.35009766, + "step": 1767, + "time_per_iteration": 2.5172948837280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081834, + "balance_loss_mlp": 1.04683375, + "epoch": 0.34013081954597923, + "flos": 417115768320.0, + "grad_norm": 0.05470212710373979, + "language_loss": 0.88085568, + "learning_rate": 0.0007682920727738579, + "loss": 0.89167398, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.35058594, + "step": 1768, + "time_per_iteration": 2.4423539638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084324, + "balance_loss_mlp": 1.04939604, + "epoch": 0.34032320123124277, + "flos": 437293472256.0, + "grad_norm": 0.06228189549734304, + "language_loss": 0.84428132, + "learning_rate": 0.000768029126723369, + "loss": 0.85512453, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.34960938, + "step": 1769, + "time_per_iteration": 2.5054280757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082733, + "balance_loss_mlp": 1.04811513, + "epoch": 0.34051558291650635, + "flos": 457353312768.0, + "grad_norm": 0.058774755629116764, + "language_loss": 0.81489038, + "learning_rate": 0.0007677660766147447, + "loss": 0.82571769, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.34667969, + "step": 1770, + "time_per_iteration": 2.524327039718628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_mlp": 1.01844561, + "epoch": 0.3407079646017699, + "flos": 1558029938688.0, + "grad_norm": 0.017684799672329513, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.7350117, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.11767578, + "step": 1771, + "time_per_iteration": 4.924427032470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081114, + "balance_loss_mlp": 1.04666233, + "epoch": 0.3409003462870335, + "flos": 492312190464.0, + "grad_norm": 0.06375677891517043, + "language_loss": 0.79619604, + "learning_rate": 0.0007672396646316306, + "loss": 0.80700719, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.3449707, + "step": 1772, + "time_per_iteration": 2.5239012241363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081305, + "balance_loss_mlp": 1.04589987, + "epoch": 0.34109272797229706, + "flos": 808145303040.0, + "grad_norm": 0.06003817873980187, + "language_loss": 0.80518734, + "learning_rate": 0.000766976302961512, + "loss": 0.81600046, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.35424805, + "step": 1773, + "time_per_iteration": 2.9730074405670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083157, + "balance_loss_mlp": 1.04834807, + "epoch": 0.3412851096575606, + "flos": 469903997952.0, + "grad_norm": 0.05958263274361502, + "language_loss": 0.81420594, + "learning_rate": 0.0007667128376420003, + "loss": 0.82503754, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.34863281, + "step": 1774, + "time_per_iteration": 2.521329879760742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073378, + "balance_loss_mlp": 1.03842556, + "epoch": 0.3414774913428242, + "flos": 595402085376.0, + "grad_norm": 0.09709010294240925, + "language_loss": 0.84563607, + "learning_rate": 0.0007664492687753817, + "loss": 0.85636985, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.34985352, + "step": 1775, + "time_per_iteration": 2.6744766235351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072901, + "balance_loss_mlp": 1.03976059, + "epoch": 0.3416698730280877, + "flos": 527202667008.0, + "grad_norm": 0.05030413358353647, + "language_loss": 0.81513566, + "learning_rate": 0.000766185596463983, + "loss": 0.82586467, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.33154297, + "step": 1776, + "time_per_iteration": 2.6050221920013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073672, + "balance_loss_mlp": 1.03962612, + "epoch": 0.3418622547133513, + "flos": 874272794112.0, + "grad_norm": 0.05039515754698922, + "language_loss": 0.76683038, + "learning_rate": 0.0007659218208101706, + "loss": 0.77756709, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.34082031, + "step": 1777, + "time_per_iteration": 3.0900516510009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079189, + "balance_loss_mlp": 1.04504752, + "epoch": 0.34205463639861483, + "flos": 603462680064.0, + "grad_norm": 0.04915159817243754, + "language_loss": 0.84680861, + "learning_rate": 0.0007656579419163515, + "loss": 0.85760045, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.34179688, + "step": 1778, + "time_per_iteration": 2.7328884601593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081812, + "balance_loss_mlp": 1.04774237, + "epoch": 0.3422470180838784, + "flos": 463547183616.0, + "grad_norm": 0.05230649511498847, + "language_loss": 0.76939148, + "learning_rate": 0.0007653939598849724, + "loss": 0.7802096, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.34106445, + "step": 1779, + "time_per_iteration": 2.5020573139190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_mlp": 1.01441097, + "epoch": 0.34243939976914195, + "flos": 1585584377856.0, + "grad_norm": 0.019842751190116498, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83907396, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.12792969, + "step": 1780, + "time_per_iteration": 4.919352054595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107992, + "balance_loss_mlp": 1.04656482, + "epoch": 0.34263178145440554, + "flos": 872662146048.0, + "grad_norm": 0.0514393238831889, + "language_loss": 0.80344206, + "learning_rate": 0.000764865686819522, + "loss": 0.81424129, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.33374023, + "step": 1781, + "time_per_iteration": 3.059682846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089318, + "balance_loss_mlp": 1.05546236, + "epoch": 0.3428241631396691, + "flos": 506630674944.0, + "grad_norm": 0.04318417455303755, + "language_loss": 0.85701579, + "learning_rate": 0.0007646013959905449, + "loss": 0.86790895, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.33886719, + "step": 1782, + "time_per_iteration": 2.572772741317749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085097, + "balance_loss_mlp": 1.05162311, + "epoch": 0.34301654482493266, + "flos": 879669324288.0, + "grad_norm": 0.05640606275212692, + "language_loss": 0.80626374, + "learning_rate": 0.0007643370024341949, + "loss": 0.81711471, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.33496094, + "step": 1783, + "time_per_iteration": 3.0805578231811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089012, + "balance_loss_mlp": 1.05472708, + "epoch": 0.34320892651019624, + "flos": 431537559552.0, + "grad_norm": 0.05116039291223259, + "language_loss": 0.82947731, + "learning_rate": 0.0007640725062531195, + "loss": 0.84036732, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.34326172, + "step": 1784, + "time_per_iteration": 2.497859239578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083577, + "balance_loss_mlp": 1.05010295, + "epoch": 0.3434013081954598, + "flos": 463404589056.0, + "grad_norm": 0.06763804466989645, + "language_loss": 0.86272931, + "learning_rate": 0.0007638079075500047, + "loss": 0.87356508, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.33496094, + "step": 1785, + "time_per_iteration": 2.516842842102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017655, + "balance_loss_mlp": 1.0058769, + "epoch": 0.34359368988072336, + "flos": 1556499276288.0, + "grad_norm": 0.01279941843938601, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76198322, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.11767578, + "step": 1786, + "time_per_iteration": 4.979317665100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077073, + "balance_loss_mlp": 1.04417157, + "epoch": 0.3437860715659869, + "flos": 495267236352.0, + "grad_norm": 0.04590480874587016, + "language_loss": 0.83035767, + "learning_rate": 0.0007632784029886026, + "loss": 0.84112841, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.32910156, + "step": 1787, + "time_per_iteration": 2.6075103282928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079505, + "balance_loss_mlp": 1.04617453, + "epoch": 0.3439784532512505, + "flos": 717942154752.0, + "grad_norm": 0.04559278353066439, + "language_loss": 0.85611933, + "learning_rate": 0.0007630134973358873, + "loss": 0.86691439, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.33349609, + "step": 1788, + "time_per_iteration": 2.917405366897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086657, + "balance_loss_mlp": 1.05327868, + "epoch": 0.34417083493651407, + "flos": 565598218752.0, + "grad_norm": 0.05301353071730806, + "language_loss": 0.86864436, + "learning_rate": 0.0007627484895722763, + "loss": 0.879511, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.33398438, + "step": 1789, + "time_per_iteration": 2.6353273391723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081939, + "balance_loss_mlp": 1.04834569, + "epoch": 0.3443632166217776, + "flos": 795988905984.0, + "grad_norm": 0.057022653970397005, + "language_loss": 0.80155563, + "learning_rate": 0.0007624833798006552, + "loss": 0.81237495, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.3359375, + "step": 1790, + "time_per_iteration": 3.039126396179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090848, + "balance_loss_mlp": 1.05692101, + "epoch": 0.3445555983070412, + "flos": 569045067264.0, + "grad_norm": 0.05940117534987587, + "language_loss": 0.84113955, + "learning_rate": 0.0007622181681239483, + "loss": 0.85204804, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.33935547, + "step": 1791, + "time_per_iteration": 2.6392083168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083434, + "balance_loss_mlp": 1.04903054, + "epoch": 0.3447479799923047, + "flos": 568524151296.0, + "grad_norm": 0.04492792711883196, + "language_loss": 0.84501636, + "learning_rate": 0.0007619528546451202, + "loss": 0.8558507, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.34448242, + "step": 1792, + "time_per_iteration": 2.776982069015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080197, + "balance_loss_mlp": 1.04708052, + "epoch": 0.3449403616775683, + "flos": 967323299328.0, + "grad_norm": 0.05878857203246004, + "language_loss": 0.8358798, + "learning_rate": 0.0007616874394671745, + "loss": 0.84668171, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.33129883, + "step": 1793, + "time_per_iteration": 3.3427693843841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074615, + "balance_loss_mlp": 1.04128361, + "epoch": 0.34513274336283184, + "flos": 568340858880.0, + "grad_norm": 0.05893035372227358, + "language_loss": 0.84961653, + "learning_rate": 0.0007614219226931547, + "loss": 0.86036265, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.33349609, + "step": 1794, + "time_per_iteration": 2.6591315269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070169, + "balance_loss_mlp": 1.03783977, + "epoch": 0.3453251250480954, + "flos": 460715793408.0, + "grad_norm": 0.06432823181520617, + "language_loss": 0.84724808, + "learning_rate": 0.0007611563044261435, + "loss": 0.85794979, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.32324219, + "step": 1795, + "time_per_iteration": 2.51755690574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078106, + "balance_loss_mlp": 1.0443697, + "epoch": 0.34551750673335896, + "flos": 415397431296.0, + "grad_norm": 0.0640589434438139, + "language_loss": 0.87120652, + "learning_rate": 0.0007608905847692631, + "loss": 0.88198757, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.33764648, + "step": 1796, + "time_per_iteration": 2.47190260887146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074227, + "balance_loss_mlp": 1.04103947, + "epoch": 0.34570988841862255, + "flos": 587540749824.0, + "grad_norm": 0.04642061059617041, + "language_loss": 0.86689866, + "learning_rate": 0.0007606247638256749, + "loss": 0.8776409, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.33203125, + "step": 1797, + "time_per_iteration": 2.956444025039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094984, + "balance_loss_mlp": 1.08373046, + "epoch": 0.34590227010388613, + "flos": 1566835439616.0, + "grad_norm": 0.041839887126655914, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79265279, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.11230469, + "step": 1798, + "time_per_iteration": 4.9134039878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.05352104, + "epoch": 0.34609465178914967, + "flos": 1536950177280.0, + "grad_norm": 0.029939636480755576, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80391788, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.11083984, + "step": 1799, + "time_per_iteration": 4.743322849273682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083924, + "balance_loss_mlp": 1.05054486, + "epoch": 0.34628703347441325, + "flos": 609075998208.0, + "grad_norm": 0.0564087129204809, + "language_loss": 0.85731971, + "learning_rate": 0.0007598266943068686, + "loss": 0.86815894, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.33398438, + "step": 1800, + "time_per_iteration": 2.7374043464660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077797, + "balance_loss_mlp": 1.04603946, + "epoch": 0.3464794151596768, + "flos": 473084596224.0, + "grad_norm": 0.06346922489791823, + "language_loss": 0.83911705, + "learning_rate": 0.0007595604692488507, + "loss": 0.849895, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.31738281, + "step": 1801, + "time_per_iteration": 2.5296249389648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.04147625, + "epoch": 0.34667179684494037, + "flos": 605397805056.0, + "grad_norm": 0.05750507090521113, + "language_loss": 0.83014846, + "learning_rate": 0.0007592941434205215, + "loss": 0.84088963, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.32641602, + "step": 1802, + "time_per_iteration": 2.758260488510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017015, + "balance_loss_mlp": 1.00628662, + "epoch": 0.3468641785302039, + "flos": 1564053511680.0, + "grad_norm": 0.014489769518708178, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74588072, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.10742188, + "step": 1803, + "time_per_iteration": 5.078529119491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069541, + "balance_loss_mlp": 1.0363059, + "epoch": 0.3470565602154675, + "flos": 906902258688.0, + "grad_norm": 0.0666829693597375, + "language_loss": 0.79937375, + "learning_rate": 0.0007587611898665566, + "loss": 0.81006914, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.33251953, + "step": 1804, + "time_per_iteration": 3.05087947845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078664, + "balance_loss_mlp": 1.04621565, + "epoch": 0.347248941900731, + "flos": 638613614592.0, + "grad_norm": 0.050247612363814816, + "language_loss": 0.8218019, + "learning_rate": 0.0007584945623478315, + "loss": 0.83258855, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.32446289, + "step": 1805, + "time_per_iteration": 2.8188822269439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071735, + "balance_loss_mlp": 1.03940511, + "epoch": 0.3474413235859946, + "flos": 847009336320.0, + "grad_norm": 0.06830759319763476, + "language_loss": 0.81376302, + "learning_rate": 0.000758227834472617, + "loss": 0.82448041, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.32324219, + "step": 1806, + "time_per_iteration": 3.049736976623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080854, + "balance_loss_mlp": 1.04771423, + "epoch": 0.3476337052712582, + "flos": 515395478016.0, + "grad_norm": 0.0580200838122141, + "language_loss": 0.77365351, + "learning_rate": 0.0007579610063444664, + "loss": 0.78446203, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.33154297, + "step": 1807, + "time_per_iteration": 2.768986701965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072697, + "balance_loss_mlp": 1.03993857, + "epoch": 0.34782608695652173, + "flos": 913161558528.0, + "grad_norm": 0.05804810611861273, + "language_loss": 0.8735044, + "learning_rate": 0.0007576940780669712, + "loss": 0.88423139, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.32763672, + "step": 1808, + "time_per_iteration": 3.200984477996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073414, + "balance_loss_mlp": 1.04041636, + "epoch": 0.3480184686417853, + "flos": 773374099968.0, + "grad_norm": 0.05336970886803796, + "language_loss": 0.84611619, + "learning_rate": 0.0007574270497437624, + "loss": 0.85685027, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.33007812, + "step": 1809, + "time_per_iteration": 2.9432260990142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069124, + "balance_loss_mlp": 1.03619814, + "epoch": 0.34821085032704885, + "flos": 576549840384.0, + "grad_norm": 0.04930975616190813, + "language_loss": 0.87883413, + "learning_rate": 0.000757159921478509, + "loss": 0.88952535, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.3293457, + "step": 1810, + "time_per_iteration": 2.769669771194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079936, + "balance_loss_mlp": 1.06887364, + "epoch": 0.34840323201231244, + "flos": 1524176911872.0, + "grad_norm": 0.03214902088053901, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75530577, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.11083984, + "step": 1811, + "time_per_iteration": 4.764174222946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107769, + "balance_loss_mlp": 1.04469275, + "epoch": 0.34859561369757597, + "flos": 508910625792.0, + "grad_norm": 0.059132347701423886, + "language_loss": 0.87255216, + "learning_rate": 0.0007566253655367423, + "loss": 0.88332909, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.33007812, + "step": 1812, + "time_per_iteration": 2.578930616378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073116, + "balance_loss_mlp": 1.04014218, + "epoch": 0.34878799538283956, + "flos": 548390117376.0, + "grad_norm": 0.051501554075800156, + "language_loss": 0.89574003, + "learning_rate": 0.000756357938067762, + "loss": 0.90647119, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.32983398, + "step": 1813, + "time_per_iteration": 2.6508982181549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079394, + "balance_loss_mlp": 1.04673076, + "epoch": 0.34898037706810314, + "flos": 983251021824.0, + "grad_norm": 0.051360492330316726, + "language_loss": 0.82609868, + "learning_rate": 0.0007560904110718033, + "loss": 0.8368926, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.32666016, + "step": 1814, + "time_per_iteration": 3.236894130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075398, + "balance_loss_mlp": 1.04185271, + "epoch": 0.3491727587533667, + "flos": 681298435584.0, + "grad_norm": 0.05446392192761228, + "language_loss": 0.83478653, + "learning_rate": 0.0007558227846527297, + "loss": 0.84554052, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.33569336, + "step": 1815, + "time_per_iteration": 2.8674044609069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074987, + "balance_loss_mlp": 1.04175162, + "epoch": 0.34936514043863026, + "flos": 393811310592.0, + "grad_norm": 0.0691488486506015, + "language_loss": 0.83195454, + "learning_rate": 0.0007555550589144429, + "loss": 0.84270442, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.33251953, + "step": 1816, + "time_per_iteration": 2.421494722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071292, + "balance_loss_mlp": 1.03917694, + "epoch": 0.3495575221238938, + "flos": 461120256000.0, + "grad_norm": 0.07868701205222765, + "language_loss": 0.8463372, + "learning_rate": 0.000755287233960883, + "loss": 0.85705012, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.32104492, + "step": 1817, + "time_per_iteration": 2.54315185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072544, + "balance_loss_mlp": 1.04023862, + "epoch": 0.3497499038091574, + "flos": 723859600896.0, + "grad_norm": 0.06602653795060065, + "language_loss": 0.77636009, + "learning_rate": 0.0007550193098960292, + "loss": 0.78708553, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.32299805, + "step": 1818, + "time_per_iteration": 2.848236560821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076763, + "balance_loss_mlp": 1.04452837, + "epoch": 0.3499422854944209, + "flos": 827364132864.0, + "grad_norm": 0.049816715297611704, + "language_loss": 0.86387616, + "learning_rate": 0.0007547512868238988, + "loss": 0.8746438, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.32226562, + "step": 1819, + "time_per_iteration": 3.140552043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076553, + "balance_loss_mlp": 1.0441277, + "epoch": 0.3501346671796845, + "flos": 493214247936.0, + "grad_norm": 0.049810070694169546, + "language_loss": 0.83196282, + "learning_rate": 0.0007544831648485473, + "loss": 0.84272838, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.32421875, + "step": 1820, + "time_per_iteration": 2.7085797786712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074414, + "balance_loss_mlp": 1.04179859, + "epoch": 0.35032704886494803, + "flos": 578479173120.0, + "grad_norm": 0.05987447994889705, + "language_loss": 0.81237, + "learning_rate": 0.0007542149440740694, + "loss": 0.82311416, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.32617188, + "step": 1821, + "time_per_iteration": 2.6648108959198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075377, + "balance_loss_mlp": 1.0426898, + "epoch": 0.3505194305502116, + "flos": 584383472640.0, + "grad_norm": 0.06285767185927299, + "language_loss": 0.85454488, + "learning_rate": 0.000753946624604597, + "loss": 0.86529863, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.3269043, + "step": 1822, + "time_per_iteration": 2.7114102840423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080366, + "balance_loss_mlp": 1.04722571, + "epoch": 0.3507118122354752, + "flos": 526705072128.0, + "grad_norm": 0.056571758739544044, + "language_loss": 0.88259315, + "learning_rate": 0.0007536782065443015, + "loss": 0.89339685, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.33154297, + "step": 1823, + "time_per_iteration": 2.6336190700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077518, + "balance_loss_mlp": 1.04576099, + "epoch": 0.35090419392073874, + "flos": 511269152256.0, + "grad_norm": 0.06612506998948281, + "language_loss": 0.74917412, + "learning_rate": 0.0007534096899973919, + "loss": 0.75994933, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.31738281, + "step": 1824, + "time_per_iteration": 2.5683584213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108279, + "balance_loss_mlp": 1.05069852, + "epoch": 0.3510965756060023, + "flos": 563728522752.0, + "grad_norm": 0.05207355522992398, + "language_loss": 0.82511663, + "learning_rate": 0.0007531410750681154, + "loss": 0.83594453, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.32080078, + "step": 1825, + "time_per_iteration": 2.7370071411132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108428, + "balance_loss_mlp": 1.05207014, + "epoch": 0.35128895729126586, + "flos": 1020107146752.0, + "grad_norm": 0.05855996344544413, + "language_loss": 0.86223209, + "learning_rate": 0.0007528723618607575, + "loss": 0.87307489, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.32202148, + "step": 1826, + "time_per_iteration": 3.4230828285217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080919, + "balance_loss_mlp": 1.04889941, + "epoch": 0.35148133897652944, + "flos": 587972915712.0, + "grad_norm": 0.06514472806491804, + "language_loss": 0.82370871, + "learning_rate": 0.0007526035504796422, + "loss": 0.8345179, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.32006836, + "step": 1827, + "time_per_iteration": 2.7472023963928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083394, + "balance_loss_mlp": 1.05011046, + "epoch": 0.351673720661793, + "flos": 495054830592.0, + "grad_norm": 0.13631276803870807, + "language_loss": 0.86120903, + "learning_rate": 0.0007523346410291312, + "loss": 0.87204289, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.33300781, + "step": 1828, + "time_per_iteration": 2.7665555477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080213, + "balance_loss_mlp": 1.04757404, + "epoch": 0.35186610234705656, + "flos": 762339520512.0, + "grad_norm": 0.04983334941453678, + "language_loss": 0.85021639, + "learning_rate": 0.0007520656336136245, + "loss": 0.86101854, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.32641602, + "step": 1829, + "time_per_iteration": 2.9405102729797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080442, + "balance_loss_mlp": 1.04847038, + "epoch": 0.3520584840323201, + "flos": 625822820352.0, + "grad_norm": 0.049266285647792965, + "language_loss": 0.87560928, + "learning_rate": 0.0007517965283375599, + "loss": 0.88641369, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.31958008, + "step": 1830, + "time_per_iteration": 2.8742456436157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076907, + "balance_loss_mlp": 1.04429162, + "epoch": 0.3522508657175837, + "flos": 537124193280.0, + "grad_norm": 0.05152278098600794, + "language_loss": 0.8913554, + "learning_rate": 0.0007515273253054132, + "loss": 0.90212452, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.32617188, + "step": 1831, + "time_per_iteration": 2.647270917892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085251, + "balance_loss_mlp": 1.0506562, + "epoch": 0.35244324740284727, + "flos": 567105560064.0, + "grad_norm": 0.052396269804254075, + "language_loss": 0.82697165, + "learning_rate": 0.0007512580246216988, + "loss": 0.83782411, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.34643555, + "step": 1832, + "time_per_iteration": 2.6887552738189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079591, + "balance_loss_mlp": 1.04673672, + "epoch": 0.3526356290881108, + "flos": 512809989120.0, + "grad_norm": 0.05749675796225481, + "language_loss": 0.85263908, + "learning_rate": 0.000750988626390968, + "loss": 0.86343497, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.32861328, + "step": 1833, + "time_per_iteration": 2.6013457775115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080641, + "balance_loss_mlp": 1.04781032, + "epoch": 0.3528280107733744, + "flos": 595496627712.0, + "grad_norm": 0.05344239959905588, + "language_loss": 0.84880912, + "learning_rate": 0.0007507191307178108, + "loss": 0.8596155, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.32836914, + "step": 1834, + "time_per_iteration": 2.7490363121032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080747, + "balance_loss_mlp": 1.04682052, + "epoch": 0.3530203924586379, + "flos": 550969814016.0, + "grad_norm": 0.06515988244691072, + "language_loss": 0.74431223, + "learning_rate": 0.0007504495377068543, + "loss": 0.75511968, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.33959961, + "step": 1835, + "time_per_iteration": 2.729029417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084115, + "balance_loss_mlp": 1.04925871, + "epoch": 0.3532127741439015, + "flos": 652662876672.0, + "grad_norm": 0.06759605529963146, + "language_loss": 0.81589389, + "learning_rate": 0.0007501798474627642, + "loss": 0.82673502, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.34912109, + "step": 1836, + "time_per_iteration": 2.9048030376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080775, + "balance_loss_mlp": 1.04708695, + "epoch": 0.35340515582916504, + "flos": 722452594176.0, + "grad_norm": 0.055893281392717674, + "language_loss": 0.83221173, + "learning_rate": 0.0007499100600902433, + "loss": 0.84301955, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.3371582, + "step": 1837, + "time_per_iteration": 2.9900574684143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080937, + "balance_loss_mlp": 1.0464375, + "epoch": 0.35359753751442863, + "flos": 594619301376.0, + "grad_norm": 0.06113982905710786, + "language_loss": 0.84191763, + "learning_rate": 0.0007496401756940324, + "loss": 0.852727, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.34545898, + "step": 1838, + "time_per_iteration": 2.6746203899383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079632, + "balance_loss_mlp": 1.04575253, + "epoch": 0.3537899191996922, + "flos": 632384838144.0, + "grad_norm": 0.05956961248716192, + "language_loss": 0.82392603, + "learning_rate": 0.0007493701943789098, + "loss": 0.8347224, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.33886719, + "step": 1839, + "time_per_iteration": 2.773550510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089423, + "balance_loss_mlp": 1.05590141, + "epoch": 0.35398230088495575, + "flos": 506118523392.0, + "grad_norm": 0.05410374630174333, + "language_loss": 0.82311571, + "learning_rate": 0.000749100116249692, + "loss": 0.83400989, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.33544922, + "step": 1840, + "time_per_iteration": 2.6255862712860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089923, + "balance_loss_mlp": 1.05649722, + "epoch": 0.35417468257021933, + "flos": 507783015936.0, + "grad_norm": 0.06109989504264522, + "language_loss": 0.86315167, + "learning_rate": 0.0007488299414112321, + "loss": 0.87405092, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.33447266, + "step": 1841, + "time_per_iteration": 2.5875229835510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088685, + "balance_loss_mlp": 1.05552149, + "epoch": 0.35436706425548287, + "flos": 656133046272.0, + "grad_norm": 0.05742985112465967, + "language_loss": 0.77833533, + "learning_rate": 0.0007485596699684215, + "loss": 0.78922212, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.33178711, + "step": 1842, + "time_per_iteration": 2.819591760635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092886, + "balance_loss_mlp": 1.05903101, + "epoch": 0.35455944594074645, + "flos": 652322433024.0, + "grad_norm": 0.047878329403948795, + "language_loss": 0.85455877, + "learning_rate": 0.000748289302026189, + "loss": 0.86548758, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.33886719, + "step": 1843, + "time_per_iteration": 2.829897880554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088118, + "balance_loss_mlp": 1.05569351, + "epoch": 0.35475182762601, + "flos": 848240252928.0, + "grad_norm": 0.06279452498251797, + "language_loss": 0.85658133, + "learning_rate": 0.0007480188376895004, + "loss": 0.86746252, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.32421875, + "step": 1844, + "time_per_iteration": 3.067828893661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085552, + "balance_loss_mlp": 1.07358336, + "epoch": 0.3549442093112736, + "flos": 1520644133376.0, + "grad_norm": 0.027370210450034033, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.7489689, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.11962891, + "step": 1845, + "time_per_iteration": 4.860119342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087227, + "balance_loss_mlp": 1.05401564, + "epoch": 0.3551365909965371, + "flos": 651087134208.0, + "grad_norm": 0.057022057365061586, + "language_loss": 0.7840451, + "learning_rate": 0.0007474776202528074, + "loss": 0.79491735, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.33227539, + "step": 1846, + "time_per_iteration": 2.924600601196289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081626, + "balance_loss_mlp": 1.04877198, + "epoch": 0.3553289726818007, + "flos": 897094213632.0, + "grad_norm": 0.05655103479540665, + "language_loss": 0.81245291, + "learning_rate": 0.000747206867362922, + "loss": 0.82326913, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.32861328, + "step": 1847, + "time_per_iteration": 3.0635437965393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078587, + "balance_loss_mlp": 1.0454942, + "epoch": 0.3555213543670643, + "flos": 688181958144.0, + "grad_norm": 0.057996459019562165, + "language_loss": 0.83748043, + "learning_rate": 0.0007469360184988194, + "loss": 0.84826624, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.33105469, + "step": 1848, + "time_per_iteration": 2.816774606704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072989, + "balance_loss_mlp": 1.03977752, + "epoch": 0.3557137360523278, + "flos": 538305647616.0, + "grad_norm": 0.0578078380794177, + "language_loss": 0.87284935, + "learning_rate": 0.0007466650737656518, + "loss": 0.88357925, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.33203125, + "step": 1849, + "time_per_iteration": 2.611743927001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075842, + "balance_loss_mlp": 1.04208231, + "epoch": 0.3559061177375914, + "flos": 402039230976.0, + "grad_norm": 0.05251231214094578, + "language_loss": 0.90093362, + "learning_rate": 0.0007463940332686098, + "loss": 0.91169202, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.33789062, + "step": 1850, + "time_per_iteration": 2.4692726135253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073866, + "balance_loss_mlp": 1.04017735, + "epoch": 0.35609849942285493, + "flos": 696238170624.0, + "grad_norm": 0.04795835093932571, + "language_loss": 0.84167922, + "learning_rate": 0.0007461228971129205, + "loss": 0.85241795, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.33691406, + "step": 1851, + "time_per_iteration": 2.894505023956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106997, + "balance_loss_mlp": 1.0372591, + "epoch": 0.3562908811081185, + "flos": 568660953600.0, + "grad_norm": 0.055081415669052246, + "language_loss": 0.85513294, + "learning_rate": 0.0007458516654038483, + "loss": 0.86583257, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.32714844, + "step": 1852, + "time_per_iteration": 2.678018569946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076636, + "balance_loss_mlp": 1.0421133, + "epoch": 0.35648326279338205, + "flos": 682081219584.0, + "grad_norm": 0.04842584798560518, + "language_loss": 0.8668319, + "learning_rate": 0.0007455803382466946, + "loss": 0.87759829, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.34545898, + "step": 1853, + "time_per_iteration": 2.795799493789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081318, + "balance_loss_mlp": 1.04674757, + "epoch": 0.35667564447864564, + "flos": 628840475136.0, + "grad_norm": 0.04891463031827082, + "language_loss": 0.87319207, + "learning_rate": 0.0007453089157467979, + "loss": 0.88400525, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.34594727, + "step": 1854, + "time_per_iteration": 2.7683348655700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084525, + "balance_loss_mlp": 1.05000162, + "epoch": 0.35686802616390917, + "flos": 813685837824.0, + "grad_norm": 0.04901692214928195, + "language_loss": 0.81941634, + "learning_rate": 0.0007450373980095341, + "loss": 0.83026159, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.34545898, + "step": 1855, + "time_per_iteration": 3.069664716720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088761, + "balance_loss_mlp": 1.0541904, + "epoch": 0.35706040784917276, + "flos": 525922288128.0, + "grad_norm": 0.06393454459125486, + "language_loss": 0.86792582, + "learning_rate": 0.0007447657851403155, + "loss": 0.87881339, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.34619141, + "step": 1856, + "time_per_iteration": 2.6120662689208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081268, + "balance_loss_mlp": 1.04793692, + "epoch": 0.35725278953443634, + "flos": 511698345984.0, + "grad_norm": 0.060959809088696394, + "language_loss": 0.78963649, + "learning_rate": 0.0007444940772445915, + "loss": 0.80044913, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.33349609, + "step": 1857, + "time_per_iteration": 2.802053689956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079861, + "balance_loss_mlp": 1.04653037, + "epoch": 0.3574451712196999, + "flos": 487162971648.0, + "grad_norm": 0.06448223618511208, + "language_loss": 0.80338144, + "learning_rate": 0.0007442222744278484, + "loss": 0.81418002, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.33349609, + "step": 1858, + "time_per_iteration": 2.660689353942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080529, + "balance_loss_mlp": 1.04765141, + "epoch": 0.35763755290496346, + "flos": 550384879104.0, + "grad_norm": 0.061962253699798436, + "language_loss": 0.84126002, + "learning_rate": 0.0007439503767956099, + "loss": 0.85206527, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.32885742, + "step": 1859, + "time_per_iteration": 2.7479875087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080547, + "balance_loss_mlp": 1.06767237, + "epoch": 0.357829934590227, + "flos": 1503300791808.0, + "grad_norm": 0.035903025748828234, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80752152, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.12890625, + "step": 1860, + "time_per_iteration": 4.900041580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077433, + "balance_loss_mlp": 1.04479325, + "epoch": 0.3580223162754906, + "flos": 568410670080.0, + "grad_norm": 0.040558802150678905, + "language_loss": 0.85799539, + "learning_rate": 0.000743406297506922, + "loss": 0.86876976, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.32641602, + "step": 1861, + "time_per_iteration": 2.701162576675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084428, + "balance_loss_mlp": 1.05107355, + "epoch": 0.3582146979607541, + "flos": 626153089536.0, + "grad_norm": 0.04686630584337546, + "language_loss": 0.8419295, + "learning_rate": 0.0007431341160617031, + "loss": 0.85277379, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.33374023, + "step": 1862, + "time_per_iteration": 2.860173463821411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085309, + "balance_loss_mlp": 1.05266929, + "epoch": 0.3584070796460177, + "flos": 507010406400.0, + "grad_norm": 0.04939599291948986, + "language_loss": 0.88143289, + "learning_rate": 0.0007428618402234491, + "loss": 0.89228594, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.32641602, + "step": 1863, + "time_per_iteration": 2.62233567237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083488, + "balance_loss_mlp": 1.05030036, + "epoch": 0.3585994613312813, + "flos": 606190763520.0, + "grad_norm": 0.051497717495276533, + "language_loss": 0.80248374, + "learning_rate": 0.0007425894700978668, + "loss": 0.81331861, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.33203125, + "step": 1864, + "time_per_iteration": 2.711484670639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087216, + "balance_loss_mlp": 1.05424261, + "epoch": 0.3587918430165448, + "flos": 1412338484736.0, + "grad_norm": 0.047877863134497434, + "language_loss": 0.79232943, + "learning_rate": 0.0007423170057906996, + "loss": 0.80320162, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.32983398, + "step": 1865, + "time_per_iteration": 3.852776527404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091453, + "balance_loss_mlp": 1.05821717, + "epoch": 0.3589842247018084, + "flos": 478313800704.0, + "grad_norm": 0.06431447428318769, + "language_loss": 0.85827845, + "learning_rate": 0.0007420444474077275, + "loss": 0.86919296, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.33251953, + "step": 1866, + "time_per_iteration": 2.6104037761688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101796, + "balance_loss_mlp": 1.06829846, + "epoch": 0.35917660638707194, + "flos": 504464205312.0, + "grad_norm": 0.06438653143979521, + "language_loss": 0.89830631, + "learning_rate": 0.0007417717950547671, + "loss": 0.90932429, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.33520508, + "step": 1867, + "time_per_iteration": 2.5619330406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108848, + "balance_loss_mlp": 1.09687901, + "epoch": 0.3593689880723355, + "flos": 1491294191616.0, + "grad_norm": 0.037524520389889536, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77105457, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.11962891, + "step": 1868, + "time_per_iteration": 4.971943378448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096427, + "balance_loss_mlp": 1.06388319, + "epoch": 0.35956136975759906, + "flos": 528369564672.0, + "grad_norm": 0.050983088733796166, + "language_loss": 0.84612024, + "learning_rate": 0.0007412262088623299, + "loss": 0.85708451, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.32543945, + "step": 1869, + "time_per_iteration": 2.7295072078704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104656, + "balance_loss_mlp": 1.07142007, + "epoch": 0.35975375144286265, + "flos": 534647803392.0, + "grad_norm": 0.057848782745497714, + "language_loss": 0.79012549, + "learning_rate": 0.0007409532752346684, + "loss": 0.80117208, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.33251953, + "step": 1870, + "time_per_iteration": 2.74664568901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107858, + "balance_loss_mlp": 1.07464623, + "epoch": 0.3599461331281262, + "flos": 504695549952.0, + "grad_norm": 0.054621035664709404, + "language_loss": 0.88661271, + "learning_rate": 0.0007406802480606491, + "loss": 0.89769125, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.33227539, + "step": 1871, + "time_per_iteration": 2.636101722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_mlp": 1.06923246, + "epoch": 0.36013851481338977, + "flos": 511283708928.0, + "grad_norm": 0.05849515281409536, + "language_loss": 0.90592384, + "learning_rate": 0.0007404071274462707, + "loss": 0.91694903, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.33300781, + "step": 1872, + "time_per_iteration": 2.559588670730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098641, + "balance_loss_mlp": 1.06533384, + "epoch": 0.36033089649865335, + "flos": 547330908672.0, + "grad_norm": 0.06237198940644659, + "language_loss": 0.8363173, + "learning_rate": 0.0007401339134975682, + "loss": 0.84730369, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.33325195, + "step": 1873, + "time_per_iteration": 2.6156845092773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100197, + "balance_loss_mlp": 1.06617522, + "epoch": 0.3605232781839169, + "flos": 458416903680.0, + "grad_norm": 0.05108892475659159, + "language_loss": 0.84187275, + "learning_rate": 0.0007398606063206122, + "loss": 0.85287476, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.34033203, + "step": 1874, + "time_per_iteration": 2.6152756214141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090341, + "balance_loss_mlp": 1.05729628, + "epoch": 0.36071565986918047, + "flos": 509309296128.0, + "grad_norm": 0.05589329807105905, + "language_loss": 0.7857852, + "learning_rate": 0.0007395872060215101, + "loss": 0.79668868, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.33056641, + "step": 1875, + "time_per_iteration": 2.592906951904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095107, + "balance_loss_mlp": 1.06230044, + "epoch": 0.360908041554444, + "flos": 558931484160.0, + "grad_norm": 0.12468103825296885, + "language_loss": 0.88329792, + "learning_rate": 0.0007393137127064056, + "loss": 0.89424896, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.328125, + "step": 1876, + "time_per_iteration": 2.629368782043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109653, + "balance_loss_mlp": 1.06300879, + "epoch": 0.3611004232397076, + "flos": 523588492800.0, + "grad_norm": 0.05189881397754868, + "language_loss": 0.84167802, + "learning_rate": 0.0007390401264814779, + "loss": 0.85264337, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.33544922, + "step": 1877, + "time_per_iteration": 2.644322156906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084049, + "balance_loss_mlp": 1.05179131, + "epoch": 0.3612928049249711, + "flos": 540728193024.0, + "grad_norm": 0.07312313725982984, + "language_loss": 0.84472072, + "learning_rate": 0.0007387664474529427, + "loss": 0.8555612, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.32250977, + "step": 1878, + "time_per_iteration": 2.6105034351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094149, + "balance_loss_mlp": 1.06131935, + "epoch": 0.3614851866102347, + "flos": 552289480704.0, + "grad_norm": 0.06338398309504269, + "language_loss": 0.9129535, + "learning_rate": 0.0007384926757270518, + "loss": 0.923895, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.32836914, + "step": 1879, + "time_per_iteration": 2.6268200874328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082708, + "balance_loss_mlp": 1.05023539, + "epoch": 0.36167756829549824, + "flos": 771734338560.0, + "grad_norm": 0.048672507925477976, + "language_loss": 0.79680419, + "learning_rate": 0.0007382188114100924, + "loss": 0.80763125, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.32470703, + "step": 1880, + "time_per_iteration": 2.9548373222351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078144, + "balance_loss_mlp": 1.04509938, + "epoch": 0.36186994998076183, + "flos": 711560609280.0, + "grad_norm": 0.04804943389379678, + "language_loss": 0.81544787, + "learning_rate": 0.0007379448546083884, + "loss": 0.82622933, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.33056641, + "step": 1881, + "time_per_iteration": 2.900480031967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082413, + "balance_loss_mlp": 1.04884315, + "epoch": 0.3620623316660254, + "flos": 747209138688.0, + "grad_norm": 0.049920719635936736, + "language_loss": 0.88019323, + "learning_rate": 0.0007376708054282992, + "loss": 0.89101738, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.3359375, + "step": 1882, + "time_per_iteration": 2.9482829570770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075385, + "balance_loss_mlp": 1.04286492, + "epoch": 0.36225471335128895, + "flos": 482312088576.0, + "grad_norm": 0.04692483307288239, + "language_loss": 0.83908749, + "learning_rate": 0.0007373966639762201, + "loss": 0.84984136, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.32519531, + "step": 1883, + "time_per_iteration": 2.597809076309204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078049, + "balance_loss_mlp": 1.0448606, + "epoch": 0.36244709503655254, + "flos": 506655406080.0, + "grad_norm": 0.0703007209611724, + "language_loss": 0.8835175, + "learning_rate": 0.0007371224303585822, + "loss": 0.89429802, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.33203125, + "step": 1884, + "time_per_iteration": 2.5686471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046189, + "balance_loss_mlp": 1.03360081, + "epoch": 0.36263947672181607, + "flos": 1393302643200.0, + "grad_norm": 0.020620128786032376, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81403255, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.12597656, + "step": 1885, + "time_per_iteration": 4.68831205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073343, + "balance_loss_mlp": 1.03939199, + "epoch": 0.36283185840707965, + "flos": 652991735808.0, + "grad_norm": 0.05943029236677907, + "language_loss": 0.82845902, + "learning_rate": 0.0007365736870525335, + "loss": 0.83919251, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.33959961, + "step": 1886, + "time_per_iteration": 2.8566346168518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070323, + "balance_loss_mlp": 1.0373739, + "epoch": 0.3630242400923432, + "flos": 488619440640.0, + "grad_norm": 0.06703223685064427, + "language_loss": 0.82574463, + "learning_rate": 0.000736299177577164, + "loss": 0.83644783, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.32958984, + "step": 1887, + "time_per_iteration": 2.5848894119262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074233, + "balance_loss_mlp": 1.04130709, + "epoch": 0.3632166217776068, + "flos": 516892644864.0, + "grad_norm": 0.0626455482667494, + "language_loss": 0.83844066, + "learning_rate": 0.0007360245763623174, + "loss": 0.84918302, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.3293457, + "step": 1888, + "time_per_iteration": 2.6179397106170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067525, + "balance_loss_mlp": 1.03564882, + "epoch": 0.36340900346287036, + "flos": 645881250816.0, + "grad_norm": 0.06111369810549259, + "language_loss": 0.89794236, + "learning_rate": 0.0007357498835146039, + "loss": 0.90861762, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.31860352, + "step": 1889, + "time_per_iteration": 2.8311662673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070281, + "balance_loss_mlp": 1.03854752, + "epoch": 0.3636013851481339, + "flos": 553057708032.0, + "grad_norm": 0.0568549422608731, + "language_loss": 0.86402494, + "learning_rate": 0.0007354750991406684, + "loss": 0.87472773, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.31713867, + "step": 1890, + "time_per_iteration": 2.6922197341918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072041, + "balance_loss_mlp": 1.03952074, + "epoch": 0.3637937668333975, + "flos": 546395355648.0, + "grad_norm": 0.053455628499382915, + "language_loss": 0.80957252, + "learning_rate": 0.0007352002233471919, + "loss": 0.82029295, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.32519531, + "step": 1891, + "time_per_iteration": 2.621241569519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066281, + "balance_loss_mlp": 1.03371286, + "epoch": 0.363986148518661, + "flos": 537838576128.0, + "grad_norm": 0.07508945751401845, + "language_loss": 0.79549944, + "learning_rate": 0.0007349252562408906, + "loss": 0.80616224, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.32543945, + "step": 1892, + "time_per_iteration": 2.674318552017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069175, + "balance_loss_mlp": 1.0372504, + "epoch": 0.3641785302039246, + "flos": 659895607296.0, + "grad_norm": 0.04761623500947703, + "language_loss": 0.81258041, + "learning_rate": 0.0007346501979285158, + "loss": 0.82327211, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.3190918, + "step": 1893, + "time_per_iteration": 2.8671226501464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_mlp": 1.0238719, + "epoch": 0.36437091188918813, + "flos": 1467911158272.0, + "grad_norm": 0.02143179240630706, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81574464, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.11474609, + "step": 1894, + "time_per_iteration": 4.7720019817352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075531, + "balance_loss_mlp": 1.04248571, + "epoch": 0.3645632935744517, + "flos": 597012733440.0, + "grad_norm": 0.049808711864839754, + "language_loss": 0.85850054, + "learning_rate": 0.0007340998081127308, + "loss": 0.8692559, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.33056641, + "step": 1895, + "time_per_iteration": 2.753730058670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081037, + "balance_loss_mlp": 1.0486834, + "epoch": 0.36475567525971525, + "flos": 599214108672.0, + "grad_norm": 0.05384470863640996, + "language_loss": 0.9063257, + "learning_rate": 0.0007338244768230007, + "loss": 0.91713607, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.32348633, + "step": 1896, + "time_per_iteration": 2.749844551086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_mlp": 1.05189669, + "epoch": 0.36494805694497884, + "flos": 798047686656.0, + "grad_norm": 0.12041633701688108, + "language_loss": 0.88843018, + "learning_rate": 0.0007335490547545578, + "loss": 0.89927363, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.32446289, + "step": 1897, + "time_per_iteration": 3.0181519985198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089996, + "balance_loss_mlp": 1.05795288, + "epoch": 0.3651404386302424, + "flos": 637023315456.0, + "grad_norm": 0.06340749789439089, + "language_loss": 0.82377589, + "learning_rate": 0.0007332735420143308, + "loss": 0.83467579, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.3203125, + "step": 1898, + "time_per_iteration": 2.7370855808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077681, + "balance_loss_mlp": 1.04442132, + "epoch": 0.36533282031550596, + "flos": 491337349632.0, + "grad_norm": 0.05751458989837244, + "language_loss": 0.8663426, + "learning_rate": 0.0007329979387092826, + "loss": 0.87711942, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.33276367, + "step": 1899, + "time_per_iteration": 2.552072048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076766, + "balance_loss_mlp": 1.0440551, + "epoch": 0.36552520200076954, + "flos": 855587874816.0, + "grad_norm": 0.050366197091212025, + "language_loss": 0.83863711, + "learning_rate": 0.0007327222449464124, + "loss": 0.84940481, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.32714844, + "step": 1900, + "time_per_iteration": 3.2450594902038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072697, + "balance_loss_mlp": 1.03936601, + "epoch": 0.3657175836860331, + "flos": 483449872896.0, + "grad_norm": 0.053278478248789174, + "language_loss": 0.88864619, + "learning_rate": 0.0007324464608327538, + "loss": 0.89937317, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.33349609, + "step": 1901, + "time_per_iteration": 2.6027348041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072333, + "balance_loss_mlp": 1.03957391, + "epoch": 0.36590996537129666, + "flos": 434561006592.0, + "grad_norm": 0.058664113400220264, + "language_loss": 0.88440275, + "learning_rate": 0.0007321705864753758, + "loss": 0.8951261, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.32763672, + "step": 1902, + "time_per_iteration": 2.668935537338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073172, + "balance_loss_mlp": 1.03950715, + "epoch": 0.3661023470565602, + "flos": 711880704000.0, + "grad_norm": 0.047699393186438684, + "language_loss": 0.84307706, + "learning_rate": 0.0007318946219813823, + "loss": 0.85380876, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.33691406, + "step": 1903, + "time_per_iteration": 3.025866985321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074477, + "balance_loss_mlp": 1.04100275, + "epoch": 0.3662947287418238, + "flos": 564495340032.0, + "grad_norm": 0.05797091317965262, + "language_loss": 0.90078342, + "learning_rate": 0.000731618567457912, + "loss": 0.91152817, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.3347168, + "step": 1904, + "time_per_iteration": 2.6391115188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073585, + "balance_loss_mlp": 1.03937197, + "epoch": 0.3664871104270873, + "flos": 789391982592.0, + "grad_norm": 0.05925410463566973, + "language_loss": 0.87083924, + "learning_rate": 0.000731342423012139, + "loss": 0.88157511, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.3425293, + "step": 1905, + "time_per_iteration": 3.020660400390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070848, + "balance_loss_mlp": 1.03682566, + "epoch": 0.3666794921123509, + "flos": 752202616320.0, + "grad_norm": 0.06601024748857935, + "language_loss": 0.82244205, + "learning_rate": 0.0007310661887512722, + "loss": 0.83315057, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.34033203, + "step": 1906, + "time_per_iteration": 3.0128185749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068568, + "balance_loss_mlp": 1.03571391, + "epoch": 0.3668718737976145, + "flos": 523264015872.0, + "grad_norm": 0.04853340441162438, + "language_loss": 0.82115662, + "learning_rate": 0.0007307898647825549, + "loss": 0.8318423, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.32861328, + "step": 1907, + "time_per_iteration": 2.6610257625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075142, + "balance_loss_mlp": 1.04126275, + "epoch": 0.367064255482878, + "flos": 571698957312.0, + "grad_norm": 0.05773956677348378, + "language_loss": 0.89470363, + "learning_rate": 0.0007305134512132659, + "loss": 0.90545505, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.33886719, + "step": 1908, + "time_per_iteration": 2.706658124923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076976, + "balance_loss_mlp": 1.04309678, + "epoch": 0.3672566371681416, + "flos": 446880347136.0, + "grad_norm": 0.0894454707503668, + "language_loss": 0.83388865, + "learning_rate": 0.0007302369481507183, + "loss": 0.84465849, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.33911133, + "step": 1909, + "time_per_iteration": 2.499483346939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049259, + "balance_loss_mlp": 1.03838694, + "epoch": 0.36744901885340514, + "flos": 1539275208192.0, + "grad_norm": 0.032302576214162944, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81011015, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.10888672, + "step": 1910, + "time_per_iteration": 4.86514949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088721, + "balance_loss_mlp": 1.05476999, + "epoch": 0.36764140053866873, + "flos": 563417192448.0, + "grad_norm": 0.061805914783829616, + "language_loss": 0.85575247, + "learning_rate": 0.000729683673975274, + "loss": 0.86663967, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.33984375, + "step": 1911, + "time_per_iteration": 2.6907522678375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091619, + "balance_loss_mlp": 1.05709589, + "epoch": 0.36783378222393226, + "flos": 1216168971264.0, + "grad_norm": 0.04498413319979697, + "language_loss": 0.82746279, + "learning_rate": 0.0007294069030771774, + "loss": 0.83837891, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.34570312, + "step": 1912, + "time_per_iteration": 3.6445353031158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109623, + "balance_loss_mlp": 1.06196952, + "epoch": 0.36802616390919585, + "flos": 498476947968.0, + "grad_norm": 0.055898807174015214, + "language_loss": 0.90671504, + "learning_rate": 0.0007291300431154224, + "loss": 0.9176774, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.34301758, + "step": 1913, + "time_per_iteration": 2.5600366592407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025736, + "balance_loss_mlp": 1.0155319, + "epoch": 0.36821854559445943, + "flos": 1581281961984.0, + "grad_norm": 0.015307788275572325, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71415472, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.10205078, + "step": 1914, + "time_per_iteration": 4.9577555656433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110633, + "balance_loss_mlp": 1.07287991, + "epoch": 0.36841092727972297, + "flos": 835261784064.0, + "grad_norm": 0.06223209716338702, + "language_loss": 0.79735458, + "learning_rate": 0.0007285760564309179, + "loss": 0.80841786, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.3347168, + "step": 1915, + "time_per_iteration": 3.1251590251922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101251, + "balance_loss_mlp": 1.06811082, + "epoch": 0.36860330896498655, + "flos": 689517591552.0, + "grad_norm": 0.05672479428696366, + "language_loss": 0.85010201, + "learning_rate": 0.0007282989299232448, + "loss": 0.8611145, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.33154297, + "step": 1916, + "time_per_iteration": 3.062971353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096553, + "balance_loss_mlp": 1.06381774, + "epoch": 0.3687956906502501, + "flos": 553919067648.0, + "grad_norm": 0.05955658020637064, + "language_loss": 0.83600092, + "learning_rate": 0.0007280217147820668, + "loss": 0.84696645, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.32739258, + "step": 1917, + "time_per_iteration": 2.6169495582580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097845, + "balance_loss_mlp": 1.06465673, + "epoch": 0.3689880723355137, + "flos": 576426184704.0, + "grad_norm": 0.05443515430960571, + "language_loss": 0.79137111, + "learning_rate": 0.0007277444111150079, + "loss": 0.80234957, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.33203125, + "step": 1918, + "time_per_iteration": 2.672696828842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101204, + "balance_loss_mlp": 1.06820679, + "epoch": 0.3691804540207772, + "flos": 528615465984.0, + "grad_norm": 0.06564490716140688, + "language_loss": 0.84340626, + "learning_rate": 0.0007274670190297272, + "loss": 0.85441828, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.33007812, + "step": 1919, + "time_per_iteration": 2.569920539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091279, + "balance_loss_mlp": 1.0575906, + "epoch": 0.3693728357060408, + "flos": 560729806848.0, + "grad_norm": 0.06475948680742319, + "language_loss": 0.81988895, + "learning_rate": 0.0007271895386339179, + "loss": 0.83080173, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.3371582, + "step": 1920, + "time_per_iteration": 2.765470027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086642, + "balance_loss_mlp": 1.05350161, + "epoch": 0.3695652173913043, + "flos": 579488919552.0, + "grad_norm": 0.0536525739451854, + "language_loss": 0.82950377, + "learning_rate": 0.0007269119700353073, + "loss": 0.84037018, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.33154297, + "step": 1921, + "time_per_iteration": 2.703117847442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089729, + "balance_loss_mlp": 1.05756688, + "epoch": 0.3697575990765679, + "flos": 512629516800.0, + "grad_norm": 0.04104943724396866, + "language_loss": 0.84983069, + "learning_rate": 0.0007266343133416571, + "loss": 0.86072791, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.3215332, + "step": 1922, + "time_per_iteration": 2.7371909618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060973, + "balance_loss_mlp": 1.04967153, + "epoch": 0.3699499807618315, + "flos": 1569826953216.0, + "grad_norm": 0.023907464900796205, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78177893, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.11279297, + "step": 1923, + "time_per_iteration": 4.827981233596802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084577, + "balance_loss_mlp": 1.05136538, + "epoch": 0.37014236244709503, + "flos": 497093262336.0, + "grad_norm": 0.06739223877154035, + "language_loss": 0.84575641, + "learning_rate": 0.0007260787361004556, + "loss": 0.85660219, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.33227539, + "step": 1924, + "time_per_iteration": 2.6221601963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048708, + "balance_loss_mlp": 1.03764546, + "epoch": 0.3703347441323586, + "flos": 1443562048512.0, + "grad_norm": 0.02017040526472397, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74810213, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.11083984, + "step": 1925, + "time_per_iteration": 4.909639120101929 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083689, + "balance_loss_mlp": 1.04997683, + "epoch": 0.37052712581762215, + "flos": 563324060160.0, + "grad_norm": 0.19489786122972683, + "language_loss": 0.87265027, + "learning_rate": 0.0007255228077730903, + "loss": 0.88348716, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.33740234, + "step": 1926, + "time_per_iteration": 2.726412773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092339, + "balance_loss_mlp": 1.05850744, + "epoch": 0.37071950750288574, + "flos": 925706451456.0, + "grad_norm": 0.06639539702607969, + "language_loss": 0.81730163, + "learning_rate": 0.0007252447122218632, + "loss": 0.82822502, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.33862305, + "step": 1927, + "time_per_iteration": 3.157439708709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090678, + "balance_loss_mlp": 1.05710912, + "epoch": 0.37091188918814927, + "flos": 418090609152.0, + "grad_norm": 0.06586667444600991, + "language_loss": 0.87736213, + "learning_rate": 0.0007249665292228834, + "loss": 0.88826889, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.3359375, + "step": 1928, + "time_per_iteration": 2.569284677505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086968, + "balance_loss_mlp": 1.05208778, + "epoch": 0.37110427087341286, + "flos": 462941899776.0, + "grad_norm": 0.056849308308669105, + "language_loss": 0.83676869, + "learning_rate": 0.000724688258884151, + "loss": 0.84763837, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.34912109, + "step": 1929, + "time_per_iteration": 2.522596597671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085634, + "balance_loss_mlp": 1.05204105, + "epoch": 0.3712966525586764, + "flos": 849303843840.0, + "grad_norm": 0.0484214736208702, + "language_loss": 0.86208755, + "learning_rate": 0.0007244099013137002, + "loss": 0.87294388, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.33618164, + "step": 1930, + "time_per_iteration": 3.055302619934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089345, + "balance_loss_mlp": 1.05370176, + "epoch": 0.37148903424394, + "flos": 925555092480.0, + "grad_norm": 0.05147814185741214, + "language_loss": 0.88918859, + "learning_rate": 0.0007241314566195993, + "loss": 0.90008199, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.35693359, + "step": 1931, + "time_per_iteration": 3.249950408935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108542, + "balance_loss_mlp": 1.05020559, + "epoch": 0.37168141592920356, + "flos": 519565473792.0, + "grad_norm": 0.061459583473066896, + "language_loss": 0.85347825, + "learning_rate": 0.0007238529249099496, + "loss": 0.86433244, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.35253906, + "step": 1932, + "time_per_iteration": 2.603287696838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068675, + "balance_loss_mlp": 1.05599129, + "epoch": 0.3718737976144671, + "flos": 1445107267584.0, + "grad_norm": 0.02721294021284605, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.7892555, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.12695312, + "step": 1933, + "time_per_iteration": 4.850685358047485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089729, + "balance_loss_mlp": 1.05346537, + "epoch": 0.3720661792997307, + "flos": 759218558976.0, + "grad_norm": 0.08735029164116491, + "language_loss": 0.80658156, + "learning_rate": 0.000723295600876581, + "loss": 0.81747884, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.36279297, + "step": 1934, + "time_per_iteration": 3.0082099437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092646, + "balance_loss_mlp": 1.05690694, + "epoch": 0.3722585609849942, + "flos": 516686031360.0, + "grad_norm": 0.1760204301041219, + "language_loss": 0.8798061, + "learning_rate": 0.0007230168087692344, + "loss": 0.89073259, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.35791016, + "step": 1935, + "time_per_iteration": 2.7076756954193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095918, + "balance_loss_mlp": 1.06070328, + "epoch": 0.3724509426702578, + "flos": 782114171904.0, + "grad_norm": 0.058450977170247324, + "language_loss": 0.82290804, + "learning_rate": 0.0007227379300790839, + "loss": 0.83386725, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.35205078, + "step": 1936, + "time_per_iteration": 3.0381107330322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108913, + "balance_loss_mlp": 1.07262528, + "epoch": 0.37264332435552133, + "flos": 391502246400.0, + "grad_norm": 0.062314619417064634, + "language_loss": 0.85779369, + "learning_rate": 0.0007224589649143997, + "loss": 0.86888283, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.36328125, + "step": 1937, + "time_per_iteration": 2.5413918495178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118094, + "balance_loss_mlp": 1.08223581, + "epoch": 0.3728357060407849, + "flos": 542599299072.0, + "grad_norm": 0.08241458585549921, + "language_loss": 0.80921531, + "learning_rate": 0.0007221799133834861, + "loss": 0.82039624, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.35839844, + "step": 1938, + "time_per_iteration": 2.620593309402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122902, + "balance_loss_mlp": 1.08682895, + "epoch": 0.3730280877260485, + "flos": 433344646656.0, + "grad_norm": 0.05702640818290307, + "language_loss": 0.81373966, + "learning_rate": 0.00072190077559468, + "loss": 0.8249687, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.36083984, + "step": 1939, + "time_per_iteration": 2.512871026992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133095, + "balance_loss_mlp": 1.09587836, + "epoch": 0.37322046941131204, + "flos": 531230068224.0, + "grad_norm": 0.0616329871980105, + "language_loss": 0.89228082, + "learning_rate": 0.0007216215516563527, + "loss": 0.90361178, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.37207031, + "step": 1940, + "time_per_iteration": 2.6655144691467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113311, + "balance_loss_mlp": 1.09534478, + "epoch": 0.3734128510965756, + "flos": 531294087168.0, + "grad_norm": 0.05659412158312536, + "language_loss": 0.83479297, + "learning_rate": 0.0007213422416769083, + "loss": 0.84612405, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.37744141, + "step": 1941, + "time_per_iteration": 2.6088144779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127109, + "balance_loss_mlp": 1.09022546, + "epoch": 0.37360523278183916, + "flos": 500195284992.0, + "grad_norm": 0.05910558413712496, + "language_loss": 0.74991721, + "learning_rate": 0.0007210628457647849, + "loss": 0.76118833, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.36889648, + "step": 1942, + "time_per_iteration": 2.57867169380188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131479, + "balance_loss_mlp": 1.09366596, + "epoch": 0.37379761446710275, + "flos": 547652413440.0, + "grad_norm": 0.06364761456819781, + "language_loss": 0.79148316, + "learning_rate": 0.000720783364028453, + "loss": 0.80279785, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.37768555, + "step": 1943, + "time_per_iteration": 2.744575023651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121556, + "balance_loss_mlp": 1.0834806, + "epoch": 0.3739899961523663, + "flos": 475517316096.0, + "grad_norm": 0.05406366318559307, + "language_loss": 0.87411249, + "learning_rate": 0.0007205037965764177, + "loss": 0.88532799, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.38061523, + "step": 1944, + "time_per_iteration": 2.5253238677978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122588, + "balance_loss_mlp": 1.0851568, + "epoch": 0.37418237783762986, + "flos": 611626581504.0, + "grad_norm": 0.05571778703090581, + "language_loss": 0.85614675, + "learning_rate": 0.0007202241435172161, + "loss": 0.86737263, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.37426758, + "step": 1945, + "time_per_iteration": 2.7462716102600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117651, + "balance_loss_mlp": 1.07931328, + "epoch": 0.3743747595228934, + "flos": 765953694720.0, + "grad_norm": 0.05225192391609906, + "language_loss": 0.88148731, + "learning_rate": 0.0007199444049594198, + "loss": 0.89266384, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.38330078, + "step": 1946, + "time_per_iteration": 2.9533469676971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116752, + "balance_loss_mlp": 1.07798529, + "epoch": 0.374567141208157, + "flos": 524120993280.0, + "grad_norm": 0.06150523549490838, + "language_loss": 0.83402771, + "learning_rate": 0.0007196645810116322, + "loss": 0.84519523, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.38720703, + "step": 1947, + "time_per_iteration": 2.709965705871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106001, + "balance_loss_mlp": 1.06735349, + "epoch": 0.37475952289342057, + "flos": 681067090944.0, + "grad_norm": 0.05833074938802531, + "language_loss": 0.83909506, + "learning_rate": 0.0007193846717824912, + "loss": 0.850155, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.38598633, + "step": 1948, + "time_per_iteration": 2.854522705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094783, + "balance_loss_mlp": 1.05682671, + "epoch": 0.3749519045786841, + "flos": 460061047296.0, + "grad_norm": 0.06673844071937801, + "language_loss": 0.88263041, + "learning_rate": 0.0007191046773806669, + "loss": 0.89357823, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.37915039, + "step": 1949, + "time_per_iteration": 2.575989007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085738, + "balance_loss_mlp": 1.04682803, + "epoch": 0.3751442862639477, + "flos": 954471458304.0, + "grad_norm": 0.06638817682476543, + "language_loss": 0.83010924, + "learning_rate": 0.0007188245979148631, + "loss": 0.84096658, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.38867188, + "step": 1950, + "time_per_iteration": 3.1386518478393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082888, + "balance_loss_mlp": 1.04462171, + "epoch": 0.3753366679492112, + "flos": 527483473920.0, + "grad_norm": 0.05996025340147905, + "language_loss": 0.8766306, + "learning_rate": 0.0007185444334938157, + "loss": 0.88745946, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.38232422, + "step": 1951, + "time_per_iteration": 2.644848585128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082635, + "balance_loss_mlp": 1.04501283, + "epoch": 0.3755290496344748, + "flos": 521535504384.0, + "grad_norm": 0.05938829335869994, + "language_loss": 0.84891546, + "learning_rate": 0.0007182641842262947, + "loss": 0.85974181, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.3762207, + "step": 1952, + "time_per_iteration": 2.6239395141601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081192, + "balance_loss_mlp": 1.04361689, + "epoch": 0.37572143131973834, + "flos": 620810403840.0, + "grad_norm": 0.06544951097265184, + "language_loss": 0.77827752, + "learning_rate": 0.0007179838502211022, + "loss": 0.78908944, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.37524414, + "step": 1953, + "time_per_iteration": 2.8444712162017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076992, + "balance_loss_mlp": 1.03958416, + "epoch": 0.37591381300500193, + "flos": 770635842048.0, + "grad_norm": 0.05616797515781331, + "language_loss": 0.86183697, + "learning_rate": 0.0007177034315870738, + "loss": 0.87260687, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.37402344, + "step": 1954, + "time_per_iteration": 2.9628727436065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078673, + "balance_loss_mlp": 1.0411936, + "epoch": 0.37610619469026546, + "flos": 520191106560.0, + "grad_norm": 0.05872311076525267, + "language_loss": 0.9098376, + "learning_rate": 0.0007174229284330773, + "loss": 0.92062426, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.37402344, + "step": 1955, + "time_per_iteration": 2.579792022705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010784, + "balance_loss_mlp": 1.0412302, + "epoch": 0.37629857637552905, + "flos": 598524456960.0, + "grad_norm": 0.050284285498010506, + "language_loss": 0.86896843, + "learning_rate": 0.0007171423408680141, + "loss": 0.8797524, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.37133789, + "step": 1956, + "time_per_iteration": 2.7764384746551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079026, + "balance_loss_mlp": 1.04211903, + "epoch": 0.37649095806079264, + "flos": 564687396864.0, + "grad_norm": 0.058102078307858664, + "language_loss": 0.89614129, + "learning_rate": 0.0007168616690008176, + "loss": 0.90693152, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.36889648, + "step": 1957, + "time_per_iteration": 2.646986246109009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083688, + "balance_loss_mlp": 1.04606569, + "epoch": 0.37668333974605617, + "flos": 592196755968.0, + "grad_norm": 0.10223927136981294, + "language_loss": 0.86142451, + "learning_rate": 0.0007165809129404545, + "loss": 0.8722614, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.37573242, + "step": 1958, + "time_per_iteration": 2.756287097930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081219, + "balance_loss_mlp": 1.04440713, + "epoch": 0.37687572143131975, + "flos": 419257506816.0, + "grad_norm": 0.0560584683493853, + "language_loss": 0.85760534, + "learning_rate": 0.0007163000727959239, + "loss": 0.8684175, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.36791992, + "step": 1959, + "time_per_iteration": 2.5415151119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105237, + "balance_loss_mlp": 1.03587127, + "epoch": 0.3770681031165833, + "flos": 1356484243968.0, + "grad_norm": 0.028402472736143748, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.7901144, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.16503906, + "step": 1960, + "time_per_iteration": 4.892657518386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086261, + "balance_loss_mlp": 1.04892445, + "epoch": 0.3772604848018469, + "flos": 644592107520.0, + "grad_norm": 0.04530874218926827, + "language_loss": 0.84377986, + "learning_rate": 0.00071573814069052, + "loss": 0.85464251, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.37329102, + "step": 1961, + "time_per_iteration": 2.898301839828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088156, + "balance_loss_mlp": 1.05098641, + "epoch": 0.3774528664871104, + "flos": 901265619456.0, + "grad_norm": 0.052585227845940184, + "language_loss": 0.87987518, + "learning_rate": 0.0007154570489478081, + "loss": 0.89075673, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.37158203, + "step": 1962, + "time_per_iteration": 3.1638717651367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088198, + "balance_loss_mlp": 1.05048013, + "epoch": 0.377645248172374, + "flos": 787717315584.0, + "grad_norm": 0.047624248218528294, + "language_loss": 0.864995, + "learning_rate": 0.0007151758735572514, + "loss": 0.87587702, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.37695312, + "step": 1963, + "time_per_iteration": 2.985558271408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090796, + "balance_loss_mlp": 1.05236292, + "epoch": 0.3778376298576376, + "flos": 586417522176.0, + "grad_norm": 0.06598015027050642, + "language_loss": 0.80448836, + "learning_rate": 0.0007148946146280119, + "loss": 0.81539631, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.38427734, + "step": 1964, + "time_per_iteration": 2.784947395324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053975, + "balance_loss_mlp": 1.03938425, + "epoch": 0.3780300115429011, + "flos": 1396014759936.0, + "grad_norm": 0.018109037433210438, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73246121, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.14550781, + "step": 1965, + "time_per_iteration": 4.85606050491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052214, + "balance_loss_mlp": 1.03838599, + "epoch": 0.3782223932281647, + "flos": 1356935348736.0, + "grad_norm": 0.019183634636191996, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76394159, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.13867188, + "step": 1966, + "time_per_iteration": 4.946903467178345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082797, + "balance_loss_mlp": 1.04517484, + "epoch": 0.37841477491342823, + "flos": 703811344896.0, + "grad_norm": 0.05921890511558738, + "language_loss": 0.83782387, + "learning_rate": 0.0007140503377003022, + "loss": 0.84865183, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.3762207, + "step": 1967, + "time_per_iteration": 2.984163761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089165, + "balance_loss_mlp": 1.0504458, + "epoch": 0.3786071565986918, + "flos": 528856985088.0, + "grad_norm": 0.047303083725180994, + "language_loss": 0.84754062, + "learning_rate": 0.000713768745708599, + "loss": 0.85843223, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.38696289, + "step": 1968, + "time_per_iteration": 2.6251039505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086627, + "balance_loss_mlp": 1.04881418, + "epoch": 0.37879953828395535, + "flos": 992872802304.0, + "grad_norm": 0.053091209869219315, + "language_loss": 0.76740122, + "learning_rate": 0.0007134870707245085, + "loss": 0.7782675, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.37792969, + "step": 1969, + "time_per_iteration": 3.252840995788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082313, + "balance_loss_mlp": 1.04435682, + "epoch": 0.37899191996921894, + "flos": 626358292992.0, + "grad_norm": 0.06088981396741891, + "language_loss": 0.8454808, + "learning_rate": 0.0007132053128573864, + "loss": 0.85630393, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.37915039, + "step": 1970, + "time_per_iteration": 2.739210844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078955, + "balance_loss_mlp": 1.0417614, + "epoch": 0.37918430165448247, + "flos": 686005314048.0, + "grad_norm": 0.05972304110224919, + "language_loss": 0.83631253, + "learning_rate": 0.0007129234722166211, + "loss": 0.84710205, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.37182617, + "step": 1971, + "time_per_iteration": 2.814235210418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086171, + "balance_loss_mlp": 1.05012178, + "epoch": 0.37937668333974606, + "flos": 475374721536.0, + "grad_norm": 0.05230765101952506, + "language_loss": 0.91063309, + "learning_rate": 0.0007126415489116328, + "loss": 0.92149478, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.3605957, + "step": 1972, + "time_per_iteration": 2.657435178756714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091568, + "balance_loss_mlp": 1.05413604, + "epoch": 0.37956906502500964, + "flos": 707271340032.0, + "grad_norm": 0.05210329015751025, + "language_loss": 0.81174934, + "learning_rate": 0.0007123595430518736, + "loss": 0.82266498, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.37402344, + "step": 1973, + "time_per_iteration": 2.832801103591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108628, + "balance_loss_mlp": 1.04856205, + "epoch": 0.3797614467102732, + "flos": 426421836288.0, + "grad_norm": 0.07403044475037865, + "language_loss": 0.8602494, + "learning_rate": 0.0007120774547468282, + "loss": 0.87111217, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.37695312, + "step": 1974, + "time_per_iteration": 2.523059844970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087165, + "balance_loss_mlp": 1.05016232, + "epoch": 0.37995382839553676, + "flos": 481588941312.0, + "grad_norm": 0.05250431859571283, + "language_loss": 0.81228226, + "learning_rate": 0.0007117952841060128, + "loss": 0.82315391, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.36962891, + "step": 1975, + "time_per_iteration": 2.648947238922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080927, + "balance_loss_mlp": 1.04409158, + "epoch": 0.3801462100808003, + "flos": 560286056448.0, + "grad_norm": 0.0511194255012935, + "language_loss": 0.83466387, + "learning_rate": 0.0007115130312389756, + "loss": 0.84547317, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.3684082, + "step": 1976, + "time_per_iteration": 2.6648154258728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086722, + "balance_loss_mlp": 1.0505538, + "epoch": 0.3803385917660639, + "flos": 464699524608.0, + "grad_norm": 0.06028169205400359, + "language_loss": 0.79143679, + "learning_rate": 0.0007112306962552973, + "loss": 0.80230403, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.36181641, + "step": 1977, + "time_per_iteration": 2.5715434551239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086468, + "balance_loss_mlp": 1.04934657, + "epoch": 0.3805309734513274, + "flos": 521614080000.0, + "grad_norm": 0.055719330197324175, + "language_loss": 0.8517288, + "learning_rate": 0.0007109482792645896, + "loss": 0.86259341, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.37084961, + "step": 1978, + "time_per_iteration": 2.6932663917541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088226, + "balance_loss_mlp": 1.05222487, + "epoch": 0.380723355136591, + "flos": 591128782848.0, + "grad_norm": 0.06665517257748008, + "language_loss": 0.83491528, + "learning_rate": 0.0007106657803764969, + "loss": 0.84579754, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.36010742, + "step": 1979, + "time_per_iteration": 2.710658311843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079049, + "balance_loss_mlp": 1.04204643, + "epoch": 0.38091573682185453, + "flos": 622394910720.0, + "grad_norm": 0.05735071115872701, + "language_loss": 0.81648314, + "learning_rate": 0.0007103831997006948, + "loss": 0.82727367, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.36987305, + "step": 1980, + "time_per_iteration": 2.7589223384857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107835, + "balance_loss_mlp": 1.04168153, + "epoch": 0.3811081185071181, + "flos": 568716208128.0, + "grad_norm": 0.047165366669346453, + "language_loss": 0.85731214, + "learning_rate": 0.0007101005373468908, + "loss": 0.86809564, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.36669922, + "step": 1981, + "time_per_iteration": 2.85588002204895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077478, + "balance_loss_mlp": 1.04161954, + "epoch": 0.3813005001923817, + "flos": 584550798336.0, + "grad_norm": 0.055048826019740454, + "language_loss": 0.86394024, + "learning_rate": 0.0007098177934248242, + "loss": 0.87471503, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.35888672, + "step": 1982, + "time_per_iteration": 2.7226805686950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077944, + "balance_loss_mlp": 1.04160953, + "epoch": 0.38149288187764524, + "flos": 621287649792.0, + "grad_norm": 0.056689743602043985, + "language_loss": 0.85823661, + "learning_rate": 0.0007095349680442661, + "loss": 0.86901605, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.36352539, + "step": 1983, + "time_per_iteration": 2.8454253673553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075326, + "balance_loss_mlp": 1.03829932, + "epoch": 0.3816852635629088, + "flos": 570414196224.0, + "grad_norm": 0.07971741755252446, + "language_loss": 0.78927159, + "learning_rate": 0.0007092520613150188, + "loss": 0.80002487, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.37036133, + "step": 1984, + "time_per_iteration": 2.7279529571533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081451, + "balance_loss_mlp": 1.04368556, + "epoch": 0.38187764524817236, + "flos": 565313029632.0, + "grad_norm": 0.06238598748372814, + "language_loss": 0.81304747, + "learning_rate": 0.0007089690733469165, + "loss": 0.82386196, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.37719727, + "step": 1985, + "time_per_iteration": 2.7343544960021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077201, + "balance_loss_mlp": 1.04115212, + "epoch": 0.38207002693343595, + "flos": 630932751360.0, + "grad_norm": 0.07832972313002672, + "language_loss": 0.82561398, + "learning_rate": 0.000708686004249825, + "loss": 0.83638602, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.36035156, + "step": 1986, + "time_per_iteration": 2.7691054344177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082228, + "balance_loss_mlp": 1.04572582, + "epoch": 0.3822624086186995, + "flos": 548507980800.0, + "grad_norm": 0.053849318526496194, + "language_loss": 0.9147824, + "learning_rate": 0.0007084028541336413, + "loss": 0.9256047, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.36499023, + "step": 1987, + "time_per_iteration": 2.7131056785583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083994, + "balance_loss_mlp": 1.04753971, + "epoch": 0.38245479030396307, + "flos": 613571880960.0, + "grad_norm": 0.06787860515410171, + "language_loss": 0.86709088, + "learning_rate": 0.0007081196231082942, + "loss": 0.87793082, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.36450195, + "step": 1988, + "time_per_iteration": 2.7983548641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083168, + "balance_loss_mlp": 1.04621339, + "epoch": 0.38264717198922665, + "flos": 667787466240.0, + "grad_norm": 0.05230973877590939, + "language_loss": 0.80107033, + "learning_rate": 0.0007078363112837436, + "loss": 0.81190205, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.36938477, + "step": 1989, + "time_per_iteration": 2.8133270740509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087048, + "balance_loss_mlp": 1.04935408, + "epoch": 0.3828395536744902, + "flos": 454521922560.0, + "grad_norm": 0.077904410907181, + "language_loss": 0.84988701, + "learning_rate": 0.000707552918769981, + "loss": 0.86075753, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.37646484, + "step": 1990, + "time_per_iteration": 2.5100817680358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089992, + "balance_loss_mlp": 1.05213106, + "epoch": 0.3830319353597538, + "flos": 499191330816.0, + "grad_norm": 0.06242573245055077, + "language_loss": 0.83457661, + "learning_rate": 0.000707269445677029, + "loss": 0.84547657, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.37817383, + "step": 1991, + "time_per_iteration": 2.7737526893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099946, + "balance_loss_mlp": 1.06327772, + "epoch": 0.3832243170450173, + "flos": 743787021312.0, + "grad_norm": 0.05437985066129539, + "language_loss": 0.84858984, + "learning_rate": 0.0007069858921149416, + "loss": 0.85958934, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.36694336, + "step": 1992, + "time_per_iteration": 2.9642581939697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095684, + "balance_loss_mlp": 1.05868101, + "epoch": 0.3834166987302809, + "flos": 577937908224.0, + "grad_norm": 0.10762195872615073, + "language_loss": 0.85869837, + "learning_rate": 0.0007067022581938043, + "loss": 0.86965525, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.36987305, + "step": 1993, + "time_per_iteration": 2.805201292037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101221, + "balance_loss_mlp": 1.06531525, + "epoch": 0.3836090804155444, + "flos": 536194432512.0, + "grad_norm": 0.06280477504596697, + "language_loss": 0.831635, + "learning_rate": 0.0007064185440237334, + "loss": 0.84264719, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.359375, + "step": 1994, + "time_per_iteration": 2.7297706604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101916, + "balance_loss_mlp": 1.06527066, + "epoch": 0.383801462100808, + "flos": 601587191808.0, + "grad_norm": 0.05513764490979663, + "language_loss": 0.84278905, + "learning_rate": 0.0007061347497148764, + "loss": 0.85380822, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.36621094, + "step": 1995, + "time_per_iteration": 2.725632429122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094031, + "balance_loss_mlp": 1.05876923, + "epoch": 0.38399384378607154, + "flos": 572427896832.0, + "grad_norm": 0.06604776765413087, + "language_loss": 0.86282277, + "learning_rate": 0.0007058508753774122, + "loss": 0.87376308, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.35302734, + "step": 1996, + "time_per_iteration": 2.760045051574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092829, + "balance_loss_mlp": 1.05773425, + "epoch": 0.38418622547133513, + "flos": 536513117184.0, + "grad_norm": 0.058737109015633, + "language_loss": 0.86788458, + "learning_rate": 0.0007055669211215505, + "loss": 0.87881291, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.35131836, + "step": 1997, + "time_per_iteration": 2.6091408729553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088772, + "balance_loss_mlp": 1.05238962, + "epoch": 0.3843786071565987, + "flos": 572673798144.0, + "grad_norm": 0.06483433205315106, + "language_loss": 0.77687544, + "learning_rate": 0.0007052828870575322, + "loss": 0.78776312, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.36376953, + "step": 1998, + "time_per_iteration": 2.671349048614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109131, + "balance_loss_mlp": 1.0558331, + "epoch": 0.38457098884186225, + "flos": 728361275904.0, + "grad_norm": 0.05010154832824161, + "language_loss": 0.86955881, + "learning_rate": 0.0007049987732956291, + "loss": 0.88047194, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.35498047, + "step": 1999, + "time_per_iteration": 2.9918439388275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108718, + "balance_loss_mlp": 1.05189395, + "epoch": 0.38476337052712584, + "flos": 583123442688.0, + "grad_norm": 0.047224279388360366, + "language_loss": 0.82623643, + "learning_rate": 0.0007047145799461439, + "loss": 0.83710825, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.35327148, + "step": 2000, + "time_per_iteration": 2.84328293800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092374, + "balance_loss_mlp": 1.05656374, + "epoch": 0.38495575221238937, + "flos": 552787075584.0, + "grad_norm": 0.05254385134155795, + "language_loss": 0.82269979, + "learning_rate": 0.00070443030711941, + "loss": 0.83362353, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.3581543, + "step": 2001, + "time_per_iteration": 2.753903865814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092226, + "balance_loss_mlp": 1.0559628, + "epoch": 0.38514813389765296, + "flos": 654173190144.0, + "grad_norm": 0.05896823149323879, + "language_loss": 0.8241961, + "learning_rate": 0.0007041459549257924, + "loss": 0.83511841, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.36254883, + "step": 2002, + "time_per_iteration": 2.85963773727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086715, + "balance_loss_mlp": 1.05030835, + "epoch": 0.3853405155829165, + "flos": 867715158528.0, + "grad_norm": 0.0671523306708724, + "language_loss": 0.78569824, + "learning_rate": 0.0007038615234756859, + "loss": 0.79656541, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.36425781, + "step": 2003, + "time_per_iteration": 3.1452226638793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_mlp": 1.04440188, + "epoch": 0.3855328972681801, + "flos": 546164011008.0, + "grad_norm": 0.05736478292188374, + "language_loss": 0.83675313, + "learning_rate": 0.000703577012879517, + "loss": 0.84755898, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.36230469, + "step": 2004, + "time_per_iteration": 2.6308705806732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075931, + "balance_loss_mlp": 1.04040706, + "epoch": 0.3857252789534436, + "flos": 533819939328.0, + "grad_norm": 0.0602394573591363, + "language_loss": 0.8843599, + "learning_rate": 0.0007032924232477423, + "loss": 0.89511919, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.35595703, + "step": 2005, + "time_per_iteration": 2.6188220977783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079495, + "balance_loss_mlp": 1.04337406, + "epoch": 0.3859176606387072, + "flos": 491514849792.0, + "grad_norm": 0.055511202055775664, + "language_loss": 0.80448711, + "learning_rate": 0.0007030077546908493, + "loss": 0.81528199, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.36132812, + "step": 2006, + "time_per_iteration": 2.6309516429901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088067, + "balance_loss_mlp": 1.07609844, + "epoch": 0.3861100423239708, + "flos": 1486278955008.0, + "grad_norm": 0.032522163132150485, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84152722, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.11962891, + "step": 2007, + "time_per_iteration": 4.736604452133179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083286, + "balance_loss_mlp": 1.04816651, + "epoch": 0.3863024240092343, + "flos": 473493441024.0, + "grad_norm": 0.05514866045126494, + "language_loss": 0.79152983, + "learning_rate": 0.0007024381812438117, + "loss": 0.80236268, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.3515625, + "step": 2008, + "time_per_iteration": 2.5392396450042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108868, + "balance_loss_mlp": 1.05306053, + "epoch": 0.3864948056944979, + "flos": 716258723328.0, + "grad_norm": 0.059806412844581394, + "language_loss": 0.83199877, + "learning_rate": 0.0007021532765747951, + "loss": 0.84288561, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.35668945, + "step": 2009, + "time_per_iteration": 2.9926528930664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087876, + "balance_loss_mlp": 1.0511353, + "epoch": 0.38668718737976143, + "flos": 727302067200.0, + "grad_norm": 0.05631620148302912, + "language_loss": 0.7933259, + "learning_rate": 0.0007018682934229162, + "loss": 0.8042047, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.36743164, + "step": 2010, + "time_per_iteration": 2.924781322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087301, + "balance_loss_mlp": 1.05103779, + "epoch": 0.386879569065025, + "flos": 525218079744.0, + "grad_norm": 0.05794664731816873, + "language_loss": 0.82387936, + "learning_rate": 0.0007015832318988152, + "loss": 0.83475244, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.36328125, + "step": 2011, + "time_per_iteration": 2.668565511703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_mlp": 1.02173615, + "epoch": 0.38707195075028855, + "flos": 1527036005376.0, + "grad_norm": 0.019732384975687786, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74922872, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11083984, + "step": 2012, + "time_per_iteration": 4.948081970214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085105, + "balance_loss_mlp": 1.04886508, + "epoch": 0.38726433243555214, + "flos": 557045821440.0, + "grad_norm": 0.049164425244227684, + "language_loss": 0.84333575, + "learning_rate": 0.0007010128741766604, + "loss": 0.85418677, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.36230469, + "step": 2013, + "time_per_iteration": 2.7263684272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073123, + "balance_loss_mlp": 1.03695476, + "epoch": 0.38745671412081567, + "flos": 553431647232.0, + "grad_norm": 0.06787190277242791, + "language_loss": 0.84107876, + "learning_rate": 0.0007007275782000391, + "loss": 0.85181004, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.36181641, + "step": 2014, + "time_per_iteration": 2.6458756923675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082824, + "balance_loss_mlp": 1.04679942, + "epoch": 0.38764909580607926, + "flos": 458175384576.0, + "grad_norm": 0.05583745089019265, + "language_loss": 0.85148585, + "learning_rate": 0.0007004422042940605, + "loss": 0.86231411, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.3605957, + "step": 2015, + "time_per_iteration": 2.5374679565429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074881, + "balance_loss_mlp": 1.03847444, + "epoch": 0.38784147749134285, + "flos": 521973462528.0, + "grad_norm": 0.056017537147394686, + "language_loss": 0.89528251, + "learning_rate": 0.0007001567525695169, + "loss": 0.90603131, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.36425781, + "step": 2016, + "time_per_iteration": 2.5863571166992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081449, + "balance_loss_mlp": 1.04504275, + "epoch": 0.3880338591766064, + "flos": 665696600064.0, + "grad_norm": 0.05583938490392423, + "language_loss": 0.839926, + "learning_rate": 0.0006998712231372303, + "loss": 0.85074055, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.36425781, + "step": 2017, + "time_per_iteration": 2.998652219772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076964, + "balance_loss_mlp": 1.04103458, + "epoch": 0.38822624086186996, + "flos": 593660427264.0, + "grad_norm": 0.044278068469259586, + "language_loss": 0.86088806, + "learning_rate": 0.0006995856161080532, + "loss": 0.87165773, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.35961914, + "step": 2018, + "time_per_iteration": 2.870619297027588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077822, + "balance_loss_mlp": 1.03972268, + "epoch": 0.3884186225471335, + "flos": 612256596480.0, + "grad_norm": 0.10653426783792587, + "language_loss": 0.8221643, + "learning_rate": 0.0006992999315928679, + "loss": 0.83294249, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.38061523, + "step": 2019, + "time_per_iteration": 2.7867438793182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074258, + "balance_loss_mlp": 1.03782773, + "epoch": 0.3886110042323971, + "flos": 606737820672.0, + "grad_norm": 0.05830260104080337, + "language_loss": 0.85476196, + "learning_rate": 0.0006990141697025871, + "loss": 0.8655045, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.36401367, + "step": 2020, + "time_per_iteration": 2.7722346782684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008633, + "balance_loss_mlp": 0.99642587, + "epoch": 0.3888033859176606, + "flos": 1527289108992.0, + "grad_norm": 0.011259985776032525, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77368271, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.12207031, + "step": 2021, + "time_per_iteration": 4.71975302696228 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069739, + "balance_loss_mlp": 1.03368998, + "epoch": 0.3889957676029242, + "flos": 692145340416.0, + "grad_norm": 0.08577921290538253, + "language_loss": 0.82040119, + "learning_rate": 0.0006984424142405392, + "loss": 0.83109856, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.36035156, + "step": 2022, + "time_per_iteration": 2.8003616333007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070636, + "balance_loss_mlp": 1.03413415, + "epoch": 0.3891881492881878, + "flos": 514937170944.0, + "grad_norm": 0.06357614890279897, + "language_loss": 0.81860286, + "learning_rate": 0.0006981564208907474, + "loss": 0.82930923, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.36499023, + "step": 2023, + "time_per_iteration": 2.581556558609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074791, + "balance_loss_mlp": 1.03933799, + "epoch": 0.3893805309734513, + "flos": 628770663936.0, + "grad_norm": 0.04985256691663517, + "language_loss": 0.90055227, + "learning_rate": 0.0006978703506098102, + "loss": 0.91130018, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.35498047, + "step": 2024, + "time_per_iteration": 2.7220654487609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080786, + "balance_loss_mlp": 1.04497564, + "epoch": 0.3895729126587149, + "flos": 543894234624.0, + "grad_norm": 0.06500254639996711, + "language_loss": 0.88078821, + "learning_rate": 0.00069758420350879, + "loss": 0.89159608, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.35839844, + "step": 2025, + "time_per_iteration": 2.6044023036956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086131, + "balance_loss_mlp": 1.05012965, + "epoch": 0.38976529434397844, + "flos": 617987778048.0, + "grad_norm": 0.06153368317516065, + "language_loss": 0.86008936, + "learning_rate": 0.000697297979698779, + "loss": 0.87095064, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.36010742, + "step": 2026, + "time_per_iteration": 2.7051053047180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091671, + "balance_loss_mlp": 1.05628932, + "epoch": 0.38995767602924203, + "flos": 834518287872.0, + "grad_norm": 0.05732441037152358, + "language_loss": 0.83766049, + "learning_rate": 0.0006970116792908992, + "loss": 0.84857726, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.35400391, + "step": 2027, + "time_per_iteration": 3.086228847503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096114, + "balance_loss_mlp": 1.06032705, + "epoch": 0.39015005771450556, + "flos": 541343651328.0, + "grad_norm": 0.060477391230123065, + "language_loss": 0.8159399, + "learning_rate": 0.000696725302396302, + "loss": 0.82690096, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.3581543, + "step": 2028, + "time_per_iteration": 2.6521902084350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093769, + "balance_loss_mlp": 1.05867422, + "epoch": 0.39034243939976915, + "flos": 1007102536704.0, + "grad_norm": 0.04866281229524116, + "language_loss": 0.85781944, + "learning_rate": 0.0006964388491261692, + "loss": 0.86875713, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.35131836, + "step": 2029, + "time_per_iteration": 3.2338647842407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099777, + "balance_loss_mlp": 1.06401408, + "epoch": 0.3905348210850327, + "flos": 678723121152.0, + "grad_norm": 0.05278281932643199, + "language_loss": 0.87335277, + "learning_rate": 0.0006961523195917114, + "loss": 0.88435054, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.35791016, + "step": 2030, + "time_per_iteration": 2.8414504528045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098277, + "balance_loss_mlp": 1.06256151, + "epoch": 0.39072720277029627, + "flos": 548606905344.0, + "grad_norm": 0.05643291477722073, + "language_loss": 0.77850938, + "learning_rate": 0.0006958657139041696, + "loss": 0.78949213, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.35742188, + "step": 2031, + "time_per_iteration": 2.7278060913085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091715, + "balance_loss_mlp": 1.07807708, + "epoch": 0.39091958445555985, + "flos": 1546912401408.0, + "grad_norm": 0.03426807627657635, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77804685, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.13671875, + "step": 2032, + "time_per_iteration": 4.927444696426392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099209, + "balance_loss_mlp": 1.06399429, + "epoch": 0.3911119661408234, + "flos": 503741058048.0, + "grad_norm": 0.050822615130563034, + "language_loss": 0.78371578, + "learning_rate": 0.0006952922745149434, + "loss": 0.79470789, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.35253906, + "step": 2033, + "time_per_iteration": 2.6730306148529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095244, + "balance_loss_mlp": 1.05933857, + "epoch": 0.391304347826087, + "flos": 556967245824.0, + "grad_norm": 0.05118619150019999, + "language_loss": 0.87770367, + "learning_rate": 0.000695005441035888, + "loss": 0.88865614, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.359375, + "step": 2034, + "time_per_iteration": 2.6585283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_mlp": 1.02461255, + "epoch": 0.3914967295113505, + "flos": 1499309858304.0, + "grad_norm": 0.00946210886057752, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74762058, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.140625, + "step": 2035, + "time_per_iteration": 4.8464648723602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087203, + "balance_loss_mlp": 1.05182171, + "epoch": 0.3916891111966141, + "flos": 706715518464.0, + "grad_norm": 0.07007748344060821, + "language_loss": 0.81416976, + "learning_rate": 0.0006944315470656863, + "loss": 0.82504177, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.35424805, + "step": 2036, + "time_per_iteration": 2.9289584159851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010868, + "balance_loss_mlp": 1.05051255, + "epoch": 0.3918814928818776, + "flos": 556085537280.0, + "grad_norm": 0.05570869743183256, + "language_loss": 0.91007531, + "learning_rate": 0.000694144486797345, + "loss": 0.92094326, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.36303711, + "step": 2037, + "time_per_iteration": 0.0013861656188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043662, + "balance_loss_mlp": 1.02954793, + "epoch": 0.3920738745671412, + "flos": 1537845032448.0, + "grad_norm": 0.018656318140729232, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80564094, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.14160156, + "step": 2038, + "time_per_iteration": 4.6249003410339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091153, + "balance_loss_mlp": 1.05586696, + "epoch": 0.39226625625240474, + "flos": 498594811392.0, + "grad_norm": 0.06764177247916761, + "language_loss": 0.89479941, + "learning_rate": 0.0006935701402514156, + "loss": 0.90571094, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.35327148, + "step": 2039, + "time_per_iteration": 2.535269260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_mlp": 1.01963174, + "epoch": 0.39245863793766833, + "flos": 1346465203200.0, + "grad_norm": 0.017448285120256823, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.7406826, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.13769531, + "step": 2040, + "time_per_iteration": 4.913180589675903 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086444, + "balance_loss_mlp": 1.05072939, + "epoch": 0.3926510196229319, + "flos": 1345614474240.0, + "grad_norm": 0.055350347752650936, + "language_loss": 0.8453002, + "learning_rate": 0.0006929954931031422, + "loss": 0.85616457, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.35742188, + "step": 2041, + "time_per_iteration": 3.6796491146087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081967, + "balance_loss_mlp": 1.04722965, + "epoch": 0.39284340130819545, + "flos": 499333925376.0, + "grad_norm": 0.05434437059814268, + "language_loss": 0.88856936, + "learning_rate": 0.0006927080570819805, + "loss": 0.89938903, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.34790039, + "step": 2042, + "time_per_iteration": 2.634052038192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087578, + "balance_loss_mlp": 1.05217266, + "epoch": 0.39303578299345904, + "flos": 520077625344.0, + "grad_norm": 0.06468620716873735, + "language_loss": 0.80649555, + "learning_rate": 0.0006924205462449161, + "loss": 0.81737131, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.35473633, + "step": 2043, + "time_per_iteration": 2.6021780967712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078315, + "balance_loss_mlp": 1.04288566, + "epoch": 0.39322816467872257, + "flos": 907529301504.0, + "grad_norm": 0.05516365318311268, + "language_loss": 0.82013571, + "learning_rate": 0.0006921329607035702, + "loss": 0.83091891, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.35473633, + "step": 2044, + "time_per_iteration": 3.2195992469787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078665, + "balance_loss_mlp": 1.04473805, + "epoch": 0.39342054636398616, + "flos": 517330603008.0, + "grad_norm": 0.046703626280748714, + "language_loss": 0.88374329, + "learning_rate": 0.0006918453005695938, + "loss": 0.89452994, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.33959961, + "step": 2045, + "time_per_iteration": 2.6319282054901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080714, + "balance_loss_mlp": 1.04435515, + "epoch": 0.3936129280492497, + "flos": 547646621184.0, + "grad_norm": 0.04434497339872072, + "language_loss": 0.8422206, + "learning_rate": 0.0006915575659546662, + "loss": 0.8530277, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.36401367, + "step": 2046, + "time_per_iteration": 2.648895263671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081267, + "balance_loss_mlp": 1.04519427, + "epoch": 0.3938053097345133, + "flos": 525858269184.0, + "grad_norm": 0.0524234289418272, + "language_loss": 0.80648899, + "learning_rate": 0.0006912697569704959, + "loss": 0.81730163, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.36083984, + "step": 2047, + "time_per_iteration": 2.6330111026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085798, + "balance_loss_mlp": 1.04934382, + "epoch": 0.39399769141977686, + "flos": 471390990336.0, + "grad_norm": 0.0542278175669412, + "language_loss": 0.86721706, + "learning_rate": 0.0006909818737288205, + "loss": 0.87807506, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.36450195, + "step": 2048, + "time_per_iteration": 2.537581205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090734, + "balance_loss_mlp": 1.05468488, + "epoch": 0.3941900731050404, + "flos": 501490220544.0, + "grad_norm": 0.056383256559611315, + "language_loss": 0.80660325, + "learning_rate": 0.000690693916341406, + "loss": 0.8175106, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.3605957, + "step": 2049, + "time_per_iteration": 2.622183084487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096224, + "balance_loss_mlp": 1.05898309, + "epoch": 0.394382454790304, + "flos": 580577241600.0, + "grad_norm": 0.11468284139168772, + "language_loss": 0.82465422, + "learning_rate": 0.0006904058849200475, + "loss": 0.83561641, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.37255859, + "step": 2050, + "time_per_iteration": 2.7216436862945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087551, + "balance_loss_mlp": 1.05088186, + "epoch": 0.3945748364755675, + "flos": 513563659776.0, + "grad_norm": 0.05187056217607278, + "language_loss": 0.84988606, + "learning_rate": 0.0006901177795765683, + "loss": 0.86076152, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.36694336, + "step": 2051, + "time_per_iteration": 2.5725293159484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079371, + "balance_loss_mlp": 1.04291642, + "epoch": 0.3947672181608311, + "flos": 593683748352.0, + "grad_norm": 0.0518129521666432, + "language_loss": 0.8131091, + "learning_rate": 0.0006898296004228213, + "loss": 0.82390279, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.36450195, + "step": 2052, + "time_per_iteration": 2.6969447135925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050959, + "balance_loss_mlp": 1.0379895, + "epoch": 0.39495959984609463, + "flos": 1546829443584.0, + "grad_norm": 0.029989620736742544, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79177701, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12988281, + "step": 2053, + "time_per_iteration": 4.8486199378967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078173, + "balance_loss_mlp": 1.04233825, + "epoch": 0.3951519815313582, + "flos": 496271190528.0, + "grad_norm": 0.06693197740080077, + "language_loss": 0.80026031, + "learning_rate": 0.0006892530211320763, + "loss": 0.81104207, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.35864258, + "step": 2054, + "time_per_iteration": 2.6731534004211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082597, + "balance_loss_mlp": 1.04657161, + "epoch": 0.39534436321662175, + "flos": 530934704640.0, + "grad_norm": 0.06400198340094926, + "language_loss": 0.8367995, + "learning_rate": 0.000688964621218926, + "loss": 0.84762549, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.36010742, + "step": 2055, + "time_per_iteration": 2.623870611190796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107818, + "balance_loss_mlp": 1.0422982, + "epoch": 0.39553674490188534, + "flos": 702224017920.0, + "grad_norm": 0.05929287076568038, + "language_loss": 0.80154717, + "learning_rate": 0.0006886761479432037, + "loss": 0.81232893, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.35864258, + "step": 2056, + "time_per_iteration": 2.810593366622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088441, + "balance_loss_mlp": 1.05263042, + "epoch": 0.3957291265871489, + "flos": 409552768512.0, + "grad_norm": 0.05784227470554994, + "language_loss": 0.84645867, + "learning_rate": 0.0006883876014169045, + "loss": 0.85734308, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.3581543, + "step": 2057, + "time_per_iteration": 2.464358329772949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088007, + "balance_loss_mlp": 1.05121863, + "epoch": 0.39592150827241246, + "flos": 618204566016.0, + "grad_norm": 0.05454135250908964, + "language_loss": 0.90161431, + "learning_rate": 0.000688098981752052, + "loss": 0.91249436, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.36791992, + "step": 2058, + "time_per_iteration": 2.742589235305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094817, + "balance_loss_mlp": 1.05819607, + "epoch": 0.39611388995767605, + "flos": 820986969600.0, + "grad_norm": 0.05656267147709111, + "language_loss": 0.80105305, + "learning_rate": 0.0006878102890606982, + "loss": 0.81200117, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.36621094, + "step": 2059, + "time_per_iteration": 3.0725343227386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096787, + "balance_loss_mlp": 1.06018949, + "epoch": 0.3963062716429396, + "flos": 491977539072.0, + "grad_norm": 0.0710153527902746, + "language_loss": 0.81321216, + "learning_rate": 0.0006875215234549239, + "loss": 0.82418001, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.3659668, + "step": 2060, + "time_per_iteration": 2.542090654373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097398, + "balance_loss_mlp": 1.06015694, + "epoch": 0.39649865332820317, + "flos": 584466430464.0, + "grad_norm": 0.08966956269211096, + "language_loss": 0.8554219, + "learning_rate": 0.0006872326850468376, + "loss": 0.86639589, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.37231445, + "step": 2061, + "time_per_iteration": 2.7194440364837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010999, + "balance_loss_mlp": 1.06139588, + "epoch": 0.3966910350134667, + "flos": 458328153600.0, + "grad_norm": 0.05276871533818733, + "language_loss": 0.78609985, + "learning_rate": 0.0006869437739485762, + "loss": 0.79709888, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.38476562, + "step": 2062, + "time_per_iteration": 2.6032114028930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089079, + "balance_loss_mlp": 1.05281568, + "epoch": 0.3968834166987303, + "flos": 508388299776.0, + "grad_norm": 0.05735909750828215, + "language_loss": 0.93035084, + "learning_rate": 0.0006866547902723053, + "loss": 0.94124162, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.36279297, + "step": 2063, + "time_per_iteration": 2.666294813156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097295, + "balance_loss_mlp": 1.06110287, + "epoch": 0.3970757983839938, + "flos": 572349321216.0, + "grad_norm": 0.05819495660266105, + "language_loss": 0.79961425, + "learning_rate": 0.000686365734130218, + "loss": 0.81058717, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.36206055, + "step": 2064, + "time_per_iteration": 2.667443037033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093878, + "balance_loss_mlp": 1.05639839, + "epoch": 0.3972681800692574, + "flos": 481391092224.0, + "grad_norm": 0.051061390118103664, + "language_loss": 0.84029245, + "learning_rate": 0.000686076605634536, + "loss": 0.85123128, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.37475586, + "step": 2065, + "time_per_iteration": 2.605252981185913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091284, + "balance_loss_mlp": 1.05406737, + "epoch": 0.397460561754521, + "flos": 487683887616.0, + "grad_norm": 0.060621892107923396, + "language_loss": 0.84327424, + "learning_rate": 0.0006857874048975088, + "loss": 0.85418713, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.37207031, + "step": 2066, + "time_per_iteration": 2.5779786109924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090658, + "balance_loss_mlp": 1.05329823, + "epoch": 0.3976529434397845, + "flos": 421768802304.0, + "grad_norm": 0.05929679602335689, + "language_loss": 0.86975348, + "learning_rate": 0.0006854981320314142, + "loss": 0.88066006, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.37353516, + "step": 2067, + "time_per_iteration": 2.4723403453826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082754, + "balance_loss_mlp": 1.04591799, + "epoch": 0.3978453251250481, + "flos": 545331764736.0, + "grad_norm": 0.058605050617464606, + "language_loss": 0.86758339, + "learning_rate": 0.0006852087871485579, + "loss": 0.87841094, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.36816406, + "step": 2068, + "time_per_iteration": 2.6007602214813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082239, + "balance_loss_mlp": 1.04602289, + "epoch": 0.39803770681031164, + "flos": 650548841472.0, + "grad_norm": 0.06821675645960798, + "language_loss": 0.81689966, + "learning_rate": 0.0006849193703612735, + "loss": 0.82772201, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.36206055, + "step": 2069, + "time_per_iteration": 2.7661337852478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083569, + "balance_loss_mlp": 1.04661417, + "epoch": 0.39823008849557523, + "flos": 739734888960.0, + "grad_norm": 0.059947122372600754, + "language_loss": 0.77649361, + "learning_rate": 0.0006846298817819225, + "loss": 0.78732932, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.36987305, + "step": 2070, + "time_per_iteration": 2.9364843368530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091151, + "balance_loss_mlp": 1.05436325, + "epoch": 0.39842247018083876, + "flos": 384825337344.0, + "grad_norm": 0.0736862776590319, + "language_loss": 0.80732799, + "learning_rate": 0.0006843403215228945, + "loss": 0.81823957, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.36767578, + "step": 2071, + "time_per_iteration": 2.4597537517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078334, + "balance_loss_mlp": 1.04133189, + "epoch": 0.39861485186610235, + "flos": 533431443456.0, + "grad_norm": 0.052578385162892496, + "language_loss": 0.80366135, + "learning_rate": 0.0006840506896966065, + "loss": 0.81444472, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.36962891, + "step": 2072, + "time_per_iteration": 2.6826841831207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081243, + "balance_loss_mlp": 1.04397774, + "epoch": 0.39880723355136594, + "flos": 642834482688.0, + "grad_norm": 0.055481383737447196, + "language_loss": 0.82090193, + "learning_rate": 0.0006837609864155038, + "loss": 0.83171439, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.37255859, + "step": 2073, + "time_per_iteration": 2.8541154861450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075844, + "balance_loss_mlp": 1.0408442, + "epoch": 0.39899961523662947, + "flos": 515587534848.0, + "grad_norm": 0.07686004257588779, + "language_loss": 0.83464944, + "learning_rate": 0.0006834712117920592, + "loss": 0.84540784, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.3503418, + "step": 2074, + "time_per_iteration": 2.6023800373077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107731, + "balance_loss_mlp": 1.04025948, + "epoch": 0.39919199692189306, + "flos": 464148085248.0, + "grad_norm": 0.0625246810856132, + "language_loss": 0.85794407, + "learning_rate": 0.0006831813659387729, + "loss": 0.86871719, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.37036133, + "step": 2075, + "time_per_iteration": 2.5916242599487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071695, + "balance_loss_mlp": 1.0353837, + "epoch": 0.3993843786071566, + "flos": 531382837248.0, + "grad_norm": 0.05588277312371317, + "language_loss": 0.84130096, + "learning_rate": 0.0006828914489681733, + "loss": 0.852018, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.36303711, + "step": 2076, + "time_per_iteration": 2.7014079093933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078543, + "balance_loss_mlp": 1.04168355, + "epoch": 0.3995767602924202, + "flos": 503701770240.0, + "grad_norm": 0.05616101270921505, + "language_loss": 0.85284638, + "learning_rate": 0.0006826014609928162, + "loss": 0.86363184, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.36816406, + "step": 2077, + "time_per_iteration": 2.6714975833892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089101, + "balance_loss_mlp": 1.07622623, + "epoch": 0.3997691419776837, + "flos": 1453780500480.0, + "grad_norm": 0.03492718818999835, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.8428849, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.12890625, + "step": 2078, + "time_per_iteration": 4.8198041915893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075675, + "balance_loss_mlp": 1.03943551, + "epoch": 0.3999615236629473, + "flos": 530418170880.0, + "grad_norm": 0.06060184252075253, + "language_loss": 0.79984158, + "learning_rate": 0.0006820212724781896, + "loss": 0.81059831, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.36279297, + "step": 2079, + "time_per_iteration": 2.725576400756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077383, + "balance_loss_mlp": 1.04209709, + "epoch": 0.4001539053482108, + "flos": 694823961600.0, + "grad_norm": 0.12722864956638674, + "language_loss": 0.83843565, + "learning_rate": 0.0006817310721641694, + "loss": 0.84920955, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.35302734, + "step": 2080, + "time_per_iteration": 2.808981418609619 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 173365568, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4716113890902016.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/training_args.bin b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..459663e238ea62a90da439e633388cc1e16cedb6 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63f07a99639c8908760dc7ac65f4d34d749c1861fc4b5a1f91cbdcc73581ce9e +size 7992 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/zero_to_fp32.py b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/added_tokens.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7929d4cdbe9bb7ee3537b93d161990a8caa422ec --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sharev3", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/generation_config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e07bf8a87c110aa91435803e61cad876af543f40 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:347a889f150a0ff9430f58c8122337c694fed870d8acc8e390b6f2c615db0ebc +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3111e287fee425238ad78cbacf588941d097de60 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c85dbfa39e98c0a030dbc91f8956813f102cb0492f5577b87a25aafb024184ab +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c21ad53fc00ac95c9c79f2b5296b66e4aa01882 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c37df91d65a88e4ca5fba311ea3c6053122c9f65957fce9590fa522e6b8cb547 +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e4aa9176ca20c742325878cb69395afd1dea6d2 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b71459fccc800e1639e63e1ab65ed48ffb2546bb0e31f16848280fb2dbe8c5ef +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..748243ca6702afc48f4aba1bf247b1ed4593d396 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b980a2682d912eeb4bc1425e2695aaff0c01a401df2f8a39e3024802191c146 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c4651dadf863537eced63f65260454a4e0c9a14 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f48fcf007aea2e8bf4c61736c9e5273e3c3021ef2ab632d3732c5e7152125942 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bbce7db42afac8077feef21bb0ab610fc9fea075 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e13f9cc1bcc98d732f0fa4e95323724a293fcdff8c08c4235192cc11705b7d32 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..46f9507e20843ca44090df67a0a55a3f0733b42c --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a243b3a22b87dcdc8fb64b738bb9a59d44977b47783ce757494e59956988b2b7 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/latest b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/latest new file mode 100644 index 0000000000000000000000000000000000000000..804da059f781bacb3f274fb2103e4bc7f9bb7407 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/latest @@ -0,0 +1 @@ +global_step3120 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1c4f5bbaecb65f34cd8bac0aba19d7cdbc189ddc --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8cf4f3614d59c9f8fa5ad476511b09c2e357b8ad241ff60e3bd165b54edb7ce +size 3759020544 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/model.safetensors.index.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..90a56cf0153ed9062ff35ee2b372f7ab9e796c6a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731420128 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/rng_state_0.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/rng_state_1.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/rng_state_2.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/rng_state_3.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/special_tokens_map.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/tokenizer.model b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/tokenizer_config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/trainer_state.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6830c0b539ef39615eaae12fcfb446c298ea3ab5 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/trainer_state.json @@ -0,0 +1,46833 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6002308580223162, + "eval_steps": 500, + "global_step": 3120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03964023, + "balance_loss_mlp": 3.01339984, + "epoch": 0.00019238168526356292, + "flos": 470464353792.0, + "grad_norm": 27.10233905437441, + "language_loss": 3.72295761, + "learning_rate": 0.0, + "loss": 2.48840261, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 9.5, + "step": 1, + "time_per_iteration": 29.606513023376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01906453, + "balance_loss_mlp": 1.25642872, + "epoch": 0.00038476337052712584, + "flos": 504311436288.0, + "grad_norm": 2.874173750989579, + "language_loss": 1.79264998, + "learning_rate": 0.00013726078121135892, + "loss": 1.81171465, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.5, + "step": 2, + "time_per_iteration": 2.7078208923339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01915611, + "balance_loss_mlp": 1.2667315, + "epoch": 0.0005771450557906887, + "flos": 598869282816.0, + "grad_norm": 2.1141296462778643, + "language_loss": 1.61429811, + "learning_rate": 0.00021755319103969496, + "loss": 1.63345432, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.48828125, + "step": 3, + "time_per_iteration": 3.010409116744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01917319, + "balance_loss_mlp": 1.26309848, + "epoch": 0.0007695267410542517, + "flos": 580133491200.0, + "grad_norm": 1.255159247360545, + "language_loss": 1.49202251, + "learning_rate": 0.00027452156242271784, + "loss": 1.51119578, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.54296875, + "step": 4, + "time_per_iteration": 2.7161622047424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0185163, + "balance_loss_mlp": 1.22144234, + "epoch": 0.0009619084263178145, + "flos": 485857861632.0, + "grad_norm": 4.267520959606063, + "language_loss": 1.57359505, + "learning_rate": 0.0003187096642208417, + "loss": 1.59211147, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 6.296875, + "step": 5, + "time_per_iteration": 2.718417167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01828185, + "balance_loss_mlp": 1.21211123, + "epoch": 0.0011542901115813775, + "flos": 559744791552.0, + "grad_norm": 1.225349312557607, + "language_loss": 1.4752574, + "learning_rate": 0.0003548139722510539, + "loss": 1.49353933, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 6.15234375, + "step": 6, + "time_per_iteration": 2.6827874183654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01821666, + "balance_loss_mlp": 1.22428453, + "epoch": 0.0013466717968449403, + "flos": 533721014784.0, + "grad_norm": 0.5025899606895544, + "language_loss": 1.33846116, + "learning_rate": 0.00038533972973918044, + "loss": 1.35667801, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.96875, + "step": 7, + "time_per_iteration": 2.6889517307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01776667, + "balance_loss_mlp": 1.2090404, + "epoch": 0.0015390534821085034, + "flos": 492037175808.0, + "grad_norm": 0.1719820928967348, + "language_loss": 1.2814672, + "learning_rate": 0.0004117823436340768, + "loss": 1.29923391, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.6875, + "step": 8, + "time_per_iteration": 2.7207248210906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0177577, + "balance_loss_mlp": 1.23217535, + "epoch": 0.0017314351673720662, + "flos": 564402207744.0, + "grad_norm": 0.6128716609675008, + "language_loss": 1.39861906, + "learning_rate": 0.00043510638207938993, + "loss": 1.41637683, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.44140625, + "step": 9, + "time_per_iteration": 2.887538194656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01823371, + "balance_loss_mlp": 1.31334615, + "epoch": 0.001923816852635629, + "flos": 593132308992.0, + "grad_norm": 0.480897383035181, + "language_loss": 1.25963569, + "learning_rate": 0.00045597044543220066, + "loss": 1.27786922, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.09765625, + "step": 10, + "time_per_iteration": 2.7672832012176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01930298, + "balance_loss_mlp": 1.44621277, + "epoch": 0.002116198537899192, + "flos": 609308752896.0, + "grad_norm": 0.21803247425844502, + "language_loss": 1.22959518, + "learning_rate": 0.00047484428652143135, + "loss": 1.24889803, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 4.83203125, + "step": 11, + "time_per_iteration": 2.9771082401275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130152, + "balance_loss_mlp": 1.67772901, + "epoch": 0.002308580223162755, + "flos": 544869075456.0, + "grad_norm": 0.19847359144835577, + "language_loss": 1.28057694, + "learning_rate": 0.0004920747534624128, + "loss": 1.30187845, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 4.52734375, + "step": 12, + "time_per_iteration": 2.6094090938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02177014, + "balance_loss_mlp": 1.7512939, + "epoch": 0.002500961908426318, + "flos": 644458277376.0, + "grad_norm": 0.3126355826019607, + "language_loss": 1.29235363, + "learning_rate": 0.0005079252465375872, + "loss": 1.31412375, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 4.265625, + "step": 13, + "time_per_iteration": 2.841792345046997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02141221, + "balance_loss_mlp": 1.74411082, + "epoch": 0.0026933435936898806, + "flos": 487605312000.0, + "grad_norm": 0.282411779716686, + "language_loss": 1.17459798, + "learning_rate": 0.0005226005109505393, + "loss": 1.19601011, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 3.96875, + "step": 14, + "time_per_iteration": 2.597313165664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02024541, + "balance_loss_mlp": 1.65890288, + "epoch": 0.0028857252789534437, + "flos": 434368949760.0, + "grad_norm": 0.2583476739022616, + "language_loss": 1.22957516, + "learning_rate": 0.0005362628552605367, + "loss": 1.24982059, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 3.65234375, + "step": 15, + "time_per_iteration": 2.6388704776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01790575, + "balance_loss_mlp": 1.44687057, + "epoch": 0.0030781069642170067, + "flos": 596465676288.0, + "grad_norm": 0.18613747071639053, + "language_loss": 1.27631426, + "learning_rate": 0.0005490431248454357, + "loss": 1.29421997, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 3.44140625, + "step": 16, + "time_per_iteration": 2.708346128463745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01779165, + "balance_loss_mlp": 1.46941185, + "epoch": 0.0032704886494805694, + "flos": 1537360432128.0, + "grad_norm": 0.2733785965407311, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.77484274, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 3.09375, + "step": 17, + "time_per_iteration": 6.916250705718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01553778, + "balance_loss_mlp": 1.24955583, + "epoch": 0.0034628703347441324, + "flos": 473720403456.0, + "grad_norm": 0.11658431553946913, + "language_loss": 1.14468098, + "learning_rate": 0.0005723671632907488, + "loss": 1.16021872, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 3.03710938, + "step": 18, + "time_per_iteration": 2.7716212272644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01490625, + "balance_loss_mlp": 1.21005416, + "epoch": 0.0036552520200076955, + "flos": 448303320576.0, + "grad_norm": 0.11552730485963776, + "language_loss": 1.19723654, + "learning_rate": 0.0005830738490244919, + "loss": 1.21214283, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 2.80859375, + "step": 19, + "time_per_iteration": 2.6067557334899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0141948, + "balance_loss_mlp": 1.16103387, + "epoch": 0.003847633705271258, + "flos": 635881148928.0, + "grad_norm": 0.11977740619668105, + "language_loss": 1.21676993, + "learning_rate": 0.0005932312266435596, + "loss": 1.23096466, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 2.58398438, + "step": 20, + "time_per_iteration": 2.8545703887939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01364308, + "balance_loss_mlp": 1.13084817, + "epoch": 0.004040015390534821, + "flos": 589222771200.0, + "grad_norm": 0.09935322828728523, + "language_loss": 1.16681409, + "learning_rate": 0.0006028929207788754, + "loss": 1.18045723, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 2.33203125, + "step": 21, + "time_per_iteration": 2.7119524478912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319718, + "balance_loss_mlp": 1.11038613, + "epoch": 0.004232397075798384, + "flos": 756253338624.0, + "grad_norm": 0.09023283304690737, + "language_loss": 1.20250762, + "learning_rate": 0.0006121050677327902, + "loss": 1.21570492, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 2.09667969, + "step": 22, + "time_per_iteration": 2.884739398956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304467, + "balance_loss_mlp": 1.1184051, + "epoch": 0.004424778761061947, + "flos": 526434439680.0, + "grad_norm": 0.08559602389751407, + "language_loss": 1.10067201, + "learning_rate": 0.0006209076479463684, + "loss": 1.1137166, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 1.85839844, + "step": 23, + "time_per_iteration": 2.6616718769073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275434, + "balance_loss_mlp": 1.10787356, + "epoch": 0.00461716044632551, + "flos": 547907079168.0, + "grad_norm": 0.07141137445072718, + "language_loss": 1.2012924, + "learning_rate": 0.0006293355346737718, + "loss": 1.21404672, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 1.67675781, + "step": 24, + "time_per_iteration": 2.7025952339172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252583, + "balance_loss_mlp": 1.10476315, + "epoch": 0.004809542131589073, + "flos": 567293234688.0, + "grad_norm": 0.08524381015789384, + "language_loss": 1.16738653, + "learning_rate": 0.0006374193284416834, + "loss": 1.17991233, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 1.47753906, + "step": 25, + "time_per_iteration": 2.827439069747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223638, + "balance_loss_mlp": 1.0984205, + "epoch": 0.005001923816852636, + "flos": 470391418368.0, + "grad_norm": 0.08512374611478205, + "language_loss": 1.15399337, + "learning_rate": 0.0006451860277489461, + "loss": 1.16622972, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 1.25097656, + "step": 26, + "time_per_iteration": 2.6214864253997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206141, + "balance_loss_mlp": 1.10009253, + "epoch": 0.005194305502116198, + "flos": 415283950080.0, + "grad_norm": 0.07774032731783902, + "language_loss": 1.23061514, + "learning_rate": 0.0006526595731190848, + "loss": 1.2426765, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 1.0625, + "step": 27, + "time_per_iteration": 2.5637125968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117904, + "balance_loss_mlp": 1.09192181, + "epoch": 0.005386687187379761, + "flos": 628466535936.0, + "grad_norm": 0.05524077436438855, + "language_loss": 1.1626848, + "learning_rate": 0.0006598612921618983, + "loss": 1.17447519, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 0.87158203, + "step": 28, + "time_per_iteration": 2.8202784061431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159441, + "balance_loss_mlp": 1.08772469, + "epoch": 0.005579068872643324, + "flos": 886100332032.0, + "grad_norm": 0.07386109802626846, + "language_loss": 1.08505416, + "learning_rate": 0.0006668102665011454, + "loss": 1.09664845, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 0.71728516, + "step": 29, + "time_per_iteration": 3.2254040241241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154142, + "balance_loss_mlp": 1.09520459, + "epoch": 0.005771450557906887, + "flos": 547287238656.0, + "grad_norm": 0.0797557646441396, + "language_loss": 1.18077409, + "learning_rate": 0.0006735236364718957, + "loss": 1.19231534, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 0.58886719, + "step": 30, + "time_per_iteration": 2.6730945110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140737, + "balance_loss_mlp": 1.09384, + "epoch": 0.00596383224317045, + "flos": 531766950912.0, + "grad_norm": 0.060827451674393726, + "language_loss": 1.1687839, + "learning_rate": 0.0006800168558381346, + "loss": 1.18019128, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 0.46875, + "step": 31, + "time_per_iteration": 2.649216651916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148736, + "balance_loss_mlp": 1.11166239, + "epoch": 0.0061562139284340135, + "flos": 588813926400.0, + "grad_norm": 0.10592463777190406, + "language_loss": 1.19211543, + "learning_rate": 0.0006863039060567947, + "loss": 1.20360279, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 0.37084961, + "step": 32, + "time_per_iteration": 2.6697018146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132499, + "balance_loss_mlp": 1.10136151, + "epoch": 0.006348595613697576, + "flos": 617929551360.0, + "grad_norm": 0.09812744917576391, + "language_loss": 1.1217525, + "learning_rate": 0.0006923974775611263, + "loss": 1.13307738, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 0.3112793, + "step": 33, + "time_per_iteration": 2.770225763320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137532, + "balance_loss_mlp": 1.11146092, + "epoch": 0.006540977298961139, + "flos": 777564444672.0, + "grad_norm": 0.06513543096417564, + "language_loss": 1.08375585, + "learning_rate": 0.0006983091239737814, + "loss": 1.09513116, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 0.26086426, + "step": 34, + "time_per_iteration": 2.99418306350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128276, + "balance_loss_mlp": 1.10578084, + "epoch": 0.006733358984224702, + "flos": 666837356544.0, + "grad_norm": 0.06344935516817307, + "language_loss": 1.07062221, + "learning_rate": 0.0007040493939600222, + "loss": 1.08190489, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 0.22497559, + "step": 35, + "time_per_iteration": 2.9126057624816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119708, + "balance_loss_mlp": 1.09892988, + "epoch": 0.006925740669488265, + "flos": 564092287488.0, + "grad_norm": 0.06579143759664555, + "language_loss": 1.07960629, + "learning_rate": 0.0007096279445021078, + "loss": 1.09080338, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 0.20788574, + "step": 36, + "time_per_iteration": 2.7079102993011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114855, + "balance_loss_mlp": 1.09574544, + "epoch": 0.007118122354751828, + "flos": 549583156224.0, + "grad_norm": 0.14799474820221378, + "language_loss": 1.14634764, + "learning_rate": 0.0007150536386503726, + "loss": 1.15749621, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 0.19104004, + "step": 37, + "time_per_iteration": 2.8290467262268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104011, + "balance_loss_mlp": 1.08533084, + "epoch": 0.007310504040015391, + "flos": 702161409024.0, + "grad_norm": 0.2513092385422617, + "language_loss": 1.08396375, + "learning_rate": 0.0007203346302358509, + "loss": 1.09500384, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 0.18688965, + "step": 38, + "time_per_iteration": 2.961430311203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121274, + "balance_loss_mlp": 1.10231924, + "epoch": 0.007502885725278953, + "flos": 599022051840.0, + "grad_norm": 0.0999674626629785, + "language_loss": 1.11391926, + "learning_rate": 0.000725478437577282, + "loss": 1.12513208, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 0.18945312, + "step": 39, + "time_per_iteration": 2.742088556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146989, + "balance_loss_mlp": 1.12810588, + "epoch": 0.007695267410542516, + "flos": 560000867328.0, + "grad_norm": 0.3323772184023467, + "language_loss": 1.08355689, + "learning_rate": 0.0007304920078549186, + "loss": 1.09502685, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 0.18884277, + "step": 40, + "time_per_iteration": 2.66943621635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116486, + "balance_loss_mlp": 1.1452378, + "epoch": 0.007887649095806078, + "flos": 507906671616.0, + "grad_norm": 0.11539272036457353, + "language_loss": 1.09356606, + "learning_rate": 0.0007353817735343603, + "loss": 1.1052146, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 0.19604492, + "step": 41, + "time_per_iteration": 2.7052595615386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132998, + "balance_loss_mlp": 1.11293542, + "epoch": 0.008080030781069641, + "flos": 503642133504.0, + "grad_norm": 0.12251683576194117, + "language_loss": 1.04851842, + "learning_rate": 0.0007401537019902344, + "loss": 1.05984843, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 0.20056152, + "step": 42, + "time_per_iteration": 2.590432643890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124507, + "balance_loss_mlp": 1.10198867, + "epoch": 0.008272412466333205, + "flos": 517764178944.0, + "grad_norm": 0.09393858903586973, + "language_loss": 1.08539796, + "learning_rate": 0.0007448133392900729, + "loss": 1.09664297, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 0.22521973, + "step": 43, + "time_per_iteration": 2.6619081497192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112544, + "balance_loss_mlp": 1.10156202, + "epoch": 0.008464794151596768, + "flos": 607673373696.0, + "grad_norm": 0.06822323064374927, + "language_loss": 1.03845203, + "learning_rate": 0.0007493658489441491, + "loss": 1.04970646, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 0.23864746, + "step": 44, + "time_per_iteration": 2.861008644104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128905, + "balance_loss_mlp": 1.10477662, + "epoch": 0.00865717583686033, + "flos": 537661075968.0, + "grad_norm": 0.1413166066405165, + "language_loss": 1.08820629, + "learning_rate": 0.0007538160463002316, + "loss": 1.09949529, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 0.2409668, + "step": 45, + "time_per_iteration": 2.643458604812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115676, + "balance_loss_mlp": 1.13258433, + "epoch": 0.008849557522123894, + "flos": 507758284800.0, + "grad_norm": 0.08570115972640321, + "language_loss": 1.10720444, + "learning_rate": 0.0007581684291577274, + "loss": 1.11877203, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.24157715, + "step": 46, + "time_per_iteration": 2.5904788970947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145761, + "balance_loss_mlp": 1.12085772, + "epoch": 0.009041939207387457, + "flos": 625048800768.0, + "grad_norm": 0.06636849455276843, + "language_loss": 1.14156199, + "learning_rate": 0.0007624272050891776, + "loss": 1.15301955, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.24902344, + "step": 47, + "time_per_iteration": 2.782179594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154374, + "balance_loss_mlp": 1.12759995, + "epoch": 0.00923432089265102, + "flos": 549124849152.0, + "grad_norm": 0.09356522507451794, + "language_loss": 1.04615343, + "learning_rate": 0.0007665963158851307, + "loss": 1.05769718, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.26806641, + "step": 48, + "time_per_iteration": 2.824540138244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174661, + "balance_loss_mlp": 1.14738548, + "epoch": 0.009426702577914583, + "flos": 562202242560.0, + "grad_norm": 0.059100241584136314, + "language_loss": 1.12381458, + "learning_rate": 0.0007706794594783609, + "loss": 1.13556111, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.27270508, + "step": 49, + "time_per_iteration": 2.790757894515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192673, + "balance_loss_mlp": 1.16604137, + "epoch": 0.009619084263178146, + "flos": 616486228992.0, + "grad_norm": 0.08074806779925832, + "language_loss": 1.11280799, + "learning_rate": 0.0007746801096530423, + "loss": 1.12473488, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.2668457, + "step": 50, + "time_per_iteration": 2.7235305309295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116178, + "balance_loss_mlp": 1.135149, + "epoch": 0.009811465948441709, + "flos": 541176325632.0, + "grad_norm": 0.06558886342971224, + "language_loss": 1.16576111, + "learning_rate": 0.0007786015338021173, + "loss": 1.17737889, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.26672363, + "step": 51, + "time_per_iteration": 2.6817519664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134628, + "balance_loss_mlp": 1.1085453, + "epoch": 0.010003847633705272, + "flos": 535608087552.0, + "grad_norm": 0.06210449580458492, + "language_loss": 1.08870959, + "learning_rate": 0.0007824468089603051, + "loss": 1.10005593, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.26098633, + "step": 52, + "time_per_iteration": 2.644577980041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125522, + "balance_loss_mlp": 1.09910512, + "epoch": 0.010196229318968833, + "flos": 908867907072.0, + "grad_norm": 0.05864822926220488, + "language_loss": 1.07807887, + "learning_rate": 0.0007862188363098669, + "loss": 1.08933413, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.26428223, + "step": 53, + "time_per_iteration": 3.1450047492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126237, + "balance_loss_mlp": 1.10084558, + "epoch": 0.010388611004232396, + "flos": 585594040320.0, + "grad_norm": 0.07974065634267835, + "language_loss": 1.08295977, + "learning_rate": 0.0007899203543304438, + "loss": 1.09422219, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.25390625, + "step": 54, + "time_per_iteration": 2.6822280883789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155972, + "balance_loss_mlp": 1.13315582, + "epoch": 0.01058099268949596, + "flos": 502233716736.0, + "grad_norm": 0.07014139109577967, + "language_loss": 1.22212756, + "learning_rate": 0.0007935539507422731, + "loss": 1.23368728, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.22814941, + "step": 55, + "time_per_iteration": 2.5841405391693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117516, + "balance_loss_mlp": 1.153512, + "epoch": 0.010773374374759523, + "flos": 544170659328.0, + "grad_norm": 0.07006342440897594, + "language_loss": 1.13914931, + "learning_rate": 0.0007971220733732573, + "loss": 1.15090084, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.21643066, + "step": 56, + "time_per_iteration": 2.697427988052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193099, + "balance_loss_mlp": 1.17267895, + "epoch": 0.010965756060023086, + "flos": 525874235904.0, + "grad_norm": 0.08125896119424647, + "language_loss": 1.0764755, + "learning_rate": 0.0008006270400641869, + "loss": 1.08840656, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.2043457, + "step": 57, + "time_per_iteration": 2.723154306411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174019, + "balance_loss_mlp": 1.15412247, + "epoch": 0.011158137745286649, + "flos": 576653147136.0, + "grad_norm": 0.07485866075688756, + "language_loss": 1.09104013, + "learning_rate": 0.0008040710477125043, + "loss": 1.10278034, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.19897461, + "step": 58, + "time_per_iteration": 2.703186273574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153983, + "balance_loss_mlp": 1.13440859, + "epoch": 0.011350519430550212, + "flos": 529024310784.0, + "grad_norm": 0.06764829366941465, + "language_loss": 1.09780312, + "learning_rate": 0.0008074561805429771, + "loss": 1.10934305, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.19567871, + "step": 59, + "time_per_iteration": 2.6111674308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136624, + "balance_loss_mlp": 1.11676335, + "epoch": 0.011542901115813775, + "flos": 555608291328.0, + "grad_norm": 0.06986870516034673, + "language_loss": 1.08079648, + "learning_rate": 0.0008107844176832545, + "loss": 1.09216261, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.19848633, + "step": 60, + "time_per_iteration": 2.682687997817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125651, + "balance_loss_mlp": 1.1056236, + "epoch": 0.011735282801077338, + "flos": 571826995200.0, + "grad_norm": 0.061548073586970495, + "language_loss": 1.09071934, + "learning_rate": 0.0008140576401132568, + "loss": 1.10197592, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.20019531, + "step": 61, + "time_per_iteration": 2.639394760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111743, + "balance_loss_mlp": 1.09838021, + "epoch": 0.0119276644863409, + "flos": 615309156864.0, + "grad_norm": 0.06273761556608791, + "language_loss": 1.10558033, + "learning_rate": 0.0008172776370494935, + "loss": 1.11675453, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.19030762, + "step": 62, + "time_per_iteration": 2.7110230922698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134294, + "balance_loss_mlp": 1.11483955, + "epoch": 0.012120046171604464, + "flos": 500835474432.0, + "grad_norm": 0.07391589684249159, + "language_loss": 1.17346644, + "learning_rate": 0.0008204461118185703, + "loss": 1.18480933, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.19445801, + "step": 63, + "time_per_iteration": 2.5490689277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142708, + "balance_loss_mlp": 1.12420678, + "epoch": 0.012312427856868027, + "flos": 473109327360.0, + "grad_norm": 0.05825974543220343, + "language_loss": 1.06081367, + "learning_rate": 0.0008235646872681536, + "loss": 1.07224083, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.18505859, + "step": 64, + "time_per_iteration": 2.5874247550964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139504, + "balance_loss_mlp": 1.12069249, + "epoch": 0.012504809542131588, + "flos": 538094651904.0, + "grad_norm": 0.1040066778051144, + "language_loss": 1.06503749, + "learning_rate": 0.0008266349107584288, + "loss": 1.07643247, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.18823242, + "step": 65, + "time_per_iteration": 2.678736925125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123492, + "balance_loss_mlp": 1.10500288, + "epoch": 0.012697191227395151, + "flos": 608450365440.0, + "grad_norm": 0.09066354406474254, + "language_loss": 1.09410381, + "learning_rate": 0.0008296582587724851, + "loss": 1.10533869, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.18481445, + "step": 66, + "time_per_iteration": 2.6937255859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121105, + "balance_loss_mlp": 1.10255599, + "epoch": 0.012889572912658714, + "flos": 767750607360.0, + "grad_norm": 0.11790618145169461, + "language_loss": 1.07982886, + "learning_rate": 0.0008326361411800136, + "loss": 1.0910399, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.1854248, + "step": 67, + "time_per_iteration": 2.9377663135528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096346, + "balance_loss_mlp": 1.07871521, + "epoch": 0.013081954597922277, + "flos": 533604561408.0, + "grad_norm": 0.09153807632987658, + "language_loss": 1.08335972, + "learning_rate": 0.0008355699051851403, + "loss": 1.09432316, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.17651367, + "step": 68, + "time_per_iteration": 2.7278473377227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_mlp": 1.0865227, + "epoch": 0.01327433628318584, + "flos": 572826567168.0, + "grad_norm": 0.08317322449907456, + "language_loss": 1.14837921, + "learning_rate": 0.0008384608389860635, + "loss": 1.15941942, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.1751709, + "step": 69, + "time_per_iteration": 2.7238211631774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111418, + "balance_loss_mlp": 1.09424019, + "epoch": 0.013466717968449404, + "flos": 497029243392.0, + "grad_norm": 0.08213812906773327, + "language_loss": 1.04970825, + "learning_rate": 0.000841310175171381, + "loss": 1.06082237, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.17199707, + "step": 70, + "time_per_iteration": 2.578726291656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101987, + "balance_loss_mlp": 1.08526158, + "epoch": 0.013659099653712967, + "flos": 565234454016.0, + "grad_norm": 0.06358988870017376, + "language_loss": 1.03380442, + "learning_rate": 0.000844119093875517, + "loss": 1.04482436, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.16723633, + "step": 71, + "time_per_iteration": 2.692791223526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103533, + "balance_loss_mlp": 1.08689094, + "epoch": 0.01385148133897653, + "flos": 573540950016.0, + "grad_norm": 0.07461407963015444, + "language_loss": 1.08098376, + "learning_rate": 0.0008468887257134666, + "loss": 1.09201908, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.16650391, + "step": 72, + "time_per_iteration": 2.6599459648132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122889, + "balance_loss_mlp": 1.10587776, + "epoch": 0.014043863024240093, + "flos": 576539665920.0, + "grad_norm": 0.05931650266846123, + "language_loss": 1.10316896, + "learning_rate": 0.0008496201545131264, + "loss": 1.11439776, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.17028809, + "step": 73, + "time_per_iteration": 2.7093684673309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126213, + "balance_loss_mlp": 1.10950017, + "epoch": 0.014236244709503656, + "flos": 938287660032.0, + "grad_norm": 0.060718352480344094, + "language_loss": 1.08902812, + "learning_rate": 0.0008523144198617317, + "loss": 1.1002903, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.16711426, + "step": 74, + "time_per_iteration": 3.1743276119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125614, + "balance_loss_mlp": 1.10876918, + "epoch": 0.014428626394767219, + "flos": 528231352320.0, + "grad_norm": 0.07198154728214846, + "language_loss": 1.08249164, + "learning_rate": 0.0008549725194813783, + "loss": 1.09374774, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.1685791, + "step": 75, + "time_per_iteration": 2.630387783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106727, + "balance_loss_mlp": 1.09047866, + "epoch": 0.014621008080030782, + "flos": 803371433472.0, + "grad_norm": 0.07553700512989577, + "language_loss": 1.06998253, + "learning_rate": 0.0008575954114472099, + "loss": 1.0810498, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.16247559, + "step": 76, + "time_per_iteration": 3.134385347366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109532, + "balance_loss_mlp": 1.0933075, + "epoch": 0.014813389765294343, + "flos": 696588788736.0, + "grad_norm": 0.053440596513601155, + "language_loss": 1.05069363, + "learning_rate": 0.0008601840162606118, + "loss": 1.06178904, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.16223145, + "step": 77, + "time_per_iteration": 3.039991855621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123171, + "balance_loss_mlp": 1.10660076, + "epoch": 0.015005771450557906, + "flos": 596702813184.0, + "grad_norm": 0.07894951514499118, + "language_loss": 1.1143651, + "learning_rate": 0.000862739218788641, + "loss": 1.12559676, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.16577148, + "step": 78, + "time_per_iteration": 2.867741346359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113228, + "balance_loss_mlp": 1.11553121, + "epoch": 0.01519815313582147, + "flos": 549148170240.0, + "grad_norm": 0.0893413961860561, + "language_loss": 1.07743871, + "learning_rate": 0.0008652618700799138, + "loss": 1.08876157, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.16760254, + "step": 79, + "time_per_iteration": 2.675795555114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160152, + "balance_loss_mlp": 1.14348662, + "epoch": 0.015390534821085032, + "flos": 430306642944.0, + "grad_norm": 0.06679936706529424, + "language_loss": 1.07125092, + "learning_rate": 0.0008677527890662774, + "loss": 1.08285248, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.16662598, + "step": 80, + "time_per_iteration": 2.4765963554382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196819, + "balance_loss_mlp": 1.17889023, + "epoch": 0.015582916506348595, + "flos": 523854743040.0, + "grad_norm": 0.12362960542988827, + "language_loss": 1.09903598, + "learning_rate": 0.0008702127641587799, + "loss": 1.11100423, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.17932129, + "step": 81, + "time_per_iteration": 2.636688470840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180455, + "balance_loss_mlp": 1.16288388, + "epoch": 0.015775298191612157, + "flos": 575151598080.0, + "grad_norm": 0.08274533442322421, + "language_loss": 1.04032063, + "learning_rate": 0.0008726425547457192, + "loss": 1.05212522, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.17565918, + "step": 82, + "time_per_iteration": 2.765179395675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157804, + "balance_loss_mlp": 1.14051914, + "epoch": 0.01596767987687572, + "flos": 610040664576.0, + "grad_norm": 0.07618339381967684, + "language_loss": 1.03921247, + "learning_rate": 0.0008750428925998964, + "loss": 1.05079055, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.1730957, + "step": 83, + "time_per_iteration": 2.7615418434143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159673, + "balance_loss_mlp": 1.14280462, + "epoch": 0.016160061562139283, + "flos": 566864040960.0, + "grad_norm": 0.0706757922791228, + "language_loss": 1.09743476, + "learning_rate": 0.0008774144832015932, + "loss": 1.10903156, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.16882324, + "step": 84, + "time_per_iteration": 2.694364070892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01699218, + "balance_loss_mlp": 1.68252861, + "epoch": 0.016352443247402846, + "flos": 1410557234688.0, + "grad_norm": 0.23342967148410274, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76473522, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.16699219, + "step": 85, + "time_per_iteration": 4.599137306213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212855, + "balance_loss_mlp": 1.19580793, + "epoch": 0.01654482493266641, + "flos": 730177127424.0, + "grad_norm": 0.09253845479208671, + "language_loss": 1.04518116, + "learning_rate": 0.0008820741205014318, + "loss": 1.05730963, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.1706543, + "step": 86, + "time_per_iteration": 2.8595266342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246652, + "balance_loss_mlp": 1.22939014, + "epoch": 0.016737206617929972, + "flos": 536016932352.0, + "grad_norm": 0.10044068584300966, + "language_loss": 1.06437612, + "learning_rate": 0.0008843634575408404, + "loss": 1.07684278, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.17248535, + "step": 87, + "time_per_iteration": 2.690492630004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215448, + "balance_loss_mlp": 1.19887805, + "epoch": 0.016929588303193535, + "flos": 536706584064.0, + "grad_norm": 0.0661610487366718, + "language_loss": 1.07674646, + "learning_rate": 0.0008866266301555082, + "loss": 1.08890104, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.16577148, + "step": 88, + "time_per_iteration": 2.737339496612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203027, + "balance_loss_mlp": 1.18706512, + "epoch": 0.017121969988457098, + "flos": 526498458624.0, + "grad_norm": 0.07897226836222233, + "language_loss": 1.08543992, + "learning_rate": 0.0008888642296509615, + "loss": 1.09747016, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.1595459, + "step": 89, + "time_per_iteration": 2.576819658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187655, + "balance_loss_mlp": 1.17131162, + "epoch": 0.01731435167372066, + "flos": 625304876544.0, + "grad_norm": 0.0740353605135553, + "language_loss": 1.13367987, + "learning_rate": 0.0008910768275115906, + "loss": 1.14555645, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.16345215, + "step": 90, + "time_per_iteration": 2.778571128845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173459, + "balance_loss_mlp": 1.15692425, + "epoch": 0.017506733358984224, + "flos": 496157709312.0, + "grad_norm": 0.07518713147028631, + "language_loss": 1.08794332, + "learning_rate": 0.0008932649762767675, + "loss": 1.0996778, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.16540527, + "step": 91, + "time_per_iteration": 2.5931665897369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185544, + "balance_loss_mlp": 1.16881919, + "epoch": 0.017699115044247787, + "flos": 745613047296.0, + "grad_norm": 0.07711429280558382, + "language_loss": 1.11576343, + "learning_rate": 0.0008954292103690864, + "loss": 1.12761879, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.1673584, + "step": 92, + "time_per_iteration": 2.9129488468170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194769, + "balance_loss_mlp": 1.17854476, + "epoch": 0.01789149672951135, + "flos": 515257265664.0, + "grad_norm": 0.0669718610224715, + "language_loss": 1.1343056, + "learning_rate": 0.0008975700468778296, + "loss": 1.14625335, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.16223145, + "step": 93, + "time_per_iteration": 2.576620101928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216953, + "balance_loss_mlp": 1.20076382, + "epoch": 0.018083878414774913, + "flos": 585850116096.0, + "grad_norm": 0.11698648494194364, + "language_loss": 1.0652318, + "learning_rate": 0.0008996879863005366, + "loss": 1.07740128, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.16186523, + "step": 94, + "time_per_iteration": 2.6751108169555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217025, + "balance_loss_mlp": 1.2013253, + "epoch": 0.018276260100038477, + "flos": 497103436800.0, + "grad_norm": 0.08327491501556071, + "language_loss": 1.06208014, + "learning_rate": 0.0009017835132453337, + "loss": 1.07425046, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.15686035, + "step": 95, + "time_per_iteration": 2.5971803665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196463, + "balance_loss_mlp": 1.1804409, + "epoch": 0.01846864178530204, + "flos": 639765955584.0, + "grad_norm": 0.09756000368948786, + "language_loss": 1.06920743, + "learning_rate": 0.0009038570970964896, + "loss": 1.08117199, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.16027832, + "step": 96, + "time_per_iteration": 2.7428832054138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173361, + "balance_loss_mlp": 1.15723228, + "epoch": 0.018661023470565603, + "flos": 511411746816.0, + "grad_norm": 0.07053433913024812, + "language_loss": 1.04343212, + "learning_rate": 0.0009059091926454854, + "loss": 1.05516577, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.16125488, + "step": 97, + "time_per_iteration": 2.570509433746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.16246903, + "epoch": 0.018853405155829166, + "flos": 930710103552.0, + "grad_norm": 0.08767892767743933, + "language_loss": 1.03389072, + "learning_rate": 0.0009079402406897198, + "loss": 1.04567385, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.15844727, + "step": 98, + "time_per_iteration": 3.202298164367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179075, + "balance_loss_mlp": 1.16296983, + "epoch": 0.01904578684109273, + "flos": 576209396736.0, + "grad_norm": 0.2639136557883628, + "language_loss": 1.0596242, + "learning_rate": 0.0009099506686008212, + "loss": 1.07141495, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.16101074, + "step": 99, + "time_per_iteration": 2.812368869781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139923, + "balance_loss_mlp": 1.12423468, + "epoch": 0.019238168526356292, + "flos": 558173431296.0, + "grad_norm": 0.12311670746354397, + "language_loss": 1.08180976, + "learning_rate": 0.0009119408908644013, + "loss": 1.09320903, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.15673828, + "step": 100, + "time_per_iteration": 2.7063775062561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150815, + "balance_loss_mlp": 1.13574743, + "epoch": 0.019430550211619855, + "flos": 723539506176.0, + "grad_norm": 0.12127606313133317, + "language_loss": 1.14121008, + "learning_rate": 0.0009139113095929519, + "loss": 1.15271831, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.15039062, + "step": 101, + "time_per_iteration": 2.840913772583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218173, + "balance_loss_mlp": 1.20243776, + "epoch": 0.019622931896883418, + "flos": 499235000832.0, + "grad_norm": 0.1104247345061639, + "language_loss": 1.0836457, + "learning_rate": 0.0009158623150134762, + "loss": 1.09582746, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.15722656, + "step": 102, + "time_per_iteration": 2.560464859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01357908, + "balance_loss_mlp": 1.34204173, + "epoch": 0.01981531358214698, + "flos": 508916418048.0, + "grad_norm": 0.15164768975642337, + "language_loss": 1.07661259, + "learning_rate": 0.000917794285931332, + "loss": 1.0901916, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.15856934, + "step": 103, + "time_per_iteration": 2.6684353351593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381196, + "balance_loss_mlp": 1.36572242, + "epoch": 0.020007695267410544, + "flos": 521087371776.0, + "grad_norm": 0.10342928287682196, + "language_loss": 0.9971087, + "learning_rate": 0.0009197075901716639, + "loss": 1.01092052, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.15454102, + "step": 104, + "time_per_iteration": 2.7250871658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01356986, + "balance_loss_mlp": 1.34017777, + "epoch": 0.020200076952674107, + "flos": 533013834240.0, + "grad_norm": 0.1824265866479698, + "language_loss": 1.09647703, + "learning_rate": 0.0009216025849997171, + "loss": 1.11004686, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.16809082, + "step": 105, + "time_per_iteration": 2.776764154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261961, + "balance_loss_mlp": 1.24583197, + "epoch": 0.020392458637937667, + "flos": 684430981632.0, + "grad_norm": 0.06376163654280764, + "language_loss": 1.0425086, + "learning_rate": 0.0009234796175212258, + "loss": 1.05512834, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.16125488, + "step": 106, + "time_per_iteration": 2.9174978733062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269614, + "balance_loss_mlp": 1.25201869, + "epoch": 0.02058484032320123, + "flos": 701791852032.0, + "grad_norm": 0.060044663360548714, + "language_loss": 1.08808422, + "learning_rate": 0.000925339025064007, + "loss": 1.10078037, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.17590332, + "step": 107, + "time_per_iteration": 2.975735902786255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324579, + "balance_loss_mlp": 1.30547023, + "epoch": 0.020777222008464793, + "flos": 638772175872.0, + "grad_norm": 0.12680512225677842, + "language_loss": 1.01262307, + "learning_rate": 0.0009271811355418027, + "loss": 1.02586877, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.19128418, + "step": 108, + "time_per_iteration": 2.8408150672912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01306621, + "balance_loss_mlp": 1.28755951, + "epoch": 0.020969603693728356, + "flos": 681785856000.0, + "grad_norm": 0.06997483982989385, + "language_loss": 1.08551693, + "learning_rate": 0.0009290062678013548, + "loss": 1.09858322, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.19055176, + "step": 109, + "time_per_iteration": 2.869980812072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309468, + "balance_loss_mlp": 1.29159832, + "epoch": 0.02116198537899192, + "flos": 533140462080.0, + "grad_norm": 0.13190855435004306, + "language_loss": 1.06647623, + "learning_rate": 0.0009308147319536321, + "loss": 1.07957077, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.17895508, + "step": 110, + "time_per_iteration": 2.6270735263824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130688, + "balance_loss_mlp": 1.29067969, + "epoch": 0.021354367064255482, + "flos": 717168135168.0, + "grad_norm": 0.10963649287068344, + "language_loss": 1.1282903, + "learning_rate": 0.0009326068296900676, + "loss": 1.14135909, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.1619873, + "step": 111, + "time_per_iteration": 2.8845341205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388527, + "balance_loss_mlp": 1.37200487, + "epoch": 0.021546748749519045, + "flos": 519290459136.0, + "grad_norm": 0.12406482447985402, + "language_loss": 1.03902006, + "learning_rate": 0.0009343828545846161, + "loss": 1.05290532, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.16516113, + "step": 112, + "time_per_iteration": 2.8167102336883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01548404, + "balance_loss_mlp": 1.53109479, + "epoch": 0.021739130434782608, + "flos": 504912337920.0, + "grad_norm": 0.2528517188051562, + "language_loss": 1.0722419, + "learning_rate": 0.0009361430923823841, + "loss": 1.08772588, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.1730957, + "step": 113, + "time_per_iteration": 2.664581060409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441472, + "balance_loss_mlp": 1.42576015, + "epoch": 0.02193151212004617, + "flos": 463251820032.0, + "grad_norm": 0.1910881492312462, + "language_loss": 1.11420846, + "learning_rate": 0.0009378878212755459, + "loss": 1.12862325, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.15710449, + "step": 114, + "time_per_iteration": 2.4851133823394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262203, + "balance_loss_mlp": 1.24767148, + "epoch": 0.022123893805309734, + "flos": 552008673792.0, + "grad_norm": 0.09004287588953173, + "language_loss": 1.0099957, + "learning_rate": 0.0009396173121672103, + "loss": 1.0226177, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.14538574, + "step": 115, + "time_per_iteration": 2.6535162925720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215709, + "balance_loss_mlp": 1.20165396, + "epoch": 0.022316275490573297, + "flos": 635920436736.0, + "grad_norm": 0.07849561533847389, + "language_loss": 1.07122314, + "learning_rate": 0.0009413318289238633, + "loss": 1.08338022, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.14050293, + "step": 116, + "time_per_iteration": 2.7836899757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203544, + "balance_loss_mlp": 1.18965602, + "epoch": 0.02250865717583686, + "flos": 798535107072.0, + "grad_norm": 0.07099947506123377, + "language_loss": 0.98912275, + "learning_rate": 0.0009430316286169771, + "loss": 1.00115824, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.13891602, + "step": 117, + "time_per_iteration": 3.049468517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263206, + "balance_loss_mlp": 1.24786401, + "epoch": 0.022701038861100423, + "flos": 455851763712.0, + "grad_norm": 0.18808502465815918, + "language_loss": 1.04843259, + "learning_rate": 0.0009447169617543361, + "loss": 1.06106472, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.15319824, + "step": 118, + "time_per_iteration": 2.5886504650115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121023, + "balance_loss_mlp": 1.19557953, + "epoch": 0.022893420546363986, + "flos": 582812112384.0, + "grad_norm": 0.09179634719817005, + "language_loss": 1.11139297, + "learning_rate": 0.0009463880725016029, + "loss": 1.12349522, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.14648438, + "step": 119, + "time_per_iteration": 2.6861259937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169264, + "balance_loss_mlp": 1.15572226, + "epoch": 0.02308580223162755, + "flos": 561010613760.0, + "grad_norm": 0.09164108943144146, + "language_loss": 1.05675769, + "learning_rate": 0.0009480451988946134, + "loss": 1.06845045, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.13549805, + "step": 120, + "time_per_iteration": 2.8075129985809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217336, + "balance_loss_mlp": 1.2034359, + "epoch": 0.023278183916891113, + "flos": 770966111232.0, + "grad_norm": 0.1019945076921087, + "language_loss": 1.07486713, + "learning_rate": 0.0009496885730428627, + "loss": 1.08704054, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.13903809, + "step": 121, + "time_per_iteration": 3.0081264972686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291544, + "balance_loss_mlp": 1.27698815, + "epoch": 0.023470565602154676, + "flos": 553111552512.0, + "grad_norm": 0.08478902086488087, + "language_loss": 1.05369067, + "learning_rate": 0.0009513184213246156, + "loss": 1.06660616, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.14550781, + "step": 122, + "time_per_iteration": 2.654902696609497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406128, + "balance_loss_mlp": 1.39054775, + "epoch": 0.02366294728741824, + "flos": 559744791552.0, + "grad_norm": 0.09837859270685317, + "language_loss": 1.09463692, + "learning_rate": 0.0009529349645740552, + "loss": 1.10869825, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.15563965, + "step": 123, + "time_per_iteration": 2.6837081909179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01484693, + "balance_loss_mlp": 1.46961284, + "epoch": 0.0238553289726818, + "flos": 468313698816.0, + "grad_norm": 0.11388616458843728, + "language_loss": 1.07573724, + "learning_rate": 0.0009545384182608524, + "loss": 1.09058416, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.1505127, + "step": 124, + "time_per_iteration": 2.5069937705993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01411359, + "balance_loss_mlp": 1.39688659, + "epoch": 0.024047710657945365, + "flos": 559763730432.0, + "grad_norm": 0.3429043048666504, + "language_loss": 1.05057025, + "learning_rate": 0.0009561289926625252, + "loss": 1.06468379, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.14465332, + "step": 125, + "time_per_iteration": 2.6802117824554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011688, + "balance_loss_mlp": 1.15507352, + "epoch": 0.024240092343208928, + "flos": 504528224256.0, + "grad_norm": 0.18048320350440872, + "language_loss": 1.09737623, + "learning_rate": 0.0009577068930299292, + "loss": 1.10906434, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.13739014, + "step": 126, + "time_per_iteration": 2.6096670627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163735, + "balance_loss_mlp": 1.15040147, + "epoch": 0.02443247402847249, + "flos": 435516908544.0, + "grad_norm": 0.07278748671530755, + "language_loss": 1.05931616, + "learning_rate": 0.0009592723197462087, + "loss": 1.07095349, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.13360596, + "step": 127, + "time_per_iteration": 2.6409482955932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248107, + "balance_loss_mlp": 1.23239577, + "epoch": 0.024624855713736054, + "flos": 683445966336.0, + "grad_norm": 0.0813490266373729, + "language_loss": 1.02871299, + "learning_rate": 0.0009608254684795125, + "loss": 1.04119396, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.15710449, + "step": 128, + "time_per_iteration": 2.940600872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265693, + "balance_loss_mlp": 1.24772859, + "epoch": 0.024817237398999614, + "flos": 524721894912.0, + "grad_norm": 0.0804185451989367, + "language_loss": 1.06161952, + "learning_rate": 0.0009623665303297678, + "loss": 1.07427645, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.1796875, + "step": 129, + "time_per_iteration": 2.7088472843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256284, + "balance_loss_mlp": 1.23668599, + "epoch": 0.025009619084263177, + "flos": 655350262272.0, + "grad_norm": 0.12369480901617341, + "language_loss": 1.10218048, + "learning_rate": 0.0009638956919697878, + "loss": 1.11474347, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.19592285, + "step": 130, + "time_per_iteration": 2.857571840286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266475, + "balance_loss_mlp": 1.24420691, + "epoch": 0.02520200076952674, + "flos": 454187271168.0, + "grad_norm": 0.08293639348197612, + "language_loss": 1.02638018, + "learning_rate": 0.0009654131357809714, + "loss": 1.03904486, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.22253418, + "step": 131, + "time_per_iteration": 2.641470432281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128644, + "balance_loss_mlp": 1.26142943, + "epoch": 0.025394382454790303, + "flos": 839427397632.0, + "grad_norm": 0.05741461740254168, + "language_loss": 1.11002767, + "learning_rate": 0.0009669190399838441, + "loss": 1.12289214, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.25036621, + "step": 132, + "time_per_iteration": 3.133596420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302533, + "balance_loss_mlp": 1.27633083, + "epoch": 0.025586764140053866, + "flos": 580725628416.0, + "grad_norm": 0.06987664196058198, + "language_loss": 1.0413487, + "learning_rate": 0.0009684135787636724, + "loss": 1.05437398, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.26208496, + "step": 133, + "time_per_iteration": 2.7968075275421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01325396, + "balance_loss_mlp": 1.29710746, + "epoch": 0.02577914582531743, + "flos": 789893959680.0, + "grad_norm": 0.07551411578012862, + "language_loss": 1.07757604, + "learning_rate": 0.0009698969223913726, + "loss": 1.09083009, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.28283691, + "step": 134, + "time_per_iteration": 3.0058987140655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131126, + "balance_loss_mlp": 1.28212547, + "epoch": 0.025971527510580992, + "flos": 594683320320.0, + "grad_norm": 0.0731546450398535, + "language_loss": 1.10457921, + "learning_rate": 0.0009713692373399265, + "loss": 1.11769176, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.29125977, + "step": 135, + "time_per_iteration": 2.6654229164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02152319, + "balance_loss_mlp": 1.95700705, + "epoch": 0.026163909195844555, + "flos": 1576771522560.0, + "grad_norm": 0.26755932757436196, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81608546, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 1.953125, + "step": 136, + "time_per_iteration": 6.531313896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01724331, + "balance_loss_mlp": 1.55266988, + "epoch": 0.026356290881108118, + "flos": 1501306030080.0, + "grad_norm": 0.1444935793983717, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79535371, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 1.71875, + "step": 137, + "time_per_iteration": 4.966995716094971 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01371776, + "balance_loss_mlp": 1.34284425, + "epoch": 0.02654867256637168, + "flos": 596841025536.0, + "grad_norm": 0.06823267419395149, + "language_loss": 1.03539467, + "learning_rate": 0.0009757216201974225, + "loss": 1.04911256, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.28918457, + "step": 138, + "time_per_iteration": 2.7901663780212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396345, + "balance_loss_mlp": 1.36752045, + "epoch": 0.026741054251635244, + "flos": 544761386496.0, + "grad_norm": 0.08904352821745645, + "language_loss": 1.08793342, + "learning_rate": 0.0009771514130396581, + "loss": 1.10189688, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.28833008, + "step": 139, + "time_per_iteration": 2.664384603500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410566, + "balance_loss_mlp": 1.38171697, + "epoch": 0.026933435936898807, + "flos": 506591387136.0, + "grad_norm": 0.09467843708761726, + "language_loss": 1.08393478, + "learning_rate": 0.00097857095638274, + "loss": 1.09804034, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.28833008, + "step": 140, + "time_per_iteration": 2.5600626468658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399161, + "balance_loss_mlp": 1.37263703, + "epoch": 0.02712581762216237, + "flos": 740513290752.0, + "grad_norm": 0.06303030428856128, + "language_loss": 0.99670362, + "learning_rate": 0.0009799803961288726, + "loss": 1.01069522, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.26538086, + "step": 141, + "time_per_iteration": 2.984253168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01354082, + "balance_loss_mlp": 1.33143175, + "epoch": 0.027318199307425933, + "flos": 848023464960.0, + "grad_norm": 0.06264638149228761, + "language_loss": 1.05898559, + "learning_rate": 0.000981379875086876, + "loss": 1.07252645, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.22644043, + "step": 142, + "time_per_iteration": 3.032597064971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323808, + "balance_loss_mlp": 1.30553341, + "epoch": 0.027510580992689496, + "flos": 575288400384.0, + "grad_norm": 0.07028220907739285, + "language_loss": 1.01752293, + "learning_rate": 0.0009827695330590185, + "loss": 1.030761, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.18273926, + "step": 143, + "time_per_iteration": 2.626483678817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303402, + "balance_loss_mlp": 1.28557992, + "epoch": 0.02770296267795306, + "flos": 772079164416.0, + "grad_norm": 0.05744811954937285, + "language_loss": 1.00619161, + "learning_rate": 0.0009841495069248256, + "loss": 1.0192256, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.17822266, + "step": 144, + "time_per_iteration": 2.9495198726654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316023, + "balance_loss_mlp": 1.29916632, + "epoch": 0.027895344363216622, + "flos": 569123642880.0, + "grad_norm": 0.04968902291069247, + "language_loss": 0.9920603, + "learning_rate": 0.0009855199307219871, + "loss": 1.00522041, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.1685791, + "step": 145, + "time_per_iteration": 2.6407721042633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130391, + "balance_loss_mlp": 1.28731608, + "epoch": 0.028087726048480186, + "flos": 547099564032.0, + "grad_norm": 0.10723696528856613, + "language_loss": 1.01566505, + "learning_rate": 0.0009868809357244854, + "loss": 1.02870417, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.16589355, + "step": 146, + "time_per_iteration": 2.6262452602386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287507, + "balance_loss_mlp": 1.27153277, + "epoch": 0.02828010773374375, + "flos": 524519663616.0, + "grad_norm": 0.06991830692152445, + "language_loss": 1.05632663, + "learning_rate": 0.0009882326505180556, + "loss": 1.06920183, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.15966797, + "step": 147, + "time_per_iteration": 2.6469435691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270213, + "balance_loss_mlp": 1.2534517, + "epoch": 0.02847248941900731, + "flos": 772108277760.0, + "grad_norm": 0.07309095407736986, + "language_loss": 1.04486537, + "learning_rate": 0.0009895752010730906, + "loss": 1.0575676, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.16748047, + "step": 148, + "time_per_iteration": 2.9457786083221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012724, + "balance_loss_mlp": 1.25667655, + "epoch": 0.028664871104270875, + "flos": 534150208512.0, + "grad_norm": 0.048334696317449924, + "language_loss": 1.10088921, + "learning_rate": 0.0009909087108150867, + "loss": 1.11361325, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.15710449, + "step": 149, + "time_per_iteration": 2.712559700012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309133, + "balance_loss_mlp": 1.29286051, + "epoch": 0.028857252789534438, + "flos": 367557599232.0, + "grad_norm": 0.13115053493636905, + "language_loss": 1.11238122, + "learning_rate": 0.0009922333006927371, + "loss": 1.12547255, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.16247559, + "step": 150, + "time_per_iteration": 2.4607067108154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01329212, + "balance_loss_mlp": 1.31257081, + "epoch": 0.029049634474798, + "flos": 515232534528.0, + "grad_norm": 0.06948512606819708, + "language_loss": 1.04613614, + "learning_rate": 0.0009935490892437632, + "loss": 1.05942833, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.16650391, + "step": 151, + "time_per_iteration": 2.5460238456726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309109, + "balance_loss_mlp": 1.29317045, + "epoch": 0.029242016160061564, + "flos": 587840495616.0, + "grad_norm": 0.11257287432569656, + "language_loss": 1.03097093, + "learning_rate": 0.0009948561926585687, + "loss": 1.04406202, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.15930176, + "step": 152, + "time_per_iteration": 2.753009557723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300362, + "balance_loss_mlp": 1.28555596, + "epoch": 0.029434397845325123, + "flos": 551816616960.0, + "grad_norm": 0.062223246716750634, + "language_loss": 1.06524086, + "learning_rate": 0.0009961547248418122, + "loss": 1.07824445, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.14807129, + "step": 153, + "time_per_iteration": 2.630092144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308008, + "balance_loss_mlp": 1.29357219, + "epoch": 0.029626779530588686, + "flos": 603221160960.0, + "grad_norm": 0.09420536563091944, + "language_loss": 1.03062868, + "learning_rate": 0.0009974447974719707, + "loss": 1.04370856, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.14440918, + "step": 154, + "time_per_iteration": 2.6962759494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312448, + "balance_loss_mlp": 1.29745138, + "epoch": 0.02981916121585225, + "flos": 620808993792.0, + "grad_norm": 0.08558703297148447, + "language_loss": 1.04985213, + "learning_rate": 0.0009987265200589763, + "loss": 1.0629766, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.15002441, + "step": 155, + "time_per_iteration": 2.7059414386749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295882, + "balance_loss_mlp": 1.28057528, + "epoch": 0.030011542901115813, + "flos": 661322962944.0, + "grad_norm": 0.09731995783752632, + "language_loss": 1.04436159, + "learning_rate": 0.001, + "loss": 1.05732036, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.1529541, + "step": 156, + "time_per_iteration": 2.856968641281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262708, + "balance_loss_mlp": 1.24682927, + "epoch": 0.030203924586379376, + "flos": 651258842112.0, + "grad_norm": 0.05966927829613408, + "language_loss": 1.02520585, + "learning_rate": 0.0009999999029413921, + "loss": 1.03783274, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.15856934, + "step": 157, + "time_per_iteration": 2.851480722427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268181, + "balance_loss_mlp": 1.25150406, + "epoch": 0.03039630627164294, + "flos": 531083091456.0, + "grad_norm": 0.1034311415514979, + "language_loss": 1.04085183, + "learning_rate": 0.0009999996117656068, + "loss": 1.05353379, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.16674805, + "step": 158, + "time_per_iteration": 2.707646369934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262524, + "balance_loss_mlp": 1.24747968, + "epoch": 0.030588687956906502, + "flos": 585914135040.0, + "grad_norm": 0.12050944658187299, + "language_loss": 0.97824669, + "learning_rate": 0.0009999991264727564, + "loss": 0.99087203, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.15039062, + "step": 159, + "time_per_iteration": 2.7575390338897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272116, + "balance_loss_mlp": 1.25716722, + "epoch": 0.030781069642170065, + "flos": 513026777088.0, + "grad_norm": 0.07020206521781955, + "language_loss": 1.08316755, + "learning_rate": 0.0009999984470630296, + "loss": 1.09588861, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.14929199, + "step": 160, + "time_per_iteration": 2.62310528755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128559, + "balance_loss_mlp": 1.27058172, + "epoch": 0.030973451327433628, + "flos": 717766064640.0, + "grad_norm": 0.06068839125924313, + "language_loss": 0.96528012, + "learning_rate": 0.0009999975735366902, + "loss": 0.978136, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.15002441, + "step": 161, + "time_per_iteration": 3.0823376178741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305752, + "balance_loss_mlp": 1.29055238, + "epoch": 0.03116583301269719, + "flos": 1109312133120.0, + "grad_norm": 0.09428930343360856, + "language_loss": 0.98546314, + "learning_rate": 0.0009999965058940775, + "loss": 0.99852067, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.1517334, + "step": 162, + "time_per_iteration": 3.486618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315996, + "balance_loss_mlp": 1.3010118, + "epoch": 0.031358214697960754, + "flos": 450676403712.0, + "grad_norm": 0.09976775191278689, + "language_loss": 1.04580116, + "learning_rate": 0.0009999952441356057, + "loss": 1.05896115, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.1496582, + "step": 163, + "time_per_iteration": 2.537173271179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300744, + "balance_loss_mlp": 1.28654623, + "epoch": 0.031550596383224314, + "flos": 1254701325312.0, + "grad_norm": 0.0838197011845512, + "language_loss": 1.05903006, + "learning_rate": 0.000999993788261765, + "loss": 1.07203746, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.14196777, + "step": 164, + "time_per_iteration": 3.5638957023620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270584, + "balance_loss_mlp": 1.25625503, + "epoch": 0.03174297806848788, + "flos": 667841310720.0, + "grad_norm": 0.068717417443618, + "language_loss": 1.0642612, + "learning_rate": 0.00099999213827312, + "loss": 1.076967, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.14343262, + "step": 165, + "time_per_iteration": 2.8084213733673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255587, + "balance_loss_mlp": 1.24152076, + "epoch": 0.03193535975375144, + "flos": 551033832960.0, + "grad_norm": 0.06892139424853191, + "language_loss": 1.0208962, + "learning_rate": 0.000999990294170312, + "loss": 1.03345203, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.14074707, + "step": 166, + "time_per_iteration": 2.6247787475585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259954, + "balance_loss_mlp": 1.24549401, + "epoch": 0.032127741439015006, + "flos": 543377700864.0, + "grad_norm": 0.08292396830811857, + "language_loss": 1.05774951, + "learning_rate": 0.0009999882559540566, + "loss": 1.07034898, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.14465332, + "step": 167, + "time_per_iteration": 2.654036283493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291491, + "balance_loss_mlp": 1.27790117, + "epoch": 0.032320123124278566, + "flos": 548104928256.0, + "grad_norm": 0.07217909902530589, + "language_loss": 1.02104354, + "learning_rate": 0.000999986023625145, + "loss": 1.03395844, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.13598633, + "step": 168, + "time_per_iteration": 2.696866750717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03738194, + "balance_loss_mlp": 3.61993837, + "epoch": 0.03251250480954213, + "flos": 1305156865536.0, + "grad_norm": 0.563981464368737, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.82662606, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 1.1796875, + "step": 169, + "time_per_iteration": 4.971506834030151 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134723, + "balance_loss_mlp": 1.33386648, + "epoch": 0.03270488649480569, + "flos": 560866609152.0, + "grad_norm": 0.12141219581883538, + "language_loss": 1.02540469, + "learning_rate": 0.0009999809766328958, + "loss": 1.03887701, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.13391113, + "step": 170, + "time_per_iteration": 2.646425724029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01355192, + "balance_loss_mlp": 1.34039843, + "epoch": 0.03289726818006926, + "flos": 482120031744.0, + "grad_norm": 0.08046017426621577, + "language_loss": 1.05186188, + "learning_rate": 0.0009999781619715177, + "loss": 1.06541371, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.14770508, + "step": 171, + "time_per_iteration": 2.535360336303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381569, + "balance_loss_mlp": 1.36640596, + "epoch": 0.03308964986533282, + "flos": 674355276288.0, + "grad_norm": 0.08789680193074563, + "language_loss": 1.04250002, + "learning_rate": 0.000999975153201402, + "loss": 1.05631578, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.15161133, + "step": 172, + "time_per_iteration": 2.8205513954162598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433883, + "balance_loss_mlp": 1.41711044, + "epoch": 0.033282031550596385, + "flos": 608937785856.0, + "grad_norm": 0.07610360898370483, + "language_loss": 1.02505267, + "learning_rate": 0.0009999719503237174, + "loss": 1.03939152, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.16760254, + "step": 173, + "time_per_iteration": 2.738676071166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451195, + "balance_loss_mlp": 1.43315864, + "epoch": 0.033474413235859944, + "flos": 467801547264.0, + "grad_norm": 0.07270846083900323, + "language_loss": 1.111094, + "learning_rate": 0.0009999685533397073, + "loss": 1.12560594, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.18029785, + "step": 174, + "time_per_iteration": 2.5556905269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01429898, + "balance_loss_mlp": 1.41368508, + "epoch": 0.03366679492112351, + "flos": 579365263872.0, + "grad_norm": 0.09196642879711979, + "language_loss": 1.03199494, + "learning_rate": 0.00099996496225069, + "loss": 1.04629397, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.16210938, + "step": 175, + "time_per_iteration": 2.6806485652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432234, + "balance_loss_mlp": 1.41513896, + "epoch": 0.03385917660638707, + "flos": 637378315776.0, + "grad_norm": 0.08705990667808558, + "language_loss": 1.05897307, + "learning_rate": 0.0009999611770580604, + "loss": 1.07329535, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.17102051, + "step": 176, + "time_per_iteration": 2.830826759338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415158, + "balance_loss_mlp": 1.39910054, + "epoch": 0.03405155829165064, + "flos": 441587123712.0, + "grad_norm": 0.08054669051038237, + "language_loss": 1.03868258, + "learning_rate": 0.0009999571977632876, + "loss": 1.05283427, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.16052246, + "step": 177, + "time_per_iteration": 2.623309850692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463141, + "balance_loss_mlp": 1.44573641, + "epoch": 0.034243939976914196, + "flos": 466097766912.0, + "grad_norm": 0.08089290506220445, + "language_loss": 1.06928194, + "learning_rate": 0.0009999530243679166, + "loss": 1.08391333, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.17407227, + "step": 178, + "time_per_iteration": 2.545133113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451423, + "balance_loss_mlp": 1.43560433, + "epoch": 0.03443632166217776, + "flos": 778919016960.0, + "grad_norm": 0.08468734735068614, + "language_loss": 1.01505899, + "learning_rate": 0.0009999486568735675, + "loss": 1.0295732, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.15808105, + "step": 179, + "time_per_iteration": 3.0384457111358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433641, + "balance_loss_mlp": 1.41778612, + "epoch": 0.03462870334744132, + "flos": 1263284246016.0, + "grad_norm": 0.06997324880309466, + "language_loss": 1.01388979, + "learning_rate": 0.0009999440952819362, + "loss": 1.02822614, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.15856934, + "step": 180, + "time_per_iteration": 3.6892786026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401308, + "balance_loss_mlp": 1.38610911, + "epoch": 0.03482108503270489, + "flos": 606899354112.0, + "grad_norm": 0.057831512038439566, + "language_loss": 1.02027512, + "learning_rate": 0.0009999393395947935, + "loss": 1.03428817, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.15185547, + "step": 181, + "time_per_iteration": 2.826353073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381551, + "balance_loss_mlp": 1.36612535, + "epoch": 0.03501346671796845, + "flos": 538010284032.0, + "grad_norm": 0.05913415109875365, + "language_loss": 1.05361927, + "learning_rate": 0.0009999343898139858, + "loss": 1.06743479, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.1541748, + "step": 182, + "time_per_iteration": 2.593250036239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01359754, + "balance_loss_mlp": 1.33988214, + "epoch": 0.035205848403232015, + "flos": 518231250432.0, + "grad_norm": 0.05898920665253376, + "language_loss": 1.04308426, + "learning_rate": 0.0009999292459414348, + "loss": 1.05668187, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.19909668, + "step": 183, + "time_per_iteration": 2.565936326980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01311064, + "balance_loss_mlp": 1.296103, + "epoch": 0.035398230088495575, + "flos": 472134486528.0, + "grad_norm": 0.06373248491183749, + "language_loss": 1.08499169, + "learning_rate": 0.0009999239079791374, + "loss": 1.09810233, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.14953613, + "step": 184, + "time_per_iteration": 2.552949905395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130912, + "balance_loss_mlp": 1.29237127, + "epoch": 0.03559061177375914, + "flos": 511820591616.0, + "grad_norm": 0.056329932736213485, + "language_loss": 1.01337337, + "learning_rate": 0.0009999183759291659, + "loss": 1.02646446, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.16748047, + "step": 185, + "time_per_iteration": 2.741727113723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291511, + "balance_loss_mlp": 1.27575147, + "epoch": 0.0357829934590227, + "flos": 477146903040.0, + "grad_norm": 0.11224085577532149, + "language_loss": 1.03738213, + "learning_rate": 0.0009999126497936682, + "loss": 1.05029726, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.1574707, + "step": 186, + "time_per_iteration": 2.4901957511901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291515, + "balance_loss_mlp": 1.27446783, + "epoch": 0.03597537514428627, + "flos": 644350588416.0, + "grad_norm": 0.06537030709871235, + "language_loss": 1.06735992, + "learning_rate": 0.0009999067295748676, + "loss": 1.08027506, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.1706543, + "step": 187, + "time_per_iteration": 2.7923052310943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327575, + "balance_loss_mlp": 1.30966997, + "epoch": 0.03616775682954983, + "flos": 580916275200.0, + "grad_norm": 0.06523062893181024, + "language_loss": 1.04418302, + "learning_rate": 0.000999900615275062, + "loss": 1.05745876, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.17919922, + "step": 188, + "time_per_iteration": 2.7248637676239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295421, + "balance_loss_mlp": 1.27722955, + "epoch": 0.03636013851481339, + "flos": 382210735104.0, + "grad_norm": 0.08035209765807474, + "language_loss": 1.10347509, + "learning_rate": 0.0009998943068966256, + "loss": 1.11642933, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.18188477, + "step": 189, + "time_per_iteration": 2.429497480392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279097, + "balance_loss_mlp": 1.26120377, + "epoch": 0.03655252020007695, + "flos": 582954706944.0, + "grad_norm": 0.07380481555246936, + "language_loss": 1.0506779, + "learning_rate": 0.0009998878044420072, + "loss": 1.06346881, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.17907715, + "step": 190, + "time_per_iteration": 2.6878626346588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012863, + "balance_loss_mlp": 1.26773953, + "epoch": 0.03674490188534051, + "flos": 471376433664.0, + "grad_norm": 0.10484442400689244, + "language_loss": 1.01223493, + "learning_rate": 0.0009998811079137318, + "loss": 1.02509785, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.18566895, + "step": 191, + "time_per_iteration": 2.561494827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281775, + "balance_loss_mlp": 1.26645625, + "epoch": 0.03693728357060408, + "flos": 528113488896.0, + "grad_norm": 0.0609431296621874, + "language_loss": 1.01984763, + "learning_rate": 0.0009998742173143987, + "loss": 1.03266537, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.1529541, + "step": 192, + "time_per_iteration": 2.59798264503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336751, + "balance_loss_mlp": 1.32157528, + "epoch": 0.03712966525586764, + "flos": 798657352704.0, + "grad_norm": 0.10186248006293357, + "language_loss": 1.02005363, + "learning_rate": 0.0009998671326466833, + "loss": 1.03342128, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.15185547, + "step": 193, + "time_per_iteration": 2.9510865211486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331057, + "balance_loss_mlp": 1.3157624, + "epoch": 0.037322046941131205, + "flos": 829628116992.0, + "grad_norm": 0.06375125184008373, + "language_loss": 1.02914846, + "learning_rate": 0.0009998598539133362, + "loss": 1.04245901, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.1529541, + "step": 194, + "time_per_iteration": 2.9981300830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01337882, + "balance_loss_mlp": 1.3235296, + "epoch": 0.037514428626394765, + "flos": 437460797952.0, + "grad_norm": 0.10181133305516413, + "language_loss": 1.03936744, + "learning_rate": 0.0009998523811171828, + "loss": 1.0527463, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.14379883, + "step": 195, + "time_per_iteration": 2.501542568206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296215, + "balance_loss_mlp": 1.28125429, + "epoch": 0.03770681031165833, + "flos": 511372459008.0, + "grad_norm": 0.09414845611868274, + "language_loss": 1.04584992, + "learning_rate": 0.0009998447142611248, + "loss": 1.05881214, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.14941406, + "step": 196, + "time_per_iteration": 2.6247317790985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128144, + "balance_loss_mlp": 1.26702762, + "epoch": 0.03789919199692189, + "flos": 807102061056.0, + "grad_norm": 0.05831249070889761, + "language_loss": 0.97701526, + "learning_rate": 0.0009998368533481387, + "loss": 0.9898296, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.14422607, + "step": 197, + "time_per_iteration": 3.01912784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294999, + "balance_loss_mlp": 1.27945375, + "epoch": 0.03809157368218546, + "flos": 690274234368.0, + "grad_norm": 0.06656848410147823, + "language_loss": 1.00630498, + "learning_rate": 0.0009998287983812762, + "loss": 1.01925504, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.15551758, + "step": 198, + "time_per_iteration": 2.8252804279327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0135387, + "balance_loss_mlp": 1.33592904, + "epoch": 0.03828395536744902, + "flos": 517675428864.0, + "grad_norm": 0.06988401379713739, + "language_loss": 1.06386423, + "learning_rate": 0.0009998205493636646, + "loss": 1.07740283, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.17944336, + "step": 199, + "time_per_iteration": 2.649765729904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339461, + "balance_loss_mlp": 1.32242572, + "epoch": 0.038476337052712584, + "flos": 581389138944.0, + "grad_norm": 0.07184113921580974, + "language_loss": 0.9925406, + "learning_rate": 0.0009998121062985063, + "loss": 1.00593519, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.17053223, + "step": 200, + "time_per_iteration": 2.6788320541381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328142, + "balance_loss_mlp": 1.31167912, + "epoch": 0.03866871873797614, + "flos": 576791359488.0, + "grad_norm": 0.059667024197710104, + "language_loss": 1.01260698, + "learning_rate": 0.0009998034691890794, + "loss": 1.02588844, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.16455078, + "step": 201, + "time_per_iteration": 2.753265380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297644, + "balance_loss_mlp": 1.28249288, + "epoch": 0.03886110042323971, + "flos": 540472117248.0, + "grad_norm": 0.07302515973387386, + "language_loss": 1.05948424, + "learning_rate": 0.0009997946380387369, + "loss": 1.07246065, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.15136719, + "step": 202, + "time_per_iteration": 2.618546485900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262023, + "balance_loss_mlp": 1.24746776, + "epoch": 0.03905348210850327, + "flos": 717694843392.0, + "grad_norm": 0.0775452329378228, + "language_loss": 1.08266401, + "learning_rate": 0.0009997856128509076, + "loss": 1.09528422, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.14550781, + "step": 203, + "time_per_iteration": 2.8284859657287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267878, + "balance_loss_mlp": 1.25321579, + "epoch": 0.039245863793766836, + "flos": 427268639232.0, + "grad_norm": 0.06664318613050589, + "language_loss": 1.02886617, + "learning_rate": 0.0009997763936290952, + "loss": 1.04154491, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.14660645, + "step": 204, + "time_per_iteration": 2.516263246536255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129892, + "balance_loss_mlp": 1.28264785, + "epoch": 0.039438245479030395, + "flos": 662804163072.0, + "grad_norm": 0.07463685050771204, + "language_loss": 1.0815413, + "learning_rate": 0.0009997669803768789, + "loss": 1.09453046, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.16271973, + "step": 205, + "time_per_iteration": 2.7576606273651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291456, + "balance_loss_mlp": 1.27812803, + "epoch": 0.03963062716429396, + "flos": 635063459328.0, + "grad_norm": 0.055878982250893716, + "language_loss": 1.03253651, + "learning_rate": 0.0009997573730979134, + "loss": 1.04545116, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.13342285, + "step": 206, + "time_per_iteration": 2.716325521469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.07279634, + "balance_loss_mlp": 4.65512276, + "epoch": 0.03982300884955752, + "flos": 1417813286400.0, + "grad_norm": 0.533603848118922, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.86472833, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 26.25, + "step": 207, + "time_per_iteration": 4.635821342468262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0137482, + "balance_loss_mlp": 1.35964513, + "epoch": 0.04001539053482109, + "flos": 688769713152.0, + "grad_norm": 0.1040721574676452, + "language_loss": 1.02094078, + "learning_rate": 0.0009997375764747294, + "loss": 1.03468895, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.1517334, + "step": 208, + "time_per_iteration": 2.974442481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415285, + "balance_loss_mlp": 1.40052748, + "epoch": 0.04020777222008465, + "flos": 533363042304.0, + "grad_norm": 0.08111266742266361, + "language_loss": 0.99458027, + "learning_rate": 0.0009997273871381967, + "loss": 1.00873303, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.14758301, + "step": 209, + "time_per_iteration": 2.6802144050598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466201, + "balance_loss_mlp": 1.44989347, + "epoch": 0.040400153905348214, + "flos": 567661381632.0, + "grad_norm": 0.06875741115436663, + "language_loss": 1.05031717, + "learning_rate": 0.0009997170037902862, + "loss": 1.0649792, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.16308594, + "step": 210, + "time_per_iteration": 2.6975836753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531768, + "balance_loss_mlp": 1.51399446, + "epoch": 0.040592535590611774, + "flos": 713130559488.0, + "grad_norm": 0.07197690318934227, + "language_loss": 1.07202697, + "learning_rate": 0.0009997064264350292, + "loss": 1.08734465, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.17785645, + "step": 211, + "time_per_iteration": 2.836771011352539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531925, + "balance_loss_mlp": 1.5154984, + "epoch": 0.04078491727587533, + "flos": 577824427008.0, + "grad_norm": 0.09120436996840299, + "language_loss": 1.0146966, + "learning_rate": 0.0009996956550765317, + "loss": 1.03001595, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.16430664, + "step": 212, + "time_per_iteration": 2.671292781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01499293, + "balance_loss_mlp": 1.4817214, + "epoch": 0.0409772989611389, + "flos": 552033404928.0, + "grad_norm": 0.11449485477945152, + "language_loss": 0.96278083, + "learning_rate": 0.0009996846897189762, + "loss": 0.97777379, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.17565918, + "step": 213, + "time_per_iteration": 2.6231424808502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014653, + "balance_loss_mlp": 1.44753814, + "epoch": 0.04116968064640246, + "flos": 555347833344.0, + "grad_norm": 0.09512793115916172, + "language_loss": 1.02356708, + "learning_rate": 0.0009996735303666193, + "loss": 1.03822017, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.1776123, + "step": 214, + "time_per_iteration": 2.6930177211761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01477298, + "balance_loss_mlp": 1.46134758, + "epoch": 0.041362062331666026, + "flos": 578204158464.0, + "grad_norm": 0.09141123477091552, + "language_loss": 1.04750729, + "learning_rate": 0.0009996621770237937, + "loss": 1.06228042, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.15942383, + "step": 215, + "time_per_iteration": 2.7448923587799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01578462, + "balance_loss_mlp": 1.56013966, + "epoch": 0.041554444016929586, + "flos": 611130396672.0, + "grad_norm": 0.10233552268903827, + "language_loss": 0.99822551, + "learning_rate": 0.0009996506296949073, + "loss": 1.01401007, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.18334961, + "step": 216, + "time_per_iteration": 2.8548526763916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01609008, + "balance_loss_mlp": 1.59156775, + "epoch": 0.04174682570219315, + "flos": 527857413120.0, + "grad_norm": 0.10522858499680945, + "language_loss": 0.99888742, + "learning_rate": 0.0009996388883844428, + "loss": 1.01497757, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.17456055, + "step": 217, + "time_per_iteration": 2.618546724319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01557164, + "balance_loss_mlp": 1.54124999, + "epoch": 0.04193920738745671, + "flos": 511258977792.0, + "grad_norm": 0.09341741551851517, + "language_loss": 1.03841758, + "learning_rate": 0.0009996269530969588, + "loss": 1.05398929, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.15905762, + "step": 218, + "time_per_iteration": 2.6204636096954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01525903, + "balance_loss_mlp": 1.50927377, + "epoch": 0.04213158907272028, + "flos": 571226093568.0, + "grad_norm": 0.09609660813155754, + "language_loss": 1.02944803, + "learning_rate": 0.0009996148238370888, + "loss": 1.04470706, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.16625977, + "step": 219, + "time_per_iteration": 2.7071943283081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0150071, + "balance_loss_mlp": 1.48340106, + "epoch": 0.04232397075798384, + "flos": 963803667456.0, + "grad_norm": 0.05454565212769997, + "language_loss": 0.9941752, + "learning_rate": 0.0009996025006095421, + "loss": 1.00918233, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.1730957, + "step": 220, + "time_per_iteration": 3.3006374835968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.10944285, + "balance_loss_mlp": 6.84272289, + "epoch": 0.042516352443247404, + "flos": 1468814777856.0, + "grad_norm": 0.48497418398004566, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.88727427, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 41.0, + "step": 221, + "time_per_iteration": 5.7136383056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601015, + "balance_loss_mlp": 1.58291924, + "epoch": 0.042708734128510964, + "flos": 654419091456.0, + "grad_norm": 0.10763646442297387, + "language_loss": 0.99765503, + "learning_rate": 0.0009995772722706307, + "loss": 1.0136652, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.1809082, + "step": 222, + "time_per_iteration": 2.8322792053222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01658843, + "balance_loss_mlp": 1.63811278, + "epoch": 0.04290111581377453, + "flos": 431601578496.0, + "grad_norm": 0.16393394652444138, + "language_loss": 1.13557565, + "learning_rate": 0.0009995643671690604, + "loss": 1.1521641, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.20739746, + "step": 223, + "time_per_iteration": 2.470729351043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163871, + "balance_loss_mlp": 1.61504686, + "epoch": 0.04309349749903809, + "flos": 644379701760.0, + "grad_norm": 0.08733094203359489, + "language_loss": 1.00837708, + "learning_rate": 0.0009995512681194023, + "loss": 1.02476418, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.23632812, + "step": 224, + "time_per_iteration": 2.8274452686309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615568, + "balance_loss_mlp": 1.58755326, + "epoch": 0.04328587918430166, + "flos": 830861853696.0, + "grad_norm": 0.12001676841435771, + "language_loss": 0.98664522, + "learning_rate": 0.0009995379751267417, + "loss": 1.00280082, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.28027344, + "step": 225, + "time_per_iteration": 3.275660991668701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01617416, + "balance_loss_mlp": 1.58639741, + "epoch": 0.043478260869565216, + "flos": 524804852736.0, + "grad_norm": 0.1467276253632499, + "language_loss": 1.0007726, + "learning_rate": 0.0009995244881962398, + "loss": 1.01694679, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.30981445, + "step": 226, + "time_per_iteration": 2.6300203800201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601732, + "balance_loss_mlp": 1.56787658, + "epoch": 0.04367064255482878, + "flos": 439253328384.0, + "grad_norm": 0.095918638324787, + "language_loss": 1.01389623, + "learning_rate": 0.0009995108073331323, + "loss": 1.02991343, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.33862305, + "step": 227, + "time_per_iteration": 2.667628765106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0158134, + "balance_loss_mlp": 1.5462923, + "epoch": 0.04386302424009234, + "flos": 507109330944.0, + "grad_norm": 0.08564981186298011, + "language_loss": 1.04024279, + "learning_rate": 0.0009994969325427309, + "loss": 1.05605614, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.35058594, + "step": 228, + "time_per_iteration": 2.6454501152038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558795, + "balance_loss_mlp": 1.52224541, + "epoch": 0.04405540592535591, + "flos": 540432829440.0, + "grad_norm": 0.07744391701114339, + "language_loss": 1.00052619, + "learning_rate": 0.0009994828638304218, + "loss": 1.016114, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.36547852, + "step": 229, + "time_per_iteration": 2.6468071937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01516137, + "balance_loss_mlp": 1.47794271, + "epoch": 0.04424778761061947, + "flos": 446136850944.0, + "grad_norm": 0.08052263902742013, + "language_loss": 1.06763554, + "learning_rate": 0.0009994686012016675, + "loss": 1.08279693, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.3815918, + "step": 230, + "time_per_iteration": 2.5467634201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01483037, + "balance_loss_mlp": 1.44515228, + "epoch": 0.044440169295883035, + "flos": 700383435264.0, + "grad_norm": 0.05918307184238542, + "language_loss": 1.0518043, + "learning_rate": 0.000999454144662005, + "loss": 1.06663465, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.37866211, + "step": 231, + "time_per_iteration": 2.8704099655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473358, + "balance_loss_mlp": 1.43549716, + "epoch": 0.044632550981146595, + "flos": 588055873536.0, + "grad_norm": 0.08626514264815018, + "language_loss": 0.99676436, + "learning_rate": 0.0009994394942170468, + "loss": 1.01149797, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.37866211, + "step": 232, + "time_per_iteration": 2.6578898429870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461415, + "balance_loss_mlp": 1.4258194, + "epoch": 0.04482493266641016, + "flos": 554534525952.0, + "grad_norm": 0.07124765242066121, + "language_loss": 0.96965969, + "learning_rate": 0.0009994246498724808, + "loss": 0.98427379, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.35620117, + "step": 233, + "time_per_iteration": 2.7764015197753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463645, + "balance_loss_mlp": 1.42790616, + "epoch": 0.04501731435167372, + "flos": 722500646400.0, + "grad_norm": 0.07759597622956232, + "language_loss": 0.99069166, + "learning_rate": 0.00099940961163407, + "loss": 1.00532806, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.35766602, + "step": 234, + "time_per_iteration": 2.8431143760681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454599, + "balance_loss_mlp": 1.42098188, + "epoch": 0.04520969603693728, + "flos": 511539784704.0, + "grad_norm": 0.05931413709293958, + "language_loss": 1.02564597, + "learning_rate": 0.0009993943795076528, + "loss": 1.04019189, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.33642578, + "step": 235, + "time_per_iteration": 2.645988702774048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444244, + "balance_loss_mlp": 1.40936303, + "epoch": 0.04540207772220085, + "flos": 364854246912.0, + "grad_norm": 0.07280953320994132, + "language_loss": 1.04776168, + "learning_rate": 0.0009993789534991427, + "loss": 1.062204, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.34912109, + "step": 236, + "time_per_iteration": 2.4084837436676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01418385, + "balance_loss_mlp": 1.38390946, + "epoch": 0.045594459407464406, + "flos": 522407038464.0, + "grad_norm": 0.060943880380569936, + "language_loss": 0.99500269, + "learning_rate": 0.0009993633336145287, + "loss": 1.00918651, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.34472656, + "step": 237, + "time_per_iteration": 2.6044533252716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406135, + "balance_loss_mlp": 1.3730185, + "epoch": 0.04578684109272797, + "flos": 671442338304.0, + "grad_norm": 0.06747057459653658, + "language_loss": 1.03573179, + "learning_rate": 0.0009993475198598752, + "loss": 1.04979324, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.33129883, + "step": 238, + "time_per_iteration": 2.9948084354400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01383668, + "balance_loss_mlp": 1.35164809, + "epoch": 0.04597922277799153, + "flos": 541387321344.0, + "grad_norm": 0.07135856148897902, + "language_loss": 0.99909985, + "learning_rate": 0.0009993315122413212, + "loss": 1.01293659, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.32006836, + "step": 239, + "time_per_iteration": 2.5848827362060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369111, + "balance_loss_mlp": 1.33773541, + "epoch": 0.0461716044632551, + "flos": 458732616192.0, + "grad_norm": 0.056000088810755834, + "language_loss": 1.0008105, + "learning_rate": 0.0009993153107650818, + "loss": 1.01450157, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.31347656, + "step": 240, + "time_per_iteration": 2.5492687225341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338815, + "balance_loss_mlp": 1.31015706, + "epoch": 0.04636398614851866, + "flos": 455009342976.0, + "grad_norm": 0.06491754001609312, + "language_loss": 0.99534512, + "learning_rate": 0.0009992989154374468, + "loss": 1.00873327, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.28662109, + "step": 241, + "time_per_iteration": 2.511237621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294622, + "balance_loss_mlp": 1.26833653, + "epoch": 0.046556367833782225, + "flos": 556558401024.0, + "grad_norm": 0.07592069792168304, + "language_loss": 1.06626534, + "learning_rate": 0.0009992823262647817, + "loss": 1.07921147, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26293945, + "step": 242, + "time_per_iteration": 2.7618701457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249282, + "balance_loss_mlp": 1.22240043, + "epoch": 0.046748749519045785, + "flos": 592625949696.0, + "grad_norm": 0.0687662987323222, + "language_loss": 1.00893593, + "learning_rate": 0.0009992655432535264, + "loss": 1.0214287, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.26879883, + "step": 243, + "time_per_iteration": 2.7471935749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255015, + "balance_loss_mlp": 1.23083937, + "epoch": 0.04694113120430935, + "flos": 569596506624.0, + "grad_norm": 0.07373455055845594, + "language_loss": 1.0054853, + "learning_rate": 0.0009992485664101973, + "loss": 1.01803541, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.24169922, + "step": 244, + "time_per_iteration": 2.635344982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295554, + "balance_loss_mlp": 1.27291572, + "epoch": 0.04713351288957291, + "flos": 863401158144.0, + "grad_norm": 0.10584905626659928, + "language_loss": 1.03312445, + "learning_rate": 0.000999231395741385, + "loss": 1.04607987, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.22631836, + "step": 245, + "time_per_iteration": 3.093386173248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128706, + "balance_loss_mlp": 1.26464868, + "epoch": 0.04732589457483648, + "flos": 536961249792.0, + "grad_norm": 0.08844420521863233, + "language_loss": 1.01371169, + "learning_rate": 0.0009992140312537557, + "loss": 1.02658224, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.22412109, + "step": 246, + "time_per_iteration": 2.667579412460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256359, + "balance_loss_mlp": 1.23515141, + "epoch": 0.04751827626010004, + "flos": 761566910976.0, + "grad_norm": 0.052835972446563725, + "language_loss": 0.9609164, + "learning_rate": 0.000999196472954051, + "loss": 0.97347999, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.2121582, + "step": 247, + "time_per_iteration": 2.9537084102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02369813, + "balance_loss_mlp": 2.16687083, + "epoch": 0.0477106579453636, + "flos": 1578961313280.0, + "grad_norm": 0.2151482568863758, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.81794667, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 2.03125, + "step": 248, + "time_per_iteration": 5.758621454238892 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289622, + "balance_loss_mlp": 1.27137113, + "epoch": 0.04790303963062716, + "flos": 457535195136.0, + "grad_norm": 0.10849969336884063, + "language_loss": 1.03316629, + "learning_rate": 0.0009991607749457578, + "loss": 1.04606247, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.18261719, + "step": 249, + "time_per_iteration": 2.5432913303375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334566, + "balance_loss_mlp": 1.31724536, + "epoch": 0.04809542131589073, + "flos": 782079266304.0, + "grad_norm": 0.08264534697846654, + "language_loss": 1.01180637, + "learning_rate": 0.0009991426352510286, + "loss": 1.02515209, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.17321777, + "step": 250, + "time_per_iteration": 3.1542766094207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351096, + "balance_loss_mlp": 1.33215368, + "epoch": 0.04828780300115429, + "flos": 558995503104.0, + "grad_norm": 0.06435857362074206, + "language_loss": 1.03307557, + "learning_rate": 0.0009991243017719422, + "loss": 1.04658651, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.18933105, + "step": 251, + "time_per_iteration": 2.693882942199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333217, + "balance_loss_mlp": 1.31485844, + "epoch": 0.048480184686417856, + "flos": 501682277376.0, + "grad_norm": 0.09276508096019526, + "language_loss": 0.97794825, + "learning_rate": 0.0009991057745156165, + "loss": 0.99128038, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.18347168, + "step": 252, + "time_per_iteration": 2.628873109817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01867297, + "balance_loss_mlp": 1.75514495, + "epoch": 0.048672566371681415, + "flos": 1535585430528.0, + "grad_norm": 0.16359674361847032, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.8377828, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.125, + "step": 253, + "time_per_iteration": 5.060615062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285031, + "balance_loss_mlp": 1.26567185, + "epoch": 0.04886494805694498, + "flos": 537665458176.0, + "grad_norm": 0.07164286827098729, + "language_loss": 1.06546187, + "learning_rate": 0.0009990681387000943, + "loss": 1.07831216, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.19384766, + "step": 254, + "time_per_iteration": 2.783367395401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275754, + "balance_loss_mlp": 1.25606036, + "epoch": 0.04905732974220854, + "flos": 679841966592.0, + "grad_norm": 0.06618046133348403, + "language_loss": 1.01404011, + "learning_rate": 0.0009990490301555093, + "loss": 1.02679765, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.19689941, + "step": 255, + "time_per_iteration": 2.9520761966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01936632, + "balance_loss_mlp": 1.86796737, + "epoch": 0.04924971142747211, + "flos": 1420408949760.0, + "grad_norm": 0.31562964738653715, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.81151783, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.6875, + "step": 256, + "time_per_iteration": 4.825209856033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615246, + "balance_loss_mlp": 1.55344784, + "epoch": 0.04944209311273567, + "flos": 1557202074624.0, + "grad_norm": 0.16937574338078817, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80857986, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.6171875, + "step": 257, + "time_per_iteration": 4.995501518249512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163115, + "balance_loss_mlp": 1.58422887, + "epoch": 0.04963447479799923, + "flos": 1569985514496.0, + "grad_norm": 0.13524925240989144, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71607035, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.46875, + "step": 258, + "time_per_iteration": 4.841471910476685 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272668, + "balance_loss_mlp": 1.24463022, + "epoch": 0.049826856483262794, + "flos": 625063357440.0, + "grad_norm": 0.06365504505971183, + "language_loss": 0.95603192, + "learning_rate": 0.0009989706585723202, + "loss": 0.96875864, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.28076172, + "step": 259, + "time_per_iteration": 2.7635786533355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130022, + "balance_loss_mlp": 1.27020288, + "epoch": 0.05001923816852635, + "flos": 503912765952.0, + "grad_norm": 0.062257698278494894, + "language_loss": 1.01846027, + "learning_rate": 0.0009989505813633442, + "loss": 1.03146255, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.29980469, + "step": 260, + "time_per_iteration": 2.6451833248138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131611, + "balance_loss_mlp": 1.28101516, + "epoch": 0.05021161985378992, + "flos": 587066476032.0, + "grad_norm": 0.06290514068599455, + "language_loss": 1.01911807, + "learning_rate": 0.000998930310444573, + "loss": 1.03227913, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.35083008, + "step": 261, + "time_per_iteration": 2.6989662647247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324978, + "balance_loss_mlp": 1.2880708, + "epoch": 0.05040400153905348, + "flos": 633029409792.0, + "grad_norm": 0.0625839964239575, + "language_loss": 1.00387836, + "learning_rate": 0.0009989098458238765, + "loss": 1.01712811, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.36914062, + "step": 262, + "time_per_iteration": 2.7581043243408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319841, + "balance_loss_mlp": 1.28395867, + "epoch": 0.050596383224317046, + "flos": 553344307200.0, + "grad_norm": 0.06067150197267865, + "language_loss": 0.99905968, + "learning_rate": 0.0009988891875091998, + "loss": 1.01225805, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.35913086, + "step": 263, + "time_per_iteration": 2.7601842880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131413, + "balance_loss_mlp": 1.27793837, + "epoch": 0.050788764909580605, + "flos": 549389689344.0, + "grad_norm": 0.07440292928735547, + "language_loss": 0.94277728, + "learning_rate": 0.0009988683355085636, + "loss": 0.95591855, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.36206055, + "step": 264, + "time_per_iteration": 2.7262909412384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277315, + "balance_loss_mlp": 1.24248254, + "epoch": 0.05098114659484417, + "flos": 604812870144.0, + "grad_norm": 0.06984595792035174, + "language_loss": 1.02861905, + "learning_rate": 0.000998847289830063, + "loss": 1.04139221, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.34838867, + "step": 265, + "time_per_iteration": 2.8318397998809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256298, + "balance_loss_mlp": 1.22272849, + "epoch": 0.05117352828010773, + "flos": 438317775360.0, + "grad_norm": 0.08677906198544101, + "language_loss": 0.95779377, + "learning_rate": 0.0009988260504818682, + "loss": 0.9703567, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.3359375, + "step": 266, + "time_per_iteration": 2.5212388038635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220367, + "balance_loss_mlp": 1.19046903, + "epoch": 0.0513659099653713, + "flos": 504784300032.0, + "grad_norm": 0.09456939977029206, + "language_loss": 1.01958096, + "learning_rate": 0.000998804617472226, + "loss": 1.03178465, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.29858398, + "step": 267, + "time_per_iteration": 2.649739980697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169139, + "balance_loss_mlp": 1.14131606, + "epoch": 0.05155829165063486, + "flos": 695183344128.0, + "grad_norm": 0.07125411147685125, + "language_loss": 0.97574937, + "learning_rate": 0.0009987829908094568, + "loss": 0.98744082, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.27856445, + "step": 268, + "time_per_iteration": 2.816098690032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119703, + "balance_loss_mlp": 1.09379935, + "epoch": 0.051750673335898424, + "flos": 1347751830528.0, + "grad_norm": 0.06583247177587333, + "language_loss": 1.04151332, + "learning_rate": 0.0009987611705019569, + "loss": 1.05271029, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.25927734, + "step": 269, + "time_per_iteration": 4.478148460388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103433, + "balance_loss_mlp": 1.08008027, + "epoch": 0.051943055021161984, + "flos": 489362936832.0, + "grad_norm": 0.06787757239342199, + "language_loss": 1.02481639, + "learning_rate": 0.0009987391565581978, + "loss": 1.03585076, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.23364258, + "step": 270, + "time_per_iteration": 2.5662009716033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111004, + "balance_loss_mlp": 1.08859241, + "epoch": 0.05213543670642555, + "flos": 545504882688.0, + "grad_norm": 0.08198896814085149, + "language_loss": 0.9504528, + "learning_rate": 0.000998716948986726, + "loss": 0.96156287, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.22424316, + "step": 271, + "time_per_iteration": 2.7815349102020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158552, + "balance_loss_mlp": 1.13697529, + "epoch": 0.05232781839168911, + "flos": 603285179904.0, + "grad_norm": 0.07646156534985457, + "language_loss": 0.97641921, + "learning_rate": 0.0009986945477961633, + "loss": 0.9880048, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.21569824, + "step": 272, + "time_per_iteration": 2.694547414779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188724, + "balance_loss_mlp": 1.16735017, + "epoch": 0.052520200076952676, + "flos": 538218307584.0, + "grad_norm": 0.07381807258867126, + "language_loss": 1.02498066, + "learning_rate": 0.0009986719529952066, + "loss": 1.03686786, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.21386719, + "step": 273, + "time_per_iteration": 2.8339192867279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121752, + "balance_loss_mlp": 1.19785035, + "epoch": 0.052712581762216236, + "flos": 463148513280.0, + "grad_norm": 0.0738352941440963, + "language_loss": 1.01808548, + "learning_rate": 0.000998649164592628, + "loss": 1.03026068, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.19677734, + "step": 274, + "time_per_iteration": 2.60577130317688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236713, + "balance_loss_mlp": 1.21763909, + "epoch": 0.0529049634474798, + "flos": 547749927936.0, + "grad_norm": 0.08134169766286939, + "language_loss": 0.99272913, + "learning_rate": 0.0009986261825972748, + "loss": 1.00509632, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.19055176, + "step": 275, + "time_per_iteration": 2.652561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196689, + "balance_loss_mlp": 1.17834246, + "epoch": 0.05309734513274336, + "flos": 617727320064.0, + "grad_norm": 0.09111845604121613, + "language_loss": 1.01860571, + "learning_rate": 0.000998603007018069, + "loss": 1.03057253, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.18334961, + "step": 276, + "time_per_iteration": 2.8293774127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011443, + "balance_loss_mlp": 1.1273365, + "epoch": 0.05328972681800693, + "flos": 605220304896.0, + "grad_norm": 0.07377841396756965, + "language_loss": 0.99345076, + "learning_rate": 0.0009985796378640089, + "loss": 1.00489378, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.16955566, + "step": 277, + "time_per_iteration": 2.694716215133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098185, + "balance_loss_mlp": 1.08067346, + "epoch": 0.05348210850327049, + "flos": 604197411840.0, + "grad_norm": 0.07074934963985437, + "language_loss": 0.99532163, + "learning_rate": 0.0009985560751441665, + "loss": 1.00630355, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.1751709, + "step": 278, + "time_per_iteration": 2.798563241958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095446, + "balance_loss_mlp": 1.07736206, + "epoch": 0.053674490188534055, + "flos": 630480236544.0, + "grad_norm": 0.054749659326078955, + "language_loss": 1.01733184, + "learning_rate": 0.00099853231886769, + "loss": 1.02828622, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.1809082, + "step": 279, + "time_per_iteration": 2.780940532684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134885, + "balance_loss_mlp": 1.11744475, + "epoch": 0.053866871873797614, + "flos": 478939433472.0, + "grad_norm": 0.06375435082524677, + "language_loss": 1.01461124, + "learning_rate": 0.0009985083690438024, + "loss": 1.02595997, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.17443848, + "step": 280, + "time_per_iteration": 2.68762469291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145965, + "balance_loss_mlp": 1.12913251, + "epoch": 0.054059253559061174, + "flos": 787673645568.0, + "grad_norm": 0.07384801764192533, + "language_loss": 0.92380941, + "learning_rate": 0.0009984842256818016, + "loss": 0.93526906, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.16845703, + "step": 281, + "time_per_iteration": 3.054032325744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114791, + "balance_loss_mlp": 1.13080359, + "epoch": 0.05425163524432474, + "flos": 628076630016.0, + "grad_norm": 0.082175996598207, + "language_loss": 1.0314945, + "learning_rate": 0.0009984598887910613, + "loss": 1.04297376, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.17114258, + "step": 282, + "time_per_iteration": 2.7095611095428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144379, + "balance_loss_mlp": 1.12627149, + "epoch": 0.0544440169295883, + "flos": 615453161472.0, + "grad_norm": 0.06813866095032944, + "language_loss": 0.9902432, + "learning_rate": 0.0009984353583810297, + "loss": 1.00168693, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.18103027, + "step": 283, + "time_per_iteration": 2.804438829421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124262, + "balance_loss_mlp": 1.10624945, + "epoch": 0.05463639861485187, + "flos": 647471549952.0, + "grad_norm": 0.10003204141391345, + "language_loss": 1.01340103, + "learning_rate": 0.0009984106344612302, + "loss": 1.02464366, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.18017578, + "step": 284, + "time_per_iteration": 2.7521376609802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109552, + "balance_loss_mlp": 1.07819879, + "epoch": 0.054828780300115426, + "flos": 796845883392.0, + "grad_norm": 0.07143310654982075, + "language_loss": 0.96421391, + "learning_rate": 0.0009983857170412615, + "loss": 0.97516906, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.17321777, + "step": 285, + "time_per_iteration": 2.9796621799468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089942, + "balance_loss_mlp": 1.07363439, + "epoch": 0.05502116198537899, + "flos": 549414420480.0, + "grad_norm": 0.05224422052371224, + "language_loss": 0.95713383, + "learning_rate": 0.000998360606130798, + "loss": 0.96803325, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.16308594, + "step": 286, + "time_per_iteration": 2.7950801849365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02908189, + "balance_loss_mlp": 2.83799911, + "epoch": 0.05521354367064255, + "flos": 1406967791616.0, + "grad_norm": 0.233188183772104, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71981305, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.703125, + "step": 287, + "time_per_iteration": 4.876653432846069 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179574, + "balance_loss_mlp": 1.1627655, + "epoch": 0.05540592535590612, + "flos": 645123197952.0, + "grad_norm": 0.17417830683261867, + "language_loss": 1.0204829, + "learning_rate": 0.0009983098038774552, + "loss": 1.03227878, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.16821289, + "step": 288, + "time_per_iteration": 2.7781550884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02101154, + "balance_loss_mlp": 2.07540464, + "epoch": 0.05559830704116968, + "flos": 1510293413376.0, + "grad_norm": 0.1730100464590254, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.80271375, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.2578125, + "step": 289, + "time_per_iteration": 4.801970481872559 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338926, + "balance_loss_mlp": 1.32332134, + "epoch": 0.055790688726433245, + "flos": 508078379520.0, + "grad_norm": 0.11288123874753296, + "language_loss": 0.99586821, + "learning_rate": 0.0009982582277800948, + "loss": 1.00925756, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.15588379, + "step": 290, + "time_per_iteration": 2.6019012928009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376714, + "balance_loss_mlp": 1.36076403, + "epoch": 0.055983070411696804, + "flos": 657570576384.0, + "grad_norm": 0.11158393407579077, + "language_loss": 1.06464982, + "learning_rate": 0.0009982321495648908, + "loss": 1.07841706, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.15942383, + "step": 291, + "time_per_iteration": 2.7833075523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281101, + "balance_loss_mlp": 1.26441216, + "epoch": 0.05617545209696037, + "flos": 587051919360.0, + "grad_norm": 0.091490024999748, + "language_loss": 0.97375935, + "learning_rate": 0.0009982058779188115, + "loss": 0.98657036, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.16699219, + "step": 292, + "time_per_iteration": 2.700998067855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223751, + "balance_loss_mlp": 1.20634639, + "epoch": 0.05636783378222393, + "flos": 611331217920.0, + "grad_norm": 0.09093545163733599, + "language_loss": 1.05090272, + "learning_rate": 0.0009981794128520567, + "loss": 1.06314015, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.17431641, + "step": 293, + "time_per_iteration": 2.769562244415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172918, + "balance_loss_mlp": 1.15501237, + "epoch": 0.0565602154674875, + "flos": 667847102976.0, + "grad_norm": 0.08200667246549262, + "language_loss": 1.02219713, + "learning_rate": 0.000998152754374901, + "loss": 1.03392649, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.17919922, + "step": 294, + "time_per_iteration": 2.8421483039855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140496, + "balance_loss_mlp": 1.12121987, + "epoch": 0.05675259715275106, + "flos": 616963474944.0, + "grad_norm": 0.06298459153201627, + "language_loss": 0.97706711, + "learning_rate": 0.0009981259024976943, + "loss": 0.98847204, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.19250488, + "step": 295, + "time_per_iteration": 2.709536552429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131247, + "balance_loss_mlp": 1.11139894, + "epoch": 0.05694497883801462, + "flos": 751424214528.0, + "grad_norm": 0.13011693222478776, + "language_loss": 0.96307456, + "learning_rate": 0.0009980988572308612, + "loss": 0.97438705, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.19848633, + "step": 296, + "time_per_iteration": 2.9606993198394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125313, + "balance_loss_mlp": 1.10492802, + "epoch": 0.05713736052327818, + "flos": 711669708288.0, + "grad_norm": 0.06808560063607492, + "language_loss": 0.9959082, + "learning_rate": 0.0009980716185849015, + "loss": 1.00716126, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.20385742, + "step": 297, + "time_per_iteration": 2.952467203140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133548, + "balance_loss_mlp": 1.11424804, + "epoch": 0.05732974220854175, + "flos": 468737100288.0, + "grad_norm": 0.05570922928007862, + "language_loss": 0.95103967, + "learning_rate": 0.0009980441865703904, + "loss": 0.9623751, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.19299316, + "step": 298, + "time_per_iteration": 2.6629996299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125947, + "balance_loss_mlp": 1.10630131, + "epoch": 0.05752212389380531, + "flos": 601143441408.0, + "grad_norm": 0.06175770353433084, + "language_loss": 1.038656, + "learning_rate": 0.000998016561197978, + "loss": 1.04991555, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.19628906, + "step": 299, + "time_per_iteration": 2.7027034759521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122899, + "balance_loss_mlp": 1.10499382, + "epoch": 0.057714505579068875, + "flos": 678344799744.0, + "grad_norm": 0.07709513760197055, + "language_loss": 0.95715761, + "learning_rate": 0.0009979887424783895, + "loss": 0.96838653, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.17907715, + "step": 300, + "time_per_iteration": 2.8467562198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122592, + "balance_loss_mlp": 1.10369694, + "epoch": 0.057906887264332435, + "flos": 595604316672.0, + "grad_norm": 0.05754387138467597, + "language_loss": 0.94804943, + "learning_rate": 0.0009979607304224248, + "loss": 0.95927536, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.18908691, + "step": 301, + "time_per_iteration": 2.7457566261291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135958, + "balance_loss_mlp": 1.11577594, + "epoch": 0.058099268949596, + "flos": 551855904768.0, + "grad_norm": 0.06951393564289957, + "language_loss": 1.02452385, + "learning_rate": 0.000997932525040959, + "loss": 1.03588343, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.20166016, + "step": 302, + "time_per_iteration": 2.670464038848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123814, + "balance_loss_mlp": 1.10513425, + "epoch": 0.05829165063485956, + "flos": 507906671616.0, + "grad_norm": 0.06408930588753382, + "language_loss": 1.04041958, + "learning_rate": 0.000997904126344943, + "loss": 1.05165768, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.18676758, + "step": 303, + "time_per_iteration": 2.654275417327881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122557, + "balance_loss_mlp": 1.10432982, + "epoch": 0.05848403232012313, + "flos": 614949774336.0, + "grad_norm": 0.10902949066110783, + "language_loss": 1.00108004, + "learning_rate": 0.0009978755343454018, + "loss": 1.0123055, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.18212891, + "step": 304, + "time_per_iteration": 2.7061922550201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118016, + "balance_loss_mlp": 1.10034943, + "epoch": 0.05867641400538669, + "flos": 499835902464.0, + "grad_norm": 0.07196511907519268, + "language_loss": 1.01183403, + "learning_rate": 0.0009978467490534355, + "loss": 1.02301419, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.17663574, + "step": 305, + "time_per_iteration": 2.5658843517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118418, + "balance_loss_mlp": 1.09971452, + "epoch": 0.05886879569065025, + "flos": 531019072512.0, + "grad_norm": 0.05577021807863236, + "language_loss": 0.98775607, + "learning_rate": 0.00099781777048022, + "loss": 0.99894023, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.18713379, + "step": 306, + "time_per_iteration": 2.688661813735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112614, + "balance_loss_mlp": 1.10866416, + "epoch": 0.05906117737591381, + "flos": 488811497472.0, + "grad_norm": 0.06489613907432343, + "language_loss": 0.99682212, + "learning_rate": 0.0009977885986370057, + "loss": 1.00808358, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.17480469, + "step": 307, + "time_per_iteration": 2.527008056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129188, + "balance_loss_mlp": 1.11242771, + "epoch": 0.05925355906117737, + "flos": 591213150720.0, + "grad_norm": 0.060579194597163814, + "language_loss": 0.94911426, + "learning_rate": 0.000997759233535118, + "loss": 0.96040612, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.16772461, + "step": 308, + "time_per_iteration": 2.768683433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_mlp": 1.09052539, + "epoch": 0.05944594074644094, + "flos": 563373522432.0, + "grad_norm": 0.074144120767366, + "language_loss": 1.01706028, + "learning_rate": 0.0009977296751859576, + "loss": 1.02814317, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.17749023, + "step": 309, + "time_per_iteration": 2.710550308227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109964, + "balance_loss_mlp": 1.0817585, + "epoch": 0.0596383224317045, + "flos": 538483147776.0, + "grad_norm": 0.1012520362466171, + "language_loss": 1.03562367, + "learning_rate": 0.0009976999236009998, + "loss": 1.04662001, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.17895508, + "step": 310, + "time_per_iteration": 2.7346065044403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095396, + "balance_loss_mlp": 1.07697809, + "epoch": 0.059830704116968066, + "flos": 560684726784.0, + "grad_norm": 0.05903807060939984, + "language_loss": 1.05193245, + "learning_rate": 0.0009976699787917955, + "loss": 1.06288636, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.18408203, + "step": 311, + "time_per_iteration": 2.737165689468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04018029, + "balance_loss_mlp": 3.94440532, + "epoch": 0.060023085802231625, + "flos": 1569759962112.0, + "grad_norm": 0.34396821433057967, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.77461016, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.734375, + "step": 312, + "time_per_iteration": 4.990010976791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130575, + "balance_loss_mlp": 1.11010623, + "epoch": 0.06021546748749519, + "flos": 482415395328.0, + "grad_norm": 0.18656347991450223, + "language_loss": 0.97164261, + "learning_rate": 0.0009976095095472243, + "loss": 0.98294836, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.20458984, + "step": 313, + "time_per_iteration": 2.5596373081207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143867, + "balance_loss_mlp": 1.12198031, + "epoch": 0.06040784917275875, + "flos": 619889407488.0, + "grad_norm": 0.10017394493353984, + "language_loss": 0.98154747, + "learning_rate": 0.0009975789851353334, + "loss": 0.9929862, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.21911621, + "step": 314, + "time_per_iteration": 2.783092498779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113993, + "balance_loss_mlp": 1.11832976, + "epoch": 0.06060023085802232, + "flos": 483292721664.0, + "grad_norm": 0.12837029886330253, + "language_loss": 1.00706339, + "learning_rate": 0.0009975482675461487, + "loss": 1.01846266, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.21594238, + "step": 315, + "time_per_iteration": 2.6375765800476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128184, + "balance_loss_mlp": 1.10697675, + "epoch": 0.06079261254328588, + "flos": 581620483584.0, + "grad_norm": 0.07139597701291463, + "language_loss": 0.9800331, + "learning_rate": 0.0009975173567915952, + "loss": 0.99131489, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.21228027, + "step": 316, + "time_per_iteration": 2.680223226547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116284, + "balance_loss_mlp": 1.09438515, + "epoch": 0.060984994228549444, + "flos": 687492306432.0, + "grad_norm": 0.12898022133672052, + "language_loss": 0.92624593, + "learning_rate": 0.000997486252883674, + "loss": 0.93740869, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.21887207, + "step": 317, + "time_per_iteration": 2.835162878036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_mlp": 1.08325243, + "epoch": 0.061177375913813004, + "flos": 1314284327424.0, + "grad_norm": 0.06442728945451602, + "language_loss": 0.97186124, + "learning_rate": 0.0009974549558344602, + "loss": 0.98290741, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.21350098, + "step": 318, + "time_per_iteration": 3.6293551921844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105129, + "balance_loss_mlp": 1.08439815, + "epoch": 0.06136975759907657, + "flos": 574072040448.0, + "grad_norm": 0.08131052095693254, + "language_loss": 1.07145, + "learning_rate": 0.000997423465656105, + "loss": 1.08250129, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.20715332, + "step": 319, + "time_per_iteration": 2.7070071697235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_mlp": 1.08168781, + "epoch": 0.06156213928434013, + "flos": 527281242624.0, + "grad_norm": 0.059301156484267634, + "language_loss": 1.04424822, + "learning_rate": 0.0009973917823608335, + "loss": 1.0552659, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.20092773, + "step": 320, + "time_per_iteration": 2.6225128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110531, + "balance_loss_mlp": 1.0897882, + "epoch": 0.061754520969603696, + "flos": 495238123008.0, + "grad_norm": 0.05387649814829365, + "language_loss": 0.98383266, + "learning_rate": 0.0009973599059609462, + "loss": 0.9949379, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.20739746, + "step": 321, + "time_per_iteration": 2.692152261734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107693, + "balance_loss_mlp": 1.08798778, + "epoch": 0.061946902654867256, + "flos": 439839673344.0, + "grad_norm": 0.06112812680296507, + "language_loss": 0.9711749, + "learning_rate": 0.000997327836468819, + "loss": 0.98225188, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.19702148, + "step": 322, + "time_per_iteration": 2.5772383213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110285, + "balance_loss_mlp": 1.0900557, + "epoch": 0.06213928434013082, + "flos": 598490961408.0, + "grad_norm": 0.0645434874295678, + "language_loss": 0.9942351, + "learning_rate": 0.000997295573896902, + "loss": 1.00533807, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.20239258, + "step": 323, + "time_per_iteration": 2.839282274246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02259253, + "balance_loss_mlp": 2.20088792, + "epoch": 0.06233166602539438, + "flos": 1449393716736.0, + "grad_norm": 0.19547826226404627, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83455294, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.58203125, + "step": 324, + "time_per_iteration": 4.67440938949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01888161, + "balance_loss_mlp": 1.83246601, + "epoch": 0.06252404771065795, + "flos": 1462504453632.0, + "grad_norm": 0.11962022052509429, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80460101, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.55859375, + "step": 325, + "time_per_iteration": 4.860283136367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177486, + "balance_loss_mlp": 1.15595722, + "epoch": 0.06271642939592151, + "flos": 464059335168.0, + "grad_norm": 0.06272096910143152, + "language_loss": 0.93621421, + "learning_rate": 0.000997197627828043, + "loss": 0.94798911, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.2154541, + "step": 326, + "time_per_iteration": 2.5594961643218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205877, + "balance_loss_mlp": 1.18165386, + "epoch": 0.06290881108118507, + "flos": 532111776768.0, + "grad_norm": 0.08849931028565244, + "language_loss": 0.89414704, + "learning_rate": 0.0009971645930629716, + "loss": 0.90620589, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.2421875, + "step": 327, + "time_per_iteration": 2.7163310050964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223238, + "balance_loss_mlp": 1.19748878, + "epoch": 0.06310119276644863, + "flos": 673262572032.0, + "grad_norm": 0.09892100413683627, + "language_loss": 1.02883804, + "learning_rate": 0.0009971313652814872, + "loss": 1.04107046, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.25769043, + "step": 328, + "time_per_iteration": 2.7786266803741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228803, + "balance_loss_mlp": 1.20175433, + "epoch": 0.0632935744517122, + "flos": 770404497408.0, + "grad_norm": 0.06852265531332852, + "language_loss": 0.99799907, + "learning_rate": 0.0009970979444964903, + "loss": 1.01028717, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.27050781, + "step": 329, + "time_per_iteration": 2.952498197555542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235649, + "balance_loss_mlp": 1.2062993, + "epoch": 0.06348595613697576, + "flos": 561649393152.0, + "grad_norm": 0.09680127661829774, + "language_loss": 1.0121367, + "learning_rate": 0.0009970643307209556, + "loss": 1.02449322, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.29296875, + "step": 330, + "time_per_iteration": 2.78190541267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240935, + "balance_loss_mlp": 1.20970178, + "epoch": 0.06367833782223932, + "flos": 675891730944.0, + "grad_norm": 0.08786526055569537, + "language_loss": 0.9788332, + "learning_rate": 0.0009970305239679334, + "loss": 0.99124253, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.31201172, + "step": 331, + "time_per_iteration": 2.805845022201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228576, + "balance_loss_mlp": 1.19891691, + "epoch": 0.06387071950750288, + "flos": 495035891712.0, + "grad_norm": 0.10390832636325384, + "language_loss": 1.03124022, + "learning_rate": 0.0009969965242505483, + "loss": 1.04352593, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.29614258, + "step": 332, + "time_per_iteration": 2.676711082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207199, + "balance_loss_mlp": 1.1777302, + "epoch": 0.06406310119276645, + "flos": 533170985472.0, + "grad_norm": 0.07105898063788767, + "language_loss": 0.98331362, + "learning_rate": 0.0009969623315820007, + "loss": 0.99538565, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.29418945, + "step": 333, + "time_per_iteration": 2.6556739807128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118815, + "balance_loss_mlp": 1.16106582, + "epoch": 0.06425548287803001, + "flos": 455940513792.0, + "grad_norm": 0.08067516684621483, + "language_loss": 0.99160993, + "learning_rate": 0.000996927945975565, + "loss": 1.0034914, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.27124023, + "step": 334, + "time_per_iteration": 2.5398526191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147495, + "balance_loss_mlp": 1.1214596, + "epoch": 0.06444786456329357, + "flos": 559817574912.0, + "grad_norm": 0.08169715789363684, + "language_loss": 0.96174645, + "learning_rate": 0.0009968933674445906, + "loss": 0.97322142, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.26062012, + "step": 335, + "time_per_iteration": 2.6592860221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112932, + "balance_loss_mlp": 1.0879097, + "epoch": 0.06464024624855713, + "flos": 665769383424.0, + "grad_norm": 0.07104021966044574, + "language_loss": 0.97756392, + "learning_rate": 0.0009968585960025028, + "loss": 0.98869324, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.25036621, + "step": 336, + "time_per_iteration": 2.9279658794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01860024, + "balance_loss_mlp": 1.84323907, + "epoch": 0.0648326279338207, + "flos": 1520578704384.0, + "grad_norm": 0.14426901756633248, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.7951321, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.16796875, + "step": 337, + "time_per_iteration": 4.810914993286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101739, + "balance_loss_mlp": 1.07948256, + "epoch": 0.06502500961908426, + "flos": 1142872768512.0, + "grad_norm": 0.058812216055980165, + "language_loss": 0.95864177, + "learning_rate": 0.0009967884744390583, + "loss": 0.96965921, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.22265625, + "step": 338, + "time_per_iteration": 3.512282371520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146504, + "balance_loss_mlp": 1.12267399, + "epoch": 0.06521739130434782, + "flos": 582339248640.0, + "grad_norm": 0.10793578588091769, + "language_loss": 0.97449529, + "learning_rate": 0.0009967531243449256, + "loss": 0.98596036, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.23828125, + "step": 339, + "time_per_iteration": 2.712907075881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154087, + "balance_loss_mlp": 1.12950587, + "epoch": 0.06540977298961138, + "flos": 497398800384.0, + "grad_norm": 0.06396927661276222, + "language_loss": 1.04641414, + "learning_rate": 0.000996717581394126, + "loss": 1.05795503, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.24584961, + "step": 340, + "time_per_iteration": 2.5783133506774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168509, + "balance_loss_mlp": 1.14584756, + "epoch": 0.06560215467487496, + "flos": 542613855744.0, + "grad_norm": 0.07568553531769329, + "language_loss": 1.05092287, + "learning_rate": 0.000996681845600459, + "loss": 1.062608, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.2265625, + "step": 341, + "time_per_iteration": 2.6543757915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.13118291, + "epoch": 0.06579453636013852, + "flos": 413230961664.0, + "grad_norm": 0.06593832485574395, + "language_loss": 0.97027373, + "learning_rate": 0.0009966459169777982, + "loss": 0.9818095, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.22387695, + "step": 342, + "time_per_iteration": 2.5120761394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141132, + "balance_loss_mlp": 1.11848283, + "epoch": 0.06598691804540208, + "flos": 560354457600.0, + "grad_norm": 0.055115078659976495, + "language_loss": 1.05281377, + "learning_rate": 0.0009966097955400924, + "loss": 1.0642252, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.22644043, + "step": 343, + "time_per_iteration": 2.6954751014709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111133, + "balance_loss_mlp": 1.08904982, + "epoch": 0.06617929973066564, + "flos": 571789117440.0, + "grad_norm": 0.06176008438986438, + "language_loss": 0.99064481, + "learning_rate": 0.0009965734813013652, + "loss": 1.00175822, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.22277832, + "step": 344, + "time_per_iteration": 2.8235929012298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090293, + "balance_loss_mlp": 1.06726193, + "epoch": 0.06637168141592921, + "flos": 490234470912.0, + "grad_norm": 0.05365164831273283, + "language_loss": 1.01308548, + "learning_rate": 0.0009965369742757151, + "loss": 1.02398837, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.23022461, + "step": 345, + "time_per_iteration": 2.5708556175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078086, + "balance_loss_mlp": 1.05727243, + "epoch": 0.06656406310119277, + "flos": 1078735656960.0, + "grad_norm": 0.04968829319439664, + "language_loss": 0.97902787, + "learning_rate": 0.0009965002744773152, + "loss": 0.98980874, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.20812988, + "step": 346, + "time_per_iteration": 3.4984121322631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086945, + "balance_loss_mlp": 1.06450987, + "epoch": 0.06675644478645633, + "flos": 513421065216.0, + "grad_norm": 0.06258978415695335, + "language_loss": 0.95138037, + "learning_rate": 0.0009964633819204139, + "loss": 0.96224982, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.22436523, + "step": 347, + "time_per_iteration": 2.6866109371185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01752926, + "balance_loss_mlp": 1.73108697, + "epoch": 0.06694882647171989, + "flos": 1446359943168.0, + "grad_norm": 0.11694655230354783, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83554041, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.21875, + "step": 348, + "time_per_iteration": 4.935550928115845 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01583198, + "balance_loss_mlp": 1.56116796, + "epoch": 0.06714120815698346, + "flos": 1551230784000.0, + "grad_norm": 0.0989027294649474, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76737082, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.22070312, + "step": 349, + "time_per_iteration": 4.891008615493774 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149858, + "balance_loss_mlp": 1.12826955, + "epoch": 0.06733358984224702, + "flos": 879689673216.0, + "grad_norm": 0.07075764146586616, + "language_loss": 0.94920838, + "learning_rate": 0.000996351547842304, + "loss": 0.96070701, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.21582031, + "step": 350, + "time_per_iteration": 3.156322717666626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192552, + "balance_loss_mlp": 1.17055774, + "epoch": 0.06752597152751058, + "flos": 518654651904.0, + "grad_norm": 0.09040238598346795, + "language_loss": 0.93423587, + "learning_rate": 0.0009963138843953744, + "loss": 0.94616139, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.2199707, + "step": 351, + "time_per_iteration": 2.610987663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206077, + "balance_loss_mlp": 1.18405879, + "epoch": 0.06771835321277414, + "flos": 539366266368.0, + "grad_norm": 0.08658544591035036, + "language_loss": 0.97852194, + "learning_rate": 0.000996276028262306, + "loss": 0.9905827, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.22021484, + "step": 352, + "time_per_iteration": 2.8413686752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166048, + "balance_loss_mlp": 1.14382768, + "epoch": 0.0679107348980377, + "flos": 460430604288.0, + "grad_norm": 0.09117082479319542, + "language_loss": 1.04269946, + "learning_rate": 0.0009962379794577964, + "loss": 1.05435991, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.22216797, + "step": 353, + "time_per_iteration": 2.591372489929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114388, + "balance_loss_mlp": 1.12227976, + "epoch": 0.06810311658330127, + "flos": 635601752064.0, + "grad_norm": 0.05781909345233015, + "language_loss": 0.94169199, + "learning_rate": 0.000996199737996617, + "loss": 0.95313084, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.21630859, + "step": 354, + "time_per_iteration": 2.9088492393493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125411, + "balance_loss_mlp": 1.10420346, + "epoch": 0.06829549826856483, + "flos": 464443448832.0, + "grad_norm": 0.06770201052263504, + "language_loss": 1.03043509, + "learning_rate": 0.0009961613038936149, + "loss": 1.04168916, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.2121582, + "step": 355, + "time_per_iteration": 2.571904420852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110613, + "balance_loss_mlp": 1.08917904, + "epoch": 0.06848787995382839, + "flos": 634335929856.0, + "grad_norm": 0.06097004840688574, + "language_loss": 0.95565176, + "learning_rate": 0.000996122677163711, + "loss": 0.96675789, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.21435547, + "step": 356, + "time_per_iteration": 2.794982671737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107296, + "balance_loss_mlp": 1.08667266, + "epoch": 0.06868026163909195, + "flos": 806023913472.0, + "grad_norm": 0.08020973782133771, + "language_loss": 1.01095176, + "learning_rate": 0.000996083857821902, + "loss": 1.02202487, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.20629883, + "step": 357, + "time_per_iteration": 3.007086753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101637, + "balance_loss_mlp": 1.08076346, + "epoch": 0.06887264332435553, + "flos": 438997252608.0, + "grad_norm": 0.08125476198078858, + "language_loss": 0.99797714, + "learning_rate": 0.0009960448458832588, + "loss": 1.00899351, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.2088623, + "step": 358, + "time_per_iteration": 2.699530601501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098146, + "balance_loss_mlp": 1.07872701, + "epoch": 0.06906502500961909, + "flos": 484513463808.0, + "grad_norm": 0.06827746260367892, + "language_loss": 0.99188638, + "learning_rate": 0.000996005641362927, + "loss": 1.00286782, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.1940918, + "step": 359, + "time_per_iteration": 2.5541014671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103345, + "balance_loss_mlp": 1.0841639, + "epoch": 0.06925740669488265, + "flos": 733293706752.0, + "grad_norm": 0.08731085845928575, + "language_loss": 1.02303529, + "learning_rate": 0.0009959662442761274, + "loss": 1.0340687, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.19189453, + "step": 360, + "time_per_iteration": 2.906623363494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093844, + "balance_loss_mlp": 1.07268476, + "epoch": 0.0694497883801462, + "flos": 552127947264.0, + "grad_norm": 0.06697663210144707, + "language_loss": 0.9595629, + "learning_rate": 0.000995926654638155, + "loss": 0.97050136, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.21179199, + "step": 361, + "time_per_iteration": 2.793663501739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082773, + "balance_loss_mlp": 1.06236482, + "epoch": 0.06964217006540978, + "flos": 677708992512.0, + "grad_norm": 0.06860924301964295, + "language_loss": 0.98198265, + "learning_rate": 0.00099588687246438, + "loss": 0.99281037, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.20410156, + "step": 362, + "time_per_iteration": 2.828139305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085274, + "balance_loss_mlp": 1.06330371, + "epoch": 0.06983455175067334, + "flos": 523987163136.0, + "grad_norm": 0.08747541291209461, + "language_loss": 1.04803789, + "learning_rate": 0.0009958468977702471, + "loss": 1.0588907, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.21972656, + "step": 363, + "time_per_iteration": 2.5759966373443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02224374, + "balance_loss_mlp": 2.20682669, + "epoch": 0.0700269334359369, + "flos": 1575943658496.0, + "grad_norm": 0.2746548069890379, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81959081, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.17578125, + "step": 364, + "time_per_iteration": 4.782835245132446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134514, + "balance_loss_mlp": 1.11340213, + "epoch": 0.07021931512120046, + "flos": 1012848274944.0, + "grad_norm": 0.08586169827549085, + "language_loss": 0.93286598, + "learning_rate": 0.0009957663708830612, + "loss": 0.94421113, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.21105957, + "step": 365, + "time_per_iteration": 3.2484283447265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116189, + "balance_loss_mlp": 1.13884652, + "epoch": 0.07041169680646403, + "flos": 822622348800.0, + "grad_norm": 0.09941073368395695, + "language_loss": 0.97043455, + "learning_rate": 0.0009957258187212714, + "loss": 0.98205346, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.23034668, + "step": 366, + "time_per_iteration": 3.009479522705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01756688, + "balance_loss_mlp": 1.7255981, + "epoch": 0.07060407849172759, + "flos": 1413670993920.0, + "grad_norm": 0.12374795181042475, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80951542, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.31054688, + "step": 367, + "time_per_iteration": 4.82874608039856 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152073, + "balance_loss_mlp": 1.13087749, + "epoch": 0.07079646017699115, + "flos": 512652837888.0, + "grad_norm": 0.06786716904588838, + "language_loss": 0.93450886, + "learning_rate": 0.0009956441370400167, + "loss": 0.94602954, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.21191406, + "step": 368, + "time_per_iteration": 2.6226603984832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153965, + "balance_loss_mlp": 1.13158989, + "epoch": 0.07098884186225471, + "flos": 540240772608.0, + "grad_norm": 0.08343626294497461, + "language_loss": 0.99467343, + "learning_rate": 0.0009956030075522636, + "loss": 1.00621307, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.22375488, + "step": 369, + "time_per_iteration": 2.7128794193267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142856, + "balance_loss_mlp": 1.12137485, + "epoch": 0.07118122354751828, + "flos": 548419230720.0, + "grad_norm": 0.07464528715750075, + "language_loss": 0.98955953, + "learning_rate": 0.0009955616856543587, + "loss": 1.00098813, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.21472168, + "step": 370, + "time_per_iteration": 2.613138198852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118555, + "balance_loss_mlp": 1.0958215, + "epoch": 0.07137360523278184, + "flos": 620612554752.0, + "grad_norm": 0.056434914921328155, + "language_loss": 0.91880834, + "learning_rate": 0.0009955201713623448, + "loss": 0.92999387, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.22717285, + "step": 371, + "time_per_iteration": 2.747133255004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01746336, + "balance_loss_mlp": 1.72154021, + "epoch": 0.0715659869180454, + "flos": 1501850115072.0, + "grad_norm": 0.08669176596007007, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78419054, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.24707031, + "step": 372, + "time_per_iteration": 4.931428670883179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102855, + "balance_loss_mlp": 1.08040774, + "epoch": 0.07175836860330896, + "flos": 495246887424.0, + "grad_norm": 0.07044890130803105, + "language_loss": 1.05121827, + "learning_rate": 0.0009954365656605333, + "loss": 1.06224692, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.22436523, + "step": 373, + "time_per_iteration": 2.550243616104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118244, + "balance_loss_mlp": 1.09438992, + "epoch": 0.07195075028857253, + "flos": 785387902464.0, + "grad_norm": 0.05415547127036835, + "language_loss": 0.98150015, + "learning_rate": 0.0009953944742831947, + "loss": 0.99268264, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.23864746, + "step": 374, + "time_per_iteration": 2.9659459590911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125507, + "balance_loss_mlp": 1.10202336, + "epoch": 0.0721431319738361, + "flos": 592799067648.0, + "grad_norm": 0.07003669353380264, + "language_loss": 1.01441097, + "learning_rate": 0.0009953521905766642, + "loss": 1.02566612, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.23486328, + "step": 375, + "time_per_iteration": 2.942763566970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117119, + "balance_loss_mlp": 1.09393334, + "epoch": 0.07233551365909965, + "flos": 547981272576.0, + "grad_norm": 0.06343477824222313, + "language_loss": 0.99901861, + "learning_rate": 0.0009953097145573577, + "loss": 1.01018989, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.23193359, + "step": 376, + "time_per_iteration": 2.6275272369384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113711, + "balance_loss_mlp": 1.09023869, + "epoch": 0.07252789534436321, + "flos": 957170428416.0, + "grad_norm": 0.0678891965164594, + "language_loss": 0.97798675, + "learning_rate": 0.000995267046241766, + "loss": 0.98912394, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.23474121, + "step": 377, + "time_per_iteration": 3.2014975547790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096997, + "balance_loss_mlp": 1.07496762, + "epoch": 0.07272027702962677, + "flos": 507398902272.0, + "grad_norm": 0.0806519998399971, + "language_loss": 0.97275257, + "learning_rate": 0.0009952241856464547, + "loss": 0.98372257, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.22045898, + "step": 378, + "time_per_iteration": 2.6189732551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109641, + "balance_loss_mlp": 1.0746069, + "epoch": 0.07291265871489035, + "flos": 612128558592.0, + "grad_norm": 0.0691049335661606, + "language_loss": 1.04592681, + "learning_rate": 0.0009951811327880632, + "loss": 1.05689096, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.21826172, + "step": 379, + "time_per_iteration": 2.7411558628082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092071, + "balance_loss_mlp": 1.07025611, + "epoch": 0.0731050404001539, + "flos": 495502963200.0, + "grad_norm": 0.05765504670581196, + "language_loss": 0.97682816, + "learning_rate": 0.0009951378876833063, + "loss": 0.98774892, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.21813965, + "step": 380, + "time_per_iteration": 2.6211278438568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081575, + "balance_loss_mlp": 1.06068945, + "epoch": 0.07329742208541747, + "flos": 639677205504.0, + "grad_norm": 0.06809750593205881, + "language_loss": 1.04190159, + "learning_rate": 0.0009950944503489736, + "loss": 1.05271733, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.20898438, + "step": 381, + "time_per_iteration": 2.7533762454986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081401, + "balance_loss_mlp": 1.0607307, + "epoch": 0.07348980377068103, + "flos": 815999284224.0, + "grad_norm": 0.06607035824886899, + "language_loss": 0.98459697, + "learning_rate": 0.0009950508208019285, + "loss": 0.99541104, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.20678711, + "step": 382, + "time_per_iteration": 2.9885637760162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073667, + "balance_loss_mlp": 1.05369973, + "epoch": 0.0736821854559446, + "flos": 508383917568.0, + "grad_norm": 0.05970909775769663, + "language_loss": 1.02745128, + "learning_rate": 0.0009950069990591096, + "loss": 1.03818798, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.19958496, + "step": 383, + "time_per_iteration": 2.6111788749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01835936, + "balance_loss_mlp": 1.8101871, + "epoch": 0.07387456714120816, + "flos": 1553801716224.0, + "grad_norm": 0.167122487372618, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.78237301, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.2578125, + "step": 384, + "time_per_iteration": 4.859915494918823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116619, + "balance_loss_mlp": 1.09575748, + "epoch": 0.07406694882647172, + "flos": 525219489792.0, + "grad_norm": 0.0799084124695288, + "language_loss": 0.96017051, + "learning_rate": 0.0009949187790542777, + "loss": 0.97133672, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.20861816, + "step": 385, + "time_per_iteration": 2.6976191997528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124239, + "balance_loss_mlp": 1.10322285, + "epoch": 0.07425933051173528, + "flos": 497468611584.0, + "grad_norm": 0.08753491640442414, + "language_loss": 0.91745877, + "learning_rate": 0.0009948743808265148, + "loss": 0.92870116, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.21020508, + "step": 386, + "time_per_iteration": 2.6870572566986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113476, + "balance_loss_mlp": 1.09249496, + "epoch": 0.07445171219699885, + "flos": 504740630016.0, + "grad_norm": 0.05063210924529089, + "language_loss": 1.0156467, + "learning_rate": 0.0009948297904714782, + "loss": 1.02678132, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.20996094, + "step": 387, + "time_per_iteration": 2.668027639389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097529, + "balance_loss_mlp": 1.07642913, + "epoch": 0.07464409388226241, + "flos": 553693515264.0, + "grad_norm": 0.06830922509793466, + "language_loss": 0.93493366, + "learning_rate": 0.0009947850080064796, + "loss": 0.9459089, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.21105957, + "step": 388, + "time_per_iteration": 2.79836106300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098078, + "balance_loss_mlp": 1.07695365, + "epoch": 0.07483647556752597, + "flos": 776511028224.0, + "grad_norm": 0.06471398355705121, + "language_loss": 0.98276728, + "learning_rate": 0.0009947400334489047, + "loss": 0.99374807, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.21130371, + "step": 389, + "time_per_iteration": 3.0046355724334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095267, + "balance_loss_mlp": 1.07513261, + "epoch": 0.07502885725278953, + "flos": 612256596480.0, + "grad_norm": 0.0754939105077014, + "language_loss": 0.90272582, + "learning_rate": 0.0009946948668162145, + "loss": 0.91367853, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.20141602, + "step": 390, + "time_per_iteration": 2.724792003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091157, + "balance_loss_mlp": 1.06946135, + "epoch": 0.0752212389380531, + "flos": 688324552704.0, + "grad_norm": 0.05626120625508035, + "language_loss": 0.9463594, + "learning_rate": 0.0009946495081259441, + "loss": 0.95727098, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.21704102, + "step": 391, + "time_per_iteration": 2.8221397399902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101684, + "balance_loss_mlp": 1.08008361, + "epoch": 0.07541362062331666, + "flos": 765362967552.0, + "grad_norm": 0.09729902751759628, + "language_loss": 0.97468722, + "learning_rate": 0.0009946039573957035, + "loss": 0.98570406, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.21606445, + "step": 392, + "time_per_iteration": 2.958655595779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095785, + "balance_loss_mlp": 1.07572174, + "epoch": 0.07560600230858022, + "flos": 588460336128.0, + "grad_norm": 0.06468718689622391, + "language_loss": 0.94257009, + "learning_rate": 0.000994558214643177, + "loss": 0.95352793, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.20056152, + "step": 393, + "time_per_iteration": 2.752979040145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101382, + "balance_loss_mlp": 1.08086586, + "epoch": 0.07579838399384378, + "flos": 749508028416.0, + "grad_norm": 0.06635223139616171, + "language_loss": 0.961483, + "learning_rate": 0.000994512279886123, + "loss": 0.97249681, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.20532227, + "step": 394, + "time_per_iteration": 3.055225133895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104661, + "balance_loss_mlp": 1.08346581, + "epoch": 0.07599076567910736, + "flos": 523185440256.0, + "grad_norm": 0.06901630142642712, + "language_loss": 0.96749192, + "learning_rate": 0.0009944661531423758, + "loss": 0.97853857, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.2121582, + "step": 395, + "time_per_iteration": 2.6922085285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093271, + "balance_loss_mlp": 1.07248056, + "epoch": 0.07618314736437092, + "flos": 550812662784.0, + "grad_norm": 0.07064334209039194, + "language_loss": 0.95375401, + "learning_rate": 0.000994419834429843, + "loss": 0.96468663, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.20788574, + "step": 396, + "time_per_iteration": 2.6657333374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092352, + "balance_loss_mlp": 1.0716933, + "epoch": 0.07637552904963447, + "flos": 697901253120.0, + "grad_norm": 0.07324881108467876, + "language_loss": 0.99580455, + "learning_rate": 0.0009943733237665069, + "loss": 1.00672793, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.20654297, + "step": 397, + "time_per_iteration": 2.8662500381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085112, + "balance_loss_mlp": 1.06454849, + "epoch": 0.07656791073489803, + "flos": 579066928128.0, + "grad_norm": 0.04790317238997088, + "language_loss": 0.98118353, + "learning_rate": 0.0009943266211704248, + "loss": 0.99203461, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.20568848, + "step": 398, + "time_per_iteration": 2.930741786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094784, + "balance_loss_mlp": 1.07348132, + "epoch": 0.0767602924201616, + "flos": 416923711488.0, + "grad_norm": 0.09980331544781734, + "language_loss": 1.00422275, + "learning_rate": 0.000994279726659728, + "loss": 1.01517057, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.21325684, + "step": 399, + "time_per_iteration": 2.533738851547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109036, + "balance_loss_mlp": 1.06970143, + "epoch": 0.07695267410542517, + "flos": 482671471104.0, + "grad_norm": 0.06967700921129397, + "language_loss": 0.97985041, + "learning_rate": 0.0009942326402526231, + "loss": 0.99075395, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.20666504, + "step": 400, + "time_per_iteration": 2.51460337638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096542, + "balance_loss_mlp": 1.07526302, + "epoch": 0.07714505579068873, + "flos": 530742647808.0, + "grad_norm": 0.052652305799428985, + "language_loss": 0.96639109, + "learning_rate": 0.0009941853619673902, + "loss": 0.97735649, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.2130127, + "step": 401, + "time_per_iteration": 2.620939016342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101036, + "balance_loss_mlp": 1.08012676, + "epoch": 0.07733743747595229, + "flos": 804635845632.0, + "grad_norm": 0.07273299487754427, + "language_loss": 0.99959278, + "learning_rate": 0.0009941378918223844, + "loss": 1.01060319, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.20910645, + "step": 402, + "time_per_iteration": 3.036839008331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110477, + "balance_loss_mlp": 1.08423018, + "epoch": 0.07752981916121585, + "flos": 622192679424.0, + "grad_norm": 0.05767312217272775, + "language_loss": 0.93044209, + "learning_rate": 0.0009940902298360354, + "loss": 0.94148982, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.20544434, + "step": 403, + "time_per_iteration": 2.7703943252563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097477, + "balance_loss_mlp": 1.07694876, + "epoch": 0.07772220084647942, + "flos": 727961195520.0, + "grad_norm": 0.0686344305115436, + "language_loss": 1.02037048, + "learning_rate": 0.0009940423760268473, + "loss": 1.03134525, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.2052002, + "step": 404, + "time_per_iteration": 2.8823602199554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.06497431, + "epoch": 0.07791458253174298, + "flos": 555149984256.0, + "grad_norm": 0.10727031409308073, + "language_loss": 0.96142864, + "learning_rate": 0.0009939943304133982, + "loss": 0.97228479, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.20654297, + "step": 405, + "time_per_iteration": 2.63908314704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078944, + "balance_loss_mlp": 1.05944133, + "epoch": 0.07810696421700654, + "flos": 552919495680.0, + "grad_norm": 0.08981509362846728, + "language_loss": 1.0302707, + "learning_rate": 0.0009939460930143416, + "loss": 1.04106021, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.19482422, + "step": 406, + "time_per_iteration": 2.63259220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079269, + "balance_loss_mlp": 1.05927801, + "epoch": 0.0782993459022701, + "flos": 650323289088.0, + "grad_norm": 0.07212254231156982, + "language_loss": 0.96910775, + "learning_rate": 0.0009938976638484043, + "loss": 0.97990054, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.1998291, + "step": 407, + "time_per_iteration": 2.9489452838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_mlp": 1.05239439, + "epoch": 0.07849172758753367, + "flos": 495926364672.0, + "grad_norm": 0.07302041560946317, + "language_loss": 0.9619081, + "learning_rate": 0.0009938490429343887, + "loss": 0.97263873, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.20666504, + "step": 408, + "time_per_iteration": 2.541293144226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078297, + "balance_loss_mlp": 1.05823374, + "epoch": 0.07868410927279723, + "flos": 577696389120.0, + "grad_norm": 0.06961121210328268, + "language_loss": 0.96404505, + "learning_rate": 0.0009938002302911709, + "loss": 0.974828, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.20056152, + "step": 409, + "time_per_iteration": 2.7890634536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.05869615, + "epoch": 0.07887649095806079, + "flos": 522698019840.0, + "grad_norm": 0.10283598941623227, + "language_loss": 0.99080813, + "learning_rate": 0.0009937512259377015, + "loss": 1.00159442, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.19921875, + "step": 410, + "time_per_iteration": 2.6631360054016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076374, + "balance_loss_mlp": 1.05739617, + "epoch": 0.07906887264332435, + "flos": 556958481408.0, + "grad_norm": 0.07518465865945036, + "language_loss": 0.97744381, + "learning_rate": 0.000993702029893006, + "loss": 0.98820746, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.18981934, + "step": 411, + "time_per_iteration": 2.762937068939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070708, + "balance_loss_mlp": 1.0512886, + "epoch": 0.07926125432858792, + "flos": 821641715712.0, + "grad_norm": 0.06547583340109177, + "language_loss": 0.97466588, + "learning_rate": 0.0009936526421761838, + "loss": 0.98537302, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.1940918, + "step": 412, + "time_per_iteration": 3.019529342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070741, + "balance_loss_mlp": 1.05210841, + "epoch": 0.07945363601385148, + "flos": 562072794624.0, + "grad_norm": 0.06412617323579047, + "language_loss": 0.9993977, + "learning_rate": 0.000993603062806409, + "loss": 1.01010513, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.18615723, + "step": 413, + "time_per_iteration": 2.667893409729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078833, + "balance_loss_mlp": 1.05879402, + "epoch": 0.07964601769911504, + "flos": 517615792128.0, + "grad_norm": 0.0777298152120257, + "language_loss": 1.03187037, + "learning_rate": 0.0009935532918029298, + "loss": 1.04265857, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.20031738, + "step": 414, + "time_per_iteration": 2.628847122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079604, + "balance_loss_mlp": 1.06020916, + "epoch": 0.0798383993843786, + "flos": 538956011520.0, + "grad_norm": 0.0762846382616791, + "language_loss": 0.96381676, + "learning_rate": 0.0009935033291850694, + "loss": 0.97461283, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.19384766, + "step": 415, + "time_per_iteration": 2.6874804496765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078311, + "balance_loss_mlp": 1.05915451, + "epoch": 0.08003078106964218, + "flos": 484901959680.0, + "grad_norm": 0.07548152614126195, + "language_loss": 0.9874112, + "learning_rate": 0.0009934531749722247, + "loss": 0.9981944, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.19177246, + "step": 416, + "time_per_iteration": 2.5752930641174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077721, + "balance_loss_mlp": 1.0581702, + "epoch": 0.08022316275490574, + "flos": 517999905792.0, + "grad_norm": 0.07373378819853486, + "language_loss": 0.97326815, + "learning_rate": 0.0009934028291838672, + "loss": 0.98404539, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.1953125, + "step": 417, + "time_per_iteration": 2.715142011642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078885, + "balance_loss_mlp": 1.0593344, + "epoch": 0.0804155444401693, + "flos": 493755512832.0, + "grad_norm": 0.06878732968267398, + "language_loss": 0.9290086, + "learning_rate": 0.0009933522918395433, + "loss": 0.93979746, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.19555664, + "step": 418, + "time_per_iteration": 2.7008063793182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01673141, + "balance_loss_mlp": 1.6505394, + "epoch": 0.08060792612543285, + "flos": 1580567579136.0, + "grad_norm": 0.10865535097535944, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.7992425, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.22558594, + "step": 419, + "time_per_iteration": 4.854820728302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092516, + "balance_loss_mlp": 1.07238102, + "epoch": 0.08080030781069643, + "flos": 525090041856.0, + "grad_norm": 0.07888672823303539, + "language_loss": 1.11010027, + "learning_rate": 0.000993250642561551, + "loss": 1.12102532, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.20129395, + "step": 420, + "time_per_iteration": 2.6152822971343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102725, + "balance_loss_mlp": 1.08251905, + "epoch": 0.08099268949595999, + "flos": 546459374592.0, + "grad_norm": 0.06927423279576624, + "language_loss": 0.96781242, + "learning_rate": 0.0009931995306673466, + "loss": 0.97883964, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.20202637, + "step": 421, + "time_per_iteration": 2.8378820419311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107938, + "balance_loss_mlp": 1.08725524, + "epoch": 0.08118507118122355, + "flos": 510116811264.0, + "grad_norm": 0.07245841989657228, + "language_loss": 1.01691484, + "learning_rate": 0.000993148227296103, + "loss": 1.02799416, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.20678711, + "step": 422, + "time_per_iteration": 2.6234657764434814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109153, + "balance_loss_mlp": 1.08827925, + "epoch": 0.08137745286648711, + "flos": 720339969024.0, + "grad_norm": 0.06440268991377437, + "language_loss": 0.90059143, + "learning_rate": 0.000993096732467738, + "loss": 0.91168296, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.2088623, + "step": 423, + "time_per_iteration": 2.9789979457855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107042, + "balance_loss_mlp": 1.08620405, + "epoch": 0.08156983455175067, + "flos": 679313848320.0, + "grad_norm": 0.09430690436493987, + "language_loss": 0.97591221, + "learning_rate": 0.0009930450462022435, + "loss": 0.9869827, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.20837402, + "step": 424, + "time_per_iteration": 2.7870407104492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01731933, + "balance_loss_mlp": 1.70847309, + "epoch": 0.08176221623701424, + "flos": 1452577135104.0, + "grad_norm": 0.13164555017172178, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80921739, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.234375, + "step": 425, + "time_per_iteration": 4.870323181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095108, + "balance_loss_mlp": 1.07456827, + "epoch": 0.0819545979222778, + "flos": 1556034071040.0, + "grad_norm": 0.10298759083167684, + "language_loss": 0.95328236, + "learning_rate": 0.0009929410994402065, + "loss": 0.9642334, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.20544434, + "step": 426, + "time_per_iteration": 3.7942585945129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093366, + "balance_loss_mlp": 1.07214665, + "epoch": 0.08214697960754136, + "flos": 512456398848.0, + "grad_norm": 0.069672302328133, + "language_loss": 0.99507213, + "learning_rate": 0.0009928888389840196, + "loss": 1.00600576, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.21240234, + "step": 427, + "time_per_iteration": 2.684760093688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073876, + "balance_loss_mlp": 1.05376494, + "epoch": 0.08233936129280492, + "flos": 594850646016.0, + "grad_norm": 0.07796900075206671, + "language_loss": 1.01749206, + "learning_rate": 0.0009928363871714147, + "loss": 1.02823079, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.20092773, + "step": 428, + "time_per_iteration": 2.6608195304870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078126, + "balance_loss_mlp": 1.05796742, + "epoch": 0.08253174297806849, + "flos": 571758594048.0, + "grad_norm": 0.07341701057973313, + "language_loss": 0.95524251, + "learning_rate": 0.0009927837440227556, + "loss": 0.96602374, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.20153809, + "step": 429, + "time_per_iteration": 2.824958324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083273, + "balance_loss_mlp": 1.06413972, + "epoch": 0.08272412466333205, + "flos": 623065623552.0, + "grad_norm": 0.06194570532237157, + "language_loss": 0.90308964, + "learning_rate": 0.0009927309095584798, + "loss": 0.91392243, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.19128418, + "step": 430, + "time_per_iteration": 2.9565205574035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105878, + "balance_loss_mlp": 1.08643484, + "epoch": 0.08291650634859561, + "flos": 513745542144.0, + "grad_norm": 0.09375416706629437, + "language_loss": 1.0225904, + "learning_rate": 0.0009926778837991, + "loss": 1.03364921, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.19433594, + "step": 431, + "time_per_iteration": 2.5606777667999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104802, + "balance_loss_mlp": 1.08521628, + "epoch": 0.08310888803385917, + "flos": 667073083392.0, + "grad_norm": 0.09022222071598751, + "language_loss": 1.00445497, + "learning_rate": 0.000992624666765202, + "loss": 1.01550293, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.19580078, + "step": 432, + "time_per_iteration": 2.763514995574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112312, + "balance_loss_mlp": 1.09166527, + "epoch": 0.08330126971912274, + "flos": 582995404800.0, + "grad_norm": 0.07142121215748316, + "language_loss": 0.98131895, + "learning_rate": 0.000992571258477447, + "loss": 0.99244213, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.20654297, + "step": 433, + "time_per_iteration": 2.7823588848114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086622, + "balance_loss_mlp": 1.06731021, + "epoch": 0.0834936514043863, + "flos": 561064458240.0, + "grad_norm": 0.06618743000622296, + "language_loss": 0.92206728, + "learning_rate": 0.0009925176589565695, + "loss": 0.93293345, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.1932373, + "step": 434, + "time_per_iteration": 2.7774362564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109069, + "balance_loss_mlp": 1.07043648, + "epoch": 0.08368603308964986, + "flos": 494272046592.0, + "grad_norm": 0.07800081613857189, + "language_loss": 1.01949787, + "learning_rate": 0.0009924638682233791, + "loss": 1.03040481, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.20251465, + "step": 435, + "time_per_iteration": 2.574716091156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236801, + "balance_loss_mlp": 1.21505737, + "epoch": 0.08387841477491342, + "flos": 1388322312192.0, + "grad_norm": 0.08820287098199171, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80801398, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.21777344, + "step": 436, + "time_per_iteration": 4.521069049835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087939, + "balance_loss_mlp": 1.06750691, + "epoch": 0.084070796460177, + "flos": 798642796032.0, + "grad_norm": 0.09737991847895365, + "language_loss": 0.92070073, + "learning_rate": 0.0009923557132036668, + "loss": 0.93158013, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.2043457, + "step": 437, + "time_per_iteration": 3.0401971340179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106896, + "balance_loss_mlp": 1.08635592, + "epoch": 0.08426317814544056, + "flos": 558681200640.0, + "grad_norm": 0.07082709395687636, + "language_loss": 0.96077365, + "learning_rate": 0.0009923013489591345, + "loss": 0.97184265, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.20532227, + "step": 438, + "time_per_iteration": 2.7388038635253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138911, + "balance_loss_mlp": 1.11965871, + "epoch": 0.08445555983070412, + "flos": 810057106944.0, + "grad_norm": 0.09946092642967543, + "language_loss": 0.94659293, + "learning_rate": 0.0009922467935862681, + "loss": 0.95798206, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.19250488, + "step": 439, + "time_per_iteration": 3.0827929973602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153103, + "balance_loss_mlp": 1.13278937, + "epoch": 0.08464794151596768, + "flos": 509939311104.0, + "grad_norm": 0.08658230076015333, + "language_loss": 0.97196984, + "learning_rate": 0.0009921920471062478, + "loss": 0.9835009, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.203125, + "step": 440, + "time_per_iteration": 2.5667247772216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_mlp": 1.08952785, + "epoch": 0.08484032320123125, + "flos": 556149556224.0, + "grad_norm": 0.0779492699350581, + "language_loss": 0.95526892, + "learning_rate": 0.0009921371095403281, + "loss": 0.96636182, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.19763184, + "step": 441, + "time_per_iteration": 2.6504476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081558, + "balance_loss_mlp": 1.06137586, + "epoch": 0.08503270488649481, + "flos": 527103742464.0, + "grad_norm": 0.0823758421396894, + "language_loss": 0.98291612, + "learning_rate": 0.0009920819809098379, + "loss": 0.99373174, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.20166016, + "step": 442, + "time_per_iteration": 2.5884947776794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076633, + "balance_loss_mlp": 1.05612862, + "epoch": 0.08522508657175837, + "flos": 613989490176.0, + "grad_norm": 0.07828377396362728, + "language_loss": 0.94043314, + "learning_rate": 0.0009920266612361798, + "loss": 0.95119947, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.20507812, + "step": 443, + "time_per_iteration": 2.7464845180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077144, + "balance_loss_mlp": 1.05650926, + "epoch": 0.08541746825702193, + "flos": 619495119360.0, + "grad_norm": 0.07442656272719532, + "language_loss": 0.94335687, + "learning_rate": 0.0009919711505408308, + "loss": 0.95412827, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.2064209, + "step": 444, + "time_per_iteration": 2.7615623474121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092391, + "balance_loss_mlp": 1.07126665, + "epoch": 0.08560984994228549, + "flos": 482671471104.0, + "grad_norm": 0.08601843511227286, + "language_loss": 0.92049706, + "learning_rate": 0.000991915448845342, + "loss": 0.93142092, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.21130371, + "step": 445, + "time_per_iteration": 2.519644260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103035, + "balance_loss_mlp": 1.08145857, + "epoch": 0.08580223162754906, + "flos": 516897027072.0, + "grad_norm": 0.07781715705073443, + "language_loss": 1.01207459, + "learning_rate": 0.000991859556171339, + "loss": 1.02310491, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.21569824, + "step": 446, + "time_per_iteration": 2.5678694248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116922, + "balance_loss_mlp": 1.09462976, + "epoch": 0.08599461331281262, + "flos": 531215511552.0, + "grad_norm": 0.11213971543052093, + "language_loss": 1.02931881, + "learning_rate": 0.000991803472540521, + "loss": 1.040488, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.22302246, + "step": 447, + "time_per_iteration": 2.6309196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124555, + "balance_loss_mlp": 1.10302639, + "epoch": 0.08618699499807618, + "flos": 789966743040.0, + "grad_norm": 0.07287006723198586, + "language_loss": 0.97443926, + "learning_rate": 0.0009917471979746615, + "loss": 0.98568487, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.21533203, + "step": 448, + "time_per_iteration": 2.9742491245269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134564, + "balance_loss_mlp": 1.11266506, + "epoch": 0.08637937668333974, + "flos": 565707317760.0, + "grad_norm": 0.08202115093309782, + "language_loss": 0.97199845, + "learning_rate": 0.0009916907324956086, + "loss": 0.98334408, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.21923828, + "step": 449, + "time_per_iteration": 2.704089641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151497, + "balance_loss_mlp": 1.12693954, + "epoch": 0.08657175836860331, + "flos": 444930665472.0, + "grad_norm": 0.09325215593581063, + "language_loss": 0.93441564, + "learning_rate": 0.0009916340761252837, + "loss": 0.9459306, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.24536133, + "step": 450, + "time_per_iteration": 2.5866575241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158359, + "balance_loss_mlp": 1.13567328, + "epoch": 0.08676414005386687, + "flos": 843789450240.0, + "grad_norm": 0.23711660967347972, + "language_loss": 0.90976942, + "learning_rate": 0.0009915772288856832, + "loss": 0.92135304, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.22668457, + "step": 451, + "time_per_iteration": 3.109010696411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118071, + "balance_loss_mlp": 1.15827537, + "epoch": 0.08695652173913043, + "flos": 602995608576.0, + "grad_norm": 0.08699490701012727, + "language_loss": 0.92036849, + "learning_rate": 0.000991520190798877, + "loss": 0.93217564, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.22424316, + "step": 452, + "time_per_iteration": 2.8523812294006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181191, + "balance_loss_mlp": 1.15807629, + "epoch": 0.08714890342439399, + "flos": 730423028736.0, + "grad_norm": 0.09293440668835976, + "language_loss": 1.01637089, + "learning_rate": 0.0009914629618870089, + "loss": 1.02818286, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.23095703, + "step": 453, + "time_per_iteration": 2.882887125015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142362, + "balance_loss_mlp": 1.12891519, + "epoch": 0.08734128510965757, + "flos": 1481518232064.0, + "grad_norm": 0.0645312523276542, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79818237, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.13476562, + "step": 454, + "time_per_iteration": 4.717878103256226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103219, + "balance_loss_mlp": 1.09034455, + "epoch": 0.08753366679492113, + "flos": 1522214083584.0, + "grad_norm": 0.04274098512475534, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82531178, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.12890625, + "step": 455, + "time_per_iteration": 4.838243246078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171645, + "balance_loss_mlp": 1.14951944, + "epoch": 0.08772604848018468, + "flos": 720935078400.0, + "grad_norm": 0.10543082910841049, + "language_loss": 0.94423014, + "learning_rate": 0.0009912901304235883, + "loss": 0.95594656, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.22131348, + "step": 456, + "time_per_iteration": 2.9432015419006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150762, + "balance_loss_mlp": 1.12861252, + "epoch": 0.08791843016544824, + "flos": 707926086144.0, + "grad_norm": 0.10980567381029156, + "language_loss": 0.91300154, + "learning_rate": 0.000991232138434397, + "loss": 0.92450917, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.22143555, + "step": 457, + "time_per_iteration": 2.832761526107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113929, + "balance_loss_mlp": 1.09195828, + "epoch": 0.08811081185071182, + "flos": 472799407104.0, + "grad_norm": 0.1324680836731367, + "language_loss": 0.97845554, + "learning_rate": 0.000991173955731976, + "loss": 0.98959482, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.21960449, + "step": 458, + "time_per_iteration": 2.660696506500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100778, + "balance_loss_mlp": 1.07958269, + "epoch": 0.08830319353597538, + "flos": 684647769600.0, + "grad_norm": 0.07138233575581546, + "language_loss": 1.0178268, + "learning_rate": 0.0009911155823389137, + "loss": 1.02883458, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.21203613, + "step": 459, + "time_per_iteration": 2.9878122806549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105128, + "balance_loss_mlp": 1.08344412, + "epoch": 0.08849557522123894, + "flos": 573235411968.0, + "grad_norm": 0.0735053314112025, + "language_loss": 0.9764787, + "learning_rate": 0.000991057018277873, + "loss": 0.98752999, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.21679688, + "step": 460, + "time_per_iteration": 2.707247018814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116963, + "balance_loss_mlp": 1.09422946, + "epoch": 0.0886879569065025, + "flos": 564303283200.0, + "grad_norm": 0.10552034142073316, + "language_loss": 0.9759655, + "learning_rate": 0.0009909982635715898, + "loss": 0.98713505, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.22729492, + "step": 461, + "time_per_iteration": 2.609016180038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120097, + "balance_loss_mlp": 1.09760189, + "epoch": 0.08888033859176607, + "flos": 563609249280.0, + "grad_norm": 0.09185893532484944, + "language_loss": 0.96625364, + "learning_rate": 0.0009909393182428751, + "loss": 0.97745454, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.22497559, + "step": 462, + "time_per_iteration": 2.682616949081421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116703, + "balance_loss_mlp": 1.09437466, + "epoch": 0.08907272027702963, + "flos": 465517214208.0, + "grad_norm": 0.08888403374641002, + "language_loss": 0.91300213, + "learning_rate": 0.000990880182314614, + "loss": 0.92416912, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.22314453, + "step": 463, + "time_per_iteration": 2.732579469680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122985, + "balance_loss_mlp": 1.10014486, + "epoch": 0.08926510196229319, + "flos": 681200921088.0, + "grad_norm": 0.07408309604525525, + "language_loss": 0.92294347, + "learning_rate": 0.0009908208558097643, + "loss": 0.93417335, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.22839355, + "step": 464, + "time_per_iteration": 2.910313606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137827, + "balance_loss_mlp": 1.115273, + "epoch": 0.08945748364755675, + "flos": 596411831808.0, + "grad_norm": 0.08673846989427919, + "language_loss": 0.93827909, + "learning_rate": 0.000990761338751359, + "loss": 0.94965738, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.22546387, + "step": 465, + "time_per_iteration": 2.827570676803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133815, + "balance_loss_mlp": 1.12222791, + "epoch": 0.08964986533282032, + "flos": 1585082400768.0, + "grad_norm": 0.06082202694548154, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74793446, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.11572266, + "step": 466, + "time_per_iteration": 4.960917234420776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177765, + "balance_loss_mlp": 1.15419745, + "epoch": 0.08984224701808388, + "flos": 533268499968.0, + "grad_norm": 0.4900596090566038, + "language_loss": 0.96587038, + "learning_rate": 0.0009906417330663815, + "loss": 0.97764802, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.23571777, + "step": 467, + "time_per_iteration": 2.5937299728393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157084, + "balance_loss_mlp": 1.13383865, + "epoch": 0.09003462870334744, + "flos": 478702296576.0, + "grad_norm": 0.08613132202477504, + "language_loss": 0.92798859, + "learning_rate": 0.0009905816444862442, + "loss": 0.93955946, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.23217773, + "step": 468, + "time_per_iteration": 2.6012237071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164868, + "balance_loss_mlp": 1.14150274, + "epoch": 0.090227010388611, + "flos": 653307448320.0, + "grad_norm": 0.08218040805372613, + "language_loss": 0.90769458, + "learning_rate": 0.0009905213654454216, + "loss": 0.91934329, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.23364258, + "step": 469, + "time_per_iteration": 2.8727760314941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176439, + "balance_loss_mlp": 1.15152478, + "epoch": 0.09041939207387456, + "flos": 617894645760.0, + "grad_norm": 0.09256259391525869, + "language_loss": 0.97864139, + "learning_rate": 0.0009904608959673158, + "loss": 0.9904058, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.24938965, + "step": 470, + "time_per_iteration": 2.7991952896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151805, + "balance_loss_mlp": 1.12671185, + "epoch": 0.09061177375913813, + "flos": 454137808896.0, + "grad_norm": 0.09693984756275055, + "language_loss": 0.97988749, + "learning_rate": 0.000990400236075403, + "loss": 0.99140555, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.25109863, + "step": 471, + "time_per_iteration": 2.523508310317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125394, + "balance_loss_mlp": 1.10119498, + "epoch": 0.0908041554444017, + "flos": 543982984704.0, + "grad_norm": 0.09250187628709369, + "language_loss": 0.9490509, + "learning_rate": 0.0009903393857932338, + "loss": 0.96030486, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.24194336, + "step": 472, + "time_per_iteration": 2.7065584659576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124084, + "balance_loss_mlp": 1.09912193, + "epoch": 0.09099653712966525, + "flos": 564052999680.0, + "grad_norm": 0.10897832311722938, + "language_loss": 0.93660218, + "learning_rate": 0.0009902783451444317, + "loss": 0.94784307, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.24963379, + "step": 473, + "time_per_iteration": 2.7067277431488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108649, + "balance_loss_mlp": 1.08496177, + "epoch": 0.09118891881492881, + "flos": 474300956160.0, + "grad_norm": 0.09402902414949979, + "language_loss": 0.97273493, + "learning_rate": 0.0009902171141526956, + "loss": 0.98382139, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.23693848, + "step": 474, + "time_per_iteration": 2.5281569957733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087186, + "balance_loss_mlp": 1.06240201, + "epoch": 0.09138130050019239, + "flos": 545579076096.0, + "grad_norm": 0.06728788346792411, + "language_loss": 0.85273343, + "learning_rate": 0.000990155692841797, + "loss": 0.86360526, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.2479248, + "step": 475, + "time_per_iteration": 2.970107316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_mlp": 1.0587163, + "epoch": 0.09157368218545595, + "flos": 732397441536.0, + "grad_norm": 0.07226189405033341, + "language_loss": 0.97062063, + "learning_rate": 0.0009900940812355818, + "loss": 0.98145562, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.24768066, + "step": 476, + "time_per_iteration": 2.959184169769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096233, + "balance_loss_mlp": 1.07208097, + "epoch": 0.0917660638707195, + "flos": 610709967360.0, + "grad_norm": 0.09034653129128065, + "language_loss": 0.92824447, + "learning_rate": 0.00099003227935797, + "loss": 0.93920678, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.24157715, + "step": 477, + "time_per_iteration": 2.7553765773773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113263, + "balance_loss_mlp": 1.08839583, + "epoch": 0.09195844555598306, + "flos": 655561257984.0, + "grad_norm": 0.09830094540804109, + "language_loss": 0.95358098, + "learning_rate": 0.000989970287232955, + "loss": 0.96471357, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.2487793, + "step": 478, + "time_per_iteration": 2.7916457653045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112064, + "balance_loss_mlp": 1.09633327, + "epoch": 0.09215082724124664, + "flos": 476339387904.0, + "grad_norm": 0.08054303064285366, + "language_loss": 0.93560576, + "learning_rate": 0.0009899081048846043, + "loss": 0.94681215, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.24267578, + "step": 479, + "time_per_iteration": 2.554161787033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114732, + "balance_loss_mlp": 1.12177348, + "epoch": 0.0923432089265102, + "flos": 524051182080.0, + "grad_norm": 0.1186512856896222, + "language_loss": 0.97593725, + "learning_rate": 0.0009898457323370593, + "loss": 0.98741049, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.25549316, + "step": 480, + "time_per_iteration": 2.5794191360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131558, + "balance_loss_mlp": 1.10608315, + "epoch": 0.09253559061177376, + "flos": 545302651392.0, + "grad_norm": 0.10688941209840569, + "language_loss": 0.96892118, + "learning_rate": 0.000989783169614535, + "loss": 0.98023689, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.25512695, + "step": 481, + "time_per_iteration": 2.6676101684570312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336494, + "balance_loss_mlp": 1.32304764, + "epoch": 0.09272797229703732, + "flos": 1537222219776.0, + "grad_norm": 0.112558059824644, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80089253, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.13476562, + "step": 482, + "time_per_iteration": 4.910710096359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121205, + "balance_loss_mlp": 1.09537172, + "epoch": 0.09292035398230089, + "flos": 689501624832.0, + "grad_norm": 0.08905484371867754, + "language_loss": 0.93989253, + "learning_rate": 0.000989657473741779, + "loss": 0.95110452, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.25866699, + "step": 483, + "time_per_iteration": 2.8736467361450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120092, + "balance_loss_mlp": 1.09219658, + "epoch": 0.09311273566756445, + "flos": 509482414080.0, + "grad_norm": 0.10011855628381364, + "language_loss": 0.94861096, + "learning_rate": 0.0009895943406403465, + "loss": 0.95981193, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.27905273, + "step": 484, + "time_per_iteration": 2.7233312129974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114409, + "balance_loss_mlp": 1.08641887, + "epoch": 0.09330511735282801, + "flos": 659111413248.0, + "grad_norm": 0.10884122740481975, + "language_loss": 0.87602448, + "learning_rate": 0.0009895310174615338, + "loss": 0.88716859, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.2800293, + "step": 485, + "time_per_iteration": 2.7538061141967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098211, + "balance_loss_mlp": 1.08533621, + "epoch": 0.09349749903809157, + "flos": 1452054809088.0, + "grad_norm": 0.04867374252302138, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76816726, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.12890625, + "step": 486, + "time_per_iteration": 4.681119441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121456, + "balance_loss_mlp": 1.09291732, + "epoch": 0.09368988072335514, + "flos": 520614508032.0, + "grad_norm": 0.07858969791005947, + "language_loss": 0.92458618, + "learning_rate": 0.0009894038009701782, + "loss": 0.93580067, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.28515625, + "step": 487, + "time_per_iteration": 2.6114649772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153128, + "balance_loss_mlp": 1.12148952, + "epoch": 0.0938822624086187, + "flos": 497502107136.0, + "grad_norm": 0.11959755259003642, + "language_loss": 0.91595036, + "learning_rate": 0.0009893399077070253, + "loss": 0.92748165, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.31616211, + "step": 488, + "time_per_iteration": 2.5603692531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127952, + "balance_loss_mlp": 1.09845996, + "epoch": 0.09407464409388226, + "flos": 532948405248.0, + "grad_norm": 0.09098963794592498, + "language_loss": 0.89760649, + "learning_rate": 0.0009892758244652718, + "loss": 0.90888608, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.29516602, + "step": 489, + "time_per_iteration": 2.65938401222229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127724, + "balance_loss_mlp": 1.09568012, + "epoch": 0.09426702577914582, + "flos": 585736634880.0, + "grad_norm": 0.09102778373185845, + "language_loss": 0.94519842, + "learning_rate": 0.0009892115512697968, + "loss": 0.95647562, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.3203125, + "step": 490, + "time_per_iteration": 2.6538186073303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120065, + "balance_loss_mlp": 1.08926105, + "epoch": 0.0944594074644094, + "flos": 503081929728.0, + "grad_norm": 0.07724049493821064, + "language_loss": 0.96624851, + "learning_rate": 0.0009891470881455537, + "loss": 0.97744912, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.30810547, + "step": 491, + "time_per_iteration": 2.699535608291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122711, + "balance_loss_mlp": 1.09145451, + "epoch": 0.09465178914967295, + "flos": 570748847616.0, + "grad_norm": 0.0816499633869022, + "language_loss": 0.94510269, + "learning_rate": 0.0009890824351175692, + "loss": 0.95632982, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.31225586, + "step": 492, + "time_per_iteration": 2.678191661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125893, + "balance_loss_mlp": 1.09418344, + "epoch": 0.09484417083493651, + "flos": 549098707968.0, + "grad_norm": 0.07977284094064935, + "language_loss": 0.98609412, + "learning_rate": 0.0009890175922109435, + "loss": 0.99735302, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.31689453, + "step": 493, + "time_per_iteration": 2.6466987133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138627, + "balance_loss_mlp": 1.10534418, + "epoch": 0.09503655252020007, + "flos": 823552109568.0, + "grad_norm": 0.09331424233507904, + "language_loss": 0.96939492, + "learning_rate": 0.0009889525594508513, + "loss": 0.9807812, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.33300781, + "step": 494, + "time_per_iteration": 3.009894371032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153225, + "balance_loss_mlp": 1.12218332, + "epoch": 0.09522893420546363, + "flos": 404397757440.0, + "grad_norm": 0.08141129996203125, + "language_loss": 0.91043431, + "learning_rate": 0.0009888873368625404, + "loss": 0.92196655, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.31030273, + "step": 495, + "time_per_iteration": 2.4904890060424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171726, + "balance_loss_mlp": 1.14025438, + "epoch": 0.0954213158907272, + "flos": 690707810304.0, + "grad_norm": 0.08256479818708104, + "language_loss": 0.94339681, + "learning_rate": 0.0009888219244713326, + "loss": 0.95511413, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.31445312, + "step": 496, + "time_per_iteration": 2.8060483932495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181664, + "balance_loss_mlp": 1.15033531, + "epoch": 0.09561369757599077, + "flos": 518739019776.0, + "grad_norm": 0.10472312979641793, + "language_loss": 0.94370055, + "learning_rate": 0.0009887563223026229, + "loss": 0.95551717, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.31323242, + "step": 497, + "time_per_iteration": 2.6536803245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228939, + "balance_loss_mlp": 1.21549225, + "epoch": 0.09580607926125433, + "flos": 1384825849344.0, + "grad_norm": 0.04877985805939708, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80297101, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.13476562, + "step": 498, + "time_per_iteration": 4.874605178833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197245, + "balance_loss_mlp": 1.16455829, + "epoch": 0.09599846094651789, + "flos": 717090969600.0, + "grad_norm": 0.08863465655244346, + "language_loss": 0.93284124, + "learning_rate": 0.0009886245487346482, + "loss": 0.94481373, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.3269043, + "step": 499, + "time_per_iteration": 3.047938108444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011865, + "balance_loss_mlp": 1.15474319, + "epoch": 0.09619084263178146, + "flos": 385824909312.0, + "grad_norm": 0.09673466805801513, + "language_loss": 0.96238041, + "learning_rate": 0.0009885583773865422, + "loss": 0.97424543, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.31762695, + "step": 500, + "time_per_iteration": 2.402763843536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140705, + "balance_loss_mlp": 1.1099968, + "epoch": 0.09638322431704502, + "flos": 533869401600.0, + "grad_norm": 0.08556524095898377, + "language_loss": 0.93457472, + "learning_rate": 0.0009884920163632524, + "loss": 0.94598186, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.30688477, + "step": 501, + "time_per_iteration": 2.7420296669006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155853, + "balance_loss_mlp": 1.12373805, + "epoch": 0.09657560600230858, + "flos": 500426629632.0, + "grad_norm": 0.08462195742795481, + "language_loss": 0.95688182, + "learning_rate": 0.000988425465690543, + "loss": 0.96844035, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.32104492, + "step": 502, + "time_per_iteration": 2.5425736904144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163304, + "balance_loss_mlp": 1.13099861, + "epoch": 0.09676798768757214, + "flos": 528995197440.0, + "grad_norm": 0.07192036847451248, + "language_loss": 0.92721838, + "learning_rate": 0.0009883587253942505, + "loss": 0.93885148, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.32324219, + "step": 503, + "time_per_iteration": 2.8340742588043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188959, + "balance_loss_mlp": 1.15598607, + "epoch": 0.09696036937283571, + "flos": 463379857920.0, + "grad_norm": 0.0888689340699796, + "language_loss": 0.99166393, + "learning_rate": 0.0009882917955002862, + "loss": 1.00355351, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.32983398, + "step": 504, + "time_per_iteration": 2.560448169708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147535, + "balance_loss_mlp": 1.11606395, + "epoch": 0.09715275105809927, + "flos": 534716204544.0, + "grad_norm": 0.07251663236407552, + "language_loss": 0.9150176, + "learning_rate": 0.0009882246760346343, + "loss": 0.92649293, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.31420898, + "step": 505, + "time_per_iteration": 2.6460299491882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114081, + "balance_loss_mlp": 1.10714495, + "epoch": 0.09734513274336283, + "flos": 454713979392.0, + "grad_norm": 0.10061537251918176, + "language_loss": 0.96100289, + "learning_rate": 0.0009881573670233533, + "loss": 0.97241098, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.33666992, + "step": 506, + "time_per_iteration": 2.5137040615081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109977, + "balance_loss_mlp": 1.08029366, + "epoch": 0.09753751442862639, + "flos": 508551243264.0, + "grad_norm": 0.0762964042901656, + "language_loss": 0.91185808, + "learning_rate": 0.0009880898684925747, + "loss": 0.92295784, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.29663086, + "step": 507, + "time_per_iteration": 2.6571738719940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110119, + "balance_loss_mlp": 1.07133985, + "epoch": 0.09772989611388996, + "flos": 484030425600.0, + "grad_norm": 0.07531505250568626, + "language_loss": 0.89554358, + "learning_rate": 0.0009880221804685037, + "loss": 0.90655547, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.29882812, + "step": 508, + "time_per_iteration": 2.596289873123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01404721, + "balance_loss_mlp": 1.39136958, + "epoch": 0.09792227779915352, + "flos": 1565306339328.0, + "grad_norm": 0.10151454340945995, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80749142, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.13378906, + "step": 509, + "time_per_iteration": 4.724441051483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116621, + "balance_loss_mlp": 1.08655643, + "epoch": 0.09811465948441708, + "flos": 587529165312.0, + "grad_norm": 0.08257009801201759, + "language_loss": 0.94708043, + "learning_rate": 0.0009878862360456733, + "loss": 0.95824659, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.30029297, + "step": 510, + "time_per_iteration": 2.703011989593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122701, + "balance_loss_mlp": 1.09406662, + "epoch": 0.09830704116968064, + "flos": 612719285760.0, + "grad_norm": 0.06191460590209878, + "language_loss": 0.88457662, + "learning_rate": 0.0009878179796996922, + "loss": 0.89580369, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.28637695, + "step": 511, + "time_per_iteration": 2.7212226390838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128587, + "balance_loss_mlp": 1.09885597, + "epoch": 0.09849942285494422, + "flos": 538528227840.0, + "grad_norm": 0.06874751685339883, + "language_loss": 0.9199326, + "learning_rate": 0.0009877495339659754, + "loss": 0.9312185, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.29724121, + "step": 512, + "time_per_iteration": 2.7520575523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111609, + "balance_loss_mlp": 1.08826661, + "epoch": 0.09869180454020778, + "flos": 620193535488.0, + "grad_norm": 0.06953003964378547, + "language_loss": 0.87301105, + "learning_rate": 0.000987680898871096, + "loss": 0.88417196, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.27832031, + "step": 513, + "time_per_iteration": 2.7121992111206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134292, + "balance_loss_mlp": 1.10401261, + "epoch": 0.09888418622547133, + "flos": 811375363584.0, + "grad_norm": 0.1024184057853134, + "language_loss": 0.87763435, + "learning_rate": 0.0009876120744417, + "loss": 0.88897729, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.30273438, + "step": 514, + "time_per_iteration": 2.971573829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123143, + "balance_loss_mlp": 1.09267306, + "epoch": 0.0990765679107349, + "flos": 535548450816.0, + "grad_norm": 0.06764912074049458, + "language_loss": 0.95588082, + "learning_rate": 0.0009875430607045078, + "loss": 0.9671123, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.3046875, + "step": 515, + "time_per_iteration": 2.6630361080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108813, + "balance_loss_mlp": 1.08072746, + "epoch": 0.09926894959599845, + "flos": 587607740928.0, + "grad_norm": 0.06593749006245919, + "language_loss": 0.92788792, + "learning_rate": 0.000987473857686313, + "loss": 0.93897605, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.28076172, + "step": 516, + "time_per_iteration": 2.710068702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121556, + "balance_loss_mlp": 1.09039485, + "epoch": 0.09946133128126203, + "flos": 640947409920.0, + "grad_norm": 0.08862761474564218, + "language_loss": 0.9451825, + "learning_rate": 0.0009874044654139824, + "loss": 0.95639801, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.3112793, + "step": 517, + "time_per_iteration": 2.729975461959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117034, + "balance_loss_mlp": 1.08520555, + "epoch": 0.09965371296652559, + "flos": 465546327552.0, + "grad_norm": 0.09157938746936445, + "language_loss": 0.9250825, + "learning_rate": 0.0009873348839144563, + "loss": 0.93625283, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.31811523, + "step": 518, + "time_per_iteration": 2.5117127895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112516, + "balance_loss_mlp": 1.09540534, + "epoch": 0.09984609465178915, + "flos": 483365505024.0, + "grad_norm": 0.07736257304557469, + "language_loss": 0.9674046, + "learning_rate": 0.000987265113214749, + "loss": 0.97865617, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.29711914, + "step": 519, + "time_per_iteration": 2.5816774368286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147544, + "balance_loss_mlp": 1.11421299, + "epoch": 0.1000384763370527, + "flos": 568764260352.0, + "grad_norm": 0.08763817133734854, + "language_loss": 0.96583092, + "learning_rate": 0.0009871951533419476, + "loss": 0.97730637, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.33325195, + "step": 520, + "time_per_iteration": 2.638664484024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140597, + "balance_loss_mlp": 1.108482, + "epoch": 0.10023085802231628, + "flos": 545515057152.0, + "grad_norm": 0.10925869968591369, + "language_loss": 0.88377398, + "learning_rate": 0.0009871250043232132, + "loss": 0.89517999, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.32104492, + "step": 521, + "time_per_iteration": 2.70491886138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136934, + "balance_loss_mlp": 1.10555792, + "epoch": 0.10042323970757984, + "flos": 503208557568.0, + "grad_norm": 0.07694864026409119, + "language_loss": 0.87725985, + "learning_rate": 0.0009870546661857797, + "loss": 0.8886292, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.31347656, + "step": 522, + "time_per_iteration": 2.653456211090088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126678, + "balance_loss_mlp": 1.09380031, + "epoch": 0.1006156213928434, + "flos": 770084402688.0, + "grad_norm": 0.08414569380370593, + "language_loss": 0.95787346, + "learning_rate": 0.0009869841389569553, + "loss": 0.96914017, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.32885742, + "step": 523, + "time_per_iteration": 2.9442663192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116557, + "balance_loss_mlp": 1.08625388, + "epoch": 0.10080800307810696, + "flos": 489786338304.0, + "grad_norm": 0.06587351152736676, + "language_loss": 0.88897854, + "learning_rate": 0.0009869134226641206, + "loss": 0.90014416, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.30297852, + "step": 524, + "time_per_iteration": 2.5559868812561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110225, + "balance_loss_mlp": 1.07746601, + "epoch": 0.10100038476337053, + "flos": 454478252544.0, + "grad_norm": 0.09167866019985617, + "language_loss": 0.88383424, + "learning_rate": 0.0009868425173347303, + "loss": 0.89493656, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.32788086, + "step": 525, + "time_per_iteration": 2.645116090774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111349, + "balance_loss_mlp": 1.08216143, + "epoch": 0.10119276644863409, + "flos": 556155348480.0, + "grad_norm": 0.07288604326691553, + "language_loss": 0.96749896, + "learning_rate": 0.0009867714229963125, + "loss": 0.97863394, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.31323242, + "step": 526, + "time_per_iteration": 2.730703592300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106354, + "balance_loss_mlp": 1.07540703, + "epoch": 0.10138514813389765, + "flos": 515990587392.0, + "grad_norm": 0.07095113284061857, + "language_loss": 0.93916923, + "learning_rate": 0.000986700139676468, + "loss": 0.95023274, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.30932617, + "step": 527, + "time_per_iteration": 2.5836338996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110446, + "balance_loss_mlp": 1.07833052, + "epoch": 0.10157752981916121, + "flos": 500323322880.0, + "grad_norm": 0.06933811905919615, + "language_loss": 0.91673893, + "learning_rate": 0.0009866286674028717, + "loss": 0.92784333, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.32104492, + "step": 528, + "time_per_iteration": 2.7084739208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_mlp": 1.07100391, + "epoch": 0.10176991150442478, + "flos": 656444376576.0, + "grad_norm": 0.07189407365130172, + "language_loss": 0.88586026, + "learning_rate": 0.0009865570062032717, + "loss": 0.8968786, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.30810547, + "step": 529, + "time_per_iteration": 2.9141628742218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103952, + "balance_loss_mlp": 1.07443571, + "epoch": 0.10196229318968834, + "flos": 572974953984.0, + "grad_norm": 0.06841647032337263, + "language_loss": 0.93659967, + "learning_rate": 0.0009864851561054893, + "loss": 0.94763923, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.29516602, + "step": 530, + "time_per_iteration": 2.7539894580841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090977, + "balance_loss_mlp": 1.06110358, + "epoch": 0.1021546748749519, + "flos": 517946061312.0, + "grad_norm": 0.07340246055426732, + "language_loss": 0.91722125, + "learning_rate": 0.0009864131171374191, + "loss": 0.92813098, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.29882812, + "step": 531, + "time_per_iteration": 2.6921956539154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109944, + "balance_loss_mlp": 1.06749225, + "epoch": 0.10234705656021546, + "flos": 609470286336.0, + "grad_norm": 0.07867637119915549, + "language_loss": 0.91107762, + "learning_rate": 0.0009863408893270292, + "loss": 0.92207205, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.31933594, + "step": 532, + "time_per_iteration": 2.7911570072174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106913, + "balance_loss_mlp": 1.07396317, + "epoch": 0.10253943824547904, + "flos": 601473710592.0, + "grad_norm": 0.08191923529880715, + "language_loss": 0.86522454, + "learning_rate": 0.0009862684727023605, + "loss": 0.87629366, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.3293457, + "step": 533, + "time_per_iteration": 2.7452800273895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105875, + "balance_loss_mlp": 1.07466602, + "epoch": 0.1027318199307426, + "flos": 662647011840.0, + "grad_norm": 0.07282647554851075, + "language_loss": 0.90315968, + "learning_rate": 0.0009861958672915283, + "loss": 0.91421843, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.31201172, + "step": 534, + "time_per_iteration": 2.8041269779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096602, + "balance_loss_mlp": 1.0673244, + "epoch": 0.10292420161600616, + "flos": 682962928128.0, + "grad_norm": 0.058349855756870184, + "language_loss": 0.90126884, + "learning_rate": 0.0009861230731227201, + "loss": 0.9122349, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.29248047, + "step": 535, + "time_per_iteration": 2.8627805709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108033, + "balance_loss_mlp": 1.07615674, + "epoch": 0.10311658330126972, + "flos": 490042414080.0, + "grad_norm": 0.091555564896082, + "language_loss": 0.91954774, + "learning_rate": 0.0009860500902241973, + "loss": 0.93062806, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.31884766, + "step": 536, + "time_per_iteration": 2.6052157878875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120335, + "balance_loss_mlp": 1.08800602, + "epoch": 0.10330896498653329, + "flos": 431508446208.0, + "grad_norm": 0.0585767653270487, + "language_loss": 0.96574026, + "learning_rate": 0.0009859769186242942, + "loss": 0.97694361, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.32324219, + "step": 537, + "time_per_iteration": 2.51180362701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116517, + "balance_loss_mlp": 1.08571362, + "epoch": 0.10350134667179685, + "flos": 549330052608.0, + "grad_norm": 0.0744119924563098, + "language_loss": 0.8926785, + "learning_rate": 0.0009859035583514187, + "loss": 0.90384364, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.30834961, + "step": 538, + "time_per_iteration": 2.6369993686676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146613, + "balance_loss_mlp": 1.11380613, + "epoch": 0.10369372835706041, + "flos": 640327569408.0, + "grad_norm": 0.09976070350989504, + "language_loss": 0.90389431, + "learning_rate": 0.0009858300094340517, + "loss": 0.91536051, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.328125, + "step": 539, + "time_per_iteration": 2.7695086002349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150737, + "balance_loss_mlp": 1.11838388, + "epoch": 0.10388611004232397, + "flos": 521500598784.0, + "grad_norm": 0.08771902350159133, + "language_loss": 0.85304511, + "learning_rate": 0.0009857562719007473, + "loss": 0.8645525, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.32324219, + "step": 540, + "time_per_iteration": 2.59881329536438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144681, + "balance_loss_mlp": 1.11320961, + "epoch": 0.10407849172758753, + "flos": 702111946752.0, + "grad_norm": 0.07496368213999542, + "language_loss": 0.88249481, + "learning_rate": 0.0009856823457801331, + "loss": 0.89394164, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.31494141, + "step": 541, + "time_per_iteration": 2.873481035232544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119735, + "balance_loss_mlp": 1.08738184, + "epoch": 0.1042708734128511, + "flos": 502652736000.0, + "grad_norm": 0.06973546911765124, + "language_loss": 0.94998306, + "learning_rate": 0.00098560823110091, + "loss": 0.96118045, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.32373047, + "step": 542, + "time_per_iteration": 2.661374807357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_mlp": 1.08757377, + "epoch": 0.10446325509811466, + "flos": 485331153408.0, + "grad_norm": 0.0792045331206184, + "language_loss": 0.95517921, + "learning_rate": 0.000985533927891851, + "loss": 0.96635967, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.30419922, + "step": 543, + "time_per_iteration": 2.7264697551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096256, + "balance_loss_mlp": 1.06502366, + "epoch": 0.10465563678337822, + "flos": 568365590016.0, + "grad_norm": 0.0919664039836503, + "language_loss": 0.93718112, + "learning_rate": 0.0009854594361818044, + "loss": 0.94814372, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.31201172, + "step": 544, + "time_per_iteration": 2.6869821548461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099422, + "balance_loss_mlp": 1.0683322, + "epoch": 0.10484801846864178, + "flos": 625806853632.0, + "grad_norm": 0.1054615502202609, + "language_loss": 0.927598, + "learning_rate": 0.0009853847559996897, + "loss": 0.9385922, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.31103516, + "step": 545, + "time_per_iteration": 2.7953526973724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100313, + "balance_loss_mlp": 1.06772113, + "epoch": 0.10504040015390535, + "flos": 743063874048.0, + "grad_norm": 0.0768702593450629, + "language_loss": 0.92008656, + "learning_rate": 0.0009853098873745, + "loss": 0.93108964, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.32592773, + "step": 546, + "time_per_iteration": 3.0344293117523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106321, + "balance_loss_mlp": 1.07430172, + "epoch": 0.10523278183916891, + "flos": 586382616576.0, + "grad_norm": 0.072035501246702, + "language_loss": 0.90983582, + "learning_rate": 0.0009852348303353027, + "loss": 0.92089903, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.32006836, + "step": 547, + "time_per_iteration": 2.7647972106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110403, + "balance_loss_mlp": 1.07100892, + "epoch": 0.10542516352443247, + "flos": 869270552064.0, + "grad_norm": 0.07817580313906373, + "language_loss": 0.84611928, + "learning_rate": 0.000985159584911237, + "loss": 0.85715961, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.33007812, + "step": 548, + "time_per_iteration": 3.143122434616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104478, + "balance_loss_mlp": 1.07212472, + "epoch": 0.10561754520969603, + "flos": 505182970368.0, + "grad_norm": 0.08898596974063745, + "language_loss": 0.91126573, + "learning_rate": 0.0009850841511315162, + "loss": 0.92231047, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.32348633, + "step": 549, + "time_per_iteration": 2.6164846420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112982, + "balance_loss_mlp": 1.07946038, + "epoch": 0.1058099268949596, + "flos": 559690947072.0, + "grad_norm": 0.06224197989448247, + "language_loss": 0.92054999, + "learning_rate": 0.0009850085290254256, + "loss": 0.93167984, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.33520508, + "step": 550, + "time_per_iteration": 2.7473480701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110676, + "balance_loss_mlp": 1.07431078, + "epoch": 0.10600230858022316, + "flos": 561773048832.0, + "grad_norm": 0.05678957528127819, + "language_loss": 0.88957977, + "learning_rate": 0.0009849327186223246, + "loss": 0.90064728, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.32446289, + "step": 551, + "time_per_iteration": 2.805126905441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094878, + "balance_loss_mlp": 1.06297779, + "epoch": 0.10619469026548672, + "flos": 494079989760.0, + "grad_norm": 0.07906939671673464, + "language_loss": 0.95596325, + "learning_rate": 0.000984856719951646, + "loss": 0.96691203, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.31860352, + "step": 552, + "time_per_iteration": 2.5688273906707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105536, + "balance_loss_mlp": 1.07370734, + "epoch": 0.10638707195075028, + "flos": 675843678720.0, + "grad_norm": 0.06469368191660979, + "language_loss": 0.93170857, + "learning_rate": 0.0009847805330428943, + "loss": 0.94276392, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.31811523, + "step": 553, + "time_per_iteration": 2.8858227729797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105116, + "balance_loss_mlp": 1.07080746, + "epoch": 0.10657945363601386, + "flos": 487811925504.0, + "grad_norm": 0.07365688544553677, + "language_loss": 0.94454086, + "learning_rate": 0.0009847041579256481, + "loss": 0.95559192, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.34326172, + "step": 554, + "time_per_iteration": 2.5912039279937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114154, + "balance_loss_mlp": 1.08158636, + "epoch": 0.10677183532127742, + "flos": 482706376704.0, + "grad_norm": 0.06731486395760358, + "language_loss": 0.95310724, + "learning_rate": 0.0009846275946295592, + "loss": 0.96424878, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.32568359, + "step": 555, + "time_per_iteration": 2.6071619987487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120557, + "balance_loss_mlp": 1.08755958, + "epoch": 0.10696421700654098, + "flos": 655917668352.0, + "grad_norm": 0.06239681935918944, + "language_loss": 0.88169777, + "learning_rate": 0.0009845508431843518, + "loss": 0.89290333, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.32983398, + "step": 556, + "time_per_iteration": 2.9906973838806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_mlp": 1.08986306, + "epoch": 0.10715659869180454, + "flos": 567483881472.0, + "grad_norm": 0.06803394611182671, + "language_loss": 0.89010829, + "learning_rate": 0.0009844739036198233, + "loss": 0.90133309, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.32592773, + "step": 557, + "time_per_iteration": 2.6462793350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113246, + "balance_loss_mlp": 1.09927225, + "epoch": 0.10734898037706811, + "flos": 540432829440.0, + "grad_norm": 0.0683091886411484, + "language_loss": 0.96000761, + "learning_rate": 0.0009843967759658448, + "loss": 0.97133219, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.33203125, + "step": 558, + "time_per_iteration": 2.664320707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369087, + "balance_loss_mlp": 1.3546865, + "epoch": 0.10754136206233167, + "flos": 1475870008320.0, + "grad_norm": 0.12144998025248735, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74136841, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.14355469, + "step": 559, + "time_per_iteration": 4.836310148239136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124853, + "balance_loss_mlp": 1.0925231, + "epoch": 0.10773374374759523, + "flos": 512155243008.0, + "grad_norm": 0.06725764235558847, + "language_loss": 0.96045369, + "learning_rate": 0.000984241956509384, + "loss": 0.97170222, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.32324219, + "step": 560, + "time_per_iteration": 2.7409372329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134795, + "balance_loss_mlp": 1.10005689, + "epoch": 0.10792612543285879, + "flos": 496261016064.0, + "grad_norm": 0.08502468521942065, + "language_loss": 0.91520619, + "learning_rate": 0.0009841642647670078, + "loss": 0.92655414, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.34741211, + "step": 561, + "time_per_iteration": 2.5360167026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134435, + "balance_loss_mlp": 1.10050821, + "epoch": 0.10811850711812235, + "flos": 735131317248.0, + "grad_norm": 0.08550854990342285, + "language_loss": 0.86122006, + "learning_rate": 0.0009840863850553944, + "loss": 0.87256444, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.33911133, + "step": 562, + "time_per_iteration": 3.0013930797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118751, + "balance_loss_mlp": 1.08604038, + "epoch": 0.10831088880338592, + "flos": 611257024512.0, + "grad_norm": 0.07414056330929218, + "language_loss": 0.92513216, + "learning_rate": 0.0009840083174047782, + "loss": 0.93631971, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.3269043, + "step": 563, + "time_per_iteration": 2.761746883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125103, + "balance_loss_mlp": 1.09353685, + "epoch": 0.10850327048864948, + "flos": 556022928384.0, + "grad_norm": 0.06849160846851732, + "language_loss": 0.86520386, + "learning_rate": 0.0009839300618454685, + "loss": 0.87645483, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.31518555, + "step": 564, + "time_per_iteration": 2.833545684814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124691, + "balance_loss_mlp": 1.09291005, + "epoch": 0.10869565217391304, + "flos": 602902476288.0, + "grad_norm": 0.06688991061359367, + "language_loss": 0.92471159, + "learning_rate": 0.0009838516184078466, + "loss": 0.9359585, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.31762695, + "step": 565, + "time_per_iteration": 2.838482618331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112559, + "balance_loss_mlp": 1.09345102, + "epoch": 0.1088880338591766, + "flos": 525922288128.0, + "grad_norm": 0.08266802783800845, + "language_loss": 0.89073956, + "learning_rate": 0.0009837729871223669, + "loss": 0.90199542, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.3215332, + "step": 566, + "time_per_iteration": 2.6670589447021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134729, + "balance_loss_mlp": 1.10073042, + "epoch": 0.10908041554444017, + "flos": 619986921984.0, + "grad_norm": 0.06816497946354988, + "language_loss": 0.89503658, + "learning_rate": 0.0009836941680195568, + "loss": 0.90638387, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.34033203, + "step": 567, + "time_per_iteration": 2.7894582748413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131677, + "balance_loss_mlp": 1.09691525, + "epoch": 0.10927279722970373, + "flos": 897740195328.0, + "grad_norm": 0.07371226629870802, + "language_loss": 0.8534497, + "learning_rate": 0.0009836151611300166, + "loss": 0.86476642, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.34765625, + "step": 568, + "time_per_iteration": 3.204500913619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116051, + "balance_loss_mlp": 1.08467555, + "epoch": 0.10946517891496729, + "flos": 528408852480.0, + "grad_norm": 0.061952855977424344, + "language_loss": 0.96103537, + "learning_rate": 0.0009835359664844194, + "loss": 0.97219586, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.3137207, + "step": 569, + "time_per_iteration": 2.6154720783233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124163, + "balance_loss_mlp": 1.11014414, + "epoch": 0.10965756060023085, + "flos": 1559944714752.0, + "grad_norm": 0.03358522647050957, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82160974, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.140625, + "step": 570, + "time_per_iteration": 4.907090187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112487, + "balance_loss_mlp": 1.09406638, + "epoch": 0.10984994228549443, + "flos": 512820163584.0, + "grad_norm": 0.08674533322611513, + "language_loss": 0.9339065, + "learning_rate": 0.0009833770140481118, + "loss": 0.9451552, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.30786133, + "step": 571, + "time_per_iteration": 2.694821357727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121358, + "balance_loss_mlp": 1.09072113, + "epoch": 0.11004232397075799, + "flos": 954314307072.0, + "grad_norm": 0.07582699316256973, + "language_loss": 0.84126109, + "learning_rate": 0.000983297256319112, + "loss": 0.85247469, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.30664062, + "step": 572, + "time_per_iteration": 3.208728313446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144326, + "balance_loss_mlp": 1.11097169, + "epoch": 0.11023470565602154, + "flos": 487921024512.0, + "grad_norm": 0.07530566153242002, + "language_loss": 0.8789041, + "learning_rate": 0.000983217310957477, + "loss": 0.89034736, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.33349609, + "step": 573, + "time_per_iteration": 2.7521331310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113474, + "balance_loss_mlp": 1.1014812, + "epoch": 0.1104270873412851, + "flos": 655521970176.0, + "grad_norm": 0.08427122985019045, + "language_loss": 0.91161472, + "learning_rate": 0.000983137177994244, + "loss": 0.92296207, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.33300781, + "step": 574, + "time_per_iteration": 2.869795083999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105984, + "balance_loss_mlp": 1.0752039, + "epoch": 0.11061946902654868, + "flos": 723097165824.0, + "grad_norm": 0.0803000190442887, + "language_loss": 0.87202144, + "learning_rate": 0.0009830568574605235, + "loss": 0.88308132, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.30737305, + "step": 575, + "time_per_iteration": 2.952505111694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111674, + "balance_loss_mlp": 1.07963109, + "epoch": 0.11081185071181224, + "flos": 835113397248.0, + "grad_norm": 0.07764025760375837, + "language_loss": 0.89234924, + "learning_rate": 0.0009829763493874992, + "loss": 0.90346599, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.3203125, + "step": 576, + "time_per_iteration": 3.0367727279663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110641, + "balance_loss_mlp": 1.07508206, + "epoch": 0.1110042323970758, + "flos": 608776252416.0, + "grad_norm": 0.06795308301133055, + "language_loss": 0.94366598, + "learning_rate": 0.0009828956538064264, + "loss": 0.95473009, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.31347656, + "step": 577, + "time_per_iteration": 2.783268928527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091394, + "balance_loss_mlp": 1.0610671, + "epoch": 0.11119661408233936, + "flos": 595643604480.0, + "grad_norm": 0.0662915232098912, + "language_loss": 0.9183138, + "learning_rate": 0.0009828147707486344, + "loss": 0.92922771, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.30297852, + "step": 578, + "time_per_iteration": 2.6628670692443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092993, + "balance_loss_mlp": 1.06109214, + "epoch": 0.11138899576760293, + "flos": 555573385728.0, + "grad_norm": 0.07355059798421615, + "language_loss": 0.87444091, + "learning_rate": 0.0009827337002455245, + "loss": 0.88537085, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.31884766, + "step": 579, + "time_per_iteration": 2.616842031478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087398, + "balance_loss_mlp": 1.05857313, + "epoch": 0.11158137745286649, + "flos": 689418667008.0, + "grad_norm": 0.05531737995895799, + "language_loss": 0.89474124, + "learning_rate": 0.0009826524423285712, + "loss": 0.90561521, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.28808594, + "step": 580, + "time_per_iteration": 2.896409749984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093471, + "balance_loss_mlp": 1.06393051, + "epoch": 0.11177375913813005, + "flos": 762688728576.0, + "grad_norm": 0.06807232662928764, + "language_loss": 0.9046967, + "learning_rate": 0.0009825709970293218, + "loss": 0.91563141, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.2956543, + "step": 581, + "time_per_iteration": 2.8843319416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096208, + "balance_loss_mlp": 1.0669775, + "epoch": 0.11196614082339361, + "flos": 806211588096.0, + "grad_norm": 0.07053725402235117, + "language_loss": 0.96166003, + "learning_rate": 0.0009824893643793956, + "loss": 0.9726221, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.29248047, + "step": 582, + "time_per_iteration": 3.04577898979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104715, + "balance_loss_mlp": 1.07305288, + "epoch": 0.11215852250865718, + "flos": 558350931456.0, + "grad_norm": 0.10752491555358674, + "language_loss": 0.89033759, + "learning_rate": 0.0009824075444104857, + "loss": 0.90138471, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.31689453, + "step": 583, + "time_per_iteration": 2.682020902633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125233, + "balance_loss_mlp": 1.09497714, + "epoch": 0.11235090419392074, + "flos": 513322140672.0, + "grad_norm": 0.06606619546840543, + "language_loss": 0.94941097, + "learning_rate": 0.000982325537154357, + "loss": 0.9606632, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.30224609, + "step": 584, + "time_per_iteration": 2.577632427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122311, + "balance_loss_mlp": 1.09045827, + "epoch": 0.1125432858791843, + "flos": 491209311744.0, + "grad_norm": 0.07452844115700766, + "language_loss": 0.95190644, + "learning_rate": 0.0009822433426428484, + "loss": 0.96312958, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.31860352, + "step": 585, + "time_per_iteration": 2.560591220855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126565, + "balance_loss_mlp": 1.09280539, + "epoch": 0.11273566756444786, + "flos": 510476193792.0, + "grad_norm": 0.11434848401200806, + "language_loss": 0.87964213, + "learning_rate": 0.0009821609609078697, + "loss": 0.89090776, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.3371582, + "step": 586, + "time_per_iteration": 2.633925437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118464, + "balance_loss_mlp": 1.08785152, + "epoch": 0.11292804924971142, + "flos": 622149009408.0, + "grad_norm": 0.08000190427267627, + "language_loss": 0.905334, + "learning_rate": 0.0009820783919814045, + "loss": 0.91651857, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.3059082, + "step": 587, + "time_per_iteration": 2.806704044342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111289, + "balance_loss_mlp": 1.07857847, + "epoch": 0.113120430934975, + "flos": 477811823616.0, + "grad_norm": 0.09357252991594707, + "language_loss": 0.83955467, + "learning_rate": 0.0009819956358955095, + "loss": 0.8506676, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.32714844, + "step": 588, + "time_per_iteration": 2.5903711318969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109097, + "balance_loss_mlp": 1.07455039, + "epoch": 0.11331281262023855, + "flos": 466801975296.0, + "grad_norm": 0.06610764616840299, + "language_loss": 0.85348701, + "learning_rate": 0.0009819126926823127, + "loss": 0.86457801, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.34570312, + "step": 589, + "time_per_iteration": 2.5726494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108183, + "balance_loss_mlp": 1.07535291, + "epoch": 0.11350519430550211, + "flos": 650164727808.0, + "grad_norm": 0.06035980490561805, + "language_loss": 0.87806922, + "learning_rate": 0.000981829562374016, + "loss": 0.8891511, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.328125, + "step": 590, + "time_per_iteration": 2.7960643768310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112987, + "balance_loss_mlp": 1.08041859, + "epoch": 0.11369757599076567, + "flos": 557547798528.0, + "grad_norm": 0.08830164474684658, + "language_loss": 0.98550045, + "learning_rate": 0.0009817462450028933, + "loss": 0.99663031, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.32568359, + "step": 591, + "time_per_iteration": 2.654860734939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107596, + "balance_loss_mlp": 1.07526684, + "epoch": 0.11388995767602925, + "flos": 570774988800.0, + "grad_norm": 0.06245390963608315, + "language_loss": 0.86587834, + "learning_rate": 0.0009816627406012916, + "loss": 0.87695432, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.32348633, + "step": 592, + "time_per_iteration": 2.8017733097076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101808, + "balance_loss_mlp": 1.07074225, + "epoch": 0.1140823393612928, + "flos": 740069540352.0, + "grad_norm": 0.06581053360364857, + "language_loss": 0.8595314, + "learning_rate": 0.0009815790492016295, + "loss": 0.87054944, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.31030273, + "step": 593, + "time_per_iteration": 2.9602174758911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097875, + "balance_loss_mlp": 1.06666636, + "epoch": 0.11427472104655637, + "flos": 698694211584.0, + "grad_norm": 0.07124053574400792, + "language_loss": 0.87982339, + "learning_rate": 0.0009814951708363993, + "loss": 0.89080215, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.31201172, + "step": 594, + "time_per_iteration": 2.818460702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167391, + "balance_loss_mlp": 1.15413451, + "epoch": 0.11446710273181993, + "flos": 1476387952128.0, + "grad_norm": 0.04038129773095179, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79158378, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.1328125, + "step": 595, + "time_per_iteration": 4.776912450790405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110386, + "balance_loss_mlp": 1.07250798, + "epoch": 0.1146594844170835, + "flos": 494641603584.0, + "grad_norm": 0.1404346857169784, + "language_loss": 0.89489102, + "learning_rate": 0.0009813268533395648, + "loss": 0.90592968, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.3137207, + "step": 596, + "time_per_iteration": 2.562816858291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115324, + "balance_loss_mlp": 1.08344746, + "epoch": 0.11485186610234706, + "flos": 474596319744.0, + "grad_norm": 0.07456374098915484, + "language_loss": 0.89145029, + "learning_rate": 0.0009812424142733073, + "loss": 0.90260351, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.31884766, + "step": 597, + "time_per_iteration": 2.5198655128479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_mlp": 1.0946219, + "epoch": 0.11504424778761062, + "flos": 730858014720.0, + "grad_norm": 0.05033183127205697, + "language_loss": 0.86898923, + "learning_rate": 0.000981157788372175, + "loss": 0.88022888, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.29345703, + "step": 598, + "time_per_iteration": 3.004558563232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155063, + "balance_loss_mlp": 1.12290049, + "epoch": 0.11523662947287418, + "flos": 545539788288.0, + "grad_norm": 0.07554757352201513, + "language_loss": 0.90216064, + "learning_rate": 0.0009810729756690223, + "loss": 0.91371131, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.3215332, + "step": 599, + "time_per_iteration": 2.7165520191192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149643, + "balance_loss_mlp": 1.11790919, + "epoch": 0.11542901115813775, + "flos": 774737436672.0, + "grad_norm": 0.08801397326806587, + "language_loss": 0.92855275, + "learning_rate": 0.0009809879761967766, + "loss": 0.94004917, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.31738281, + "step": 600, + "time_per_iteration": 2.9548492431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115619, + "balance_loss_mlp": 1.12004542, + "epoch": 0.11562139284340131, + "flos": 730585972224.0, + "grad_norm": 0.08285308963026158, + "language_loss": 0.87716347, + "learning_rate": 0.0009809027899884378, + "loss": 0.8887254, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.36157227, + "step": 601, + "time_per_iteration": 2.9107346534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131924, + "balance_loss_mlp": 1.10085821, + "epoch": 0.11581377452866487, + "flos": 535589148672.0, + "grad_norm": 0.07059046613839054, + "language_loss": 0.89834028, + "learning_rate": 0.0009808174170770779, + "loss": 0.90965956, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.31079102, + "step": 602, + "time_per_iteration": 2.79127836227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217718, + "balance_loss_mlp": 1.20541608, + "epoch": 0.11600615621392843, + "flos": 1554968613888.0, + "grad_norm": 0.07528653751738872, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86115962, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.12304688, + "step": 603, + "time_per_iteration": 4.862261772155762 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103115, + "balance_loss_mlp": 1.07238269, + "epoch": 0.116198537899192, + "flos": 537178037760.0, + "grad_norm": 0.08106568577848162, + "language_loss": 0.94434869, + "learning_rate": 0.0009806461112779462, + "loss": 0.95537978, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.30737305, + "step": 604, + "time_per_iteration": 2.600008249282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097342, + "balance_loss_mlp": 1.06427336, + "epoch": 0.11639091958445556, + "flos": 453970483200.0, + "grad_norm": 0.09761910402267754, + "language_loss": 0.89590895, + "learning_rate": 0.0009805601784566814, + "loss": 0.90688241, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.33056641, + "step": 605, + "time_per_iteration": 2.4687013626098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097807, + "balance_loss_mlp": 1.06635928, + "epoch": 0.11658330126971912, + "flos": 554815332864.0, + "grad_norm": 0.0628453025897625, + "language_loss": 0.96235836, + "learning_rate": 0.0009804740590654089, + "loss": 0.97333646, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.31469727, + "step": 606, + "time_per_iteration": 2.654134750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109435, + "balance_loss_mlp": 1.0789417, + "epoch": 0.11677568295498268, + "flos": 716025968640.0, + "grad_norm": 0.07837472156111998, + "language_loss": 0.90884066, + "learning_rate": 0.0009803877531375635, + "loss": 0.91993499, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.30493164, + "step": 607, + "time_per_iteration": 2.825778007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_mlp": 1.08074808, + "epoch": 0.11696806464024626, + "flos": 609474668544.0, + "grad_norm": 0.07263848878870109, + "language_loss": 0.91923869, + "learning_rate": 0.0009803012607066523, + "loss": 0.93036401, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.31787109, + "step": 608, + "time_per_iteration": 2.721005916595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101062, + "balance_loss_mlp": 1.06980491, + "epoch": 0.11716044632550981, + "flos": 520127087616.0, + "grad_norm": 0.06980646294906427, + "language_loss": 0.9077962, + "learning_rate": 0.0009802145818062543, + "loss": 0.91880679, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.31225586, + "step": 609, + "time_per_iteration": 2.707643985748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102401, + "balance_loss_mlp": 1.07035792, + "epoch": 0.11735282801077337, + "flos": 507246133248.0, + "grad_norm": 0.07162886221417876, + "language_loss": 0.9293434, + "learning_rate": 0.0009801277164700212, + "loss": 0.9403674, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.3203125, + "step": 610, + "time_per_iteration": 2.6389639377593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094537, + "balance_loss_mlp": 1.06323278, + "epoch": 0.11754520969603693, + "flos": 686339965440.0, + "grad_norm": 0.07220465483683103, + "language_loss": 0.90727574, + "learning_rate": 0.0009800406647316776, + "loss": 0.91822106, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.31274414, + "step": 611, + "time_per_iteration": 2.8033382892608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066854, + "balance_loss_mlp": 1.05369329, + "epoch": 0.1177375913813005, + "flos": 1541673022464.0, + "grad_norm": 0.030783707978337852, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.77981311, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.13183594, + "step": 612, + "time_per_iteration": 4.777275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116404, + "balance_loss_mlp": 1.08307314, + "epoch": 0.11792997306656407, + "flos": 520269682176.0, + "grad_norm": 0.07589987368124408, + "language_loss": 0.8961159, + "learning_rate": 0.000979866002183916, + "loss": 0.90727997, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.33325195, + "step": 613, + "time_per_iteration": 2.6848883628845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109453, + "balance_loss_mlp": 1.07719529, + "epoch": 0.11812235475182763, + "flos": 665980379136.0, + "grad_norm": 0.08667718058784188, + "language_loss": 0.91197205, + "learning_rate": 0.0009797783914423082, + "loss": 0.92306662, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.32250977, + "step": 614, + "time_per_iteration": 2.832414388656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.07140493, + "epoch": 0.11831473643709119, + "flos": 621021399552.0, + "grad_norm": 0.06050640051516142, + "language_loss": 0.85425436, + "learning_rate": 0.0009796905944342094, + "loss": 0.86530626, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.33813477, + "step": 615, + "time_per_iteration": 2.8220455646514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112849, + "balance_loss_mlp": 1.07913685, + "epoch": 0.11850711812235475, + "flos": 456438108672.0, + "grad_norm": 0.0714748534502384, + "language_loss": 0.893188, + "learning_rate": 0.0009796026111937057, + "loss": 0.90431643, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.3371582, + "step": 616, + "time_per_iteration": 2.590566873550415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102524, + "balance_loss_mlp": 1.07005119, + "epoch": 0.11869949980761832, + "flos": 513598565376.0, + "grad_norm": 0.06492309219220607, + "language_loss": 0.89778733, + "learning_rate": 0.0009795144417549552, + "loss": 0.90881252, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.32470703, + "step": 617, + "time_per_iteration": 2.672914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109626, + "balance_loss_mlp": 1.0773685, + "epoch": 0.11889188149288188, + "flos": 534732171264.0, + "grad_norm": 0.057544425945024125, + "language_loss": 0.90660846, + "learning_rate": 0.0009794260861521883, + "loss": 0.9177047, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.32250977, + "step": 618, + "time_per_iteration": 2.817354202270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_mlp": 1.07009149, + "epoch": 0.11908426317814544, + "flos": 498344527872.0, + "grad_norm": 0.0773697745436404, + "language_loss": 0.87738883, + "learning_rate": 0.0009793375444197075, + "loss": 0.88841403, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.32397461, + "step": 619, + "time_per_iteration": 2.607475996017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011109, + "balance_loss_mlp": 1.07697332, + "epoch": 0.119276644863409, + "flos": 659598833664.0, + "grad_norm": 0.06767977381214116, + "language_loss": 0.86337721, + "learning_rate": 0.000979248816591888, + "loss": 0.87448615, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.33935547, + "step": 620, + "time_per_iteration": 2.758866548538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098794, + "balance_loss_mlp": 1.06667948, + "epoch": 0.11946902654867257, + "flos": 758396487168.0, + "grad_norm": 0.06819106164994826, + "language_loss": 0.87032986, + "learning_rate": 0.0009791599027031766, + "loss": 0.88131785, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.32128906, + "step": 621, + "time_per_iteration": 3.029431104660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088156, + "balance_loss_mlp": 1.05611241, + "epoch": 0.11966140823393613, + "flos": 680697533952.0, + "grad_norm": 0.0732554324646167, + "language_loss": 0.87112588, + "learning_rate": 0.0009790708027880932, + "loss": 0.88200748, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.32055664, + "step": 622, + "time_per_iteration": 2.855576992034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056461, + "balance_loss_mlp": 1.04444504, + "epoch": 0.11985378991919969, + "flos": 1450268070912.0, + "grad_norm": 0.03732324883573809, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78483754, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.12011719, + "step": 623, + "time_per_iteration": 4.840993165969849 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108671, + "balance_loss_mlp": 1.0551914, + "epoch": 0.12004617160446325, + "flos": 527586780672.0, + "grad_norm": 0.07309096746678648, + "language_loss": 0.94236648, + "learning_rate": 0.0009788920450172487, + "loss": 0.9532336, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.31518555, + "step": 624, + "time_per_iteration": 2.6301677227020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102663, + "balance_loss_mlp": 1.07023823, + "epoch": 0.12023855328972682, + "flos": 473980861440.0, + "grad_norm": 0.15739190650861204, + "language_loss": 0.91515559, + "learning_rate": 0.0009788023872308875, + "loss": 0.92618221, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.32421875, + "step": 625, + "time_per_iteration": 2.506446361541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014454, + "balance_loss_mlp": 1.0033915, + "epoch": 0.12043093497499038, + "flos": 1530954155520.0, + "grad_norm": 0.02216054665264375, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76443458, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.11083984, + "step": 626, + "time_per_iteration": 4.713289260864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114644, + "balance_loss_mlp": 1.08391225, + "epoch": 0.12062331666025394, + "flos": 539571469824.0, + "grad_norm": 0.0672242080300053, + "language_loss": 0.94766486, + "learning_rate": 0.0009786225140303285, + "loss": 0.95881128, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.30761719, + "step": 627, + "time_per_iteration": 2.61875057220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011267, + "balance_loss_mlp": 1.09503818, + "epoch": 0.1208156983455175, + "flos": 511634327040.0, + "grad_norm": 0.06510849521455, + "language_loss": 0.925771, + "learning_rate": 0.0009785322986859634, + "loss": 0.93703806, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.31640625, + "step": 628, + "time_per_iteration": 2.6567625999450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141777, + "balance_loss_mlp": 1.11059177, + "epoch": 0.12100808003078108, + "flos": 596195043840.0, + "grad_norm": 0.06735600063735754, + "language_loss": 0.93719506, + "learning_rate": 0.0009784418975588838, + "loss": 0.94861281, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.31152344, + "step": 629, + "time_per_iteration": 2.697376012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122983, + "balance_loss_mlp": 1.09222674, + "epoch": 0.12120046171604464, + "flos": 522698019840.0, + "grad_norm": 0.47103484407124013, + "language_loss": 0.93927598, + "learning_rate": 0.0009783513106841862, + "loss": 0.95050573, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.30761719, + "step": 630, + "time_per_iteration": 2.7226808071136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143332, + "balance_loss_mlp": 1.13179243, + "epoch": 0.1213928434013082, + "flos": 1553605277184.0, + "grad_norm": 0.056788624646596834, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77876031, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.11523438, + "step": 631, + "time_per_iteration": 4.948111295700073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228128, + "balance_loss_mlp": 1.19219875, + "epoch": 0.12158522508657175, + "flos": 495143580672.0, + "grad_norm": 0.06834333100250278, + "language_loss": 0.88515621, + "learning_rate": 0.0009781695798326854, + "loss": 0.89743745, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.35961914, + "step": 632, + "time_per_iteration": 2.5616555213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267845, + "balance_loss_mlp": 1.23050833, + "epoch": 0.12177760677183531, + "flos": 475335433728.0, + "grad_norm": 0.1009303482431908, + "language_loss": 0.88543177, + "learning_rate": 0.0009780784359264365, + "loss": 0.89811015, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.37329102, + "step": 633, + "time_per_iteration": 2.597935438156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265484, + "balance_loss_mlp": 1.25370574, + "epoch": 0.12196998845709889, + "flos": 1467630351360.0, + "grad_norm": 0.08843071113371018, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75454181, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.11767578, + "step": 634, + "time_per_iteration": 4.768415451049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235432, + "balance_loss_mlp": 1.19976473, + "epoch": 0.12216237014236245, + "flos": 586279309824.0, + "grad_norm": 0.0829698455775257, + "language_loss": 0.88074899, + "learning_rate": 0.000977895591329867, + "loss": 0.89310336, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.35668945, + "step": 635, + "time_per_iteration": 2.7918457984924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214994, + "balance_loss_mlp": 1.17720437, + "epoch": 0.12235475182762601, + "flos": 597721324032.0, + "grad_norm": 0.0916527997361875, + "language_loss": 0.87791145, + "learning_rate": 0.000977803890710533, + "loss": 0.89006138, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.37792969, + "step": 636, + "time_per_iteration": 2.7248313426971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186705, + "balance_loss_mlp": 1.1509428, + "epoch": 0.12254713351288957, + "flos": 497487550464.0, + "grad_norm": 0.0702522126388857, + "language_loss": 0.93856937, + "learning_rate": 0.0009777120045912774, + "loss": 0.95043641, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.35766602, + "step": 637, + "time_per_iteration": 2.6079726219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180704, + "balance_loss_mlp": 1.14236617, + "epoch": 0.12273951519815314, + "flos": 605565130752.0, + "grad_norm": 0.06645311005239844, + "language_loss": 0.90599251, + "learning_rate": 0.0009776199330077736, + "loss": 0.91779959, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.38330078, + "step": 638, + "time_per_iteration": 2.7671282291412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196025, + "balance_loss_mlp": 1.15940344, + "epoch": 0.1229318968834167, + "flos": 597578729472.0, + "grad_norm": 0.09015200479441979, + "language_loss": 0.93140519, + "learning_rate": 0.0009775276759957667, + "loss": 0.94336545, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.36621094, + "step": 639, + "time_per_iteration": 2.6990442276000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179898, + "balance_loss_mlp": 1.14265716, + "epoch": 0.12312427856868026, + "flos": 678082931712.0, + "grad_norm": 0.08188642922116089, + "language_loss": 0.90714514, + "learning_rate": 0.0009774352335910745, + "loss": 0.91894412, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.37280273, + "step": 640, + "time_per_iteration": 2.7950265407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115004, + "balance_loss_mlp": 1.11658967, + "epoch": 0.12331666025394382, + "flos": 608656978944.0, + "grad_norm": 0.07361380744806716, + "language_loss": 0.95549798, + "learning_rate": 0.000977342605829586, + "loss": 0.96699834, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.3347168, + "step": 641, + "time_per_iteration": 2.6966538429260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140018, + "balance_loss_mlp": 1.10497069, + "epoch": 0.12350904193920739, + "flos": 762172194816.0, + "grad_norm": 0.08211004604029591, + "language_loss": 0.86708105, + "learning_rate": 0.0009772497927472623, + "loss": 0.87848121, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.35083008, + "step": 642, + "time_per_iteration": 3.050595998764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121507, + "balance_loss_mlp": 1.0852437, + "epoch": 0.12370142362447095, + "flos": 540699079680.0, + "grad_norm": 0.0716743258864478, + "language_loss": 0.85363436, + "learning_rate": 0.0009771567943801368, + "loss": 0.86484945, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.36254883, + "step": 643, + "time_per_iteration": 2.627019166946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112067, + "balance_loss_mlp": 1.07744884, + "epoch": 0.12389380530973451, + "flos": 547848852480.0, + "grad_norm": 0.06992166814052157, + "language_loss": 0.89936745, + "learning_rate": 0.0009770636107643152, + "loss": 0.91048813, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.34643555, + "step": 644, + "time_per_iteration": 2.696233034133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102963, + "balance_loss_mlp": 1.06846356, + "epoch": 0.12408618699499807, + "flos": 540048715776.0, + "grad_norm": 0.06268128655507912, + "language_loss": 0.88181639, + "learning_rate": 0.0009769702419359738, + "loss": 0.89284605, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.3449707, + "step": 645, + "time_per_iteration": 2.61401104927063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116268, + "balance_loss_mlp": 1.0810535, + "epoch": 0.12427856868026164, + "flos": 745451513856.0, + "grad_norm": 0.07610574883038115, + "language_loss": 0.89730537, + "learning_rate": 0.000976876687931362, + "loss": 0.90846807, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.35229492, + "step": 646, + "time_per_iteration": 2.999408721923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131315, + "balance_loss_mlp": 1.09622002, + "epoch": 0.1244709503655252, + "flos": 533460556800.0, + "grad_norm": 0.19449531308307466, + "language_loss": 0.85410094, + "learning_rate": 0.0009767829487868005, + "loss": 0.86541414, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.35107422, + "step": 647, + "time_per_iteration": 2.617666721343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138117, + "balance_loss_mlp": 1.10159075, + "epoch": 0.12466333205078876, + "flos": 507847034880.0, + "grad_norm": 0.07509451505155453, + "language_loss": 0.89358151, + "learning_rate": 0.000976689024538682, + "loss": 0.90496266, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.36499023, + "step": 648, + "time_per_iteration": 2.5929009914398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138062, + "balance_loss_mlp": 1.10110736, + "epoch": 0.12485571373605232, + "flos": 681023420928.0, + "grad_norm": 0.07057439208121223, + "language_loss": 0.87662494, + "learning_rate": 0.0009765949152234716, + "loss": 0.8880055, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.36962891, + "step": 649, + "time_per_iteration": 2.874701976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147728, + "balance_loss_mlp": 1.13504386, + "epoch": 0.1250480954213159, + "flos": 1329402668544.0, + "grad_norm": 0.04527818124304351, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79833812, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.12695312, + "step": 650, + "time_per_iteration": 4.680933713912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138039, + "balance_loss_mlp": 1.10287213, + "epoch": 0.12524047710657946, + "flos": 938140683264.0, + "grad_norm": 0.08375968037938068, + "language_loss": 0.82443976, + "learning_rate": 0.0009764061415379919, + "loss": 0.83582014, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.35205078, + "step": 651, + "time_per_iteration": 3.2550604343414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135369, + "balance_loss_mlp": 1.09774697, + "epoch": 0.12543285879184302, + "flos": 513642235392.0, + "grad_norm": 0.07146085627000143, + "language_loss": 0.89363486, + "learning_rate": 0.0009763114772410109, + "loss": 0.90498853, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.3762207, + "step": 652, + "time_per_iteration": 2.5937142372131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139745, + "balance_loss_mlp": 1.10419679, + "epoch": 0.12562524047710658, + "flos": 717991617024.0, + "grad_norm": 0.07913079577836896, + "language_loss": 0.87230957, + "learning_rate": 0.0009762166280235146, + "loss": 0.88370705, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.35571289, + "step": 653, + "time_per_iteration": 2.96162748336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147653, + "balance_loss_mlp": 1.10974443, + "epoch": 0.12581762216237014, + "flos": 563441923584.0, + "grad_norm": 0.06492259826928527, + "language_loss": 0.87890899, + "learning_rate": 0.0009761215939223267, + "loss": 0.89038551, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.37890625, + "step": 654, + "time_per_iteration": 2.714641809463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145182, + "balance_loss_mlp": 1.1077261, + "epoch": 0.1260100038476337, + "flos": 481642785792.0, + "grad_norm": 0.07920721431290144, + "language_loss": 0.86875665, + "learning_rate": 0.0009760263749743428, + "loss": 0.88020849, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.37426758, + "step": 655, + "time_per_iteration": 2.547499179840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145343, + "balance_loss_mlp": 1.11074805, + "epoch": 0.12620238553289725, + "flos": 575269461504.0, + "grad_norm": 0.06357383816141966, + "language_loss": 0.90176344, + "learning_rate": 0.0009759309712165299, + "loss": 0.91321695, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.34570312, + "step": 656, + "time_per_iteration": 2.693922996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137509, + "balance_loss_mlp": 1.103248, + "epoch": 0.12639476721816084, + "flos": 530909973504.0, + "grad_norm": 0.07169490366111804, + "language_loss": 0.93258119, + "learning_rate": 0.0009758353826859272, + "loss": 0.94395626, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.34277344, + "step": 657, + "time_per_iteration": 2.5744612216949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139269, + "balance_loss_mlp": 1.10314822, + "epoch": 0.1265871489034244, + "flos": 689654393856.0, + "grad_norm": 0.06860158128637554, + "language_loss": 0.89679217, + "learning_rate": 0.0009757396094196456, + "loss": 0.90818477, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36132812, + "step": 658, + "time_per_iteration": 2.851700782775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143308, + "balance_loss_mlp": 1.10675859, + "epoch": 0.12677953058868796, + "flos": 536863735296.0, + "grad_norm": 0.0696485175834739, + "language_loss": 0.84555894, + "learning_rate": 0.0009756436514548673, + "loss": 0.85699201, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.36523438, + "step": 659, + "time_per_iteration": 2.7971351146698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122244, + "balance_loss_mlp": 1.08800757, + "epoch": 0.12697191227395152, + "flos": 518749194240.0, + "grad_norm": 0.05327633329409036, + "language_loss": 0.88343394, + "learning_rate": 0.0009755475088288466, + "loss": 0.89465636, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.34228516, + "step": 660, + "time_per_iteration": 2.670555353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103127, + "balance_loss_mlp": 1.06903291, + "epoch": 0.12716429395921508, + "flos": 566341714944.0, + "grad_norm": 0.06801254087798507, + "language_loss": 0.90210187, + "learning_rate": 0.0009754511815789095, + "loss": 0.91313314, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.34106445, + "step": 661, + "time_per_iteration": 2.748224973678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102552, + "balance_loss_mlp": 1.06798172, + "epoch": 0.12735667564447864, + "flos": 513844466688.0, + "grad_norm": 0.06975204014846512, + "language_loss": 0.86245489, + "learning_rate": 0.0009753546697424533, + "loss": 0.87348044, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.34594727, + "step": 662, + "time_per_iteration": 2.664799213409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092108, + "balance_loss_mlp": 1.05863369, + "epoch": 0.1275490573297422, + "flos": 541023556608.0, + "grad_norm": 0.05485824904298714, + "language_loss": 0.90572149, + "learning_rate": 0.0009752579733569475, + "loss": 0.91664255, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.3347168, + "step": 663, + "time_per_iteration": 2.679975748062134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267369, + "balance_loss_mlp": 1.2515384, + "epoch": 0.12774143901500576, + "flos": 1557872787456.0, + "grad_norm": 0.0685532780556388, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7614876, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.15820312, + "step": 664, + "time_per_iteration": 4.938101053237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096151, + "balance_loss_mlp": 1.06177139, + "epoch": 0.12793382070026935, + "flos": 613462781952.0, + "grad_norm": 0.06920677464457729, + "language_loss": 0.90523887, + "learning_rate": 0.0009750640270890217, + "loss": 0.9162004, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.34375, + "step": 665, + "time_per_iteration": 2.6939845085144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099083, + "balance_loss_mlp": 1.06563258, + "epoch": 0.1281262023855329, + "flos": 707386231296.0, + "grad_norm": 0.06773970450457005, + "language_loss": 0.96531481, + "learning_rate": 0.0009749667772818983, + "loss": 0.9763056, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.33447266, + "step": 666, + "time_per_iteration": 2.967853307723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164762, + "balance_loss_mlp": 1.15131497, + "epoch": 0.12831858407079647, + "flos": 1424250086400.0, + "grad_norm": 0.045177828452490555, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78100705, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.13476562, + "step": 667, + "time_per_iteration": 4.85069465637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093582, + "balance_loss_mlp": 1.05958366, + "epoch": 0.12851096575606002, + "flos": 448869316608.0, + "grad_norm": 0.07778909975942494, + "language_loss": 0.95426726, + "learning_rate": 0.0009747717245101093, + "loss": 0.96520311, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.34008789, + "step": 668, + "time_per_iteration": 2.5234692096710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098998, + "balance_loss_mlp": 1.06519032, + "epoch": 0.12870334744132358, + "flos": 479697486336.0, + "grad_norm": 0.05465485885236262, + "language_loss": 0.84969366, + "learning_rate": 0.00097467392162117, + "loss": 0.86068368, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.33789062, + "step": 669, + "time_per_iteration": 2.601684808731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096385, + "balance_loss_mlp": 1.06341171, + "epoch": 0.12889572912658714, + "flos": 638633963520.0, + "grad_norm": 0.05954757179165737, + "language_loss": 0.91292465, + "learning_rate": 0.0009745759344474708, + "loss": 0.92388856, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.32983398, + "step": 670, + "time_per_iteration": 2.8225347995758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098806, + "balance_loss_mlp": 1.06411648, + "epoch": 0.1290881108118507, + "flos": 509693409792.0, + "grad_norm": 0.06976130099981656, + "language_loss": 0.89229816, + "learning_rate": 0.0009744777630270536, + "loss": 0.90328622, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.34692383, + "step": 671, + "time_per_iteration": 2.633571147918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109875, + "balance_loss_mlp": 1.07435024, + "epoch": 0.12928049249711426, + "flos": 670746894336.0, + "grad_norm": 0.08011077975608555, + "language_loss": 0.93749923, + "learning_rate": 0.000974379407398032, + "loss": 0.94859791, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.35546875, + "step": 672, + "time_per_iteration": 2.875609874725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093721, + "balance_loss_mlp": 1.06065273, + "epoch": 0.12947287418237785, + "flos": 793158925824.0, + "grad_norm": 0.05850057523774312, + "language_loss": 0.82016242, + "learning_rate": 0.0009742808675985913, + "loss": 0.83109969, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.33056641, + "step": 673, + "time_per_iteration": 3.087738275527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101029, + "balance_loss_mlp": 1.0646224, + "epoch": 0.1296652558676414, + "flos": 485222054400.0, + "grad_norm": 0.08954381825883409, + "language_loss": 0.9153564, + "learning_rate": 0.0009741821436669876, + "loss": 0.92636657, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.36450195, + "step": 674, + "time_per_iteration": 2.539849281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101326, + "balance_loss_mlp": 1.06673169, + "epoch": 0.12985763755290497, + "flos": 453226987008.0, + "grad_norm": 0.0648114016490977, + "language_loss": 0.9288274, + "learning_rate": 0.0009740832356415492, + "loss": 0.93984067, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.34619141, + "step": 675, + "time_per_iteration": 2.467801094055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097673, + "balance_loss_mlp": 1.06315041, + "epoch": 0.13005001923816853, + "flos": 824719007232.0, + "grad_norm": 0.0735546441878898, + "language_loss": 0.8857609, + "learning_rate": 0.0009739841435606756, + "loss": 0.89673769, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.34545898, + "step": 676, + "time_per_iteration": 3.008781909942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109741, + "balance_loss_mlp": 1.06457949, + "epoch": 0.1302424009234321, + "flos": 531107822592.0, + "grad_norm": 0.07312926894822828, + "language_loss": 0.90675485, + "learning_rate": 0.0009738848674628377, + "loss": 0.9177289, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.328125, + "step": 677, + "time_per_iteration": 2.695338010787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104607, + "balance_loss_mlp": 1.06955981, + "epoch": 0.13043478260869565, + "flos": 525626924544.0, + "grad_norm": 0.06033597827839572, + "language_loss": 0.89643902, + "learning_rate": 0.000973785407386578, + "loss": 0.90748513, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.35058594, + "step": 678, + "time_per_iteration": 2.7727599143981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101976, + "balance_loss_mlp": 1.06714272, + "epoch": 0.1306271642939592, + "flos": 625862108160.0, + "grad_norm": 0.05570081952525763, + "language_loss": 0.87361526, + "learning_rate": 0.0009736857633705103, + "loss": 0.88463503, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.34814453, + "step": 679, + "time_per_iteration": 2.843129873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110176, + "balance_loss_mlp": 1.06630766, + "epoch": 0.13081954597922277, + "flos": 550438723584.0, + "grad_norm": 0.06405817655948583, + "language_loss": 0.93204647, + "learning_rate": 0.0009735859354533196, + "loss": 0.94306409, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.35473633, + "step": 680, + "time_per_iteration": 2.7122464179992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093118, + "balance_loss_mlp": 1.05914354, + "epoch": 0.13101192766448633, + "flos": 536651329536.0, + "grad_norm": 0.06779912020183775, + "language_loss": 0.91948998, + "learning_rate": 0.0009734859236737628, + "loss": 0.93042123, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.33984375, + "step": 681, + "time_per_iteration": 2.594881296157837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093326, + "balance_loss_mlp": 1.0593034, + "epoch": 0.13120430934974991, + "flos": 503258019840.0, + "grad_norm": 0.06413082246497326, + "language_loss": 0.93904501, + "learning_rate": 0.0009733857280706678, + "loss": 0.94997829, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.34033203, + "step": 682, + "time_per_iteration": 2.5831425189971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010929, + "balance_loss_mlp": 1.05992687, + "epoch": 0.13139669103501347, + "flos": 614014221312.0, + "grad_norm": 0.06246118190021366, + "language_loss": 0.85051745, + "learning_rate": 0.000973285348682934, + "loss": 0.86144638, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.33007812, + "step": 683, + "time_per_iteration": 2.7236225605010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226892, + "balance_loss_mlp": 1.21096563, + "epoch": 0.13158907272027703, + "flos": 1484163357696.0, + "grad_norm": 0.08359566880013784, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79125261, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.15917969, + "step": 684, + "time_per_iteration": 4.87854790687561 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087932, + "balance_loss_mlp": 1.05488706, + "epoch": 0.1317814544055406, + "flos": 985049344512.0, + "grad_norm": 0.07039095593234826, + "language_loss": 0.85449159, + "learning_rate": 0.0009730840387095046, + "loss": 0.86537099, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.33056641, + "step": 685, + "time_per_iteration": 3.30759596824646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096943, + "balance_loss_mlp": 1.06156158, + "epoch": 0.13197383609080415, + "flos": 611163892224.0, + "grad_norm": 0.05759402546544749, + "language_loss": 0.912597, + "learning_rate": 0.0009729831082019642, + "loss": 0.92356646, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.35351562, + "step": 686, + "time_per_iteration": 2.7965087890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093388, + "balance_loss_mlp": 1.0589608, + "epoch": 0.1321662177760677, + "flos": 494116305408.0, + "grad_norm": 0.058033147986452156, + "language_loss": 0.89668858, + "learning_rate": 0.0009728819940660958, + "loss": 0.90762246, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.34399414, + "step": 687, + "time_per_iteration": 2.7347469329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110102, + "balance_loss_mlp": 1.0653528, + "epoch": 0.13235859946133127, + "flos": 495591713280.0, + "grad_norm": 0.07548862234195632, + "language_loss": 0.86088693, + "learning_rate": 0.0009727806963411557, + "loss": 0.87189722, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.35668945, + "step": 688, + "time_per_iteration": 2.621638774871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098222, + "balance_loss_mlp": 1.06279302, + "epoch": 0.13255098114659483, + "flos": 511417539072.0, + "grad_norm": 0.08656773393569435, + "language_loss": 0.88000298, + "learning_rate": 0.000972679215066471, + "loss": 0.89098513, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.35449219, + "step": 689, + "time_per_iteration": 2.6806418895721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103388, + "balance_loss_mlp": 1.06900764, + "epoch": 0.13274336283185842, + "flos": 547114120704.0, + "grad_norm": 0.07064056682134613, + "language_loss": 0.99675226, + "learning_rate": 0.0009725775502814401, + "loss": 1.00778604, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.34350586, + "step": 690, + "time_per_iteration": 2.607179641723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121046, + "balance_loss_mlp": 1.08397222, + "epoch": 0.13293574451712198, + "flos": 640465781760.0, + "grad_norm": 0.08777481913975324, + "language_loss": 0.85673726, + "learning_rate": 0.0009724757020255327, + "loss": 0.86794776, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.37084961, + "step": 691, + "time_per_iteration": 2.81113338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111244, + "balance_loss_mlp": 1.07726967, + "epoch": 0.13312812620238554, + "flos": 491234042880.0, + "grad_norm": 0.09165524457583717, + "language_loss": 0.87811983, + "learning_rate": 0.0009723736703382902, + "loss": 0.88923222, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.33984375, + "step": 692, + "time_per_iteration": 2.548689603805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110186, + "balance_loss_mlp": 1.0692203, + "epoch": 0.1333205078876491, + "flos": 508693837824.0, + "grad_norm": 0.061462060991887495, + "language_loss": 0.83746743, + "learning_rate": 0.0009722714552593244, + "loss": 0.84848601, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.32641602, + "step": 693, + "time_per_iteration": 2.6584513187408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099112, + "balance_loss_mlp": 1.06358743, + "epoch": 0.13351288957291266, + "flos": 418474722816.0, + "grad_norm": 0.07144638741394425, + "language_loss": 0.94810003, + "learning_rate": 0.000972169056828319, + "loss": 0.95909119, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.35522461, + "step": 694, + "time_per_iteration": 2.461437702178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_mlp": 1.06751275, + "epoch": 0.13370527125817622, + "flos": 615614694912.0, + "grad_norm": 0.05672506947017021, + "language_loss": 0.87834966, + "learning_rate": 0.0009720664750850283, + "loss": 0.88935745, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.33251953, + "step": 695, + "time_per_iteration": 2.7716193199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103519, + "balance_loss_mlp": 1.07085609, + "epoch": 0.13389765294343978, + "flos": 625757391360.0, + "grad_norm": 0.07304651625724701, + "language_loss": 0.93482703, + "learning_rate": 0.0009719637100692784, + "loss": 0.94586229, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.32666016, + "step": 696, + "time_per_iteration": 2.7741310596466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111476, + "balance_loss_mlp": 1.08090401, + "epoch": 0.13409003462870334, + "flos": 609391710720.0, + "grad_norm": 0.06235589965882817, + "language_loss": 0.83759153, + "learning_rate": 0.0009718607618209661, + "loss": 0.84873915, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.33862305, + "step": 697, + "time_per_iteration": 2.869180202484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128671, + "balance_loss_mlp": 1.09488726, + "epoch": 0.13428241631396692, + "flos": 683499810816.0, + "grad_norm": 0.0709058406100417, + "language_loss": 0.88053036, + "learning_rate": 0.0009717576303800595, + "loss": 0.89181709, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.33789062, + "step": 698, + "time_per_iteration": 3.007253408432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122193, + "balance_loss_mlp": 1.08716917, + "epoch": 0.13447479799923048, + "flos": 508565799936.0, + "grad_norm": 0.07060238478807088, + "language_loss": 0.86057615, + "learning_rate": 0.0009716543157865975, + "loss": 0.87179804, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.35083008, + "step": 699, + "time_per_iteration": 2.6622114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112811, + "balance_loss_mlp": 1.07812154, + "epoch": 0.13466717968449404, + "flos": 897124737024.0, + "grad_norm": 0.06896685381510245, + "language_loss": 0.84149206, + "learning_rate": 0.0009715508180806907, + "loss": 0.85262012, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.34716797, + "step": 700, + "time_per_iteration": 3.175494909286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106112, + "balance_loss_mlp": 1.07054055, + "epoch": 0.1348595613697576, + "flos": 989501557248.0, + "grad_norm": 0.07388845252403331, + "language_loss": 0.90260321, + "learning_rate": 0.0009714471373025202, + "loss": 0.91366434, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.35546875, + "step": 701, + "time_per_iteration": 3.3912835121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090254, + "balance_loss_mlp": 1.05499172, + "epoch": 0.13505194305502116, + "flos": 487580580864.0, + "grad_norm": 0.07959074518459132, + "language_loss": 0.89355272, + "learning_rate": 0.0009713432734923386, + "loss": 0.9044553, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.35253906, + "step": 702, + "time_per_iteration": 2.6718733310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090728, + "balance_loss_mlp": 1.05572796, + "epoch": 0.13524432474028472, + "flos": 613103399424.0, + "grad_norm": 0.06387437846302528, + "language_loss": 0.875036, + "learning_rate": 0.0009712392266904696, + "loss": 0.88594317, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.34985352, + "step": 703, + "time_per_iteration": 2.6985831260681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010863, + "balance_loss_mlp": 1.0524683, + "epoch": 0.13543670642554828, + "flos": 904425868800.0, + "grad_norm": 0.06666466963859687, + "language_loss": 0.86250496, + "learning_rate": 0.0009711349969373076, + "loss": 0.87336791, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.33862305, + "step": 704, + "time_per_iteration": 3.1328465938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095762, + "balance_loss_mlp": 1.0610956, + "epoch": 0.13562908811081184, + "flos": 550335416832.0, + "grad_norm": 0.0628446006314887, + "language_loss": 0.80944061, + "learning_rate": 0.0009710305842733178, + "loss": 0.82039821, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.34667969, + "step": 705, + "time_per_iteration": 2.7668187618255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093976, + "balance_loss_mlp": 1.06147909, + "epoch": 0.1358214697960754, + "flos": 507797572608.0, + "grad_norm": 0.06635154625105166, + "language_loss": 0.90133065, + "learning_rate": 0.0009709259887390373, + "loss": 0.91227043, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.32519531, + "step": 706, + "time_per_iteration": 2.656233072280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096924, + "balance_loss_mlp": 1.06390333, + "epoch": 0.136013851481339, + "flos": 528640197120.0, + "grad_norm": 0.09290535615143355, + "language_loss": 0.91425377, + "learning_rate": 0.0009708212103750737, + "loss": 0.92522299, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.33007812, + "step": 707, + "time_per_iteration": 2.569655656814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_mlp": 1.06812644, + "epoch": 0.13620623316660255, + "flos": 658772379648.0, + "grad_norm": 0.06731423560591156, + "language_loss": 0.87756282, + "learning_rate": 0.0009707162492221051, + "loss": 0.88857424, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.33007812, + "step": 708, + "time_per_iteration": 2.880669593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103707, + "balance_loss_mlp": 1.07009029, + "epoch": 0.1363986148518661, + "flos": 671583522816.0, + "grad_norm": 0.07312175328849302, + "language_loss": 0.88322687, + "learning_rate": 0.0009706111053208815, + "loss": 0.89426386, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.33642578, + "step": 709, + "time_per_iteration": 2.7878787517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097257, + "balance_loss_mlp": 1.06342554, + "epoch": 0.13659099653712967, + "flos": 472828520448.0, + "grad_norm": 0.06741688104713542, + "language_loss": 0.86067665, + "learning_rate": 0.0009705057787122232, + "loss": 0.87164921, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.33862305, + "step": 710, + "time_per_iteration": 2.528298854827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105446, + "balance_loss_mlp": 1.07190061, + "epoch": 0.13678337822239323, + "flos": 452483490816.0, + "grad_norm": 0.05706590332145298, + "language_loss": 0.91653168, + "learning_rate": 0.0009704002694370216, + "loss": 0.92758614, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.33569336, + "step": 711, + "time_per_iteration": 2.5201761722564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114394, + "balance_loss_mlp": 1.0794661, + "epoch": 0.13697575990765679, + "flos": 519373416960.0, + "grad_norm": 0.06387130477766731, + "language_loss": 0.86892813, + "learning_rate": 0.0009702945775362388, + "loss": 0.88007212, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.34960938, + "step": 712, + "time_per_iteration": 2.661848783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130562, + "balance_loss_mlp": 1.0947994, + "epoch": 0.13716814159292035, + "flos": 480145618944.0, + "grad_norm": 0.06038249383316015, + "language_loss": 0.87339497, + "learning_rate": 0.0009701887030509086, + "loss": 0.8847006, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.35766602, + "step": 713, + "time_per_iteration": 2.6068434715270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125148, + "balance_loss_mlp": 1.0908401, + "epoch": 0.1373605232781839, + "flos": 545376844800.0, + "grad_norm": 0.06924339631343991, + "language_loss": 0.92127877, + "learning_rate": 0.0009700826460221346, + "loss": 0.93253028, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.34301758, + "step": 714, + "time_per_iteration": 2.653224468231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145818, + "balance_loss_mlp": 1.11050797, + "epoch": 0.1375529049634475, + "flos": 708473143296.0, + "grad_norm": 0.0682346884445605, + "language_loss": 0.93435562, + "learning_rate": 0.0009699764064910921, + "loss": 0.94581378, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.35302734, + "step": 715, + "time_per_iteration": 2.878445625305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130305, + "balance_loss_mlp": 1.09542441, + "epoch": 0.13774528664871105, + "flos": 486452971008.0, + "grad_norm": 0.07091873756636237, + "language_loss": 0.87931371, + "learning_rate": 0.0009698699844990268, + "loss": 0.89061677, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.34863281, + "step": 716, + "time_per_iteration": 2.6278092861175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124133, + "balance_loss_mlp": 1.09070659, + "epoch": 0.1379376683339746, + "flos": 679885636608.0, + "grad_norm": 0.0686032560828043, + "language_loss": 0.88731855, + "learning_rate": 0.0009697633800872555, + "loss": 0.89855987, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.33422852, + "step": 717, + "time_per_iteration": 2.888576030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112997, + "balance_loss_mlp": 1.07825947, + "epoch": 0.13813005001923817, + "flos": 610628419584.0, + "grad_norm": 0.07907714555147631, + "language_loss": 0.9128629, + "learning_rate": 0.0009696565932971655, + "loss": 0.92399287, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.34741211, + "step": 718, + "time_per_iteration": 2.8937225341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110249, + "balance_loss_mlp": 1.06837237, + "epoch": 0.13832243170450173, + "flos": 588431222784.0, + "grad_norm": 0.05947825646897862, + "language_loss": 0.9001984, + "learning_rate": 0.0009695496241702153, + "loss": 0.91122329, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.34155273, + "step": 719, + "time_per_iteration": 2.791111469268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094313, + "balance_loss_mlp": 1.06093454, + "epoch": 0.1385148133897653, + "flos": 699674844672.0, + "grad_norm": 0.07440757355955382, + "language_loss": 0.86308432, + "learning_rate": 0.0009694424727479339, + "loss": 0.87402749, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.33398438, + "step": 720, + "time_per_iteration": 2.8781325817108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088445, + "balance_loss_mlp": 1.05475688, + "epoch": 0.13870719507502885, + "flos": 597977399808.0, + "grad_norm": 0.059872525751604476, + "language_loss": 0.90073895, + "learning_rate": 0.0009693351390719213, + "loss": 0.91162348, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.3371582, + "step": 721, + "time_per_iteration": 2.691493272781372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095999, + "balance_loss_mlp": 1.06242967, + "epoch": 0.1388995767602924, + "flos": 586279309824.0, + "grad_norm": 0.07792099406652078, + "language_loss": 0.91640067, + "learning_rate": 0.000969227623183848, + "loss": 0.92736065, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.33569336, + "step": 722, + "time_per_iteration": 2.768209218978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086676, + "balance_loss_mlp": 1.05475235, + "epoch": 0.139091958445556, + "flos": 650810709504.0, + "grad_norm": 0.07717859695455091, + "language_loss": 0.91485119, + "learning_rate": 0.0009691199251254554, + "loss": 0.92571795, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.3190918, + "step": 723, + "time_per_iteration": 2.813594102859497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093708, + "balance_loss_mlp": 1.06159282, + "epoch": 0.13928434013081956, + "flos": 575446961664.0, + "grad_norm": 0.06414169604653322, + "language_loss": 0.8718468, + "learning_rate": 0.0009690120449385555, + "loss": 0.88278389, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.32104492, + "step": 724, + "time_per_iteration": 2.732372999191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110008, + "balance_loss_mlp": 1.06574821, + "epoch": 0.13947672181608312, + "flos": 562954503168.0, + "grad_norm": 0.07538454681544235, + "language_loss": 0.93399024, + "learning_rate": 0.0009689039826650312, + "loss": 0.94499099, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.34375, + "step": 725, + "time_per_iteration": 2.769481658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111743, + "balance_loss_mlp": 1.09967864, + "epoch": 0.13966910350134668, + "flos": 1520699387904.0, + "grad_norm": 0.042030956775344956, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77634799, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.12060547, + "step": 726, + "time_per_iteration": 4.903716802597046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101674, + "balance_loss_mlp": 1.06619751, + "epoch": 0.13986148518661023, + "flos": 499604557824.0, + "grad_norm": 0.07361028590256702, + "language_loss": 0.88265646, + "learning_rate": 0.0009686873120259941, + "loss": 0.89367324, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.35522461, + "step": 727, + "time_per_iteration": 2.639673948287964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099007, + "balance_loss_mlp": 1.06612897, + "epoch": 0.1400538668718738, + "flos": 598381862400.0, + "grad_norm": 0.053177263225715844, + "language_loss": 0.87612498, + "learning_rate": 0.0009685787037446004, + "loss": 0.88711506, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.32885742, + "step": 728, + "time_per_iteration": 2.7457332611083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095755, + "balance_loss_mlp": 1.06135106, + "epoch": 0.14024624855713735, + "flos": 593757941760.0, + "grad_norm": 0.0730266030670127, + "language_loss": 0.88032103, + "learning_rate": 0.0009684699135448201, + "loss": 0.89127851, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.34423828, + "step": 729, + "time_per_iteration": 2.6995558738708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091636, + "balance_loss_mlp": 1.05940139, + "epoch": 0.1404386302424009, + "flos": 506335311360.0, + "grad_norm": 0.06378774069808751, + "language_loss": 0.93033969, + "learning_rate": 0.0009683609414688895, + "loss": 0.94125605, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.32226562, + "step": 730, + "time_per_iteration": 2.6648926734924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097348, + "balance_loss_mlp": 1.06175184, + "epoch": 0.14063101192766447, + "flos": 573132105216.0, + "grad_norm": 0.05452232030629634, + "language_loss": 0.86945236, + "learning_rate": 0.0009682517875591154, + "loss": 0.88042581, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.35620117, + "step": 731, + "time_per_iteration": 2.7333967685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099629, + "balance_loss_mlp": 1.06656027, + "epoch": 0.14082339361292806, + "flos": 564333806592.0, + "grad_norm": 0.06482276791137384, + "language_loss": 0.87207299, + "learning_rate": 0.0009681424518578749, + "loss": 0.88306928, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.33081055, + "step": 732, + "time_per_iteration": 2.706704616546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_mlp": 1.06734443, + "epoch": 0.14101577529819162, + "flos": 463336187904.0, + "grad_norm": 0.05411989278901109, + "language_loss": 0.88122302, + "learning_rate": 0.000968032934407616, + "loss": 0.89222693, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.33056641, + "step": 733, + "time_per_iteration": 2.5904436111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100997, + "balance_loss_mlp": 1.06766593, + "epoch": 0.14120815698345518, + "flos": 595791991296.0, + "grad_norm": 0.06321555834593343, + "language_loss": 0.82077157, + "learning_rate": 0.0009679232352508571, + "loss": 0.83178151, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.33349609, + "step": 734, + "time_per_iteration": 2.758493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102299, + "balance_loss_mlp": 1.06992185, + "epoch": 0.14140053866871874, + "flos": 534864591360.0, + "grad_norm": 0.05697576898708014, + "language_loss": 0.81442666, + "learning_rate": 0.0009678133544301871, + "loss": 0.82544965, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.32373047, + "step": 735, + "time_per_iteration": 2.6508195400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092208, + "balance_loss_mlp": 1.06006956, + "epoch": 0.1415929203539823, + "flos": 520013606400.0, + "grad_norm": 0.0400187761209974, + "language_loss": 0.91843486, + "learning_rate": 0.0009677032919882658, + "loss": 0.92935699, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.32128906, + "step": 736, + "time_per_iteration": 2.705019474029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_mlp": 1.06975937, + "epoch": 0.14178530203924586, + "flos": 482095300608.0, + "grad_norm": 0.07179339183341249, + "language_loss": 0.92199683, + "learning_rate": 0.000967593047967823, + "loss": 0.93300164, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.30712891, + "step": 737, + "time_per_iteration": 2.55415415763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109678, + "balance_loss_mlp": 1.07577443, + "epoch": 0.14197768372450942, + "flos": 676339863552.0, + "grad_norm": 0.08640894081958116, + "language_loss": 0.87084705, + "learning_rate": 0.0009674826224116593, + "loss": 0.88194382, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.33911133, + "step": 738, + "time_per_iteration": 2.819878101348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097633, + "balance_loss_mlp": 1.06544614, + "epoch": 0.14217006540977298, + "flos": 445802199552.0, + "grad_norm": 0.06953952980021996, + "language_loss": 0.8713401, + "learning_rate": 0.0009673720153626455, + "loss": 0.88231641, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.32177734, + "step": 739, + "time_per_iteration": 2.5987422466278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096281, + "balance_loss_mlp": 1.06385565, + "epoch": 0.14236244709503657, + "flos": 496261016064.0, + "grad_norm": 0.08400230511878481, + "language_loss": 0.87465405, + "learning_rate": 0.0009672612268637235, + "loss": 0.88561684, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.32421875, + "step": 740, + "time_per_iteration": 2.6148736476898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098472, + "balance_loss_mlp": 1.06669128, + "epoch": 0.14255482878030012, + "flos": 648022989312.0, + "grad_norm": 0.0806935070673247, + "language_loss": 0.846753, + "learning_rate": 0.0009671502569579048, + "loss": 0.85773772, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.31762695, + "step": 741, + "time_per_iteration": 2.7533769607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089599, + "balance_loss_mlp": 1.05774641, + "epoch": 0.14274721046556368, + "flos": 535888894464.0, + "grad_norm": 0.06572551706098649, + "language_loss": 0.90748239, + "learning_rate": 0.0009670391056882719, + "loss": 0.91837835, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.31835938, + "step": 742, + "time_per_iteration": 2.698690176010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088511, + "balance_loss_mlp": 1.0565629, + "epoch": 0.14293959215082724, + "flos": 956677215744.0, + "grad_norm": 0.07291469749344824, + "language_loss": 0.89417249, + "learning_rate": 0.0009669277730979776, + "loss": 0.90505755, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.31958008, + "step": 743, + "time_per_iteration": 3.1728732585906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108649, + "balance_loss_mlp": 1.05408931, + "epoch": 0.1431319738360908, + "flos": 692766590976.0, + "grad_norm": 0.06693583917292938, + "language_loss": 0.85588205, + "learning_rate": 0.0009668162592302449, + "loss": 0.86674696, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.32397461, + "step": 744, + "time_per_iteration": 2.896467685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099426, + "balance_loss_mlp": 1.06673896, + "epoch": 0.14332435552135436, + "flos": 565174817280.0, + "grad_norm": 0.0717564206721674, + "language_loss": 0.86683381, + "learning_rate": 0.0009667045641283676, + "loss": 0.877828, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.3269043, + "step": 745, + "time_per_iteration": 2.6326427459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095955, + "balance_loss_mlp": 1.06336319, + "epoch": 0.14351673720661792, + "flos": 738045665280.0, + "grad_norm": 0.07083856064802352, + "language_loss": 0.95545924, + "learning_rate": 0.0009665926878357092, + "loss": 0.96641874, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.32592773, + "step": 746, + "time_per_iteration": 2.902628183364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108393, + "balance_loss_mlp": 1.07565856, + "epoch": 0.14370911889188148, + "flos": 548951731200.0, + "grad_norm": 0.08672542857876225, + "language_loss": 0.91510898, + "learning_rate": 0.0009664806303957043, + "loss": 0.92619288, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.32714844, + "step": 747, + "time_per_iteration": 2.678656578063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107271, + "balance_loss_mlp": 1.07448816, + "epoch": 0.14390150057714507, + "flos": 589973469696.0, + "grad_norm": 0.06575006445724518, + "language_loss": 0.87633115, + "learning_rate": 0.0009663683918518571, + "loss": 0.88740385, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.32788086, + "step": 748, + "time_per_iteration": 2.894339084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116744, + "balance_loss_mlp": 1.08226848, + "epoch": 0.14409388226240863, + "flos": 590773782528.0, + "grad_norm": 0.06412555003569581, + "language_loss": 0.86334193, + "learning_rate": 0.0009662559722477428, + "loss": 0.87450933, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.3449707, + "step": 749, + "time_per_iteration": 2.6673357486724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116866, + "balance_loss_mlp": 1.15397346, + "epoch": 0.1442862639476722, + "flos": 1510418479104.0, + "grad_norm": 0.05654081816866197, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77331638, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.14648438, + "step": 750, + "time_per_iteration": 4.97744607925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111461, + "balance_loss_mlp": 1.0782733, + "epoch": 0.14447864563293575, + "flos": 496493770752.0, + "grad_norm": 0.05840496998451829, + "language_loss": 0.89989787, + "learning_rate": 0.0009660305900333632, + "loss": 0.91101241, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.33203125, + "step": 751, + "time_per_iteration": 2.6919631958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108129, + "balance_loss_mlp": 1.07513142, + "epoch": 0.1446710273181993, + "flos": 589400271360.0, + "grad_norm": 0.0663289310880325, + "language_loss": 0.83084202, + "learning_rate": 0.0009659176275105992, + "loss": 0.8419233, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.33007812, + "step": 752, + "time_per_iteration": 2.702003240585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097403, + "balance_loss_mlp": 1.0634284, + "epoch": 0.14486340900346287, + "flos": 585521256960.0, + "grad_norm": 0.05748666507804042, + "language_loss": 0.86628646, + "learning_rate": 0.0009658044841025701, + "loss": 0.87726045, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.34008789, + "step": 753, + "time_per_iteration": 2.7666702270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106626, + "balance_loss_mlp": 1.07114923, + "epoch": 0.14505579068872643, + "flos": 504405978624.0, + "grad_norm": 0.07320865998852653, + "language_loss": 0.81996346, + "learning_rate": 0.0009656911598532021, + "loss": 0.83102977, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.35498047, + "step": 754, + "time_per_iteration": 2.6273839473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094053, + "balance_loss_mlp": 1.05936301, + "epoch": 0.14524817237399, + "flos": 486566452224.0, + "grad_norm": 0.05776902712696923, + "language_loss": 0.90229332, + "learning_rate": 0.0009655776548064917, + "loss": 0.91323388, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.34667969, + "step": 755, + "time_per_iteration": 2.6639461517333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092368, + "balance_loss_mlp": 1.05867922, + "epoch": 0.14544055405925355, + "flos": 727857888768.0, + "grad_norm": 0.059694446461720084, + "language_loss": 0.88762641, + "learning_rate": 0.0009654639690065054, + "loss": 0.89855003, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.33691406, + "step": 756, + "time_per_iteration": 2.881164789199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092737, + "balance_loss_mlp": 1.05981112, + "epoch": 0.14563293574451713, + "flos": 593359271424.0, + "grad_norm": 0.0719411984245977, + "language_loss": 0.88362074, + "learning_rate": 0.00096535010249738, + "loss": 0.89454818, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.3293457, + "step": 757, + "time_per_iteration": 2.703355312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092325, + "balance_loss_mlp": 1.05925632, + "epoch": 0.1458253174297807, + "flos": 560192924160.0, + "grad_norm": 0.09095988428785044, + "language_loss": 0.8300786, + "learning_rate": 0.0009652360553233224, + "loss": 0.84100187, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.33081055, + "step": 758, + "time_per_iteration": 2.7321062088012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_mlp": 1.04821551, + "epoch": 0.14601769911504425, + "flos": 1557025984512.0, + "grad_norm": 0.03493248396843453, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74836457, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.14453125, + "step": 759, + "time_per_iteration": 4.917184591293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099151, + "balance_loss_mlp": 1.06605887, + "epoch": 0.1462100808003078, + "flos": 865922628096.0, + "grad_norm": 0.05465610046720203, + "language_loss": 0.8166393, + "learning_rate": 0.0009650074191575883, + "loss": 0.82763088, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.33105469, + "step": 760, + "time_per_iteration": 3.2009472846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097005, + "balance_loss_mlp": 1.06341171, + "epoch": 0.14640246248557137, + "flos": 522673288704.0, + "grad_norm": 0.07890258703475667, + "language_loss": 0.86329532, + "learning_rate": 0.0009648928302546766, + "loss": 0.87426543, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.3359375, + "step": 761, + "time_per_iteration": 2.6858482360839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087335, + "balance_loss_mlp": 1.05340791, + "epoch": 0.14659484417083493, + "flos": 1030121805312.0, + "grad_norm": 0.05505233607608704, + "language_loss": 0.8584463, + "learning_rate": 0.0009647780608643613, + "loss": 0.86931968, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.33935547, + "step": 762, + "time_per_iteration": 3.3784618377685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087006, + "balance_loss_mlp": 1.05365133, + "epoch": 0.1467872258560985, + "flos": 500426629632.0, + "grad_norm": 0.083565321416964, + "language_loss": 0.88299912, + "learning_rate": 0.0009646631110312001, + "loss": 0.89386916, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.33349609, + "step": 763, + "time_per_iteration": 2.642038345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096574, + "balance_loss_mlp": 1.06465006, + "epoch": 0.14697960754136205, + "flos": 547514201088.0, + "grad_norm": 0.05646167170610495, + "language_loss": 0.88908124, + "learning_rate": 0.0009645479807998203, + "loss": 0.900047, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.3190918, + "step": 764, + "time_per_iteration": 2.7709102630615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093321, + "balance_loss_mlp": 1.0614922, + "epoch": 0.14717198922662564, + "flos": 517586678784.0, + "grad_norm": 0.06731397985108602, + "language_loss": 0.93233657, + "learning_rate": 0.0009644326702149196, + "loss": 0.94326979, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.31811523, + "step": 765, + "time_per_iteration": 2.691761016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098194, + "balance_loss_mlp": 1.06472015, + "epoch": 0.1473643709118892, + "flos": 731661147648.0, + "grad_norm": 0.08664060064789567, + "language_loss": 0.85604531, + "learning_rate": 0.0009643171793212653, + "loss": 0.86702728, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.33496094, + "step": 766, + "time_per_iteration": 3.0578510761260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095374, + "balance_loss_mlp": 1.06190002, + "epoch": 0.14755675259715276, + "flos": 620257554432.0, + "grad_norm": 0.06875066800131625, + "language_loss": 0.90379435, + "learning_rate": 0.0009642015081636952, + "loss": 0.91474807, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.33496094, + "step": 767, + "time_per_iteration": 2.690892219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091039, + "balance_loss_mlp": 1.05830407, + "epoch": 0.14774913428241632, + "flos": 451981513728.0, + "grad_norm": 0.06617868208271054, + "language_loss": 0.88812423, + "learning_rate": 0.0009640856567871166, + "loss": 0.89903462, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.32714844, + "step": 768, + "time_per_iteration": 2.5108768939971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086849, + "balance_loss_mlp": 1.05316067, + "epoch": 0.14794151596767988, + "flos": 836881196544.0, + "grad_norm": 0.06813910901976611, + "language_loss": 0.89643073, + "learning_rate": 0.0009639696252365072, + "loss": 0.90729922, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.33691406, + "step": 769, + "time_per_iteration": 3.036872386932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087914, + "balance_loss_mlp": 1.05546558, + "epoch": 0.14813389765294344, + "flos": 685765204992.0, + "grad_norm": 0.06952898718112278, + "language_loss": 0.82433641, + "learning_rate": 0.0009638534135569144, + "loss": 0.83521557, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.32446289, + "step": 770, + "time_per_iteration": 2.920228958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096521, + "balance_loss_mlp": 1.06395316, + "epoch": 0.148326279338207, + "flos": 509625008640.0, + "grad_norm": 0.05850145176667806, + "language_loss": 0.90417981, + "learning_rate": 0.0009637370217934554, + "loss": 0.91514498, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.32568359, + "step": 771, + "time_per_iteration": 2.6692943572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0624088, + "epoch": 0.14851866102347056, + "flos": 587869608960.0, + "grad_norm": 0.06374792966079154, + "language_loss": 0.83362675, + "learning_rate": 0.0009636204499913175, + "loss": 0.84457153, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.32055664, + "step": 772, + "time_per_iteration": 2.9103784561157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101595, + "balance_loss_mlp": 1.07129157, + "epoch": 0.14871104270873411, + "flos": 690722366976.0, + "grad_norm": 0.05784692032564958, + "language_loss": 0.8891257, + "learning_rate": 0.0009635036981957581, + "loss": 0.90014172, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.30273438, + "step": 773, + "time_per_iteration": 2.840233087539673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109331, + "balance_loss_mlp": 1.06112361, + "epoch": 0.1489034243939977, + "flos": 654803205120.0, + "grad_norm": 0.06091674471201955, + "language_loss": 0.9126395, + "learning_rate": 0.0009633867664521043, + "loss": 0.9235726, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.32202148, + "step": 774, + "time_per_iteration": 2.8467912673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098219, + "balance_loss_mlp": 1.0643878, + "epoch": 0.14909580607926126, + "flos": 475595891712.0, + "grad_norm": 0.06395321005815084, + "language_loss": 0.87366414, + "learning_rate": 0.0009632696548057527, + "loss": 0.8846463, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.33862305, + "step": 775, + "time_per_iteration": 2.55267596244812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090534, + "balance_loss_mlp": 1.05729866, + "epoch": 0.14928818776452482, + "flos": 610789953024.0, + "grad_norm": 0.07257335679926562, + "language_loss": 0.85489643, + "learning_rate": 0.0009631523633021704, + "loss": 0.86580181, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.33251953, + "step": 776, + "time_per_iteration": 2.800656795501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090063, + "balance_loss_mlp": 1.05694628, + "epoch": 0.14948056944978838, + "flos": 561487859712.0, + "grad_norm": 0.058446141184189525, + "language_loss": 0.88943005, + "learning_rate": 0.0009630348919868936, + "loss": 0.90033066, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.33129883, + "step": 777, + "time_per_iteration": 2.7306644916534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088502, + "balance_loss_mlp": 1.05397916, + "epoch": 0.14967295113505194, + "flos": 448972623360.0, + "grad_norm": 0.08136314957760014, + "language_loss": 0.81536144, + "learning_rate": 0.0009629172409055293, + "loss": 0.82624644, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.34545898, + "step": 778, + "time_per_iteration": 2.532480239868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091534, + "balance_loss_mlp": 1.05937171, + "epoch": 0.1498653328203155, + "flos": 571000541184.0, + "grad_norm": 0.06865521140792329, + "language_loss": 0.88039231, + "learning_rate": 0.0009627994101037531, + "loss": 0.89130771, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.3215332, + "step": 779, + "time_per_iteration": 2.7336056232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091586, + "balance_loss_mlp": 1.05811191, + "epoch": 0.15005771450557906, + "flos": 630918194688.0, + "grad_norm": 0.06277485509918372, + "language_loss": 0.8981787, + "learning_rate": 0.0009626813996273114, + "loss": 0.90909451, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.3347168, + "step": 780, + "time_per_iteration": 2.8651859760284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092654, + "balance_loss_mlp": 1.06018162, + "epoch": 0.15025009619084262, + "flos": 577633780224.0, + "grad_norm": 0.06737111741199381, + "language_loss": 0.89359641, + "learning_rate": 0.0009625632095220198, + "loss": 0.90452296, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.32470703, + "step": 781, + "time_per_iteration": 2.910163640975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093984, + "balance_loss_mlp": 1.06041455, + "epoch": 0.1504424778761062, + "flos": 483646311936.0, + "grad_norm": 0.06188715182302237, + "language_loss": 0.87568116, + "learning_rate": 0.0009624448398337637, + "loss": 0.88662094, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.3359375, + "step": 782, + "time_per_iteration": 2.532055616378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100397, + "balance_loss_mlp": 1.06751907, + "epoch": 0.15063485956136977, + "flos": 762167812608.0, + "grad_norm": 0.06229794960735175, + "language_loss": 0.89905757, + "learning_rate": 0.0009623262906084984, + "loss": 0.91006154, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.32861328, + "step": 783, + "time_per_iteration": 2.9851605892181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104855, + "balance_loss_mlp": 1.0712378, + "epoch": 0.15082724124663333, + "flos": 497369687040.0, + "grad_norm": 0.060596744514248076, + "language_loss": 0.90796679, + "learning_rate": 0.0009622075618922486, + "loss": 0.91901541, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.33642578, + "step": 784, + "time_per_iteration": 2.6796786785125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102349, + "balance_loss_mlp": 1.06928015, + "epoch": 0.15101962293189689, + "flos": 509476621824.0, + "grad_norm": 0.06389342174673626, + "language_loss": 0.87423813, + "learning_rate": 0.0009620886537311091, + "loss": 0.8852616, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.33081055, + "step": 785, + "time_per_iteration": 2.6153056621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113163, + "balance_loss_mlp": 1.07685184, + "epoch": 0.15121200461716044, + "flos": 457520638464.0, + "grad_norm": 0.06793935281312648, + "language_loss": 0.85492945, + "learning_rate": 0.000961969566171244, + "loss": 0.86606109, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.36303711, + "step": 786, + "time_per_iteration": 2.506267786026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122272, + "balance_loss_mlp": 1.08703363, + "epoch": 0.151404386302424, + "flos": 537729477120.0, + "grad_norm": 0.0670602351843582, + "language_loss": 0.90370345, + "learning_rate": 0.0009618502992588873, + "loss": 0.91492617, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.35253906, + "step": 787, + "time_per_iteration": 2.623457670211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141844, + "balance_loss_mlp": 1.10658193, + "epoch": 0.15159676798768756, + "flos": 687858891264.0, + "grad_norm": 0.06543467559167064, + "language_loss": 0.88581872, + "learning_rate": 0.0009617308530403424, + "loss": 0.89723718, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.35302734, + "step": 788, + "time_per_iteration": 2.975861072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149381, + "balance_loss_mlp": 1.11371326, + "epoch": 0.15178914967295112, + "flos": 545042193408.0, + "grad_norm": 0.059566397417978756, + "language_loss": 0.87806541, + "learning_rate": 0.0009616112275619825, + "loss": 0.88955921, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.35668945, + "step": 789, + "time_per_iteration": 2.683262348175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152452, + "balance_loss_mlp": 1.1169517, + "epoch": 0.1519815313582147, + "flos": 511510671360.0, + "grad_norm": 0.05728483560240697, + "language_loss": 0.84466863, + "learning_rate": 0.0009614914228702503, + "loss": 0.85619313, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.35498047, + "step": 790, + "time_per_iteration": 2.6616339683532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142719, + "balance_loss_mlp": 1.10850596, + "epoch": 0.15217391304347827, + "flos": 683747122176.0, + "grad_norm": 0.057799273493116435, + "language_loss": 0.89279461, + "learning_rate": 0.0009613714390116581, + "loss": 0.90422177, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.34204102, + "step": 791, + "time_per_iteration": 2.947608470916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133841, + "balance_loss_mlp": 1.0997231, + "epoch": 0.15236629472874183, + "flos": 643873342464.0, + "grad_norm": 0.06413295296627212, + "language_loss": 0.86589968, + "learning_rate": 0.0009612512760327879, + "loss": 0.87723809, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.34155273, + "step": 792, + "time_per_iteration": 2.8261189460754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124449, + "balance_loss_mlp": 1.08727932, + "epoch": 0.1525586764140054, + "flos": 412654791168.0, + "grad_norm": 0.06095846853214657, + "language_loss": 0.85749042, + "learning_rate": 0.0009611309339802909, + "loss": 0.86873484, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.37182617, + "step": 793, + "time_per_iteration": 2.438474178314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113041, + "balance_loss_mlp": 1.07811236, + "epoch": 0.15275105809926895, + "flos": 802444644864.0, + "grad_norm": 0.04691390558901254, + "language_loss": 0.84620011, + "learning_rate": 0.0009610104129008881, + "loss": 0.85733056, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.34985352, + "step": 794, + "time_per_iteration": 3.1149892807006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092099, + "balance_loss_mlp": 1.05786228, + "epoch": 0.1529434397845325, + "flos": 612143115264.0, + "grad_norm": 0.06446455819394356, + "language_loss": 0.88995111, + "learning_rate": 0.0009608897128413701, + "loss": 0.90087205, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.3425293, + "step": 795, + "time_per_iteration": 2.7310965061187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085841, + "balance_loss_mlp": 1.05160367, + "epoch": 0.15313582146979607, + "flos": 614941009920.0, + "grad_norm": 0.04580320827636504, + "language_loss": 0.8595438, + "learning_rate": 0.0009607688338485965, + "loss": 0.87040222, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.3425293, + "step": 796, + "time_per_iteration": 2.8534584045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088916, + "balance_loss_mlp": 1.05427384, + "epoch": 0.15332820315505963, + "flos": 793256440320.0, + "grad_norm": 0.053101967265095064, + "language_loss": 0.91128695, + "learning_rate": 0.0009606477759694969, + "loss": 0.92217612, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.34643555, + "step": 797, + "time_per_iteration": 3.0544466972351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_mlp": 1.06441545, + "epoch": 0.1535205848403232, + "flos": 549945510912.0, + "grad_norm": 0.0662794157411924, + "language_loss": 0.87591946, + "learning_rate": 0.0009605265392510703, + "loss": 0.88690674, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.34350586, + "step": 798, + "time_per_iteration": 2.6120660305023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011091, + "balance_loss_mlp": 1.07417202, + "epoch": 0.15371296652558677, + "flos": 535691045376.0, + "grad_norm": 0.07220239734969772, + "language_loss": 0.92342889, + "learning_rate": 0.0009604051237403846, + "loss": 0.93451989, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.34960938, + "step": 799, + "time_per_iteration": 2.640749216079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111728, + "balance_loss_mlp": 1.07808757, + "epoch": 0.15390534821085033, + "flos": 395002939392.0, + "grad_norm": 0.06314402273456009, + "language_loss": 0.86126584, + "learning_rate": 0.0009602835294845776, + "loss": 0.87238312, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.33666992, + "step": 800, + "time_per_iteration": 2.44914174079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117351, + "balance_loss_mlp": 1.08254242, + "epoch": 0.1540977298961139, + "flos": 535587738624.0, + "grad_norm": 0.057636094576239, + "language_loss": 0.91100746, + "learning_rate": 0.0009601617565308565, + "loss": 0.92218101, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.34790039, + "step": 801, + "time_per_iteration": 2.599679470062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119062, + "balance_loss_mlp": 1.08511138, + "epoch": 0.15429011158137745, + "flos": 723388147200.0, + "grad_norm": 0.05961266019354579, + "language_loss": 0.86783326, + "learning_rate": 0.0009600398049264977, + "loss": 0.87902391, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.33935547, + "step": 802, + "time_per_iteration": 2.9514007568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121787, + "balance_loss_mlp": 1.08735943, + "epoch": 0.154482493266641, + "flos": 620209502208.0, + "grad_norm": 0.06366105456569557, + "language_loss": 0.92098475, + "learning_rate": 0.0009599176747188469, + "loss": 0.9322027, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.34448242, + "step": 803, + "time_per_iteration": 2.8068411350250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_mlp": 1.08012128, + "epoch": 0.15467487495190457, + "flos": 525351909888.0, + "grad_norm": 0.08101366702111423, + "language_loss": 0.83651662, + "learning_rate": 0.0009597953659553196, + "loss": 0.84765685, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.33911133, + "step": 804, + "time_per_iteration": 2.7075448036193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098377, + "balance_loss_mlp": 1.06616712, + "epoch": 0.15486725663716813, + "flos": 527473299456.0, + "grad_norm": 0.07377431927286832, + "language_loss": 0.89624304, + "learning_rate": 0.0009596728786833997, + "loss": 0.90722686, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.32202148, + "step": 805, + "time_per_iteration": 2.6376051902770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088085, + "balance_loss_mlp": 1.05420554, + "epoch": 0.1550596383224317, + "flos": 1048118482944.0, + "grad_norm": 0.06708822771662253, + "language_loss": 0.90018022, + "learning_rate": 0.0009595502129506415, + "loss": 0.91106105, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.33911133, + "step": 806, + "time_per_iteration": 3.3391284942626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092582, + "balance_loss_mlp": 1.05903625, + "epoch": 0.15525202000769528, + "flos": 613438050816.0, + "grad_norm": 0.06052700763637142, + "language_loss": 0.83084035, + "learning_rate": 0.0009594273688046678, + "loss": 0.84176612, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.33544922, + "step": 807, + "time_per_iteration": 2.7136006355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088184, + "balance_loss_mlp": 1.05273128, + "epoch": 0.15544440169295884, + "flos": 532805810688.0, + "grad_norm": 0.07048562468234597, + "language_loss": 0.86048424, + "learning_rate": 0.000959304346293171, + "loss": 0.87136608, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.35473633, + "step": 808, + "time_per_iteration": 2.6744906902313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097573, + "balance_loss_mlp": 1.06254935, + "epoch": 0.1556367833782224, + "flos": 644433546240.0, + "grad_norm": 0.06803397985071584, + "language_loss": 0.88331544, + "learning_rate": 0.0009591811454639125, + "loss": 0.89429116, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.3503418, + "step": 809, + "time_per_iteration": 2.730431079864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0610261, + "epoch": 0.15582916506348596, + "flos": 543540644352.0, + "grad_norm": 0.06204685505428811, + "language_loss": 0.88227659, + "learning_rate": 0.0009590577663647234, + "loss": 0.89322132, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.3347168, + "step": 810, + "time_per_iteration": 2.71469783782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104684, + "balance_loss_mlp": 1.07078123, + "epoch": 0.15602154674874952, + "flos": 579740613120.0, + "grad_norm": 0.05672341894910533, + "language_loss": 0.86610442, + "learning_rate": 0.0009589342090435036, + "loss": 0.87715125, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.33935547, + "step": 811, + "time_per_iteration": 2.799246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110387, + "balance_loss_mlp": 1.06918025, + "epoch": 0.15621392843401308, + "flos": 534982454784.0, + "grad_norm": 0.0647852675732537, + "language_loss": 0.87778354, + "learning_rate": 0.0009588104735482223, + "loss": 0.8888222, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.34692383, + "step": 812, + "time_per_iteration": 2.6684510707855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126921, + "balance_loss_mlp": 1.09106326, + "epoch": 0.15640631011927664, + "flos": 550635162624.0, + "grad_norm": 0.08222618986335321, + "language_loss": 0.84280443, + "learning_rate": 0.0009586865599269177, + "loss": 0.85407358, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.35864258, + "step": 813, + "time_per_iteration": 2.6293816566467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131277, + "balance_loss_mlp": 1.09651566, + "epoch": 0.1565986918045402, + "flos": 637190641152.0, + "grad_norm": 0.05945515562529824, + "language_loss": 0.88725412, + "learning_rate": 0.0009585624682276977, + "loss": 0.89856684, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.34814453, + "step": 814, + "time_per_iteration": 2.744253158569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137563, + "balance_loss_mlp": 1.10113239, + "epoch": 0.15679107348980378, + "flos": 490569122304.0, + "grad_norm": 0.09591637295165127, + "language_loss": 0.87945771, + "learning_rate": 0.0009584381984987386, + "loss": 0.89083332, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.36474609, + "step": 815, + "time_per_iteration": 2.5264036655426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124613, + "balance_loss_mlp": 1.0911386, + "epoch": 0.15698345517506734, + "flos": 529689231360.0, + "grad_norm": 0.05838460881618622, + "language_loss": 0.90277314, + "learning_rate": 0.0009583137507882864, + "loss": 0.91401929, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.3347168, + "step": 816, + "time_per_iteration": 2.6488330364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109898, + "balance_loss_mlp": 1.07418323, + "epoch": 0.1571758368603309, + "flos": 545779897344.0, + "grad_norm": 0.07313796537718548, + "language_loss": 0.81262791, + "learning_rate": 0.000958189125144656, + "loss": 0.82372689, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.35766602, + "step": 817, + "time_per_iteration": 2.7040657997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101746, + "balance_loss_mlp": 1.06672239, + "epoch": 0.15736821854559446, + "flos": 565377048576.0, + "grad_norm": 0.067694528538076, + "language_loss": 0.88558215, + "learning_rate": 0.0009580643216162313, + "loss": 0.89659959, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.3503418, + "step": 818, + "time_per_iteration": 2.6538634300231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096156, + "balance_loss_mlp": 1.06110835, + "epoch": 0.15756060023085802, + "flos": 500707436544.0, + "grad_norm": 0.05957146674366314, + "language_loss": 0.79884583, + "learning_rate": 0.0009579393402514652, + "loss": 0.80980736, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.35107422, + "step": 819, + "time_per_iteration": 2.5606625080108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082975, + "balance_loss_mlp": 1.048738, + "epoch": 0.15775298191612158, + "flos": 519014034432.0, + "grad_norm": 0.06194437160070725, + "language_loss": 0.91126758, + "learning_rate": 0.0009578141810988801, + "loss": 0.92209733, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.34228516, + "step": 820, + "time_per_iteration": 2.55538010597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082194, + "balance_loss_mlp": 1.04712272, + "epoch": 0.15794536360138514, + "flos": 465891153408.0, + "grad_norm": 0.060184436438788555, + "language_loss": 0.91010749, + "learning_rate": 0.0009576888442070668, + "loss": 0.92092943, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.35083008, + "step": 821, + "time_per_iteration": 2.6139276027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094225, + "balance_loss_mlp": 1.05982161, + "epoch": 0.1581377452866487, + "flos": 516911583744.0, + "grad_norm": 0.06832586535724347, + "language_loss": 0.92820144, + "learning_rate": 0.0009575633296246854, + "loss": 0.93914366, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.34423828, + "step": 822, + "time_per_iteration": 2.557404041290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096972, + "balance_loss_mlp": 1.06242526, + "epoch": 0.15833012697191226, + "flos": 549522109440.0, + "grad_norm": 0.06257557491721027, + "language_loss": 0.83520567, + "learning_rate": 0.0009574376374004652, + "loss": 0.84617537, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.34570312, + "step": 823, + "time_per_iteration": 2.673220157623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100077, + "balance_loss_mlp": 1.06395626, + "epoch": 0.15852250865717585, + "flos": 487206641664.0, + "grad_norm": 0.07116075590187526, + "language_loss": 0.81073487, + "learning_rate": 0.000957311767583204, + "loss": 0.82173562, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.36132812, + "step": 824, + "time_per_iteration": 2.605074882507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159484, + "balance_loss_mlp": 1.14126849, + "epoch": 0.1587148903424394, + "flos": 1309041672192.0, + "grad_norm": 0.051809649393169656, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83231074, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.18261719, + "step": 825, + "time_per_iteration": 4.726073265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111019, + "balance_loss_mlp": 1.07349157, + "epoch": 0.15890727202770297, + "flos": 466634649600.0, + "grad_norm": 0.07947222616221912, + "language_loss": 0.92132723, + "learning_rate": 0.0009570594953650961, + "loss": 0.93243748, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.37524414, + "step": 826, + "time_per_iteration": 2.5146830081939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109067, + "balance_loss_mlp": 1.07208848, + "epoch": 0.15909965371296653, + "flos": 776733608448.0, + "grad_norm": 0.06013225990958685, + "language_loss": 0.80852252, + "learning_rate": 0.00095693309306219, + "loss": 0.81961316, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.36962891, + "step": 827, + "time_per_iteration": 3.095632553100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117102, + "balance_loss_mlp": 1.07945621, + "epoch": 0.1592920353982301, + "flos": 1077852538368.0, + "grad_norm": 0.05984978885312211, + "language_loss": 0.88600951, + "learning_rate": 0.0009568065133621244, + "loss": 0.89718056, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.37646484, + "step": 828, + "time_per_iteration": 3.3153574466705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111269, + "balance_loss_mlp": 1.07584, + "epoch": 0.15948441708349365, + "flos": 725307305472.0, + "grad_norm": 0.0632864692280333, + "language_loss": 0.85493571, + "learning_rate": 0.0009566797563140422, + "loss": 0.86604846, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.35449219, + "step": 829, + "time_per_iteration": 2.8705785274505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121543, + "balance_loss_mlp": 1.08470702, + "epoch": 0.1596767987687572, + "flos": 578447087616.0, + "grad_norm": 0.06433687205870958, + "language_loss": 0.88630873, + "learning_rate": 0.0009565528219671547, + "loss": 0.89752412, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.36816406, + "step": 830, + "time_per_iteration": 2.8890771865844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137333, + "balance_loss_mlp": 1.10049748, + "epoch": 0.15986918045402077, + "flos": 528728947200.0, + "grad_norm": 0.04994246668943954, + "language_loss": 0.85232639, + "learning_rate": 0.0009564257103707418, + "loss": 0.86369967, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.36816406, + "step": 831, + "time_per_iteration": 2.5870308876037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133852, + "balance_loss_mlp": 1.09632492, + "epoch": 0.16006156213928435, + "flos": 574313559552.0, + "grad_norm": 0.0648316290803925, + "language_loss": 0.91675746, + "learning_rate": 0.0009562984215741533, + "loss": 0.92809594, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.37524414, + "step": 832, + "time_per_iteration": 2.655066967010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117496, + "balance_loss_mlp": 1.08170903, + "epoch": 0.1602539438245479, + "flos": 515258675712.0, + "grad_norm": 0.14271195523272245, + "language_loss": 0.82911491, + "learning_rate": 0.0009561709556268065, + "loss": 0.84028995, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.35839844, + "step": 833, + "time_per_iteration": 2.69999098777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119914, + "balance_loss_mlp": 1.08419931, + "epoch": 0.16044632550981147, + "flos": 620730418176.0, + "grad_norm": 0.05962773238435596, + "language_loss": 0.95060706, + "learning_rate": 0.0009560433125781884, + "loss": 0.96180618, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.35693359, + "step": 834, + "time_per_iteration": 2.711109161376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126421, + "balance_loss_mlp": 1.08977628, + "epoch": 0.16063870719507503, + "flos": 560817146880.0, + "grad_norm": 0.06388697234939344, + "language_loss": 0.92829657, + "learning_rate": 0.0009559154924778544, + "loss": 0.93956077, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.36621094, + "step": 835, + "time_per_iteration": 2.695260763168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121916, + "balance_loss_mlp": 1.08789361, + "epoch": 0.1608310888803386, + "flos": 804778440192.0, + "grad_norm": 0.05750453212643973, + "language_loss": 0.85217482, + "learning_rate": 0.0009557874953754284, + "loss": 0.86339402, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.34057617, + "step": 836, + "time_per_iteration": 3.002013921737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126166, + "balance_loss_mlp": 1.09204817, + "epoch": 0.16102347056560215, + "flos": 600311195136.0, + "grad_norm": 0.06332628409766573, + "language_loss": 0.84060842, + "learning_rate": 0.0009556593213206038, + "loss": 0.85187006, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.34130859, + "step": 837, + "time_per_iteration": 2.698716163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125003, + "balance_loss_mlp": 1.09102869, + "epoch": 0.1612158522508657, + "flos": 553235208192.0, + "grad_norm": 0.07524747482874264, + "language_loss": 0.87844718, + "learning_rate": 0.0009555309703631414, + "loss": 0.88969719, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.33984375, + "step": 838, + "time_per_iteration": 2.669588327407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133813, + "balance_loss_mlp": 1.09752607, + "epoch": 0.16140823393612927, + "flos": 555701423616.0, + "grad_norm": 0.07144746672945328, + "language_loss": 0.87685311, + "learning_rate": 0.0009554024425528722, + "loss": 0.88819122, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.36279297, + "step": 839, + "time_per_iteration": 2.6809709072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112028, + "balance_loss_mlp": 1.08737814, + "epoch": 0.16160061562139286, + "flos": 543613427712.0, + "grad_norm": 0.06970106087394082, + "language_loss": 0.8929134, + "learning_rate": 0.0009552737379396948, + "loss": 0.90411627, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.32885742, + "step": 840, + "time_per_iteration": 2.6100995540618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102587, + "balance_loss_mlp": 1.06920815, + "epoch": 0.16179299730665642, + "flos": 603590717952.0, + "grad_norm": 0.06131687325166246, + "language_loss": 0.87945604, + "learning_rate": 0.0009551448565735767, + "loss": 0.89048195, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.33398438, + "step": 841, + "time_per_iteration": 2.7360360622406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095736, + "balance_loss_mlp": 1.06168985, + "epoch": 0.16198537899191998, + "flos": 786821050368.0, + "grad_norm": 0.07162496841720159, + "language_loss": 0.8519845, + "learning_rate": 0.0009550157985045543, + "loss": 0.86294186, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.34082031, + "step": 842, + "time_per_iteration": 3.0436456203460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087632, + "balance_loss_mlp": 1.05413389, + "epoch": 0.16217776067718354, + "flos": 519550917120.0, + "grad_norm": 0.060562390499230526, + "language_loss": 0.89622426, + "learning_rate": 0.0009548865637827321, + "loss": 0.90710062, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.33496094, + "step": 843, + "time_per_iteration": 2.6422221660614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086448, + "balance_loss_mlp": 1.05342698, + "epoch": 0.1623701423624471, + "flos": 505015644672.0, + "grad_norm": 0.07097995853224412, + "language_loss": 0.90216166, + "learning_rate": 0.0009547571524582838, + "loss": 0.91302609, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.33032227, + "step": 844, + "time_per_iteration": 2.6082894802093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095057, + "balance_loss_mlp": 1.06031966, + "epoch": 0.16256252404771065, + "flos": 496940493312.0, + "grad_norm": 0.06932052947515681, + "language_loss": 0.92511153, + "learning_rate": 0.0009546275645814512, + "loss": 0.9360621, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.34765625, + "step": 845, + "time_per_iteration": 2.5985872745513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100013, + "balance_loss_mlp": 1.065418, + "epoch": 0.16275490573297421, + "flos": 502110061056.0, + "grad_norm": 0.07540183512891604, + "language_loss": 0.90294898, + "learning_rate": 0.0009544978002025446, + "loss": 0.91394913, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.34619141, + "step": 846, + "time_per_iteration": 2.5778391361236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096289, + "balance_loss_mlp": 1.06174231, + "epoch": 0.16294728741823777, + "flos": 506952179712.0, + "grad_norm": 0.06018935314915502, + "language_loss": 0.87532055, + "learning_rate": 0.0009543678593719434, + "loss": 0.8862834, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.34570312, + "step": 847, + "time_per_iteration": 2.697566270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102624, + "balance_loss_mlp": 1.06824434, + "epoch": 0.16313966910350133, + "flos": 509418395136.0, + "grad_norm": 0.054217985504269955, + "language_loss": 0.8754853, + "learning_rate": 0.0009542377421400945, + "loss": 0.88651162, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.34375, + "step": 848, + "time_per_iteration": 2.786766290664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104457, + "balance_loss_mlp": 1.06847942, + "epoch": 0.16333205078876492, + "flos": 543712352256.0, + "grad_norm": 0.06122856356214084, + "language_loss": 0.83524954, + "learning_rate": 0.0009541074485575145, + "loss": 0.84629411, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.35986328, + "step": 849, + "time_per_iteration": 2.713759183883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098701, + "balance_loss_mlp": 1.06346297, + "epoch": 0.16352443247402848, + "flos": 507477477888.0, + "grad_norm": 0.06331477383231503, + "language_loss": 0.92240757, + "learning_rate": 0.0009539769786747874, + "loss": 0.93339461, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.35253906, + "step": 850, + "time_per_iteration": 2.589945077896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100894, + "balance_loss_mlp": 1.06584692, + "epoch": 0.16371681415929204, + "flos": 541851420672.0, + "grad_norm": 0.06704648725492578, + "language_loss": 0.81567919, + "learning_rate": 0.0009538463325425665, + "loss": 0.82668811, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.35083008, + "step": 851, + "time_per_iteration": 2.6779844760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105544, + "balance_loss_mlp": 1.07042515, + "epoch": 0.1639091958445556, + "flos": 520501026816.0, + "grad_norm": 0.058426853420895056, + "language_loss": 0.8673842, + "learning_rate": 0.0009537155102115728, + "loss": 0.87843966, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.35131836, + "step": 852, + "time_per_iteration": 2.5614206790924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106136, + "balance_loss_mlp": 1.07175565, + "epoch": 0.16410157752981916, + "flos": 547149026304.0, + "grad_norm": 0.06460558975646845, + "language_loss": 0.83482397, + "learning_rate": 0.0009535845117325961, + "loss": 0.84588534, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.34423828, + "step": 853, + "time_per_iteration": 2.6453073024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098632, + "balance_loss_mlp": 1.06470513, + "epoch": 0.16429395921508272, + "flos": 582561828864.0, + "grad_norm": 0.052152281018199936, + "language_loss": 0.93584174, + "learning_rate": 0.0009534533371564946, + "loss": 0.94682807, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.33959961, + "step": 854, + "time_per_iteration": 2.75186824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111206, + "balance_loss_mlp": 1.07670665, + "epoch": 0.16448634090034628, + "flos": 530678628864.0, + "grad_norm": 0.06475772966833339, + "language_loss": 0.8907218, + "learning_rate": 0.0009533219865341949, + "loss": 0.90183383, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.3449707, + "step": 855, + "time_per_iteration": 2.581479787826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108852, + "balance_loss_mlp": 1.07285094, + "epoch": 0.16467872258560984, + "flos": 491623948800.0, + "grad_norm": 0.06378602693040462, + "language_loss": 0.87287533, + "learning_rate": 0.0009531904599166916, + "loss": 0.88396388, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.36035156, + "step": 856, + "time_per_iteration": 2.6429831981658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107008, + "balance_loss_mlp": 1.07141232, + "epoch": 0.16487110427087343, + "flos": 506015216640.0, + "grad_norm": 0.07162133431974482, + "language_loss": 0.85139728, + "learning_rate": 0.0009530587573550478, + "loss": 0.86246729, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.35620117, + "step": 857, + "time_per_iteration": 2.5667338371276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071319, + "balance_loss_mlp": 1.05434394, + "epoch": 0.16506348595613698, + "flos": 1432006553088.0, + "grad_norm": 0.02717136097410494, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75390708, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.16992188, + "step": 858, + "time_per_iteration": 5.02930474281311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107487, + "balance_loss_mlp": 1.0740366, + "epoch": 0.16525586764140054, + "flos": 476890827264.0, + "grad_norm": 0.06438670275364486, + "language_loss": 0.90481895, + "learning_rate": 0.0009527948246039337, + "loss": 0.91589379, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.33447266, + "step": 859, + "time_per_iteration": 2.5222055912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109782, + "balance_loss_mlp": 1.07618856, + "epoch": 0.1654482493266641, + "flos": 880737297408.0, + "grad_norm": 0.058857893791213665, + "language_loss": 0.88361865, + "learning_rate": 0.000952662594516931, + "loss": 0.8947165, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.33618164, + "step": 860, + "time_per_iteration": 3.065053701400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109979, + "balance_loss_mlp": 1.07497942, + "epoch": 0.16564063101192766, + "flos": 626527028736.0, + "grad_norm": 0.058557043780191484, + "language_loss": 0.86803752, + "learning_rate": 0.0009525301886907234, + "loss": 0.87913728, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.34985352, + "step": 861, + "time_per_iteration": 2.873415470123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121945, + "balance_loss_mlp": 1.08537149, + "epoch": 0.16583301269719122, + "flos": 561250722816.0, + "grad_norm": 0.0761086770239273, + "language_loss": 0.8825953, + "learning_rate": 0.0009523976071767155, + "loss": 0.8938148, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.36572266, + "step": 862, + "time_per_iteration": 2.71508526802063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115561, + "balance_loss_mlp": 1.07994115, + "epoch": 0.16602539438245478, + "flos": 567510022656.0, + "grad_norm": 0.05388299317844869, + "language_loss": 0.88433009, + "learning_rate": 0.00095226485002638, + "loss": 0.8954857, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.35620117, + "step": 863, + "time_per_iteration": 2.7524497509002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111173, + "balance_loss_mlp": 1.07617354, + "epoch": 0.16621777606771834, + "flos": 574589984256.0, + "grad_norm": 0.05833582522103205, + "language_loss": 0.89311475, + "learning_rate": 0.0009521319172912576, + "loss": 0.90422642, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.35009766, + "step": 864, + "time_per_iteration": 2.717493772506714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134798, + "balance_loss_mlp": 1.09846306, + "epoch": 0.16641015775298193, + "flos": 514292599296.0, + "grad_norm": 0.05644176285984134, + "language_loss": 0.94990546, + "learning_rate": 0.0009519988090229579, + "loss": 0.96125346, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.36352539, + "step": 865, + "time_per_iteration": 2.6850624084472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133116, + "balance_loss_mlp": 1.09668565, + "epoch": 0.1666025394382455, + "flos": 621395338752.0, + "grad_norm": 0.05816643645022503, + "language_loss": 0.88684535, + "learning_rate": 0.0009518655252731576, + "loss": 0.89817655, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.36450195, + "step": 866, + "time_per_iteration": 2.7240021228790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124082, + "balance_loss_mlp": 1.0882715, + "epoch": 0.16679492112350905, + "flos": 548528329728.0, + "grad_norm": 0.06128727898968579, + "language_loss": 0.9070124, + "learning_rate": 0.0009517320660936022, + "loss": 0.91825324, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.35839844, + "step": 867, + "time_per_iteration": 2.6959731578826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118134, + "balance_loss_mlp": 1.08260965, + "epoch": 0.1669873028087726, + "flos": 665379477504.0, + "grad_norm": 0.05857722537468161, + "language_loss": 0.83557463, + "learning_rate": 0.0009515984315361051, + "loss": 0.84675598, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.35546875, + "step": 868, + "time_per_iteration": 2.813674211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122458, + "balance_loss_mlp": 1.08638549, + "epoch": 0.16717968449403617, + "flos": 538305647616.0, + "grad_norm": 0.06553445455365839, + "language_loss": 0.87103701, + "learning_rate": 0.000951464621652548, + "loss": 0.88226151, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.36083984, + "step": 869, + "time_per_iteration": 2.674333333969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111253, + "balance_loss_mlp": 1.07757819, + "epoch": 0.16737206617929973, + "flos": 529833235968.0, + "grad_norm": 0.059309523866322815, + "language_loss": 0.78951609, + "learning_rate": 0.0009513306364948804, + "loss": 0.80064136, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.34985352, + "step": 870, + "time_per_iteration": 2.7519431114196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011143, + "balance_loss_mlp": 1.07953846, + "epoch": 0.1675644478645633, + "flos": 480529732608.0, + "grad_norm": 0.06711563999134491, + "language_loss": 0.89559376, + "learning_rate": 0.0009511964761151197, + "loss": 0.90673673, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.34814453, + "step": 871, + "time_per_iteration": 2.544520854949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113298, + "balance_loss_mlp": 1.07820272, + "epoch": 0.16775682954982685, + "flos": 494311334400.0, + "grad_norm": 0.06484202096701225, + "language_loss": 0.9050945, + "learning_rate": 0.0009510621405653521, + "loss": 0.91622752, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.35131836, + "step": 872, + "time_per_iteration": 2.594224452972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106918, + "balance_loss_mlp": 1.07265687, + "epoch": 0.1679492112350904, + "flos": 751694846976.0, + "grad_norm": 0.060317450015561574, + "language_loss": 0.847211, + "learning_rate": 0.0009509276298977309, + "loss": 0.85828018, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.34277344, + "step": 873, + "time_per_iteration": 2.9428915977478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110568, + "balance_loss_mlp": 1.07261181, + "epoch": 0.168141592920354, + "flos": 1135413075456.0, + "grad_norm": 0.05441785661992682, + "language_loss": 0.81867516, + "learning_rate": 0.0009507929441644778, + "loss": 0.82978088, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.37939453, + "step": 874, + "time_per_iteration": 3.52008318901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101336, + "balance_loss_mlp": 1.06640816, + "epoch": 0.16833397460561755, + "flos": 632114205696.0, + "grad_norm": 0.06557720885733571, + "language_loss": 0.86201179, + "learning_rate": 0.0009506580834178826, + "loss": 0.87302518, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.34936523, + "step": 875, + "time_per_iteration": 2.7744014263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110472, + "balance_loss_mlp": 1.06817079, + "epoch": 0.1685263562908811, + "flos": 541171943424.0, + "grad_norm": 0.06007828909359903, + "language_loss": 0.91612709, + "learning_rate": 0.0009505230477103028, + "loss": 0.92717427, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.36547852, + "step": 876, + "time_per_iteration": 2.6593635082244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103279, + "balance_loss_mlp": 1.06703997, + "epoch": 0.16871873797614467, + "flos": 619036812288.0, + "grad_norm": 0.08702038824672748, + "language_loss": 0.81312418, + "learning_rate": 0.0009503878370941641, + "loss": 0.824157, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.36206055, + "step": 877, + "time_per_iteration": 2.7511024475097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094199, + "balance_loss_mlp": 1.05986643, + "epoch": 0.16891111966140823, + "flos": 606067107840.0, + "grad_norm": 0.06953183101172467, + "language_loss": 0.88841844, + "learning_rate": 0.0009502524516219595, + "loss": 0.89936042, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.34375, + "step": 878, + "time_per_iteration": 2.697455406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091575, + "balance_loss_mlp": 1.05757689, + "epoch": 0.1691035013466718, + "flos": 552058136064.0, + "grad_norm": 0.0721678347454753, + "language_loss": 0.89980447, + "learning_rate": 0.0009501168913462506, + "loss": 0.91072023, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.34008789, + "step": 879, + "time_per_iteration": 2.6825287342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080465, + "balance_loss_mlp": 1.06263125, + "epoch": 0.16929588303193535, + "flos": 1475544121344.0, + "grad_norm": 0.044515803528062385, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80202389, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.17871094, + "step": 880, + "time_per_iteration": 4.825777769088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.0464623, + "epoch": 0.1694882647171989, + "flos": 925850456064.0, + "grad_norm": 0.06491790696384477, + "language_loss": 0.85360616, + "learning_rate": 0.0009498452465949042, + "loss": 0.86441934, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.34887695, + "step": 881, + "time_per_iteration": 3.2700376510620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086319, + "balance_loss_mlp": 1.05227244, + "epoch": 0.1696806464024625, + "flos": 545829359616.0, + "grad_norm": 0.057533624801199786, + "language_loss": 0.916857, + "learning_rate": 0.0009497091622247285, + "loss": 0.92772019, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.34082031, + "step": 882, + "time_per_iteration": 2.711721181869507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085559, + "balance_loss_mlp": 1.05184698, + "epoch": 0.16987302808772606, + "flos": 528970466304.0, + "grad_norm": 0.08384615451013337, + "language_loss": 0.93744707, + "learning_rate": 0.0009495729032619723, + "loss": 0.94830269, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.33740234, + "step": 883, + "time_per_iteration": 2.688525438308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084621, + "balance_loss_mlp": 1.05062199, + "epoch": 0.17006540977298962, + "flos": 754855096320.0, + "grad_norm": 0.06073677328113264, + "language_loss": 0.84419179, + "learning_rate": 0.0009494364697595354, + "loss": 0.85503805, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.34033203, + "step": 884, + "time_per_iteration": 2.9112000465393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092434, + "balance_loss_mlp": 1.05750597, + "epoch": 0.17025779145825318, + "flos": 558532813824.0, + "grad_norm": 0.06728326387015754, + "language_loss": 0.89818925, + "learning_rate": 0.0009492998617703867, + "loss": 0.90911365, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.34936523, + "step": 885, + "time_per_iteration": 2.6760926246643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093981, + "balance_loss_mlp": 1.06045985, + "epoch": 0.17045017314351674, + "flos": 511963186176.0, + "grad_norm": 0.0687386743468794, + "language_loss": 0.87971282, + "learning_rate": 0.0009491630793475619, + "loss": 0.89065266, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.33520508, + "step": 886, + "time_per_iteration": 2.59726619720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.06011951, + "epoch": 0.1706425548287803, + "flos": 508674898944.0, + "grad_norm": 0.058204707286146434, + "language_loss": 0.85722017, + "learning_rate": 0.0009490261225441643, + "loss": 0.8681649, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.34350586, + "step": 887, + "time_per_iteration": 2.900501012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087689, + "balance_loss_mlp": 1.0545013, + "epoch": 0.17083493651404386, + "flos": 717016776192.0, + "grad_norm": 0.05310353290702558, + "language_loss": 0.90775931, + "learning_rate": 0.0009488889914133656, + "loss": 0.9186362, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.33203125, + "step": 888, + "time_per_iteration": 2.992532968521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089684, + "balance_loss_mlp": 1.05520868, + "epoch": 0.17102731819930742, + "flos": 558852908544.0, + "grad_norm": 0.047287767355612194, + "language_loss": 0.88680297, + "learning_rate": 0.0009487516860084047, + "loss": 0.89769983, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.34472656, + "step": 889, + "time_per_iteration": 2.7029643058776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082798, + "balance_loss_mlp": 1.04858518, + "epoch": 0.17121969988457098, + "flos": 494542679040.0, + "grad_norm": 0.0765590367769256, + "language_loss": 0.88680983, + "learning_rate": 0.0009486142063825884, + "loss": 0.89763772, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.34228516, + "step": 890, + "time_per_iteration": 2.5640931129455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_mlp": 1.02402985, + "epoch": 0.17141208156983456, + "flos": 1548088063488.0, + "grad_norm": 0.02832238153451814, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.7346493, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.14648438, + "step": 891, + "time_per_iteration": 4.979609251022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108867, + "balance_loss_mlp": 1.0540278, + "epoch": 0.17160446325509812, + "flos": 619282713600.0, + "grad_norm": 0.06449268303648867, + "language_loss": 0.90758598, + "learning_rate": 0.0009483387246819542, + "loss": 0.91847265, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.34667969, + "step": 892, + "time_per_iteration": 2.731332540512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_mlp": 1.01767898, + "epoch": 0.17179684494036168, + "flos": 1381026972672.0, + "grad_norm": 0.016720826063608682, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83318138, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.1484375, + "step": 893, + "time_per_iteration": 4.641844987869263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097484, + "balance_loss_mlp": 1.06386662, + "epoch": 0.17198922662562524, + "flos": 492386383872.0, + "grad_norm": 0.05711411270468228, + "language_loss": 0.89587665, + "learning_rate": 0.0009480625467392688, + "loss": 0.90685147, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.33642578, + "step": 894, + "time_per_iteration": 2.6310250759124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_mlp": 1.01795936, + "epoch": 0.1721816083108888, + "flos": 1457529914880.0, + "grad_norm": 0.013728573618451478, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79027796, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.15136719, + "step": 895, + "time_per_iteration": 4.7525646686553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132534, + "balance_loss_mlp": 1.09834456, + "epoch": 0.17237398999615236, + "flos": 527853030912.0, + "grad_norm": 0.05821127752563967, + "language_loss": 0.87793648, + "learning_rate": 0.0009477856729834196, + "loss": 0.88926184, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.34228516, + "step": 896, + "time_per_iteration": 2.7015438079833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132108, + "balance_loss_mlp": 1.09901524, + "epoch": 0.17256637168141592, + "flos": 603644562432.0, + "grad_norm": 0.08337200045302615, + "language_loss": 0.9056648, + "learning_rate": 0.0009476469753098809, + "loss": 0.91698587, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.33105469, + "step": 897, + "time_per_iteration": 2.7035813331604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125108, + "balance_loss_mlp": 1.08922589, + "epoch": 0.17275875336667948, + "flos": 509437334016.0, + "grad_norm": 0.05742024530278536, + "language_loss": 0.874506, + "learning_rate": 0.0009475081038443738, + "loss": 0.88575709, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.35913086, + "step": 898, + "time_per_iteration": 2.584437370300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115669, + "balance_loss_mlp": 1.07971573, + "epoch": 0.17295113505194307, + "flos": 664951693824.0, + "grad_norm": 0.06535241228499304, + "language_loss": 0.85809892, + "learning_rate": 0.0009473690586408124, + "loss": 0.86925566, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.35986328, + "step": 899, + "time_per_iteration": 2.83156418800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116393, + "balance_loss_mlp": 1.08084452, + "epoch": 0.17314351673720663, + "flos": 555125253120.0, + "grad_norm": 0.0683413775827569, + "language_loss": 0.86639923, + "learning_rate": 0.0009472298397531792, + "loss": 0.87756318, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.35546875, + "step": 900, + "time_per_iteration": 2.6944193840026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123337, + "balance_loss_mlp": 1.08635855, + "epoch": 0.17333589842247019, + "flos": 503361326592.0, + "grad_norm": 0.09670394547256775, + "language_loss": 0.87118709, + "learning_rate": 0.0009470904472355235, + "loss": 0.88242042, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.36987305, + "step": 901, + "time_per_iteration": 2.637882709503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114178, + "balance_loss_mlp": 1.07982159, + "epoch": 0.17352828010773375, + "flos": 555924003840.0, + "grad_norm": 0.06358596699153923, + "language_loss": 0.79912066, + "learning_rate": 0.0009469508811419626, + "loss": 0.81026244, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.34399414, + "step": 902, + "time_per_iteration": 2.726072311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077408, + "balance_loss_mlp": 1.06453359, + "epoch": 0.1737206617929973, + "flos": 1553711556096.0, + "grad_norm": 0.030950293127884103, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72691238, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.12890625, + "step": 903, + "time_per_iteration": 4.791790723800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109546, + "balance_loss_mlp": 1.07445073, + "epoch": 0.17391304347826086, + "flos": 516390667776.0, + "grad_norm": 0.06883251868009001, + "language_loss": 0.84220147, + "learning_rate": 0.0009466712284439292, + "loss": 0.85329694, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.35131836, + "step": 904, + "time_per_iteration": 2.7648017406463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104504, + "balance_loss_mlp": 1.06995738, + "epoch": 0.17410542516352442, + "flos": 540773273088.0, + "grad_norm": 0.06988851938988141, + "language_loss": 0.8903957, + "learning_rate": 0.0009465311419480276, + "loss": 0.90144074, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.34545898, + "step": 905, + "time_per_iteration": 2.725829601287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098399, + "balance_loss_mlp": 1.0637325, + "epoch": 0.17429780684878798, + "flos": 623542869504.0, + "grad_norm": 0.06312030659776342, + "language_loss": 0.88624233, + "learning_rate": 0.0009463908820933622, + "loss": 0.89722633, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.34692383, + "step": 906, + "time_per_iteration": 2.8389482498168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093859, + "balance_loss_mlp": 1.05900264, + "epoch": 0.17449018853405157, + "flos": 575368386048.0, + "grad_norm": 0.056066721215551084, + "language_loss": 0.83138871, + "learning_rate": 0.0009462504489343868, + "loss": 0.84232736, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.34863281, + "step": 907, + "time_per_iteration": 2.863349437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086469, + "balance_loss_mlp": 1.05199337, + "epoch": 0.17468257021931513, + "flos": 533499844608.0, + "grad_norm": 0.07604190500703253, + "language_loss": 0.894853, + "learning_rate": 0.0009461098425256222, + "loss": 0.90571761, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.3449707, + "step": 908, + "time_per_iteration": 2.5941011905670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108834, + "balance_loss_mlp": 1.05345941, + "epoch": 0.1748749519045787, + "flos": 540496848384.0, + "grad_norm": 0.050694136543679796, + "language_loss": 0.85873353, + "learning_rate": 0.0009459690629216567, + "loss": 0.86961693, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.34887695, + "step": 909, + "time_per_iteration": 2.6097571849823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109032, + "balance_loss_mlp": 1.0570612, + "epoch": 0.17506733358984225, + "flos": 498373641216.0, + "grad_norm": 0.0569262349138849, + "language_loss": 0.88138729, + "learning_rate": 0.0009458281101771457, + "loss": 0.89229047, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.33276367, + "step": 910, + "time_per_iteration": 2.5904784202575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091588, + "balance_loss_mlp": 1.05744696, + "epoch": 0.1752597152751058, + "flos": 622621873152.0, + "grad_norm": 0.06350455217589325, + "language_loss": 0.8266046, + "learning_rate": 0.0009456869843468122, + "loss": 0.83752048, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.34179688, + "step": 911, + "time_per_iteration": 2.8930556774139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090023, + "balance_loss_mlp": 1.05476046, + "epoch": 0.17545209696036937, + "flos": 520717814784.0, + "grad_norm": 0.07844481886152296, + "language_loss": 0.79097009, + "learning_rate": 0.0009455456854854459, + "loss": 0.80187035, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.35302734, + "step": 912, + "time_per_iteration": 2.5984511375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096028, + "balance_loss_mlp": 1.0631026, + "epoch": 0.17564447864563293, + "flos": 461750270976.0, + "grad_norm": 0.05516798292623818, + "language_loss": 0.84505737, + "learning_rate": 0.0009454042136479039, + "loss": 0.85601771, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.3293457, + "step": 913, + "time_per_iteration": 2.586195945739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085981, + "balance_loss_mlp": 1.05286503, + "epoch": 0.1758368603308965, + "flos": 480416251392.0, + "grad_norm": 0.05301404729603274, + "language_loss": 0.83308446, + "learning_rate": 0.0009452625688891103, + "loss": 0.84394431, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.33129883, + "step": 914, + "time_per_iteration": 2.5374929904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052517, + "balance_loss_mlp": 1.038975, + "epoch": 0.17602924201616005, + "flos": 1478160133632.0, + "grad_norm": 0.03507986977902886, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79787254, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.13574219, + "step": 915, + "time_per_iteration": 4.5561583042144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097389, + "balance_loss_mlp": 1.06226993, + "epoch": 0.17622162370142364, + "flos": 602010593280.0, + "grad_norm": 0.06815502965849334, + "language_loss": 0.93451321, + "learning_rate": 0.0009449787608278015, + "loss": 0.94548714, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.35131836, + "step": 916, + "time_per_iteration": 2.7807908058166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092918, + "balance_loss_mlp": 1.0588007, + "epoch": 0.1764140053866872, + "flos": 442473214464.0, + "grad_norm": 0.0637680644109211, + "language_loss": 0.92700857, + "learning_rate": 0.0009448365976354704, + "loss": 0.93793774, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.34130859, + "step": 917, + "time_per_iteration": 2.4800124168395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.06909323, + "epoch": 0.17660638707195075, + "flos": 500362610688.0, + "grad_norm": 0.07080486598597346, + "language_loss": 0.90158784, + "learning_rate": 0.0009446942617422558, + "loss": 0.91264427, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.36547852, + "step": 918, + "time_per_iteration": 2.5415430068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101469, + "balance_loss_mlp": 1.06766129, + "epoch": 0.17679876875721431, + "flos": 538621360128.0, + "grad_norm": 0.060000223973742446, + "language_loss": 0.86201262, + "learning_rate": 0.0009445517532034176, + "loss": 0.87302732, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.33789062, + "step": 919, + "time_per_iteration": 2.6849868297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121669, + "balance_loss_mlp": 1.08569145, + "epoch": 0.17699115044247787, + "flos": 497477376000.0, + "grad_norm": 0.08221632690768264, + "language_loss": 0.89522099, + "learning_rate": 0.0009444090720742824, + "loss": 0.9064377, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.35986328, + "step": 920, + "time_per_iteration": 2.6034600734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113572, + "balance_loss_mlp": 1.07883418, + "epoch": 0.17718353212774143, + "flos": 662444780544.0, + "grad_norm": 0.08029288241638204, + "language_loss": 0.88040781, + "learning_rate": 0.0009442662184102439, + "loss": 0.89154357, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.34741211, + "step": 921, + "time_per_iteration": 2.767352342605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105947, + "balance_loss_mlp": 1.07309294, + "epoch": 0.177375913813005, + "flos": 582340658688.0, + "grad_norm": 0.0705507668945597, + "language_loss": 0.87951338, + "learning_rate": 0.000944123192266763, + "loss": 0.89057279, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.32836914, + "step": 922, + "time_per_iteration": 2.789315700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108289, + "balance_loss_mlp": 1.0727644, + "epoch": 0.17756829549826855, + "flos": 552285098496.0, + "grad_norm": 0.06115562628552814, + "language_loss": 0.83835006, + "learning_rate": 0.0009439799936993671, + "loss": 0.84943295, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.35546875, + "step": 923, + "time_per_iteration": 2.7160987854003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090669, + "balance_loss_mlp": 1.05733824, + "epoch": 0.17776067718353214, + "flos": 556060806144.0, + "grad_norm": 0.07059184324253498, + "language_loss": 0.88508618, + "learning_rate": 0.0009438366227636511, + "loss": 0.89599288, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.33349609, + "step": 924, + "time_per_iteration": 2.6319191455841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084897, + "balance_loss_mlp": 1.05163789, + "epoch": 0.1779530588687957, + "flos": 658161303552.0, + "grad_norm": 0.06263940487075517, + "language_loss": 0.86677843, + "learning_rate": 0.0009436930795152763, + "loss": 0.87762737, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.33276367, + "step": 925, + "time_per_iteration": 2.8063783645629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084326, + "balance_loss_mlp": 1.05159163, + "epoch": 0.17814544055405926, + "flos": 644187644928.0, + "grad_norm": 0.06448697412821461, + "language_loss": 0.8710525, + "learning_rate": 0.0009435493640099713, + "loss": 0.88189578, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.32739258, + "step": 926, + "time_per_iteration": 2.7599081993103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080787, + "balance_loss_mlp": 1.04664516, + "epoch": 0.17833782223932282, + "flos": 460672123392.0, + "grad_norm": 0.06497730431564504, + "language_loss": 0.84328961, + "learning_rate": 0.0009434054763035314, + "loss": 0.85409749, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.34155273, + "step": 927, + "time_per_iteration": 2.612910032272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081381, + "balance_loss_mlp": 1.04740596, + "epoch": 0.17853020392458638, + "flos": 759212766720.0, + "grad_norm": 0.04594292129212818, + "language_loss": 0.85898727, + "learning_rate": 0.0009432614164518185, + "loss": 0.8698011, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.33984375, + "step": 928, + "time_per_iteration": 2.926981210708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086846, + "balance_loss_mlp": 1.05153632, + "epoch": 0.17872258560984994, + "flos": 782320785408.0, + "grad_norm": 0.055185850673896385, + "language_loss": 0.84792197, + "learning_rate": 0.000943117184510762, + "loss": 0.85879046, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.35327148, + "step": 929, + "time_per_iteration": 2.995514154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039861, + "balance_loss_mlp": 1.02660513, + "epoch": 0.1789149672951135, + "flos": 1459095482880.0, + "grad_norm": 0.021362691821678215, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79829824, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.1328125, + "step": 930, + "time_per_iteration": 4.99839448928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091678, + "balance_loss_mlp": 1.05739331, + "epoch": 0.17910734898037706, + "flos": 503598463488.0, + "grad_norm": 0.05761618473313655, + "language_loss": 0.88773429, + "learning_rate": 0.0009428282045846674, + "loss": 0.89865112, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.34301758, + "step": 931, + "time_per_iteration": 2.6966652870178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.05452061, + "epoch": 0.17929973066564064, + "flos": 745895264256.0, + "grad_norm": 0.05798282919409206, + "language_loss": 0.89983928, + "learning_rate": 0.0009426834567118214, + "loss": 0.91071755, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.33300781, + "step": 932, + "time_per_iteration": 3.072160482406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092765, + "balance_loss_mlp": 1.05907631, + "epoch": 0.1794921123509042, + "flos": 712875893760.0, + "grad_norm": 0.055390897890994044, + "language_loss": 0.80879378, + "learning_rate": 0.0009425385369740155, + "loss": 0.81972146, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.3371582, + "step": 933, + "time_per_iteration": 3.0337042808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092731, + "balance_loss_mlp": 1.05825567, + "epoch": 0.17968449403616776, + "flos": 632838763008.0, + "grad_norm": 0.0687685702394307, + "language_loss": 0.87443584, + "learning_rate": 0.0009423934454275125, + "loss": 0.8853631, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.3449707, + "step": 934, + "time_per_iteration": 2.7970879077911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089195, + "balance_loss_mlp": 1.05526757, + "epoch": 0.17987687572143132, + "flos": 536060602368.0, + "grad_norm": 0.08214865293258214, + "language_loss": 0.92215371, + "learning_rate": 0.0009422481821286418, + "loss": 0.93304563, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.33959961, + "step": 935, + "time_per_iteration": 2.7134642601013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091708, + "balance_loss_mlp": 1.05914021, + "epoch": 0.18006925740669488, + "flos": 537818227200.0, + "grad_norm": 0.0718764173736199, + "language_loss": 0.87967253, + "learning_rate": 0.0009421027471337998, + "loss": 0.89058959, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.32568359, + "step": 936, + "time_per_iteration": 2.608764171600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098474, + "balance_loss_mlp": 1.06333113, + "epoch": 0.18026163909195844, + "flos": 539255757312.0, + "grad_norm": 0.06697051800305152, + "language_loss": 0.82882118, + "learning_rate": 0.0009419571404994493, + "loss": 0.83980596, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.3515625, + "step": 937, + "time_per_iteration": 2.620296001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096306, + "balance_loss_mlp": 1.06240284, + "epoch": 0.180454020777222, + "flos": 500382959616.0, + "grad_norm": 0.08555714620461663, + "language_loss": 0.90948844, + "learning_rate": 0.00094181136228212, + "loss": 0.92045152, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.33935547, + "step": 938, + "time_per_iteration": 2.62837290763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109732, + "balance_loss_mlp": 1.06415629, + "epoch": 0.18064640246248556, + "flos": 498689353728.0, + "grad_norm": 0.06983123921060745, + "language_loss": 0.86323059, + "learning_rate": 0.0009416654125384077, + "loss": 0.8742038, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.33154297, + "step": 939, + "time_per_iteration": 2.715686321258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054723, + "balance_loss_mlp": 1.04242051, + "epoch": 0.18083878414774912, + "flos": 1518572358144.0, + "grad_norm": 0.027679884047562747, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80827093, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.12304688, + "step": 940, + "time_per_iteration": 4.941875219345093 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090642, + "balance_loss_mlp": 1.05728722, + "epoch": 0.1810311658330127, + "flos": 727006703616.0, + "grad_norm": 0.07011009980003599, + "language_loss": 0.84326053, + "learning_rate": 0.000941372998698552, + "loss": 0.85416698, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.33374023, + "step": 941, + "time_per_iteration": 2.931520938873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094264, + "balance_loss_mlp": 1.0597409, + "epoch": 0.18122354751827627, + "flos": 564643726848.0, + "grad_norm": 0.08254502738164117, + "language_loss": 0.8207435, + "learning_rate": 0.0009412265347159336, + "loss": 0.8316862, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.34570312, + "step": 942, + "time_per_iteration": 2.696354627609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091238, + "balance_loss_mlp": 1.05869377, + "epoch": 0.18141592920353983, + "flos": 519024208896.0, + "grad_norm": 0.05729066672306875, + "language_loss": 0.85217965, + "learning_rate": 0.0009410798994339829, + "loss": 0.86309201, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.32543945, + "step": 943, + "time_per_iteration": 2.6009600162506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088013, + "balance_loss_mlp": 1.0545156, + "epoch": 0.1816083108888034, + "flos": 512219261952.0, + "grad_norm": 0.05342615519744699, + "language_loss": 0.88234782, + "learning_rate": 0.000940933092909628, + "loss": 0.89322793, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.33520508, + "step": 944, + "time_per_iteration": 2.618419647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095526, + "balance_loss_mlp": 1.06286263, + "epoch": 0.18180069257406695, + "flos": 492144864768.0, + "grad_norm": 0.053227732023653135, + "language_loss": 0.8393383, + "learning_rate": 0.0009407861151998649, + "loss": 0.85029352, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.32666016, + "step": 945, + "time_per_iteration": 2.5718705654144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097774, + "balance_loss_mlp": 1.06406188, + "epoch": 0.1819930742593305, + "flos": 569891870208.0, + "grad_norm": 0.05775782434029923, + "language_loss": 0.86156505, + "learning_rate": 0.0009406389663617552, + "loss": 0.87254274, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.33740234, + "step": 946, + "time_per_iteration": 2.66513729095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097522, + "balance_loss_mlp": 1.06433463, + "epoch": 0.18218545594459407, + "flos": 605693168640.0, + "grad_norm": 0.06350431386522506, + "language_loss": 0.85736459, + "learning_rate": 0.000940491646452427, + "loss": 0.86833978, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.33203125, + "step": 947, + "time_per_iteration": 2.715071201324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103916, + "balance_loss_mlp": 1.07010818, + "epoch": 0.18237783762985763, + "flos": 548419230720.0, + "grad_norm": 0.06277969821047595, + "language_loss": 0.91195452, + "learning_rate": 0.000940344155529075, + "loss": 0.92299366, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.33837891, + "step": 948, + "time_per_iteration": 2.6502938270568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099574, + "balance_loss_mlp": 1.06550407, + "epoch": 0.1825702193151212, + "flos": 450509078016.0, + "grad_norm": 0.06933176029299125, + "language_loss": 0.87683523, + "learning_rate": 0.0009401964936489605, + "loss": 0.88783091, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.34106445, + "step": 949, + "time_per_iteration": 2.5181798934936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084135, + "balance_loss_mlp": 1.05247355, + "epoch": 0.18276260100038477, + "flos": 588962313216.0, + "grad_norm": 0.07980064544074586, + "language_loss": 0.85422635, + "learning_rate": 0.0009400486608694108, + "loss": 0.86506772, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.31640625, + "step": 950, + "time_per_iteration": 2.7189955711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087871, + "balance_loss_mlp": 1.05384839, + "epoch": 0.18295498268564833, + "flos": 786988376064.0, + "grad_norm": 0.05265351460276348, + "language_loss": 0.87225658, + "learning_rate": 0.0009399006572478195, + "loss": 0.88313532, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.34033203, + "step": 951, + "time_per_iteration": 3.0805773735046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086089, + "balance_loss_mlp": 1.05218577, + "epoch": 0.1831473643709119, + "flos": 577878271488.0, + "grad_norm": 0.059447924131550096, + "language_loss": 0.91242015, + "learning_rate": 0.0009397524828416468, + "loss": 0.92328107, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.33935547, + "step": 952, + "time_per_iteration": 2.6567108631134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082155, + "balance_loss_mlp": 1.04801321, + "epoch": 0.18333974605617545, + "flos": 566622521856.0, + "grad_norm": 0.05513512337372911, + "language_loss": 0.96184212, + "learning_rate": 0.0009396041377084192, + "loss": 0.97266364, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.34179688, + "step": 953, + "time_per_iteration": 2.6937921047210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079403, + "balance_loss_mlp": 1.04478431, + "epoch": 0.183532127741439, + "flos": 526725421056.0, + "grad_norm": 0.07204875194033089, + "language_loss": 0.87840325, + "learning_rate": 0.0009394556219057295, + "loss": 0.88919723, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.34667969, + "step": 954, + "time_per_iteration": 2.6962215900421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107777, + "balance_loss_mlp": 1.04272258, + "epoch": 0.18372450942670257, + "flos": 594259918848.0, + "grad_norm": 0.07227161235955501, + "language_loss": 0.83883446, + "learning_rate": 0.0009393069354912362, + "loss": 0.84961212, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.35058594, + "step": 955, + "time_per_iteration": 2.718308925628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081248, + "balance_loss_mlp": 1.04677236, + "epoch": 0.18391689111196613, + "flos": 644720145408.0, + "grad_norm": 0.07091738302891186, + "language_loss": 0.82511717, + "learning_rate": 0.0009391580785226649, + "loss": 0.83592963, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.34521484, + "step": 956, + "time_per_iteration": 2.907367467880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077991, + "balance_loss_mlp": 1.06216049, + "epoch": 0.18410927279722972, + "flos": 1456246563840.0, + "grad_norm": 0.048423099914415325, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80418444, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.15820312, + "step": 957, + "time_per_iteration": 4.78663969039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091525, + "balance_loss_mlp": 1.05702567, + "epoch": 0.18430165448249328, + "flos": 658437728256.0, + "grad_norm": 0.09319397884021513, + "language_loss": 0.86484683, + "learning_rate": 0.0009388598531545196, + "loss": 0.8757621, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.34545898, + "step": 958, + "time_per_iteration": 2.8470118045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087326, + "balance_loss_mlp": 1.05285025, + "epoch": 0.18449403616775684, + "flos": 517679811072.0, + "grad_norm": 0.07377492103556435, + "language_loss": 0.86076611, + "learning_rate": 0.000938710484870727, + "loss": 0.87163937, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.3449707, + "step": 959, + "time_per_iteration": 2.5930731296539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090986, + "balance_loss_mlp": 1.05672574, + "epoch": 0.1846864178530204, + "flos": 552481537536.0, + "grad_norm": 0.06589557505977534, + "language_loss": 0.86379164, + "learning_rate": 0.0009385609462644189, + "loss": 0.8747015, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.34277344, + "step": 960, + "time_per_iteration": 2.688706636428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096456, + "balance_loss_mlp": 1.06212378, + "epoch": 0.18487879953828396, + "flos": 465930441216.0, + "grad_norm": 0.0643439417949763, + "language_loss": 0.86035949, + "learning_rate": 0.0009384112373936514, + "loss": 0.871324, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.34326172, + "step": 961, + "time_per_iteration": 4.0801496505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095895, + "balance_loss_mlp": 1.06132412, + "epoch": 0.18507118122354752, + "flos": 648200489472.0, + "grad_norm": 0.0614591664996872, + "language_loss": 0.91820455, + "learning_rate": 0.0009382613583165467, + "loss": 0.92916346, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.34594727, + "step": 962, + "time_per_iteration": 2.790069341659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113098, + "balance_loss_mlp": 1.07921863, + "epoch": 0.18526356290881107, + "flos": 626486330880.0, + "grad_norm": 0.06374556186760763, + "language_loss": 0.89594233, + "learning_rate": 0.0009381113090912928, + "loss": 0.90707326, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.33886719, + "step": 963, + "time_per_iteration": 2.6891098022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117951, + "balance_loss_mlp": 1.08559799, + "epoch": 0.18545594459407463, + "flos": 432497843712.0, + "grad_norm": 0.06491910119233056, + "language_loss": 0.90103394, + "learning_rate": 0.000937961089776144, + "loss": 0.91221344, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.32348633, + "step": 964, + "time_per_iteration": 2.5821962356567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124554, + "balance_loss_mlp": 1.08926833, + "epoch": 0.1856483262793382, + "flos": 748720862208.0, + "grad_norm": 0.06849062336391444, + "language_loss": 0.829036, + "learning_rate": 0.0009378107004294208, + "loss": 0.84028149, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.35302734, + "step": 965, + "time_per_iteration": 2.9898061752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115255, + "balance_loss_mlp": 1.081972, + "epoch": 0.18584070796460178, + "flos": 530058788352.0, + "grad_norm": 0.08647217477609576, + "language_loss": 0.91352308, + "learning_rate": 0.0009376601411095096, + "loss": 0.92467564, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.33300781, + "step": 966, + "time_per_iteration": 2.6415059566497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093436, + "balance_loss_mlp": 1.06196475, + "epoch": 0.18603308964986534, + "flos": 482863527936.0, + "grad_norm": 0.05783783242438048, + "language_loss": 0.8708145, + "learning_rate": 0.0009375094118748622, + "loss": 0.88174886, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.31445312, + "step": 967, + "time_per_iteration": 2.5149550437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089094, + "balance_loss_mlp": 1.05650234, + "epoch": 0.1862254713351289, + "flos": 800976591360.0, + "grad_norm": 0.0756042683078202, + "language_loss": 0.9083451, + "learning_rate": 0.0009373585127839976, + "loss": 0.91923606, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.32592773, + "step": 968, + "time_per_iteration": 2.9569485187530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084434, + "balance_loss_mlp": 1.05250978, + "epoch": 0.18641785302039246, + "flos": 478082456064.0, + "grad_norm": 0.06160067145414361, + "language_loss": 0.91074634, + "learning_rate": 0.0009372074438954994, + "loss": 0.92159069, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.3190918, + "step": 969, + "time_per_iteration": 2.508530378341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083542, + "balance_loss_mlp": 1.05040169, + "epoch": 0.18661023470565602, + "flos": 388695587328.0, + "grad_norm": 0.07517959095695621, + "language_loss": 0.91676056, + "learning_rate": 0.0009370562052680181, + "loss": 0.92759597, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.33154297, + "step": 970, + "time_per_iteration": 2.4572672843933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087332, + "balance_loss_mlp": 1.05400109, + "epoch": 0.18680261639091958, + "flos": 564402207744.0, + "grad_norm": 0.052448577146131624, + "language_loss": 0.89610398, + "learning_rate": 0.0009369047969602695, + "loss": 0.90697736, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.33349609, + "step": 971, + "time_per_iteration": 2.714704751968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101064, + "balance_loss_mlp": 1.06556404, + "epoch": 0.18699499807618314, + "flos": 479018009088.0, + "grad_norm": 0.06595213007116614, + "language_loss": 0.8674072, + "learning_rate": 0.0009367532190310357, + "loss": 0.87841785, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.35498047, + "step": 972, + "time_per_iteration": 2.589667558670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111914, + "balance_loss_mlp": 1.07660413, + "epoch": 0.1871873797614467, + "flos": 553022802432.0, + "grad_norm": 0.0720295199384638, + "language_loss": 0.88701892, + "learning_rate": 0.0009366014715391644, + "loss": 0.89813805, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.35327148, + "step": 973, + "time_per_iteration": 2.634023904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107724, + "balance_loss_mlp": 1.07389259, + "epoch": 0.18737976144671029, + "flos": 552526617600.0, + "grad_norm": 0.05153911900793568, + "language_loss": 0.8432554, + "learning_rate": 0.0009364495545435693, + "loss": 0.85433269, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.33837891, + "step": 974, + "time_per_iteration": 2.7729458808898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107099, + "balance_loss_mlp": 1.07281494, + "epoch": 0.18757214313197385, + "flos": 502002372096.0, + "grad_norm": 0.05815108638233015, + "language_loss": 0.88620323, + "learning_rate": 0.0009362974681032297, + "loss": 0.8972742, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.34326172, + "step": 975, + "time_per_iteration": 2.631744623184204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105498, + "balance_loss_mlp": 1.07130909, + "epoch": 0.1877645248172374, + "flos": 674691337728.0, + "grad_norm": 0.06841603134690444, + "language_loss": 0.88265896, + "learning_rate": 0.0009361452122771907, + "loss": 0.89371395, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.34204102, + "step": 976, + "time_per_iteration": 2.8427281379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094167, + "balance_loss_mlp": 1.06012082, + "epoch": 0.18795690650250096, + "flos": 404771696640.0, + "grad_norm": 0.07319435948671522, + "language_loss": 0.8377496, + "learning_rate": 0.0009359927871245635, + "loss": 0.84869128, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.34057617, + "step": 977, + "time_per_iteration": 2.4665186405181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090091, + "balance_loss_mlp": 1.0565697, + "epoch": 0.18814928818776452, + "flos": 637599485952.0, + "grad_norm": 0.05986452276683665, + "language_loss": 0.86337954, + "learning_rate": 0.0009358401927045246, + "loss": 0.87428045, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.33520508, + "step": 978, + "time_per_iteration": 2.8037781715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090672, + "balance_loss_mlp": 1.05707908, + "epoch": 0.18834166987302808, + "flos": 1137825446400.0, + "grad_norm": 0.054509582003230646, + "language_loss": 0.88314402, + "learning_rate": 0.0009356874290763166, + "loss": 0.89405078, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.33618164, + "step": 979, + "time_per_iteration": 3.456723213195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097981, + "balance_loss_mlp": 1.06481671, + "epoch": 0.18853405155829164, + "flos": 504538398720.0, + "grad_norm": 0.06366920756378494, + "language_loss": 0.8866874, + "learning_rate": 0.0009355344962992474, + "loss": 0.89766723, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.33154297, + "step": 980, + "time_per_iteration": 2.6105339527130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109825, + "balance_loss_mlp": 1.06494308, + "epoch": 0.1887264332435552, + "flos": 607879987200.0, + "grad_norm": 0.05130215804193928, + "language_loss": 0.88147485, + "learning_rate": 0.0009353813944326908, + "loss": 0.89245737, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.33325195, + "step": 981, + "time_per_iteration": 2.882836103439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109809, + "balance_loss_mlp": 1.0758822, + "epoch": 0.1889188149288188, + "flos": 552264749568.0, + "grad_norm": 0.07032712681879846, + "language_loss": 0.83146608, + "learning_rate": 0.0009352281235360863, + "loss": 0.84256417, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.33959961, + "step": 982, + "time_per_iteration": 2.695748805999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120418, + "balance_loss_mlp": 1.08775461, + "epoch": 0.18911119661408235, + "flos": 418332128256.0, + "grad_norm": 0.06033753714629359, + "language_loss": 0.84987485, + "learning_rate": 0.0009350746836689389, + "loss": 0.86107904, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.32666016, + "step": 983, + "time_per_iteration": 2.5073440074920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260435, + "balance_loss_mlp": 1.23916793, + "epoch": 0.1893035782993459, + "flos": 1481141320704.0, + "grad_norm": 0.0731593378732656, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82699656, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.21289062, + "step": 984, + "time_per_iteration": 5.065609931945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133244, + "balance_loss_mlp": 1.09831583, + "epoch": 0.18949595998460947, + "flos": 508220974080.0, + "grad_norm": 0.09166419018528392, + "language_loss": 0.83211792, + "learning_rate": 0.0009347672972613634, + "loss": 0.84345031, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.34936523, + "step": 985, + "time_per_iteration": 2.580009937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115864, + "balance_loss_mlp": 1.08270001, + "epoch": 0.18968834166987303, + "flos": 530812459008.0, + "grad_norm": 0.0668772854373454, + "language_loss": 0.85875785, + "learning_rate": 0.0009346133508402735, + "loss": 0.8699165, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.33178711, + "step": 986, + "time_per_iteration": 2.6872711181640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111031, + "balance_loss_mlp": 1.07724667, + "epoch": 0.1898807233551366, + "flos": 499515807744.0, + "grad_norm": 0.11088649382938841, + "language_loss": 0.8420769, + "learning_rate": 0.0009344592356873166, + "loss": 0.8531872, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.33813477, + "step": 987, + "time_per_iteration": 2.6347994804382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098007, + "balance_loss_mlp": 1.06462848, + "epoch": 0.19007310504040015, + "flos": 601936399872.0, + "grad_norm": 0.05765681888892058, + "language_loss": 0.78527796, + "learning_rate": 0.0009343049518623255, + "loss": 0.79625803, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.33398438, + "step": 988, + "time_per_iteration": 2.696929693222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082914, + "balance_loss_mlp": 1.05029869, + "epoch": 0.1902654867256637, + "flos": 601374786048.0, + "grad_norm": 0.05732720380572914, + "language_loss": 0.83250153, + "learning_rate": 0.0009341504994251985, + "loss": 0.84333068, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.32617188, + "step": 989, + "time_per_iteration": 2.8399016857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095582, + "balance_loss_mlp": 1.07841623, + "epoch": 0.19045786841092727, + "flos": 1574925147648.0, + "grad_norm": 0.03888561388969961, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74616081, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.171875, + "step": 990, + "time_per_iteration": 5.072636842727661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109747, + "balance_loss_mlp": 1.06394839, + "epoch": 0.19065025009619085, + "flos": 681280906752.0, + "grad_norm": 0.135211113906906, + "language_loss": 0.818295, + "learning_rate": 0.0009338410889544574, + "loss": 0.82926977, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.33544922, + "step": 991, + "time_per_iteration": 3.050665855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_mlp": 1.06786811, + "epoch": 0.1908426317814544, + "flos": 601971305472.0, + "grad_norm": 0.06286082016671143, + "language_loss": 0.87738663, + "learning_rate": 0.000933686131040967, + "loss": 0.88840532, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.34033203, + "step": 992, + "time_per_iteration": 2.7589659690856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089273, + "balance_loss_mlp": 1.05672884, + "epoch": 0.19103501346671797, + "flos": 586027616256.0, + "grad_norm": 0.0561482479745879, + "language_loss": 0.90427077, + "learning_rate": 0.0009335310047555883, + "loss": 0.91516346, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.32543945, + "step": 993, + "time_per_iteration": 2.7133467197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108709, + "balance_loss_mlp": 1.0532825, + "epoch": 0.19122739515198153, + "flos": 545494708224.0, + "grad_norm": 0.06221036652136981, + "language_loss": 0.88114065, + "learning_rate": 0.0009333757101585467, + "loss": 0.89201152, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.33837891, + "step": 994, + "time_per_iteration": 2.6733241081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083527, + "balance_loss_mlp": 1.05105424, + "epoch": 0.1914197768372451, + "flos": 521171739648.0, + "grad_norm": 0.05606370206634765, + "language_loss": 0.93617988, + "learning_rate": 0.0009332202473101329, + "loss": 0.94701517, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.32470703, + "step": 995, + "time_per_iteration": 2.6689558029174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079312, + "balance_loss_mlp": 1.04536152, + "epoch": 0.19161215852250865, + "flos": 610961660928.0, + "grad_norm": 0.05986652691328414, + "language_loss": 0.83121806, + "learning_rate": 0.0009330646162707028, + "loss": 0.84201121, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.33984375, + "step": 996, + "time_per_iteration": 2.7264511585235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081823, + "balance_loss_mlp": 1.04849207, + "epoch": 0.1918045402077722, + "flos": 846281806848.0, + "grad_norm": 0.05485586532204223, + "language_loss": 0.84800065, + "learning_rate": 0.0009329088171006779, + "loss": 0.85881883, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.33349609, + "step": 997, + "time_per_iteration": 3.1486315727233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097705, + "balance_loss_mlp": 1.06220424, + "epoch": 0.19199692189303577, + "flos": 465699096576.0, + "grad_norm": 0.06540772430376247, + "language_loss": 0.84963006, + "learning_rate": 0.0009327528498605446, + "loss": 0.86060709, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.35522461, + "step": 998, + "time_per_iteration": 2.532460927963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.0542109, + "epoch": 0.19218930357829936, + "flos": 531318818304.0, + "grad_norm": 0.06065225266474605, + "language_loss": 0.89716202, + "learning_rate": 0.0009325967146108548, + "loss": 0.90804029, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.33642578, + "step": 999, + "time_per_iteration": 2.6381072998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108792, + "balance_loss_mlp": 1.05334902, + "epoch": 0.19238168526356292, + "flos": 601350054912.0, + "grad_norm": 0.06318510310852068, + "language_loss": 0.87984866, + "learning_rate": 0.0009324404114122258, + "loss": 0.89072788, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.34594727, + "step": 1000, + "time_per_iteration": 2.7017252445220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088105, + "balance_loss_mlp": 1.0544883, + "epoch": 0.19257406694882648, + "flos": 571690192896.0, + "grad_norm": 0.05361295189234855, + "language_loss": 0.87132722, + "learning_rate": 0.0009322839403253397, + "loss": 0.88220823, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.33642578, + "step": 1001, + "time_per_iteration": 2.7725350856781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091645, + "balance_loss_mlp": 1.05759907, + "epoch": 0.19276644863409004, + "flos": 801478568448.0, + "grad_norm": 0.0661765462165054, + "language_loss": 0.84038174, + "learning_rate": 0.0009321273014109439, + "loss": 0.85129815, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.34082031, + "step": 1002, + "time_per_iteration": 2.9275383949279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089997, + "balance_loss_mlp": 1.05676103, + "epoch": 0.1929588303193536, + "flos": 563024314368.0, + "grad_norm": 0.05133430998282463, + "language_loss": 0.85232604, + "learning_rate": 0.0009319704947298513, + "loss": 0.863226, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.33251953, + "step": 1003, + "time_per_iteration": 2.9198272228240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083204, + "balance_loss_mlp": 1.05120838, + "epoch": 0.19315121200461716, + "flos": 626550349824.0, + "grad_norm": 0.04652496586479965, + "language_loss": 0.88737059, + "learning_rate": 0.0009318135203429393, + "loss": 0.8982026, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.31982422, + "step": 1004, + "time_per_iteration": 2.7145965099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094807, + "balance_loss_mlp": 1.06116605, + "epoch": 0.19334359368988072, + "flos": 517169069568.0, + "grad_norm": 0.06711221272981459, + "language_loss": 0.88228458, + "learning_rate": 0.0009316563783111511, + "loss": 0.8932327, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.33642578, + "step": 1005, + "time_per_iteration": 2.68135404586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095243, + "balance_loss_mlp": 1.06050563, + "epoch": 0.19353597537514428, + "flos": 693751606272.0, + "grad_norm": 0.04947727679523619, + "language_loss": 0.82323831, + "learning_rate": 0.0009314990686954943, + "loss": 0.83419079, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.34765625, + "step": 1006, + "time_per_iteration": 2.9068872928619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098932, + "balance_loss_mlp": 1.06495738, + "epoch": 0.19372835706040784, + "flos": 1209665180160.0, + "grad_norm": 0.05336104081377929, + "language_loss": 0.80917025, + "learning_rate": 0.000931341591557042, + "loss": 0.82015955, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.34008789, + "step": 1007, + "time_per_iteration": 3.759119749069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098415, + "balance_loss_mlp": 1.06291509, + "epoch": 0.19392073874567142, + "flos": 520368606720.0, + "grad_norm": 0.06549831272650784, + "language_loss": 0.87757689, + "learning_rate": 0.0009311839469569325, + "loss": 0.88856107, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.35522461, + "step": 1008, + "time_per_iteration": 2.6298930644989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100893, + "balance_loss_mlp": 1.06620264, + "epoch": 0.19411312043093498, + "flos": 588543293952.0, + "grad_norm": 0.06763315162421418, + "language_loss": 0.8732397, + "learning_rate": 0.0009310261349563687, + "loss": 0.88424855, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.34692383, + "step": 1009, + "time_per_iteration": 2.6843061447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110066, + "balance_loss_mlp": 1.06718588, + "epoch": 0.19430550211619854, + "flos": 579085867008.0, + "grad_norm": 0.05371296475785438, + "language_loss": 0.8534441, + "learning_rate": 0.0009308681556166186, + "loss": 0.86445075, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.33496094, + "step": 1010, + "time_per_iteration": 2.8197336196899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107606, + "balance_loss_mlp": 1.07291579, + "epoch": 0.1944978838014621, + "flos": 620848281600.0, + "grad_norm": 0.08312668477716535, + "language_loss": 0.87206143, + "learning_rate": 0.0009307100089990152, + "loss": 0.88313752, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.34716797, + "step": 1011, + "time_per_iteration": 2.7118990421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101924, + "balance_loss_mlp": 1.0672822, + "epoch": 0.19469026548672566, + "flos": 598440089088.0, + "grad_norm": 0.061832865854500894, + "language_loss": 0.83946323, + "learning_rate": 0.0009305516951649568, + "loss": 0.85048252, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.34667969, + "step": 1012, + "time_per_iteration": 2.667672872543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096764, + "balance_loss_mlp": 1.06314659, + "epoch": 0.19488264717198922, + "flos": 551890810368.0, + "grad_norm": 0.04827143175142062, + "language_loss": 0.87187612, + "learning_rate": 0.0009303932141759057, + "loss": 0.88284373, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.33642578, + "step": 1013, + "time_per_iteration": 2.7321088314056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101502, + "balance_loss_mlp": 1.06705046, + "epoch": 0.19507502885725278, + "flos": 665842166784.0, + "grad_norm": 0.05715794205563071, + "language_loss": 0.84201366, + "learning_rate": 0.0009302345660933902, + "loss": 0.85302866, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.3449707, + "step": 1014, + "time_per_iteration": 2.7699263095855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109904, + "balance_loss_mlp": 1.07616735, + "epoch": 0.19526741054251634, + "flos": 670771625472.0, + "grad_norm": 0.05949834877265084, + "language_loss": 0.84866655, + "learning_rate": 0.0009300757509790026, + "loss": 0.85976553, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.33764648, + "step": 1015, + "time_per_iteration": 2.8250515460968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110813, + "balance_loss_mlp": 1.0766474, + "epoch": 0.19545979222777993, + "flos": 446983653888.0, + "grad_norm": 0.0671511226198219, + "language_loss": 0.90974069, + "learning_rate": 0.0009299167688944005, + "loss": 0.92084885, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.34204102, + "step": 1016, + "time_per_iteration": 2.545133590698242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111157, + "balance_loss_mlp": 1.07778645, + "epoch": 0.1956521739130435, + "flos": 568813722624.0, + "grad_norm": 0.06338586690579641, + "language_loss": 0.85958129, + "learning_rate": 0.0009297576199013063, + "loss": 0.87069696, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.33813477, + "step": 1017, + "time_per_iteration": 2.668503761291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148218, + "balance_loss_mlp": 1.13295972, + "epoch": 0.19584455559830705, + "flos": 1454969157120.0, + "grad_norm": 0.047651466398381144, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74150348, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.15234375, + "step": 1018, + "time_per_iteration": 4.920944929122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104842, + "balance_loss_mlp": 1.09015501, + "epoch": 0.1960369372835706, + "flos": 1590320369664.0, + "grad_norm": 0.036993279908541045, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80531144, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.14648438, + "step": 1019, + "time_per_iteration": 6.0059425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118502, + "balance_loss_mlp": 1.08505166, + "epoch": 0.19622931896883417, + "flos": 615709237248.0, + "grad_norm": 0.05240041234704895, + "language_loss": 0.86600977, + "learning_rate": 0.0009292791720892659, + "loss": 0.87719476, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.3347168, + "step": 1020, + "time_per_iteration": 2.995192527770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113873, + "balance_loss_mlp": 1.07930255, + "epoch": 0.19642170065409773, + "flos": 465950790144.0, + "grad_norm": 0.0657036282835547, + "language_loss": 0.88724279, + "learning_rate": 0.0009291193560807218, + "loss": 0.89838147, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.34594727, + "step": 1021, + "time_per_iteration": 2.633256196975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114293, + "balance_loss_mlp": 1.07962656, + "epoch": 0.19661408233936128, + "flos": 515040477696.0, + "grad_norm": 0.054836200403870924, + "language_loss": 0.87439638, + "learning_rate": 0.0009289593734732688, + "loss": 0.88553929, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.34716797, + "step": 1022, + "time_per_iteration": 2.622284173965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107262, + "balance_loss_mlp": 1.0736922, + "epoch": 0.19680646402462484, + "flos": 392427624960.0, + "grad_norm": 0.053036961045345866, + "language_loss": 0.94139373, + "learning_rate": 0.0009287992243290175, + "loss": 0.95246631, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.3359375, + "step": 1023, + "time_per_iteration": 2.4402668476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108975, + "balance_loss_mlp": 1.07247353, + "epoch": 0.19699884570988843, + "flos": 626122566144.0, + "grad_norm": 0.056904835680118435, + "language_loss": 0.90850759, + "learning_rate": 0.0009286389087101435, + "loss": 0.91959733, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.36523438, + "step": 1024, + "time_per_iteration": 2.762068271636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110519, + "balance_loss_mlp": 1.06957078, + "epoch": 0.197191227395152, + "flos": 557710742016.0, + "grad_norm": 0.05298833269370499, + "language_loss": 0.88575542, + "learning_rate": 0.0009284784266788864, + "loss": 0.89680731, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.35668945, + "step": 1025, + "time_per_iteration": 4.087035417556763 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109606, + "balance_loss_mlp": 1.07565546, + "epoch": 0.19738360908041555, + "flos": 664681061376.0, + "grad_norm": 0.0565537913278748, + "language_loss": 0.92494339, + "learning_rate": 0.0009283177782975512, + "loss": 0.93603945, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.33984375, + "step": 1026, + "time_per_iteration": 2.948167562484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095626, + "balance_loss_mlp": 1.06117415, + "epoch": 0.1975759907656791, + "flos": 522244094976.0, + "grad_norm": 0.06218898027866582, + "language_loss": 0.88052273, + "learning_rate": 0.000928156963628507, + "loss": 0.89147896, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.3449707, + "step": 1027, + "time_per_iteration": 2.564019203186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091019, + "balance_loss_mlp": 1.05694866, + "epoch": 0.19776837245094267, + "flos": 462233309184.0, + "grad_norm": 0.056114928823487176, + "language_loss": 0.8826099, + "learning_rate": 0.0009279959827341877, + "loss": 0.89352006, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.34082031, + "step": 1028, + "time_per_iteration": 2.7226340770721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090946, + "balance_loss_mlp": 1.05699515, + "epoch": 0.19796075413620623, + "flos": 502809887232.0, + "grad_norm": 0.05507551359640612, + "language_loss": 0.88204837, + "learning_rate": 0.0009278348356770915, + "loss": 0.89295781, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.33984375, + "step": 1029, + "time_per_iteration": 2.592756748199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085455, + "balance_loss_mlp": 1.05093157, + "epoch": 0.1981531358214698, + "flos": 507281038848.0, + "grad_norm": 0.061172366255401664, + "language_loss": 0.85939109, + "learning_rate": 0.0009276735225197814, + "loss": 0.87024558, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.34570312, + "step": 1030, + "time_per_iteration": 2.598607063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088832, + "balance_loss_mlp": 1.05495238, + "epoch": 0.19834551750673335, + "flos": 531275148288.0, + "grad_norm": 0.0802549423316463, + "language_loss": 0.86293721, + "learning_rate": 0.0009275120433248847, + "loss": 0.87382561, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.33886719, + "step": 1031, + "time_per_iteration": 2.7143311500549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090216, + "balance_loss_mlp": 1.05726683, + "epoch": 0.1985378991919969, + "flos": 775147691520.0, + "grad_norm": 0.05308511447166053, + "language_loss": 0.86272347, + "learning_rate": 0.0009273503981550931, + "loss": 0.87362564, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.32958984, + "step": 1032, + "time_per_iteration": 3.0616648197174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087082, + "balance_loss_mlp": 1.05351269, + "epoch": 0.1987302808772605, + "flos": 434063411712.0, + "grad_norm": 0.059916166081832097, + "language_loss": 0.8703599, + "learning_rate": 0.0009271885870731626, + "loss": 0.88123071, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.3359375, + "step": 1033, + "time_per_iteration": 2.487316131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092715, + "balance_loss_mlp": 1.05921745, + "epoch": 0.19892266256252406, + "flos": 553342897152.0, + "grad_norm": 0.06168947094446192, + "language_loss": 0.88599998, + "learning_rate": 0.0009270266101419143, + "loss": 0.89692712, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.33520508, + "step": 1034, + "time_per_iteration": 2.5978119373321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085912, + "balance_loss_mlp": 1.05403578, + "epoch": 0.19911504424778761, + "flos": 549596302848.0, + "grad_norm": 0.06019117447906982, + "language_loss": 0.85564321, + "learning_rate": 0.0009268644674242328, + "loss": 0.86650234, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.31860352, + "step": 1035, + "time_per_iteration": 2.7259163856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097892, + "balance_loss_mlp": 1.0645138, + "epoch": 0.19930742593305117, + "flos": 518024636928.0, + "grad_norm": 0.05869793462101787, + "language_loss": 0.81141233, + "learning_rate": 0.0009267021589830678, + "loss": 0.82239127, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.33398438, + "step": 1036, + "time_per_iteration": 2.597724199295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161292, + "balance_loss_mlp": 1.14507985, + "epoch": 0.19949980761831473, + "flos": 1508516849664.0, + "grad_norm": 0.04621309141147155, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78788376, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.16210938, + "step": 1037, + "time_per_iteration": 4.918612241744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093993, + "balance_loss_mlp": 1.06044722, + "epoch": 0.1996921893035783, + "flos": 697803738624.0, + "grad_norm": 0.061892224045152405, + "language_loss": 0.93283784, + "learning_rate": 0.000926377045182406, + "loss": 0.94377768, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.33569336, + "step": 1038, + "time_per_iteration": 2.8800160884857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096412, + "balance_loss_mlp": 1.06334293, + "epoch": 0.19988457098884185, + "flos": 726682226688.0, + "grad_norm": 0.0613562398808313, + "language_loss": 0.87972045, + "learning_rate": 0.0009262142399491296, + "loss": 0.89068449, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.33081055, + "step": 1039, + "time_per_iteration": 3.0561435222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097284, + "balance_loss_mlp": 1.06345224, + "epoch": 0.2000769526741054, + "flos": 560275881984.0, + "grad_norm": 0.06364175085873486, + "language_loss": 0.87837642, + "learning_rate": 0.0009260512692448105, + "loss": 0.88934934, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.33862305, + "step": 1040, + "time_per_iteration": 2.7037088871002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090351, + "balance_loss_mlp": 1.05697203, + "epoch": 0.200269334359369, + "flos": 571758594048.0, + "grad_norm": 0.05851279903795688, + "language_loss": 0.84325236, + "learning_rate": 0.000925888133132719, + "loss": 0.85415584, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.33398438, + "step": 1041, + "time_per_iteration": 2.836177110671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082089, + "balance_loss_mlp": 1.06730711, + "epoch": 0.20046171604463256, + "flos": 1485362340864.0, + "grad_norm": 0.029405325300647274, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80692518, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.14746094, + "step": 1042, + "time_per_iteration": 4.901337146759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094947, + "balance_loss_mlp": 1.06140149, + "epoch": 0.20065409772989612, + "flos": 496266808320.0, + "grad_norm": 0.07205728359427886, + "language_loss": 0.81256473, + "learning_rate": 0.0009255613649386244, + "loss": 0.82351422, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.33544922, + "step": 1043, + "time_per_iteration": 2.6198885440826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089381, + "balance_loss_mlp": 1.05686069, + "epoch": 0.20084647941515968, + "flos": 579094631424.0, + "grad_norm": 0.06625931968059934, + "language_loss": 0.79017001, + "learning_rate": 0.0009253977329834838, + "loss": 0.80106384, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.32519531, + "step": 1044, + "time_per_iteration": 2.6872498989105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111234, + "balance_loss_mlp": 1.07745981, + "epoch": 0.20103886110042324, + "flos": 641775273984.0, + "grad_norm": 0.06628294367657735, + "language_loss": 0.86666185, + "learning_rate": 0.0009252339358742965, + "loss": 0.87778521, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.34912109, + "step": 1045, + "time_per_iteration": 2.7749996185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116952, + "balance_loss_mlp": 1.08219087, + "epoch": 0.2012312427856868, + "flos": 441720953856.0, + "grad_norm": 0.05401214919341486, + "language_loss": 0.83449644, + "learning_rate": 0.000925069973674654, + "loss": 0.84566593, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.34814453, + "step": 1046, + "time_per_iteration": 2.662992477416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114258, + "balance_loss_mlp": 1.08116508, + "epoch": 0.20142362447095036, + "flos": 554135855616.0, + "grad_norm": 0.049297877184233195, + "language_loss": 0.88960069, + "learning_rate": 0.000924905846448212, + "loss": 0.90074325, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.33105469, + "step": 1047, + "time_per_iteration": 2.8164925575256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127137, + "balance_loss_mlp": 1.09306693, + "epoch": 0.20161600615621392, + "flos": 669988841472.0, + "grad_norm": 0.07365230282100185, + "language_loss": 0.85615861, + "learning_rate": 0.0009247415542586906, + "loss": 0.86742997, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.34106445, + "step": 1048, + "time_per_iteration": 2.858611822128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115399, + "balance_loss_mlp": 1.08130527, + "epoch": 0.2018083878414775, + "flos": 572788689408.0, + "grad_norm": 0.05223287600750505, + "language_loss": 0.83514655, + "learning_rate": 0.0009245770971698735, + "loss": 0.84630048, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.34106445, + "step": 1049, + "time_per_iteration": 2.8758163452148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103626, + "balance_loss_mlp": 1.07036686, + "epoch": 0.20200076952674106, + "flos": 425624495616.0, + "grad_norm": 0.061140118103518055, + "language_loss": 0.88792473, + "learning_rate": 0.0009244124752456087, + "loss": 0.89896095, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.33276367, + "step": 1050, + "time_per_iteration": 2.501565456390381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097765, + "balance_loss_mlp": 1.06457758, + "epoch": 0.20219315121200462, + "flos": 536326852608.0, + "grad_norm": 0.049507299183714965, + "language_loss": 0.85344577, + "learning_rate": 0.0009242476885498081, + "loss": 0.86442339, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.33203125, + "step": 1051, + "time_per_iteration": 2.698791027069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095962, + "balance_loss_mlp": 1.06222594, + "epoch": 0.20238553289726818, + "flos": 477634323456.0, + "grad_norm": 0.07140169421024865, + "language_loss": 0.8134433, + "learning_rate": 0.0009240827371464474, + "loss": 0.82440293, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.33764648, + "step": 1052, + "time_per_iteration": 2.5603079795837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082157, + "balance_loss_mlp": 1.04958868, + "epoch": 0.20257791458253174, + "flos": 1151611430400.0, + "grad_norm": 0.06069279327125781, + "language_loss": 0.84372044, + "learning_rate": 0.0009239176210995666, + "loss": 0.85454196, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.32568359, + "step": 1053, + "time_per_iteration": 3.4549684524536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088056, + "balance_loss_mlp": 1.05393791, + "epoch": 0.2027702962677953, + "flos": 666606011904.0, + "grad_norm": 0.06066867592012189, + "language_loss": 0.93657684, + "learning_rate": 0.0009237523404732695, + "loss": 0.94745743, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.34130859, + "step": 1054, + "time_per_iteration": 4.344247817993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078384, + "balance_loss_mlp": 1.04567289, + "epoch": 0.20296267795305886, + "flos": 641011428864.0, + "grad_norm": 0.0678922331878557, + "language_loss": 0.84289086, + "learning_rate": 0.0009235868953317235, + "loss": 0.85367465, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.32714844, + "step": 1055, + "time_per_iteration": 2.7755184173583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086857, + "balance_loss_mlp": 1.05321646, + "epoch": 0.20315505963832242, + "flos": 930187777536.0, + "grad_norm": 0.06816541670806936, + "language_loss": 0.85603452, + "learning_rate": 0.0009234212857391602, + "loss": 0.86690307, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.33642578, + "step": 1056, + "time_per_iteration": 3.1736087799072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089034, + "balance_loss_mlp": 1.05477369, + "epoch": 0.20334744132358598, + "flos": 561818128896.0, + "grad_norm": 0.05209348313890264, + "language_loss": 0.88978589, + "learning_rate": 0.000923255511759875, + "loss": 0.90067613, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.34301758, + "step": 1057, + "time_per_iteration": 2.7823617458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093097, + "balance_loss_mlp": 1.05945587, + "epoch": 0.20353982300884957, + "flos": 643902455808.0, + "grad_norm": 0.061337083912670884, + "language_loss": 0.85219932, + "learning_rate": 0.000923089573458227, + "loss": 0.86313027, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.33666992, + "step": 1058, + "time_per_iteration": 2.8398988246917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092738, + "balance_loss_mlp": 1.05952644, + "epoch": 0.20373220469411313, + "flos": 651101690880.0, + "grad_norm": 0.0713114334987562, + "language_loss": 0.84425724, + "learning_rate": 0.0009229234708986392, + "loss": 0.85518456, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.33203125, + "step": 1059, + "time_per_iteration": 2.891934394836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069957, + "balance_loss_mlp": 1.05603302, + "epoch": 0.2039245863793767, + "flos": 1436939136000.0, + "grad_norm": 0.037855568460977755, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.8273685, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.13964844, + "step": 1060, + "time_per_iteration": 4.673142194747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092929, + "balance_loss_mlp": 1.05985999, + "epoch": 0.20411696806464025, + "flos": 596678082048.0, + "grad_norm": 0.07190006801568614, + "language_loss": 0.85404283, + "learning_rate": 0.0009225907732636548, + "loss": 0.86497211, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.33081055, + "step": 1061, + "time_per_iteration": 2.74110746383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091714, + "balance_loss_mlp": 1.0585742, + "epoch": 0.2043093497499038, + "flos": 573530775552.0, + "grad_norm": 0.06271161302412134, + "language_loss": 0.86991799, + "learning_rate": 0.0009224241783174227, + "loss": 0.88083506, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.33154297, + "step": 1062, + "time_per_iteration": 2.6885697841644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084619, + "balance_loss_mlp": 1.05233693, + "epoch": 0.20450173143516737, + "flos": 630061217280.0, + "grad_norm": 0.055816021094363524, + "language_loss": 0.85842204, + "learning_rate": 0.0009222574193715802, + "loss": 0.86926818, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.32275391, + "step": 1063, + "time_per_iteration": 2.7569899559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087796, + "balance_loss_mlp": 1.05522823, + "epoch": 0.20469411312043093, + "flos": 573718450176.0, + "grad_norm": 0.051897822989382614, + "language_loss": 0.8621105, + "learning_rate": 0.000922090496490869, + "loss": 0.87298846, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.32568359, + "step": 1064, + "time_per_iteration": 2.7099597454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082485, + "balance_loss_mlp": 1.05025065, + "epoch": 0.20488649480569449, + "flos": 636748300800.0, + "grad_norm": 0.04962250787968228, + "language_loss": 0.90165728, + "learning_rate": 0.0009219234097400937, + "loss": 0.91248214, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.32226562, + "step": 1065, + "time_per_iteration": 2.8492319583892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108042, + "balance_loss_mlp": 1.04806709, + "epoch": 0.20507887649095807, + "flos": 975383894016.0, + "grad_norm": 0.051536979552593745, + "language_loss": 0.83029723, + "learning_rate": 0.0009217561591841237, + "loss": 0.84110147, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.32348633, + "step": 1066, + "time_per_iteration": 3.267207145690918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082085, + "balance_loss_mlp": 1.04951739, + "epoch": 0.20527125817622163, + "flos": 485940819456.0, + "grad_norm": 0.09661652793466288, + "language_loss": 0.81334901, + "learning_rate": 0.0009215887448878913, + "loss": 0.82416987, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.32568359, + "step": 1067, + "time_per_iteration": 2.5429391860961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088998, + "balance_loss_mlp": 1.05552411, + "epoch": 0.2054636398614852, + "flos": 526921860096.0, + "grad_norm": 0.09953641970782799, + "language_loss": 0.85144234, + "learning_rate": 0.0009214211669163922, + "loss": 0.86233234, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.33496094, + "step": 1068, + "time_per_iteration": 2.7006540298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085875, + "balance_loss_mlp": 1.05428481, + "epoch": 0.20565602154674875, + "flos": 557898416640.0, + "grad_norm": 0.048729379907622286, + "language_loss": 0.93896133, + "learning_rate": 0.0009212534253346862, + "loss": 0.94982004, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.31567383, + "step": 1069, + "time_per_iteration": 2.7544496059417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098993, + "balance_loss_mlp": 1.06713986, + "epoch": 0.2058484032320123, + "flos": 503976784896.0, + "grad_norm": 0.06649355865978995, + "language_loss": 0.8497259, + "learning_rate": 0.0009210855202078964, + "loss": 0.86071587, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.31835938, + "step": 1070, + "time_per_iteration": 2.59660005569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113348, + "balance_loss_mlp": 1.07975471, + "epoch": 0.20604078491727587, + "flos": 432950358528.0, + "grad_norm": 0.06315152856471482, + "language_loss": 0.87476587, + "learning_rate": 0.0009209174516012091, + "loss": 0.88589936, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.3359375, + "step": 1071, + "time_per_iteration": 2.498087167739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110052, + "balance_loss_mlp": 1.07624412, + "epoch": 0.20623316660253943, + "flos": 608421252096.0, + "grad_norm": 0.06211591839104366, + "language_loss": 0.89244497, + "learning_rate": 0.0009207492195798747, + "loss": 0.9035455, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.33837891, + "step": 1072, + "time_per_iteration": 2.760019063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116237, + "balance_loss_mlp": 1.08142757, + "epoch": 0.206425548287803, + "flos": 480184906752.0, + "grad_norm": 0.07379229384440758, + "language_loss": 0.84887302, + "learning_rate": 0.0009205808242092061, + "loss": 0.86003542, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.34838867, + "step": 1073, + "time_per_iteration": 2.6034529209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118683, + "balance_loss_mlp": 1.08423102, + "epoch": 0.20661792997306658, + "flos": 949007937024.0, + "grad_norm": 0.0763588165275792, + "language_loss": 0.82845032, + "learning_rate": 0.0009204122655545808, + "loss": 0.83963716, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.34472656, + "step": 1074, + "time_per_iteration": 3.3222029209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111187, + "balance_loss_mlp": 1.07759392, + "epoch": 0.20681031165833014, + "flos": 603206604288.0, + "grad_norm": 0.05592396046249817, + "language_loss": 0.80705297, + "learning_rate": 0.0009202435436814388, + "loss": 0.81816483, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.33618164, + "step": 1075, + "time_per_iteration": 2.721888780593872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114903, + "balance_loss_mlp": 1.08121455, + "epoch": 0.2070026933435937, + "flos": 708665200128.0, + "grad_norm": 0.07630450069092473, + "language_loss": 0.89700603, + "learning_rate": 0.0009200746586552836, + "loss": 0.90815508, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.3371582, + "step": 1076, + "time_per_iteration": 2.8797900676727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107662, + "balance_loss_mlp": 1.07416463, + "epoch": 0.20719507502885726, + "flos": 829456409088.0, + "grad_norm": 0.06176881640488279, + "language_loss": 0.84210765, + "learning_rate": 0.0009199056105416825, + "loss": 0.85318428, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.33520508, + "step": 1077, + "time_per_iteration": 3.120950698852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107166, + "balance_loss_mlp": 1.07312012, + "epoch": 0.20738745671412082, + "flos": 637993774080.0, + "grad_norm": 0.055893649084458805, + "language_loss": 0.86594802, + "learning_rate": 0.0009197363994062654, + "loss": 0.87701964, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.34057617, + "step": 1078, + "time_per_iteration": 2.8197755813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086727, + "balance_loss_mlp": 1.05480289, + "epoch": 0.20757983839938438, + "flos": 685258845696.0, + "grad_norm": 0.054433441748304986, + "language_loss": 0.84861732, + "learning_rate": 0.0009195670253147262, + "loss": 0.85948461, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.3190918, + "step": 1079, + "time_per_iteration": 2.966987133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096354, + "balance_loss_mlp": 1.06340468, + "epoch": 0.20777222008464794, + "flos": 519024208896.0, + "grad_norm": 0.07868801214896702, + "language_loss": 0.82301188, + "learning_rate": 0.0009193974883328216, + "loss": 0.83397532, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.32958984, + "step": 1080, + "time_per_iteration": 2.620704174041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091209, + "balance_loss_mlp": 1.05725837, + "epoch": 0.2079646017699115, + "flos": 511136732160.0, + "grad_norm": 0.09961486538628272, + "language_loss": 0.87482947, + "learning_rate": 0.0009192277885263718, + "loss": 0.88574153, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.33984375, + "step": 1081, + "time_per_iteration": 2.6247479915618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087114, + "balance_loss_mlp": 1.05352044, + "epoch": 0.20815698345517505, + "flos": 931409929728.0, + "grad_norm": 0.05448561879445608, + "language_loss": 0.86255312, + "learning_rate": 0.0009190579259612602, + "loss": 0.87342417, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.33618164, + "step": 1082, + "time_per_iteration": 3.2661428451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085916, + "balance_loss_mlp": 1.05187023, + "epoch": 0.20834936514043864, + "flos": 632114205696.0, + "grad_norm": 0.059638645169798186, + "language_loss": 0.86669636, + "learning_rate": 0.000918887900703433, + "loss": 0.87755549, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.34082031, + "step": 1083, + "time_per_iteration": 2.7930080890655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083976, + "balance_loss_mlp": 1.05038285, + "epoch": 0.2085417468257022, + "flos": 394170693120.0, + "grad_norm": 0.06326041775418027, + "language_loss": 0.90427065, + "learning_rate": 0.0009187177128188999, + "loss": 0.91511047, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.33618164, + "step": 1084, + "time_per_iteration": 2.431358814239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054848, + "balance_loss_mlp": 1.04159176, + "epoch": 0.20873412851096576, + "flos": 1401387969024.0, + "grad_norm": 0.04127554175786628, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78211385, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.1328125, + "step": 1085, + "time_per_iteration": 6.352816343307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080772, + "balance_loss_mlp": 1.04686832, + "epoch": 0.20892651019622932, + "flos": 447599112192.0, + "grad_norm": 0.06234370040412467, + "language_loss": 0.8612783, + "learning_rate": 0.000918376849434071, + "loss": 0.87208605, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.33935547, + "step": 1086, + "time_per_iteration": 2.5168843269348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084999, + "balance_loss_mlp": 1.05040443, + "epoch": 0.20911889188149288, + "flos": 492863629824.0, + "grad_norm": 0.07820142019527274, + "language_loss": 0.90828383, + "learning_rate": 0.0009182061740661098, + "loss": 0.91913384, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.34643555, + "step": 1087, + "time_per_iteration": 2.5461525917053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083648, + "balance_loss_mlp": 1.0494113, + "epoch": 0.20931127356675644, + "flos": 840928946688.0, + "grad_norm": 0.05821627614551514, + "language_loss": 0.85034752, + "learning_rate": 0.0009180353363361127, + "loss": 0.86118406, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.3425293, + "step": 1088, + "time_per_iteration": 3.11942982673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084605, + "balance_loss_mlp": 1.05036855, + "epoch": 0.20950365525202, + "flos": 756796013568.0, + "grad_norm": 0.06471498550944753, + "language_loss": 0.82101512, + "learning_rate": 0.0009178643363104044, + "loss": 0.83186114, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.34277344, + "step": 1089, + "time_per_iteration": 3.0986390113830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107905, + "balance_loss_mlp": 1.04543328, + "epoch": 0.20969603693728356, + "flos": 472301812224.0, + "grad_norm": 0.07091461504319575, + "language_loss": 0.91050649, + "learning_rate": 0.0009176931740553735, + "loss": 0.92129695, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.33642578, + "step": 1090, + "time_per_iteration": 2.4965460300445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108219, + "balance_loss_mlp": 1.04845381, + "epoch": 0.20988841862254715, + "flos": 976507121664.0, + "grad_norm": 0.05967441428812083, + "language_loss": 0.82829833, + "learning_rate": 0.0009175218496374708, + "loss": 0.83912027, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.33740234, + "step": 1091, + "time_per_iteration": 3.325467348098755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082454, + "balance_loss_mlp": 1.04917121, + "epoch": 0.2100808003078107, + "flos": 1092697731072.0, + "grad_norm": 0.06552872916111846, + "language_loss": 0.85816884, + "learning_rate": 0.0009173503631232103, + "loss": 0.86899334, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.33300781, + "step": 1092, + "time_per_iteration": 3.3492543697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080972, + "balance_loss_mlp": 1.04761767, + "epoch": 0.21027318199307427, + "flos": 1012567468032.0, + "grad_norm": 0.06864870254184631, + "language_loss": 0.8205356, + "learning_rate": 0.0009171787145791691, + "loss": 0.83134532, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.33374023, + "step": 1093, + "time_per_iteration": 3.229302167892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083111, + "balance_loss_mlp": 1.04975629, + "epoch": 0.21046556367833782, + "flos": 521141216256.0, + "grad_norm": 0.08362122797221107, + "language_loss": 0.80208671, + "learning_rate": 0.000917006904071987, + "loss": 0.81291783, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.33374023, + "step": 1094, + "time_per_iteration": 2.5901217460632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091484, + "balance_loss_mlp": 1.05843902, + "epoch": 0.21065794536360138, + "flos": 603437948928.0, + "grad_norm": 0.05523679641811596, + "language_loss": 0.87209588, + "learning_rate": 0.0009168349316683669, + "loss": 0.88301063, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.33056641, + "step": 1095, + "time_per_iteration": 2.67250919342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093141, + "balance_loss_mlp": 1.06081104, + "epoch": 0.21085032704886494, + "flos": 603045070848.0, + "grad_norm": 0.05347318487829757, + "language_loss": 0.82685143, + "learning_rate": 0.0009166627974350741, + "loss": 0.83778286, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.32324219, + "step": 1096, + "time_per_iteration": 2.9291882514953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097603, + "balance_loss_mlp": 1.06362867, + "epoch": 0.2110427087341285, + "flos": 637382697984.0, + "grad_norm": 0.059512513015867, + "language_loss": 0.89716321, + "learning_rate": 0.0009164905014389373, + "loss": 0.90813923, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.34008789, + "step": 1097, + "time_per_iteration": 2.7384700775146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100621, + "balance_loss_mlp": 1.06798196, + "epoch": 0.21123509041939206, + "flos": 522667496448.0, + "grad_norm": 0.08051519151754843, + "language_loss": 0.87020361, + "learning_rate": 0.0009163180437468476, + "loss": 0.88120985, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.32641602, + "step": 1098, + "time_per_iteration": 2.584890365600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109632, + "balance_loss_mlp": 1.0635848, + "epoch": 0.21142747210465565, + "flos": 450938271744.0, + "grad_norm": 0.05811835985780437, + "language_loss": 0.86184567, + "learning_rate": 0.000916145424425759, + "loss": 0.87280893, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.32739258, + "step": 1099, + "time_per_iteration": 2.6362791061401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_mlp": 1.07059634, + "epoch": 0.2116198537899192, + "flos": 875813630976.0, + "grad_norm": 0.07623729144092387, + "language_loss": 0.9082064, + "learning_rate": 0.0009159726435426885, + "loss": 0.91924655, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.33447266, + "step": 1100, + "time_per_iteration": 3.0668158531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108924, + "balance_loss_mlp": 1.0554564, + "epoch": 0.21181223547518277, + "flos": 523410992640.0, + "grad_norm": 0.06029059005678133, + "language_loss": 0.90809137, + "learning_rate": 0.0009157997011647154, + "loss": 0.91898382, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.33813477, + "step": 1101, + "time_per_iteration": 2.5932393074035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110663, + "balance_loss_mlp": 1.07327497, + "epoch": 0.21200461716044633, + "flos": 572014669824.0, + "grad_norm": 0.05812758027328986, + "language_loss": 0.86378956, + "learning_rate": 0.0009156265973589817, + "loss": 0.87485588, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.33374023, + "step": 1102, + "time_per_iteration": 2.79496431350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110266, + "balance_loss_mlp": 1.07672012, + "epoch": 0.2121969988457099, + "flos": 544869075456.0, + "grad_norm": 0.0704183859149776, + "language_loss": 0.89789248, + "learning_rate": 0.0009154533321926926, + "loss": 0.90899515, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.33569336, + "step": 1103, + "time_per_iteration": 2.5982048511505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107149, + "balance_loss_mlp": 1.07393694, + "epoch": 0.21238938053097345, + "flos": 843489704448.0, + "grad_norm": 0.06399101868010165, + "language_loss": 0.87705767, + "learning_rate": 0.0009152799057331156, + "loss": 0.88812917, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.33227539, + "step": 1104, + "time_per_iteration": 3.088672637939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100871, + "balance_loss_mlp": 1.06768262, + "epoch": 0.212581762216237, + "flos": 445984081920.0, + "grad_norm": 0.064105004549741, + "language_loss": 0.91047186, + "learning_rate": 0.0009151063180475805, + "loss": 0.9214806, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.33203125, + "step": 1105, + "time_per_iteration": 2.569998025894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090698, + "balance_loss_mlp": 1.05798697, + "epoch": 0.21277414390150057, + "flos": 514129655808.0, + "grad_norm": 0.05967324045126681, + "language_loss": 0.84732658, + "learning_rate": 0.0009149325692034803, + "loss": 0.85823357, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.32714844, + "step": 1106, + "time_per_iteration": 2.6009016036987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149918, + "balance_loss_mlp": 1.13380063, + "epoch": 0.21296652558676413, + "flos": 1484790552576.0, + "grad_norm": 0.04210654195905191, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80353343, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.16113281, + "step": 1107, + "time_per_iteration": 4.820629596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082056, + "balance_loss_mlp": 1.04994082, + "epoch": 0.21315890727202771, + "flos": 845689669632.0, + "grad_norm": 0.07454945953507684, + "language_loss": 0.87513995, + "learning_rate": 0.0009145845883094678, + "loss": 0.88596046, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.32080078, + "step": 1108, + "time_per_iteration": 3.0311591625213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083334, + "balance_loss_mlp": 1.04971695, + "epoch": 0.21335128895729127, + "flos": 629086376448.0, + "grad_norm": 0.07212897946446892, + "language_loss": 0.85387337, + "learning_rate": 0.000914410356394654, + "loss": 0.86470675, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.33642578, + "step": 1109, + "time_per_iteration": 2.7746968269348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085071, + "balance_loss_mlp": 1.05102468, + "epoch": 0.21354367064255483, + "flos": 710649787392.0, + "grad_norm": 0.053148069764317206, + "language_loss": 0.85104829, + "learning_rate": 0.0009142359635914709, + "loss": 0.86189902, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.34057617, + "step": 1110, + "time_per_iteration": 3.109018564224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083884, + "balance_loss_mlp": 1.05067194, + "epoch": 0.2137360523278184, + "flos": 455950688256.0, + "grad_norm": 0.07113647076789116, + "language_loss": 0.84692943, + "learning_rate": 0.0009140614099676245, + "loss": 0.8577683, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.33203125, + "step": 1111, + "time_per_iteration": 2.5607409477233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083673, + "balance_loss_mlp": 1.05072355, + "epoch": 0.21392843401308195, + "flos": 665749034496.0, + "grad_norm": 0.059219994241997045, + "language_loss": 0.82997137, + "learning_rate": 0.0009138866955908821, + "loss": 0.84080815, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.32958984, + "step": 1112, + "time_per_iteration": 2.901376724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086583, + "balance_loss_mlp": 1.05327559, + "epoch": 0.2141208156983455, + "flos": 748656843264.0, + "grad_norm": 0.06302145936378449, + "language_loss": 0.80617785, + "learning_rate": 0.0009137118205290738, + "loss": 0.81704366, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.33325195, + "step": 1113, + "time_per_iteration": 2.9629132747650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085777, + "balance_loss_mlp": 1.05142069, + "epoch": 0.21431319738360907, + "flos": 418898124288.0, + "grad_norm": 0.06913372638273338, + "language_loss": 0.90778732, + "learning_rate": 0.0009135367848500924, + "loss": 0.91864502, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.34399414, + "step": 1114, + "time_per_iteration": 2.5860419273376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087087, + "balance_loss_mlp": 1.05406582, + "epoch": 0.21450557906887263, + "flos": 608849035776.0, + "grad_norm": 0.07370492115341919, + "language_loss": 0.86567605, + "learning_rate": 0.0009133615886218927, + "loss": 0.87654686, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.33032227, + "step": 1115, + "time_per_iteration": 2.6986265182495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095845, + "balance_loss_mlp": 1.06082106, + "epoch": 0.21469796075413622, + "flos": 561649393152.0, + "grad_norm": 0.0682239504380638, + "language_loss": 0.88444531, + "learning_rate": 0.0009131862319124917, + "loss": 0.89540386, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.3503418, + "step": 1116, + "time_per_iteration": 2.644977569580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086331, + "balance_loss_mlp": 1.0540725, + "epoch": 0.21489034243939978, + "flos": 594363225600.0, + "grad_norm": 0.06937847326766512, + "language_loss": 0.8429122, + "learning_rate": 0.0009130107147899691, + "loss": 0.85377544, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.32250977, + "step": 1117, + "time_per_iteration": 2.768064498901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084886, + "balance_loss_mlp": 1.05148315, + "epoch": 0.21508272412466334, + "flos": 441661317120.0, + "grad_norm": 0.09911577685113587, + "language_loss": 0.85504615, + "learning_rate": 0.0009128350373224665, + "loss": 0.86589503, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.33422852, + "step": 1118, + "time_per_iteration": 2.5369865894317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_mlp": 1.02206326, + "epoch": 0.2152751058099269, + "flos": 1495397348352.0, + "grad_norm": 0.028624916140058014, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82490271, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.11767578, + "step": 1119, + "time_per_iteration": 4.634536266326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109091, + "balance_loss_mlp": 1.05741262, + "epoch": 0.21546748749519046, + "flos": 493759895040.0, + "grad_norm": 0.057336284262766976, + "language_loss": 0.85470641, + "learning_rate": 0.0009124832016254005, + "loss": 0.86561549, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.33520508, + "step": 1120, + "time_per_iteration": 2.57099986076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097048, + "balance_loss_mlp": 1.06245303, + "epoch": 0.21565986918045402, + "flos": 634241387520.0, + "grad_norm": 0.0556622286599547, + "language_loss": 0.8842063, + "learning_rate": 0.0009123070435324316, + "loss": 0.89517677, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.34619141, + "step": 1121, + "time_per_iteration": 2.73698091506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010392, + "balance_loss_mlp": 1.02780366, + "epoch": 0.21585225086571758, + "flos": 1582502704128.0, + "grad_norm": 0.024824935431588098, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78914982, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.11376953, + "step": 1122, + "time_per_iteration": 4.960963010787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093163, + "balance_loss_mlp": 1.05897415, + "epoch": 0.21604463255098114, + "flos": 683799556608.0, + "grad_norm": 0.06637115500638362, + "language_loss": 0.86772728, + "learning_rate": 0.0009119542471995752, + "loss": 0.87865889, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.34204102, + "step": 1123, + "time_per_iteration": 2.819042205810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109659, + "balance_loss_mlp": 1.06228209, + "epoch": 0.2162370142362447, + "flos": 780660675072.0, + "grad_norm": 0.06221084946299637, + "language_loss": 0.81623554, + "learning_rate": 0.0009117776090966554, + "loss": 0.82720149, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.34326172, + "step": 1124, + "time_per_iteration": 2.9435975551605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090546, + "balance_loss_mlp": 1.05473578, + "epoch": 0.21642939592150828, + "flos": 1001745294336.0, + "grad_norm": 0.06219513600405685, + "language_loss": 0.86821365, + "learning_rate": 0.0009116008111274899, + "loss": 0.8791191, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.35839844, + "step": 1125, + "time_per_iteration": 3.250828504562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023544, + "balance_loss_mlp": 1.01271951, + "epoch": 0.21662177760677184, + "flos": 1481867440128.0, + "grad_norm": 0.013492425774453086, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.8013047, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.10839844, + "step": 1126, + "time_per_iteration": 4.836662530899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078043, + "balance_loss_mlp": 1.0431627, + "epoch": 0.2168141592920354, + "flos": 887030092800.0, + "grad_norm": 0.06408405180788145, + "language_loss": 0.84878719, + "learning_rate": 0.0009112467358650396, + "loss": 0.85956764, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.34912109, + "step": 1127, + "time_per_iteration": 3.118460178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087154, + "balance_loss_mlp": 1.05205846, + "epoch": 0.21700654097729896, + "flos": 545682382848.0, + "grad_norm": 0.06014422622436645, + "language_loss": 0.86521864, + "learning_rate": 0.0009110694587092192, + "loss": 0.87609017, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.35131836, + "step": 1128, + "time_per_iteration": 2.736814022064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080165, + "balance_loss_mlp": 1.0446167, + "epoch": 0.21719892266256252, + "flos": 509270008320.0, + "grad_norm": 0.06606219668196793, + "language_loss": 0.81429344, + "learning_rate": 0.0009108920219620815, + "loss": 0.82509506, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.35571289, + "step": 1129, + "time_per_iteration": 2.6214489936828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083138, + "balance_loss_mlp": 1.04782772, + "epoch": 0.21739130434782608, + "flos": 543150738432.0, + "grad_norm": 0.060577581075914995, + "language_loss": 0.89903116, + "learning_rate": 0.0009107144256925133, + "loss": 0.90986252, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.35302734, + "step": 1130, + "time_per_iteration": 2.6337971687316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079633, + "balance_loss_mlp": 1.04489541, + "epoch": 0.21758368603308964, + "flos": 616564804608.0, + "grad_norm": 0.0674499307688184, + "language_loss": 0.82610142, + "learning_rate": 0.0009105366699694638, + "loss": 0.83689773, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.34790039, + "step": 1131, + "time_per_iteration": 2.6984267234802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085371, + "balance_loss_mlp": 1.04979873, + "epoch": 0.2177760677183532, + "flos": 634813175808.0, + "grad_norm": 0.051829013054278075, + "language_loss": 0.8159321, + "learning_rate": 0.0009103587548619439, + "loss": 0.8267858, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.35571289, + "step": 1132, + "time_per_iteration": 2.8308732509613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083319, + "balance_loss_mlp": 1.04850996, + "epoch": 0.2179684494036168, + "flos": 532181587968.0, + "grad_norm": 0.06772780520844247, + "language_loss": 0.86115086, + "learning_rate": 0.0009101806804390261, + "loss": 0.87198412, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.34863281, + "step": 1133, + "time_per_iteration": 2.7745282649993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081903, + "balance_loss_mlp": 1.04671264, + "epoch": 0.21816083108888035, + "flos": 474980433408.0, + "grad_norm": 0.05911376481567057, + "language_loss": 0.90451765, + "learning_rate": 0.0009100024467698453, + "loss": 0.91533667, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.35205078, + "step": 1134, + "time_per_iteration": 2.551278829574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086865, + "balance_loss_mlp": 1.051054, + "epoch": 0.2183532127741439, + "flos": 577198794240.0, + "grad_norm": 0.07962415284192025, + "language_loss": 0.83050048, + "learning_rate": 0.0009098240539235981, + "loss": 0.84136909, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.3581543, + "step": 1135, + "time_per_iteration": 2.660019636154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086508, + "balance_loss_mlp": 1.05172312, + "epoch": 0.21854559445940747, + "flos": 593832135168.0, + "grad_norm": 0.05867668726509775, + "language_loss": 0.87679315, + "learning_rate": 0.0009096455019695423, + "loss": 0.88765824, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.34838867, + "step": 1136, + "time_per_iteration": 2.756463050842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090029, + "balance_loss_mlp": 1.05426645, + "epoch": 0.21873797614467103, + "flos": 408464446464.0, + "grad_norm": 0.06290439978907646, + "language_loss": 0.90092266, + "learning_rate": 0.000909466790976998, + "loss": 0.91182297, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.35791016, + "step": 1137, + "time_per_iteration": 2.5046186447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089401, + "balance_loss_mlp": 1.05373359, + "epoch": 0.21893035782993459, + "flos": 893824865280.0, + "grad_norm": 0.05253297698454947, + "language_loss": 0.83030021, + "learning_rate": 0.0009092879210153473, + "loss": 0.84119421, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.35668945, + "step": 1138, + "time_per_iteration": 3.1294023990631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092735, + "balance_loss_mlp": 1.05835557, + "epoch": 0.21912273951519814, + "flos": 467392702464.0, + "grad_norm": 0.05516730570048504, + "language_loss": 0.88930631, + "learning_rate": 0.0009091088921540333, + "loss": 0.90023363, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.34399414, + "step": 1139, + "time_per_iteration": 2.5161380767822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081896, + "balance_loss_mlp": 1.06921172, + "epoch": 0.2193151212004617, + "flos": 1531262665728.0, + "grad_norm": 0.036356034107047845, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76590574, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.12695312, + "step": 1140, + "time_per_iteration": 4.929131984710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087847, + "balance_loss_mlp": 1.05306172, + "epoch": 0.2195075028857253, + "flos": 590901820416.0, + "grad_norm": 0.07364984820319191, + "language_loss": 0.8488574, + "learning_rate": 0.0009087503580104985, + "loss": 0.85973585, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.34814453, + "step": 1141, + "time_per_iteration": 2.676321029663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087752, + "balance_loss_mlp": 1.05287158, + "epoch": 0.21969988457098885, + "flos": 636033917952.0, + "grad_norm": 0.0662048159418312, + "language_loss": 0.79610777, + "learning_rate": 0.0009085708528674728, + "loss": 0.80698538, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.34912109, + "step": 1142, + "time_per_iteration": 2.7667393684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088746, + "balance_loss_mlp": 1.05202913, + "epoch": 0.2198922662562524, + "flos": 911974311936.0, + "grad_norm": 0.07907290305355467, + "language_loss": 0.86086833, + "learning_rate": 0.0009083911891031745, + "loss": 0.87175578, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.36743164, + "step": 1143, + "time_per_iteration": 3.1026079654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093334, + "balance_loss_mlp": 1.05809617, + "epoch": 0.22008464794151597, + "flos": 822603409920.0, + "grad_norm": 0.06284406217527433, + "language_loss": 0.91362917, + "learning_rate": 0.0009082113667873553, + "loss": 0.92456251, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.3527832, + "step": 1144, + "time_per_iteration": 3.098741292953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107482, + "balance_loss_mlp": 1.07188582, + "epoch": 0.22027702962677953, + "flos": 459416475648.0, + "grad_norm": 0.06625631151069579, + "language_loss": 0.90562177, + "learning_rate": 0.0009080313859898283, + "loss": 0.91669661, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.35620117, + "step": 1145, + "time_per_iteration": 2.4998207092285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111259, + "balance_loss_mlp": 1.07606888, + "epoch": 0.2204694113120431, + "flos": 530998723584.0, + "grad_norm": 0.05051092763003013, + "language_loss": 0.91815794, + "learning_rate": 0.0009078512467804684, + "loss": 0.92927051, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.35180664, + "step": 1146, + "time_per_iteration": 2.569073438644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117194, + "balance_loss_mlp": 1.08162224, + "epoch": 0.22066179299730665, + "flos": 522382307328.0, + "grad_norm": 0.06837547739928014, + "language_loss": 0.90610331, + "learning_rate": 0.0009076709492292119, + "loss": 0.91727525, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.35571289, + "step": 1147, + "time_per_iteration": 2.614039659500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114459, + "balance_loss_mlp": 1.07969809, + "epoch": 0.2208541746825702, + "flos": 546188742144.0, + "grad_norm": 0.06837160959472317, + "language_loss": 0.89193797, + "learning_rate": 0.0009074904934060562, + "loss": 0.90308249, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.34790039, + "step": 1148, + "time_per_iteration": 2.6419012546539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112121, + "balance_loss_mlp": 1.08578134, + "epoch": 0.22104655636783377, + "flos": 708404742144.0, + "grad_norm": 0.07108081727062696, + "language_loss": 0.84988266, + "learning_rate": 0.0009073098793810607, + "loss": 0.86109483, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.35473633, + "step": 1149, + "time_per_iteration": 2.909515142440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116065, + "balance_loss_mlp": 1.08061242, + "epoch": 0.22123893805309736, + "flos": 584594468352.0, + "grad_norm": 0.07695680382665727, + "language_loss": 0.88374794, + "learning_rate": 0.000907129107224346, + "loss": 0.89490861, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.35522461, + "step": 1150, + "time_per_iteration": 2.7029008865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099143, + "balance_loss_mlp": 1.06369042, + "epoch": 0.22143131973836092, + "flos": 492002270208.0, + "grad_norm": 0.049049579749502144, + "language_loss": 0.88305712, + "learning_rate": 0.0009069481770060939, + "loss": 0.89404863, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.35498047, + "step": 1151, + "time_per_iteration": 2.65167236328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097606, + "balance_loss_mlp": 1.06248736, + "epoch": 0.22162370142362448, + "flos": 1079227459584.0, + "grad_norm": 0.054063738490033035, + "language_loss": 0.84240663, + "learning_rate": 0.000906767088796548, + "loss": 0.85338271, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.35180664, + "step": 1152, + "time_per_iteration": 3.423985004425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110736, + "balance_loss_mlp": 1.07300401, + "epoch": 0.22181608310888803, + "flos": 492258345984.0, + "grad_norm": 0.057939830998464815, + "language_loss": 0.87012136, + "learning_rate": 0.0009065858426660127, + "loss": 0.88119501, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.34399414, + "step": 1153, + "time_per_iteration": 2.5987319946289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108178, + "balance_loss_mlp": 1.07355952, + "epoch": 0.2220084647941516, + "flos": 723687892992.0, + "grad_norm": 0.0653796708212952, + "language_loss": 0.84926325, + "learning_rate": 0.0009064044386848543, + "loss": 0.86034507, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.34667969, + "step": 1154, + "time_per_iteration": 2.9024224281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112372, + "balance_loss_mlp": 1.0753932, + "epoch": 0.22220084647941515, + "flos": 488988997632.0, + "grad_norm": 0.06606878403176955, + "language_loss": 0.88905716, + "learning_rate": 0.0009062228769234997, + "loss": 0.90018088, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.36987305, + "step": 1155, + "time_per_iteration": 2.5483920574188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104305, + "balance_loss_mlp": 1.06868565, + "epoch": 0.2223932281646787, + "flos": 536025696768.0, + "grad_norm": 0.06185680569649912, + "language_loss": 0.81360811, + "learning_rate": 0.0009060411574524376, + "loss": 0.82465118, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.35644531, + "step": 1156, + "time_per_iteration": 2.629166841506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114341, + "balance_loss_mlp": 1.0794363, + "epoch": 0.22258560984994227, + "flos": 931034580480.0, + "grad_norm": 0.06288530021121215, + "language_loss": 0.88191485, + "learning_rate": 0.0009058592803422178, + "loss": 0.8930583, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.34936523, + "step": 1157, + "time_per_iteration": 3.133453845977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219449, + "balance_loss_mlp": 1.20495331, + "epoch": 0.22277799153520586, + "flos": 1198998443520.0, + "grad_norm": 0.06392494715081258, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79929739, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.14453125, + "step": 1158, + "time_per_iteration": 4.838433027267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095231, + "balance_loss_mlp": 1.0620904, + "epoch": 0.22297037322046942, + "flos": 501052262400.0, + "grad_norm": 0.059439708082357066, + "language_loss": 0.90095651, + "learning_rate": 0.00090549505348681, + "loss": 0.91190875, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.33154297, + "step": 1159, + "time_per_iteration": 2.561887264251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083603, + "balance_loss_mlp": 1.04977143, + "epoch": 0.22316275490573298, + "flos": 752413612032.0, + "grad_norm": 0.05610915875378834, + "language_loss": 0.84254742, + "learning_rate": 0.0009053127038830275, + "loss": 0.85338354, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.33862305, + "step": 1160, + "time_per_iteration": 2.9465925693511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082896, + "balance_loss_mlp": 1.04844403, + "epoch": 0.22335513659099654, + "flos": 514553057280.0, + "grad_norm": 0.06657410760601727, + "language_loss": 0.87182009, + "learning_rate": 0.000905130196922898, + "loss": 0.88264906, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.3449707, + "step": 1161, + "time_per_iteration": 2.597325325012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076013, + "balance_loss_mlp": 1.04213357, + "epoch": 0.2235475182762601, + "flos": 484286501376.0, + "grad_norm": 0.057467913173926514, + "language_loss": 0.87228084, + "learning_rate": 0.0009049475326772769, + "loss": 0.88304096, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.33911133, + "step": 1162, + "time_per_iteration": 2.591592788696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107232, + "balance_loss_mlp": 1.0379163, + "epoch": 0.22373989996152366, + "flos": 469698794496.0, + "grad_norm": 0.05481831884816676, + "language_loss": 0.83362567, + "learning_rate": 0.0009047647112170811, + "loss": 0.84434885, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.34448242, + "step": 1163, + "time_per_iteration": 2.7466936111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080512, + "balance_loss_mlp": 1.04503536, + "epoch": 0.22393228164678722, + "flos": 1270512594432.0, + "grad_norm": 0.0775991801606853, + "language_loss": 0.87402856, + "learning_rate": 0.0009045817326132876, + "loss": 0.88483369, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.35498047, + "step": 1164, + "time_per_iteration": 3.6615524291992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082563, + "balance_loss_mlp": 1.04665732, + "epoch": 0.22412466333205078, + "flos": 596052449280.0, + "grad_norm": 0.05603114612800397, + "language_loss": 0.83484542, + "learning_rate": 0.0009043985969369357, + "loss": 0.84567106, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.35913086, + "step": 1165, + "time_per_iteration": 2.800389528274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084793, + "balance_loss_mlp": 1.047647, + "epoch": 0.22431704501731436, + "flos": 608136062976.0, + "grad_norm": 0.052919924442321326, + "language_loss": 0.84423298, + "learning_rate": 0.0009042153042591245, + "loss": 0.85508084, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.37158203, + "step": 1166, + "time_per_iteration": 2.7848384380340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080872, + "balance_loss_mlp": 1.04622972, + "epoch": 0.22450942670257792, + "flos": 906203842560.0, + "grad_norm": 0.054053491984114646, + "language_loss": 0.85318398, + "learning_rate": 0.0009040318546510146, + "loss": 0.86399269, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.34667969, + "step": 1167, + "time_per_iteration": 3.1406538486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080871, + "balance_loss_mlp": 1.04529881, + "epoch": 0.22470180838784148, + "flos": 565032222720.0, + "grad_norm": 0.06590224184570584, + "language_loss": 0.85490131, + "learning_rate": 0.0009038482481838275, + "loss": 0.86571002, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.35620117, + "step": 1168, + "time_per_iteration": 2.6623363494873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107832, + "balance_loss_mlp": 1.04265296, + "epoch": 0.22489419007310504, + "flos": 834109443072.0, + "grad_norm": 0.05295244004415107, + "language_loss": 0.87364161, + "learning_rate": 0.0009036644849288455, + "loss": 0.88442481, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.35668945, + "step": 1169, + "time_per_iteration": 3.096397638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082872, + "balance_loss_mlp": 1.04567838, + "epoch": 0.2250865717583686, + "flos": 580788237312.0, + "grad_norm": 0.06189616009675637, + "language_loss": 0.85257494, + "learning_rate": 0.0009034805649574118, + "loss": 0.86340362, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.37207031, + "step": 1170, + "time_per_iteration": 2.655629873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081931, + "balance_loss_mlp": 1.04669285, + "epoch": 0.22527895344363216, + "flos": 600091435008.0, + "grad_norm": 0.0574349936504533, + "language_loss": 0.85081124, + "learning_rate": 0.0009032964883409308, + "loss": 0.86163056, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.35253906, + "step": 1171, + "time_per_iteration": 2.9228479862213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096453, + "balance_loss_mlp": 1.08109915, + "epoch": 0.22547133512889572, + "flos": 1440009073152.0, + "grad_norm": 0.03435009764288223, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74146986, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.15332031, + "step": 1172, + "time_per_iteration": 4.9870195388793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099751, + "balance_loss_mlp": 1.06365418, + "epoch": 0.22566371681415928, + "flos": 490377065472.0, + "grad_norm": 0.06750207251725504, + "language_loss": 0.87597418, + "learning_rate": 0.0009029278654587462, + "loss": 0.88697171, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.36108398, + "step": 1173, + "time_per_iteration": 2.545078754425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098309, + "balance_loss_mlp": 1.06156898, + "epoch": 0.22585609849942284, + "flos": 604334214144.0, + "grad_norm": 0.06244795934891309, + "language_loss": 0.82517409, + "learning_rate": 0.0009027433193361548, + "loss": 0.8361572, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.36767578, + "step": 1174, + "time_per_iteration": 2.69753098487854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100346, + "balance_loss_mlp": 1.06305695, + "epoch": 0.22604848018468643, + "flos": 635280247296.0, + "grad_norm": 0.06854123529633785, + "language_loss": 0.87138826, + "learning_rate": 0.00090255861685474, + "loss": 0.88239175, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.37280273, + "step": 1175, + "time_per_iteration": 2.7199149131774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094451, + "balance_loss_mlp": 1.05744886, + "epoch": 0.22624086186995, + "flos": 479633467392.0, + "grad_norm": 0.06836538183258173, + "language_loss": 0.91474092, + "learning_rate": 0.0009023737580862095, + "loss": 0.92568541, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.36962891, + "step": 1176, + "time_per_iteration": 2.51255464553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092025, + "balance_loss_mlp": 1.0570724, + "epoch": 0.22643324355521355, + "flos": 495566982144.0, + "grad_norm": 0.05906016973995859, + "language_loss": 0.83066601, + "learning_rate": 0.0009021887431023321, + "loss": 0.84158623, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.34985352, + "step": 1177, + "time_per_iteration": 2.5783960819244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084144, + "balance_loss_mlp": 1.04842877, + "epoch": 0.2266256252404771, + "flos": 561271071744.0, + "grad_norm": 0.05542928649781209, + "language_loss": 0.87720597, + "learning_rate": 0.0009020035719749369, + "loss": 0.8880474, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.35742188, + "step": 1178, + "time_per_iteration": 2.7076900005340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088743, + "balance_loss_mlp": 1.05259871, + "epoch": 0.22681800692574067, + "flos": 579353527296.0, + "grad_norm": 0.05892405405909356, + "language_loss": 0.77506709, + "learning_rate": 0.0009018182447759136, + "loss": 0.78595448, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.36157227, + "step": 1179, + "time_per_iteration": 2.974362373352051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083275, + "balance_loss_mlp": 1.04798961, + "epoch": 0.22701038861100423, + "flos": 739842577920.0, + "grad_norm": 0.0555118465290956, + "language_loss": 0.80168724, + "learning_rate": 0.0009016327615772126, + "loss": 0.81252003, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.35327148, + "step": 1180, + "time_per_iteration": 2.9207658767700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082522, + "balance_loss_mlp": 1.04776096, + "epoch": 0.2272027702962678, + "flos": 576996562944.0, + "grad_norm": 0.06857059729731818, + "language_loss": 0.88146389, + "learning_rate": 0.0009014471224508451, + "loss": 0.8922891, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.34790039, + "step": 1181, + "time_per_iteration": 2.6884429454803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080826, + "balance_loss_mlp": 1.0466603, + "epoch": 0.22739515198153135, + "flos": 544012098048.0, + "grad_norm": 0.07386093909180869, + "language_loss": 0.83020878, + "learning_rate": 0.0009012613274688823, + "loss": 0.84101701, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.34204102, + "step": 1182, + "time_per_iteration": 2.625973701477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082925, + "balance_loss_mlp": 1.04735291, + "epoch": 0.22758753366679493, + "flos": 439932805632.0, + "grad_norm": 0.06621157637783351, + "language_loss": 0.87839937, + "learning_rate": 0.0009010753767034565, + "loss": 0.88922858, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.35571289, + "step": 1183, + "time_per_iteration": 2.545569658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086712, + "balance_loss_mlp": 1.05030489, + "epoch": 0.2277799153520585, + "flos": 729104772096.0, + "grad_norm": 0.07242159959797279, + "language_loss": 0.79501748, + "learning_rate": 0.0009008892702267599, + "loss": 0.80588454, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.36425781, + "step": 1184, + "time_per_iteration": 2.9862120151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089174, + "balance_loss_mlp": 1.05322075, + "epoch": 0.22797229703732205, + "flos": 526641053184.0, + "grad_norm": 0.0740207336504876, + "language_loss": 0.89059424, + "learning_rate": 0.0009007030081110457, + "loss": 0.90148592, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.35961914, + "step": 1185, + "time_per_iteration": 2.6184284687042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083236, + "balance_loss_mlp": 1.04783034, + "epoch": 0.2281646787225856, + "flos": 535159954944.0, + "grad_norm": 0.06479663876551665, + "language_loss": 0.84969211, + "learning_rate": 0.000900516590428627, + "loss": 0.86052454, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.35449219, + "step": 1186, + "time_per_iteration": 2.724161386489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083897, + "balance_loss_mlp": 1.049088, + "epoch": 0.22835706040784917, + "flos": 541107924480.0, + "grad_norm": 0.052728830082858405, + "language_loss": 0.89177948, + "learning_rate": 0.0009003300172518778, + "loss": 0.90261841, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.34790039, + "step": 1187, + "time_per_iteration": 2.6810121536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108576, + "balance_loss_mlp": 1.05018783, + "epoch": 0.22854944209311273, + "flos": 790297012224.0, + "grad_norm": 0.05376177869775473, + "language_loss": 0.84676045, + "learning_rate": 0.0009001432886532321, + "loss": 0.85761803, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.35571289, + "step": 1188, + "time_per_iteration": 2.977433919906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109735, + "balance_loss_mlp": 1.0614686, + "epoch": 0.2287418237783763, + "flos": 469047020544.0, + "grad_norm": 0.06589135500726684, + "language_loss": 0.86752445, + "learning_rate": 0.0008999564047051843, + "loss": 0.87849802, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.35913086, + "step": 1189, + "time_per_iteration": 2.5126237869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094774, + "balance_loss_mlp": 1.06017935, + "epoch": 0.22893420546363985, + "flos": 467786990592.0, + "grad_norm": 0.061551577223012334, + "language_loss": 0.84713042, + "learning_rate": 0.0008997693654802894, + "loss": 0.85807812, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.34643555, + "step": 1190, + "time_per_iteration": 2.6570322513580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088276, + "balance_loss_mlp": 1.05318046, + "epoch": 0.22912658714890344, + "flos": 625974179328.0, + "grad_norm": 0.05326512300588333, + "language_loss": 0.86549705, + "learning_rate": 0.0008995821710511625, + "loss": 0.87637979, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.35107422, + "step": 1191, + "time_per_iteration": 2.8115806579589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108036, + "balance_loss_mlp": 1.04731488, + "epoch": 0.229318968834167, + "flos": 502785156096.0, + "grad_norm": 0.06330680163661413, + "language_loss": 0.8511278, + "learning_rate": 0.0008993948214904786, + "loss": 0.86193144, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.33056641, + "step": 1192, + "time_per_iteration": 2.546410083770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125589, + "balance_loss_mlp": 1.11023474, + "epoch": 0.22951135051943056, + "flos": 1374108544512.0, + "grad_norm": 0.06153086464986019, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79547799, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.15332031, + "step": 1193, + "time_per_iteration": 4.891269207000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099997, + "balance_loss_mlp": 1.06306624, + "epoch": 0.22970373220469412, + "flos": 644045050368.0, + "grad_norm": 0.06658536787009234, + "language_loss": 0.79028845, + "learning_rate": 0.0008990196572654427, + "loss": 0.80128849, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.36914062, + "step": 1194, + "time_per_iteration": 2.8504366874694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100467, + "balance_loss_mlp": 1.06582475, + "epoch": 0.22989611388995768, + "flos": 499945001472.0, + "grad_norm": 0.048025217626556156, + "language_loss": 0.87748766, + "learning_rate": 0.0008988318427467426, + "loss": 0.88849235, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.34667969, + "step": 1195, + "time_per_iteration": 2.735084056854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099859, + "balance_loss_mlp": 1.06524014, + "epoch": 0.23008849557522124, + "flos": 1096071796224.0, + "grad_norm": 0.06731751876810108, + "language_loss": 0.86263168, + "learning_rate": 0.0008986438733877887, + "loss": 0.87363023, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.34667969, + "step": 1196, + "time_per_iteration": 3.435035228729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100924, + "balance_loss_mlp": 1.06606746, + "epoch": 0.2302808772604848, + "flos": 683313546240.0, + "grad_norm": 0.04733445604251135, + "language_loss": 0.84099567, + "learning_rate": 0.0008984557492615576, + "loss": 0.85200489, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.34887695, + "step": 1197, + "time_per_iteration": 2.927668809890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107534, + "balance_loss_mlp": 1.07317793, + "epoch": 0.23047325894574835, + "flos": 528664928256.0, + "grad_norm": 0.0630370564804667, + "language_loss": 0.89949608, + "learning_rate": 0.0008982674704410854, + "loss": 0.91057146, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.34399414, + "step": 1198, + "time_per_iteration": 2.691016435623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107479, + "balance_loss_mlp": 1.07228875, + "epoch": 0.23066564063101191, + "flos": 682427455488.0, + "grad_norm": 0.06209648084563375, + "language_loss": 0.77829844, + "learning_rate": 0.0008980790369994682, + "loss": 0.78937328, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.35205078, + "step": 1199, + "time_per_iteration": 2.9320883750915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_mlp": 1.06525421, + "epoch": 0.2308580223162755, + "flos": 558247624704.0, + "grad_norm": 0.09180159748966574, + "language_loss": 0.87396461, + "learning_rate": 0.000897890449009863, + "loss": 0.88496947, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.3527832, + "step": 1200, + "time_per_iteration": 2.6804869174957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096306, + "balance_loss_mlp": 1.06183052, + "epoch": 0.23105040400153906, + "flos": 555406060032.0, + "grad_norm": 0.05982856494430897, + "language_loss": 0.90313268, + "learning_rate": 0.0008977017065454853, + "loss": 0.9140957, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.3449707, + "step": 1201, + "time_per_iteration": 2.639636754989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090282, + "balance_loss_mlp": 1.05556786, + "epoch": 0.23124278568680262, + "flos": 704474855424.0, + "grad_norm": 0.06077351963181601, + "language_loss": 0.80804175, + "learning_rate": 0.0008975128096796121, + "loss": 0.81894457, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.34765625, + "step": 1202, + "time_per_iteration": 2.8410260677337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078695, + "balance_loss_mlp": 1.04431474, + "epoch": 0.23143516737206618, + "flos": 612469002240.0, + "grad_norm": 0.07413481943536562, + "language_loss": 0.85940087, + "learning_rate": 0.0008973237584855794, + "loss": 0.87018776, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.34423828, + "step": 1203, + "time_per_iteration": 2.898423671722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077022, + "balance_loss_mlp": 1.04233205, + "epoch": 0.23162754905732974, + "flos": 389030238720.0, + "grad_norm": 0.06038618944932519, + "language_loss": 0.8201915, + "learning_rate": 0.0008971345530367832, + "loss": 0.83096182, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.34716797, + "step": 1204, + "time_per_iteration": 2.486668586730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082339, + "balance_loss_mlp": 1.04738712, + "epoch": 0.2318199307425933, + "flos": 667481928192.0, + "grad_norm": 0.05427081728260985, + "language_loss": 0.85029405, + "learning_rate": 0.0008969451934066799, + "loss": 0.86111748, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.34960938, + "step": 1205, + "time_per_iteration": 2.771306276321411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091653, + "balance_loss_mlp": 1.05662966, + "epoch": 0.23201231242785686, + "flos": 666093860352.0, + "grad_norm": 0.0707913589572404, + "language_loss": 0.80143172, + "learning_rate": 0.0008967556796687854, + "loss": 0.81234825, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.35058594, + "step": 1206, + "time_per_iteration": 2.8904309272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087698, + "balance_loss_mlp": 1.05353224, + "epoch": 0.23220469411312042, + "flos": 748498281984.0, + "grad_norm": 0.05559113870944949, + "language_loss": 0.83954245, + "learning_rate": 0.0008965660118966752, + "loss": 0.8504194, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.34204102, + "step": 1207, + "time_per_iteration": 2.9140615463256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108846, + "balance_loss_mlp": 1.05529559, + "epoch": 0.232397075798384, + "flos": 666763163136.0, + "grad_norm": 0.04975334384076733, + "language_loss": 0.90441763, + "learning_rate": 0.0008963761901639851, + "loss": 0.91530222, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.33154297, + "step": 1208, + "time_per_iteration": 2.8032286167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094486, + "balance_loss_mlp": 1.06008244, + "epoch": 0.23258945748364757, + "flos": 609937357824.0, + "grad_norm": 0.05840728669351643, + "language_loss": 0.83201033, + "learning_rate": 0.0008961862145444103, + "loss": 0.84295517, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.34399414, + "step": 1209, + "time_per_iteration": 2.7161943912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_mlp": 1.05954397, + "epoch": 0.23278183916891113, + "flos": 489397842432.0, + "grad_norm": 0.06743904317466738, + "language_loss": 0.85216832, + "learning_rate": 0.0008959960851117059, + "loss": 0.86311138, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.34790039, + "step": 1210, + "time_per_iteration": 2.5817031860351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095047, + "balance_loss_mlp": 1.06014228, + "epoch": 0.23297422085417469, + "flos": 511314232320.0, + "grad_norm": 0.057575168534165826, + "language_loss": 0.84338427, + "learning_rate": 0.0008958058019396868, + "loss": 0.85433477, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.34936523, + "step": 1211, + "time_per_iteration": 2.7788164615631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091213, + "balance_loss_mlp": 1.05730987, + "epoch": 0.23316660253943824, + "flos": 546145072128.0, + "grad_norm": 0.057082370400879795, + "language_loss": 0.86897939, + "learning_rate": 0.0008956153651022274, + "loss": 0.87989151, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.33935547, + "step": 1212, + "time_per_iteration": 2.7309062480926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090204, + "balance_loss_mlp": 1.05608642, + "epoch": 0.2333589842247018, + "flos": 509998947840.0, + "grad_norm": 0.06317396982696966, + "language_loss": 0.84641176, + "learning_rate": 0.0008954247746732618, + "loss": 0.85731381, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.34155273, + "step": 1213, + "time_per_iteration": 2.619058609008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084199, + "balance_loss_mlp": 1.05072522, + "epoch": 0.23355136590996536, + "flos": 662834686464.0, + "grad_norm": 0.09780220222501788, + "language_loss": 0.90954423, + "learning_rate": 0.0008952340307267837, + "loss": 0.9203862, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.33496094, + "step": 1214, + "time_per_iteration": 2.869351387023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108593, + "balance_loss_mlp": 1.05128753, + "epoch": 0.23374374759522892, + "flos": 508206417408.0, + "grad_norm": 0.061496426320555984, + "language_loss": 0.83373952, + "learning_rate": 0.0008950431333368468, + "loss": 0.84459883, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.34667969, + "step": 1215, + "time_per_iteration": 2.5557806491851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093645, + "balance_loss_mlp": 1.05928898, + "epoch": 0.2339361292804925, + "flos": 1293964028928.0, + "grad_norm": 0.062331860667319446, + "language_loss": 0.84730738, + "learning_rate": 0.0008948520825775634, + "loss": 0.85824382, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.34399414, + "step": 1216, + "time_per_iteration": 3.6050164699554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098293, + "balance_loss_mlp": 1.06343639, + "epoch": 0.23412851096575607, + "flos": 705617021952.0, + "grad_norm": 0.06500023378725601, + "language_loss": 0.84162283, + "learning_rate": 0.0008946608785231067, + "loss": 0.8526057, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.34863281, + "step": 1217, + "time_per_iteration": 2.858696699142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109995, + "balance_loss_mlp": 1.065045, + "epoch": 0.23432089265101963, + "flos": 438036968448.0, + "grad_norm": 0.06356573317347913, + "language_loss": 0.84325957, + "learning_rate": 0.0008944695212477084, + "loss": 0.85425907, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.34912109, + "step": 1218, + "time_per_iteration": 2.4787168502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107192, + "balance_loss_mlp": 1.07190585, + "epoch": 0.2345132743362832, + "flos": 480697058304.0, + "grad_norm": 0.05460931090439532, + "language_loss": 0.86098325, + "learning_rate": 0.0008942780108256599, + "loss": 0.87205517, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.35327148, + "step": 1219, + "time_per_iteration": 2.574692726135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105521, + "balance_loss_mlp": 1.06966305, + "epoch": 0.23470565602154675, + "flos": 411231817728.0, + "grad_norm": 0.05853057396081394, + "language_loss": 0.86360055, + "learning_rate": 0.0008940863473313121, + "loss": 0.87465572, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.35839844, + "step": 1220, + "time_per_iteration": 2.4849462509155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115361, + "balance_loss_mlp": 1.08024168, + "epoch": 0.2348980377068103, + "flos": 545189170176.0, + "grad_norm": 0.0745659618807548, + "language_loss": 0.87691534, + "learning_rate": 0.0008938945308390756, + "loss": 0.88806891, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.3515625, + "step": 1221, + "time_per_iteration": 2.6100285053253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107677, + "balance_loss_mlp": 1.07248664, + "epoch": 0.23509041939207387, + "flos": 575465900544.0, + "grad_norm": 0.055913245264753976, + "language_loss": 0.87316763, + "learning_rate": 0.00089370256142342, + "loss": 0.88424438, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.35205078, + "step": 1222, + "time_per_iteration": 2.726897716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109541, + "balance_loss_mlp": 1.06007659, + "epoch": 0.23528280107733743, + "flos": 588568025088.0, + "grad_norm": 0.04976165943815558, + "language_loss": 0.85095507, + "learning_rate": 0.0008935104391588746, + "loss": 0.86190915, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.35351562, + "step": 1223, + "time_per_iteration": 2.7249879837036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108966, + "balance_loss_mlp": 1.07313156, + "epoch": 0.235475182762601, + "flos": 822948235776.0, + "grad_norm": 0.05651852634602403, + "language_loss": 0.82749176, + "learning_rate": 0.0008933181641200276, + "loss": 0.83858138, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.35839844, + "step": 1224, + "time_per_iteration": 3.1651737689971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094197, + "balance_loss_mlp": 1.06017447, + "epoch": 0.23566756444786457, + "flos": 679865287680.0, + "grad_norm": 0.06356150049585653, + "language_loss": 0.8609674, + "learning_rate": 0.0008931257363815271, + "loss": 0.87190938, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.34033203, + "step": 1225, + "time_per_iteration": 2.891789674758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091395, + "balance_loss_mlp": 1.05711007, + "epoch": 0.23585994613312813, + "flos": 701481931776.0, + "grad_norm": 0.04853721262867189, + "language_loss": 0.89892405, + "learning_rate": 0.0008929331560180798, + "loss": 0.90983796, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.34277344, + "step": 1226, + "time_per_iteration": 2.934101104736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093973, + "balance_loss_mlp": 1.06002271, + "epoch": 0.2360523278183917, + "flos": 523923144192.0, + "grad_norm": 0.06491881814379113, + "language_loss": 0.91129786, + "learning_rate": 0.0008927404231044525, + "loss": 0.92223763, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.33984375, + "step": 1227, + "time_per_iteration": 2.682377815246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083665, + "balance_loss_mlp": 1.0493325, + "epoch": 0.23624470950365525, + "flos": 524027860992.0, + "grad_norm": 0.053423388326064705, + "language_loss": 0.81944436, + "learning_rate": 0.0008925475377154703, + "loss": 0.83028102, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.34375, + "step": 1228, + "time_per_iteration": 2.7511117458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077291, + "balance_loss_mlp": 1.04169512, + "epoch": 0.2364370911889188, + "flos": 596525313024.0, + "grad_norm": 0.05836717970983508, + "language_loss": 0.8241868, + "learning_rate": 0.0008923544999260183, + "loss": 0.83495975, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.35644531, + "step": 1229, + "time_per_iteration": 2.7915079593658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087923, + "balance_loss_mlp": 1.05194569, + "epoch": 0.23662947287418237, + "flos": 756519588864.0, + "grad_norm": 0.08156392485297027, + "language_loss": 0.91757852, + "learning_rate": 0.00089216130981104, + "loss": 0.92845774, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.35986328, + "step": 1230, + "time_per_iteration": 3.037900924682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089884, + "balance_loss_mlp": 1.0531199, + "epoch": 0.23682185455944593, + "flos": 545907935232.0, + "grad_norm": 0.05473268619072285, + "language_loss": 0.82659578, + "learning_rate": 0.000891967967445539, + "loss": 0.83749461, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.36743164, + "step": 1231, + "time_per_iteration": 2.6595497131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088497, + "balance_loss_mlp": 1.05201924, + "epoch": 0.2370142362447095, + "flos": 661977709056.0, + "grad_norm": 0.04604146434030928, + "language_loss": 0.88502473, + "learning_rate": 0.0008917744729045772, + "loss": 0.89590967, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.36499023, + "step": 1232, + "time_per_iteration": 2.851651668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095833, + "balance_loss_mlp": 1.05868709, + "epoch": 0.23720661792997308, + "flos": 683361598464.0, + "grad_norm": 0.06835104069372223, + "language_loss": 0.84165156, + "learning_rate": 0.0008915808262632757, + "loss": 0.85260987, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.37133789, + "step": 1233, + "time_per_iteration": 2.8114235401153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095976, + "balance_loss_mlp": 1.05892599, + "epoch": 0.23739899961523664, + "flos": 558631738368.0, + "grad_norm": 0.055258261409357204, + "language_loss": 0.92769438, + "learning_rate": 0.0008913870275968148, + "loss": 0.93865418, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.37036133, + "step": 1234, + "time_per_iteration": 2.705349922180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092916, + "balance_loss_mlp": 1.05629516, + "epoch": 0.2375913813005002, + "flos": 889144128000.0, + "grad_norm": 0.12876854850300654, + "language_loss": 0.87540263, + "learning_rate": 0.0008911930769804342, + "loss": 0.8863318, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.36621094, + "step": 1235, + "time_per_iteration": 3.2342941761016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091192, + "balance_loss_mlp": 1.05492854, + "epoch": 0.23778376298576376, + "flos": 640810607616.0, + "grad_norm": 0.044375832072417805, + "language_loss": 0.91481459, + "learning_rate": 0.0008909989744894318, + "loss": 0.92572653, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.36303711, + "step": 1236, + "time_per_iteration": 2.8858232498168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089597, + "balance_loss_mlp": 1.05333364, + "epoch": 0.23797614467102732, + "flos": 616540073472.0, + "grad_norm": 0.05892762197337364, + "language_loss": 0.81707233, + "learning_rate": 0.0008908047201991649, + "loss": 0.82796836, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.36279297, + "step": 1237, + "time_per_iteration": 2.785226583480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085624, + "balance_loss_mlp": 1.05071974, + "epoch": 0.23816852635629088, + "flos": 623941539840.0, + "grad_norm": 0.051487502947417364, + "language_loss": 0.86561942, + "learning_rate": 0.0008906103141850502, + "loss": 0.87647569, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.34960938, + "step": 1238, + "time_per_iteration": 2.868241310119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095513, + "balance_loss_mlp": 1.05901158, + "epoch": 0.23836090804155444, + "flos": 521180504064.0, + "grad_norm": 0.07170300234131513, + "language_loss": 0.88119614, + "learning_rate": 0.0008904157565225621, + "loss": 0.89215136, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.36499023, + "step": 1239, + "time_per_iteration": 2.610048294067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092952, + "balance_loss_mlp": 1.05716562, + "epoch": 0.238553289726818, + "flos": 1153527616512.0, + "grad_norm": 0.07764557472008667, + "language_loss": 0.82042629, + "learning_rate": 0.000890221047287235, + "loss": 0.83135581, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.3581543, + "step": 1240, + "time_per_iteration": 3.547147274017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102634, + "balance_loss_mlp": 1.06772995, + "epoch": 0.23874567141208156, + "flos": 499600175616.0, + "grad_norm": 0.07123563443936186, + "language_loss": 0.91052604, + "learning_rate": 0.0008900261865546615, + "loss": 0.92155242, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.34936523, + "step": 1241, + "time_per_iteration": 2.6277406215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110411, + "balance_loss_mlp": 1.06768012, + "epoch": 0.23893805309734514, + "flos": 556657325568.0, + "grad_norm": 0.08027126565183675, + "language_loss": 0.84991688, + "learning_rate": 0.0008898311744004936, + "loss": 0.86095798, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.36425781, + "step": 1242, + "time_per_iteration": 2.687009811401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112655, + "balance_loss_mlp": 1.07708287, + "epoch": 0.2391304347826087, + "flos": 549009957888.0, + "grad_norm": 0.05686617926086787, + "language_loss": 0.86918116, + "learning_rate": 0.0008896360109004414, + "loss": 0.88030773, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.35595703, + "step": 1243, + "time_per_iteration": 2.6292564868927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111871, + "balance_loss_mlp": 1.07629931, + "epoch": 0.23932281646787226, + "flos": 515794148352.0, + "grad_norm": 0.05075175877282041, + "language_loss": 0.84481502, + "learning_rate": 0.0008894406961302742, + "loss": 0.85593379, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.35595703, + "step": 1244, + "time_per_iteration": 2.5960640907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122594, + "balance_loss_mlp": 1.08737969, + "epoch": 0.23951519815313582, + "flos": 743353445376.0, + "grad_norm": 0.06488001286924965, + "language_loss": 0.84053004, + "learning_rate": 0.0008892452301658201, + "loss": 0.85175598, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.35253906, + "step": 1245, + "time_per_iteration": 2.9320998191833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123528, + "balance_loss_mlp": 1.08728814, + "epoch": 0.23970757983839938, + "flos": 553855048704.0, + "grad_norm": 0.05553543969160018, + "language_loss": 0.83631629, + "learning_rate": 0.0008890496130829653, + "loss": 0.84755158, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.36230469, + "step": 1246, + "time_per_iteration": 2.6420071125030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123934, + "balance_loss_mlp": 1.08802795, + "epoch": 0.23989996152366294, + "flos": 480416251392.0, + "grad_norm": 0.0595921721906752, + "language_loss": 0.85551775, + "learning_rate": 0.0008888538449576555, + "loss": 0.86675715, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.359375, + "step": 1247, + "time_per_iteration": 2.544706344604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123543, + "balance_loss_mlp": 1.08687472, + "epoch": 0.2400923432089265, + "flos": 485069285376.0, + "grad_norm": 0.06973867138143126, + "language_loss": 0.82958472, + "learning_rate": 0.0008886579258658944, + "loss": 0.84082007, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.36669922, + "step": 1248, + "time_per_iteration": 2.5460424423217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108744, + "balance_loss_mlp": 1.0724808, + "epoch": 0.24028472489419006, + "flos": 623247505920.0, + "grad_norm": 0.04817062293972818, + "language_loss": 0.85353303, + "learning_rate": 0.0008884618558837446, + "loss": 0.86462045, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.36279297, + "step": 1249, + "time_per_iteration": 2.80222487449646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125107, + "balance_loss_mlp": 1.08765173, + "epoch": 0.24047710657945365, + "flos": 601302002688.0, + "grad_norm": 0.052194699096834656, + "language_loss": 0.86387813, + "learning_rate": 0.0008882656350873273, + "loss": 0.87512922, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.37426758, + "step": 1250, + "time_per_iteration": 2.830887794494629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136265, + "balance_loss_mlp": 1.09911942, + "epoch": 0.2406694882647172, + "flos": 841199579136.0, + "grad_norm": 0.07156482775024936, + "language_loss": 0.86951184, + "learning_rate": 0.0008880692635528219, + "loss": 0.88087451, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.37109375, + "step": 1251, + "time_per_iteration": 3.0439021587371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140454, + "balance_loss_mlp": 1.10450029, + "epoch": 0.24086186994998077, + "flos": 526789440000.0, + "grad_norm": 0.062254670736574515, + "language_loss": 0.89187038, + "learning_rate": 0.0008878727413564669, + "loss": 0.90327489, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.36010742, + "step": 1252, + "time_per_iteration": 2.7363240718841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094248, + "balance_loss_mlp": 1.07717752, + "epoch": 0.24105425163524433, + "flos": 1337464673280.0, + "grad_norm": 0.032126183170312766, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81229842, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.17089844, + "step": 1253, + "time_per_iteration": 4.8370680809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175937, + "balance_loss_mlp": 1.13755202, + "epoch": 0.24124663332050789, + "flos": 613822164480.0, + "grad_norm": 0.06436318886622608, + "language_loss": 0.78452635, + "learning_rate": 0.0008874792452834528, + "loss": 0.79628575, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.38354492, + "step": 1254, + "time_per_iteration": 2.724947452545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151033, + "balance_loss_mlp": 1.11381602, + "epoch": 0.24143901500577145, + "flos": 575278225920.0, + "grad_norm": 0.08996846201845816, + "language_loss": 0.87516546, + "learning_rate": 0.0008872822715595626, + "loss": 0.88667583, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.37207031, + "step": 1255, + "time_per_iteration": 2.676539659500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118396, + "balance_loss_mlp": 1.08275259, + "epoch": 0.241631396691035, + "flos": 494941349376.0, + "grad_norm": 0.06475486920780314, + "language_loss": 0.87080252, + "learning_rate": 0.0008870851474793598, + "loss": 0.88198644, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.35668945, + "step": 1256, + "time_per_iteration": 2.5523862838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108714, + "balance_loss_mlp": 1.07287991, + "epoch": 0.24182377837629856, + "flos": 635891323392.0, + "grad_norm": 0.0627898868455093, + "language_loss": 0.89724898, + "learning_rate": 0.0008868878731193752, + "loss": 0.90833616, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.35888672, + "step": 1257, + "time_per_iteration": 2.8152451515197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094242, + "balance_loss_mlp": 1.05933762, + "epoch": 0.24201616006156215, + "flos": 514938580992.0, + "grad_norm": 0.06450256361572139, + "language_loss": 0.89708877, + "learning_rate": 0.0008866904485561973, + "loss": 0.90803117, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.34936523, + "step": 1258, + "time_per_iteration": 2.7304298877716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095529, + "balance_loss_mlp": 1.05945659, + "epoch": 0.2422085417468257, + "flos": 614837703168.0, + "grad_norm": 0.05809143078881904, + "language_loss": 0.83024096, + "learning_rate": 0.000886492873866473, + "loss": 0.8411963, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.36108398, + "step": 1259, + "time_per_iteration": 2.828904628753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090865, + "balance_loss_mlp": 1.05576968, + "epoch": 0.24240092343208927, + "flos": 585515464704.0, + "grad_norm": 0.07568124142212555, + "language_loss": 0.84760439, + "learning_rate": 0.000886295149126908, + "loss": 0.85851306, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.35131836, + "step": 1260, + "time_per_iteration": 2.7011313438415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109068, + "balance_loss_mlp": 1.05537009, + "epoch": 0.24259330511735283, + "flos": 761930675712.0, + "grad_norm": 0.05459059834864095, + "language_loss": 0.85652769, + "learning_rate": 0.0008860972744142655, + "loss": 0.8674345, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.35327148, + "step": 1261, + "time_per_iteration": 2.9039082527160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084953, + "balance_loss_mlp": 1.05028725, + "epoch": 0.2427856868026164, + "flos": 626566316544.0, + "grad_norm": 0.06267274795834049, + "language_loss": 0.8183161, + "learning_rate": 0.0008858992498053671, + "loss": 0.82916564, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.34692383, + "step": 1262, + "time_per_iteration": 2.8293697834014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080455, + "balance_loss_mlp": 1.06586385, + "epoch": 0.24297806848787995, + "flos": 1510840470528.0, + "grad_norm": 0.02756761643082338, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77669203, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.14550781, + "step": 1263, + "time_per_iteration": 4.8116748332977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087756, + "balance_loss_mlp": 1.05328119, + "epoch": 0.2431704501731435, + "flos": 541669538304.0, + "grad_norm": 0.05501719814044903, + "language_loss": 0.83684969, + "learning_rate": 0.0008855027512063817, + "loss": 0.8477273, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.3449707, + "step": 1264, + "time_per_iteration": 2.6995394229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084053, + "balance_loss_mlp": 1.0493865, + "epoch": 0.24336283185840707, + "flos": 523588492800.0, + "grad_norm": 0.06804757776515359, + "language_loss": 0.85974693, + "learning_rate": 0.0008853042773702292, + "loss": 0.87058747, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.34692383, + "step": 1265, + "time_per_iteration": 2.683969497680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086337, + "balance_loss_mlp": 1.05002618, + "epoch": 0.24355521354367063, + "flos": 536839004160.0, + "grad_norm": 0.05444938358074035, + "language_loss": 0.87678754, + "learning_rate": 0.0008851056539456896, + "loss": 0.88765097, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.36303711, + "step": 1266, + "time_per_iteration": 2.674891471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_mlp": 1.04830909, + "epoch": 0.24374759522893422, + "flos": 930050975232.0, + "grad_norm": 0.04940136823280911, + "language_loss": 0.82195789, + "learning_rate": 0.0008849068810098755, + "loss": 0.83279288, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.35229492, + "step": 1267, + "time_per_iteration": 3.27172589302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083691, + "balance_loss_mlp": 1.04859626, + "epoch": 0.24393997691419778, + "flos": 427564002816.0, + "grad_norm": 0.07591960175092535, + "language_loss": 0.83287823, + "learning_rate": 0.0008847079586399575, + "loss": 0.84371519, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.35131836, + "step": 1268, + "time_per_iteration": 2.4539763927459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010876, + "balance_loss_mlp": 1.05281472, + "epoch": 0.24413235859946134, + "flos": 578582479872.0, + "grad_norm": 0.059755639557228325, + "language_loss": 0.86095846, + "learning_rate": 0.0008845088869131641, + "loss": 0.87183452, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.34790039, + "step": 1269, + "time_per_iteration": 2.651010274887085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094661, + "balance_loss_mlp": 1.058851, + "epoch": 0.2443247402847249, + "flos": 529600481280.0, + "grad_norm": 0.07776240560166553, + "language_loss": 0.89366186, + "learning_rate": 0.0008843096659067818, + "loss": 0.90460849, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.35839844, + "step": 1270, + "time_per_iteration": 2.61082124710083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108773, + "balance_loss_mlp": 1.05292046, + "epoch": 0.24451712196998845, + "flos": 695996651520.0, + "grad_norm": 0.05083497592617014, + "language_loss": 0.86395383, + "learning_rate": 0.000884110295698155, + "loss": 0.87483108, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.34863281, + "step": 1271, + "time_per_iteration": 2.930372476577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085133, + "balance_loss_mlp": 1.05089653, + "epoch": 0.24470950365525201, + "flos": 529575750144.0, + "grad_norm": 0.05520811698213447, + "language_loss": 0.86009014, + "learning_rate": 0.0008839107763646861, + "loss": 0.87094152, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.34277344, + "step": 1272, + "time_per_iteration": 2.576322078704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088964, + "balance_loss_mlp": 1.05324888, + "epoch": 0.24490188534051557, + "flos": 491091448320.0, + "grad_norm": 0.0616556586287024, + "language_loss": 0.9024111, + "learning_rate": 0.0008837111079838353, + "loss": 0.91330075, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.35742188, + "step": 1273, + "time_per_iteration": 2.6859118938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109117, + "balance_loss_mlp": 1.05631351, + "epoch": 0.24509426702577913, + "flos": 473916842496.0, + "grad_norm": 0.05704478566457949, + "language_loss": 0.89869869, + "learning_rate": 0.000883511290633121, + "loss": 0.90961039, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.34887695, + "step": 1274, + "time_per_iteration": 2.5262861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096408, + "balance_loss_mlp": 1.06152773, + "epoch": 0.24528664871104272, + "flos": 550329624576.0, + "grad_norm": 0.04914382449005864, + "language_loss": 0.92288065, + "learning_rate": 0.000883311324390119, + "loss": 0.93384475, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.34887695, + "step": 1275, + "time_per_iteration": 2.6791441440582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100279, + "balance_loss_mlp": 1.0631578, + "epoch": 0.24547903039630628, + "flos": 825546871296.0, + "grad_norm": 0.0705624444694786, + "language_loss": 0.81542301, + "learning_rate": 0.0008831112093324629, + "loss": 0.82642579, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.37060547, + "step": 1276, + "time_per_iteration": 3.0612823963165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100002, + "balance_loss_mlp": 1.06419206, + "epoch": 0.24567141208156984, + "flos": 591325221888.0, + "grad_norm": 0.0822852621967946, + "language_loss": 0.89184481, + "learning_rate": 0.0008829109455378444, + "loss": 0.90284485, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.35839844, + "step": 1277, + "time_per_iteration": 2.6601858139038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091196, + "balance_loss_mlp": 1.05567181, + "epoch": 0.2458637937668334, + "flos": 547611715584.0, + "grad_norm": 0.05101212903881184, + "language_loss": 0.86474031, + "learning_rate": 0.000882710533084013, + "loss": 0.87565225, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.35546875, + "step": 1278, + "time_per_iteration": 2.6333553791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087707, + "balance_loss_mlp": 1.05158627, + "epoch": 0.24605617545209696, + "flos": 515641379328.0, + "grad_norm": 0.04855931692812416, + "language_loss": 0.89387107, + "learning_rate": 0.0008825099720487755, + "loss": 0.9047482, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.36108398, + "step": 1279, + "time_per_iteration": 2.6388816833496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071735, + "balance_loss_mlp": 1.05943298, + "epoch": 0.24624855713736052, + "flos": 1510953951744.0, + "grad_norm": 0.03612446278815301, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76332873, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.12304688, + "step": 1280, + "time_per_iteration": 4.837193727493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044143, + "balance_loss_mlp": 1.03145874, + "epoch": 0.24644093882262408, + "flos": 1526826419712.0, + "grad_norm": 0.020354868078157083, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.78988254, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.12695312, + "step": 1281, + "time_per_iteration": 4.7473485469818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078971, + "balance_loss_mlp": 1.04418564, + "epoch": 0.24663332050788764, + "flos": 658811667456.0, + "grad_norm": 0.060866999123497585, + "language_loss": 0.89327228, + "learning_rate": 0.0008819073982335619, + "loss": 0.90406203, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.34838867, + "step": 1282, + "time_per_iteration": 2.839691162109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107951, + "balance_loss_mlp": 1.0446527, + "epoch": 0.24682570219315123, + "flos": 541510977024.0, + "grad_norm": 0.05752783194209404, + "language_loss": 0.84339237, + "learning_rate": 0.0008817062436519235, + "loss": 0.85418749, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.34887695, + "step": 1283, + "time_per_iteration": 2.6106019020080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080546, + "balance_loss_mlp": 1.04459274, + "epoch": 0.24701808387841478, + "flos": 440455131648.0, + "grad_norm": 0.05999718389674832, + "language_loss": 0.89926815, + "learning_rate": 0.0008815049408787788, + "loss": 0.91007358, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.36010742, + "step": 1284, + "time_per_iteration": 2.5186686515808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079282, + "balance_loss_mlp": 1.04518795, + "epoch": 0.24721046556367834, + "flos": 467826278400.0, + "grad_norm": 0.054777388157378364, + "language_loss": 0.8565737, + "learning_rate": 0.0008813034899922805, + "loss": 0.86736655, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.34106445, + "step": 1285, + "time_per_iteration": 2.5217878818511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082636, + "balance_loss_mlp": 1.04730225, + "epoch": 0.2474028472489419, + "flos": 504183398400.0, + "grad_norm": 0.06351521025868076, + "language_loss": 0.90182853, + "learning_rate": 0.0008811018910706387, + "loss": 0.91265488, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.35375977, + "step": 1286, + "time_per_iteration": 2.549523115158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010823, + "balance_loss_mlp": 1.04787278, + "epoch": 0.24759522893420546, + "flos": 479707660800.0, + "grad_norm": 0.06857789842871208, + "language_loss": 0.81978023, + "learning_rate": 0.0008809001441921211, + "loss": 0.83060318, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.34448242, + "step": 1287, + "time_per_iteration": 2.7147598266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083728, + "balance_loss_mlp": 1.04984844, + "epoch": 0.24778761061946902, + "flos": 533446000128.0, + "grad_norm": 0.05733880184845353, + "language_loss": 0.85523212, + "learning_rate": 0.0008806982494350528, + "loss": 0.86606944, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.33911133, + "step": 1288, + "time_per_iteration": 2.6304967403411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086884, + "balance_loss_mlp": 1.05197978, + "epoch": 0.24797999230473258, + "flos": 559513446912.0, + "grad_norm": 0.04849910181782432, + "language_loss": 0.90370154, + "learning_rate": 0.0008804962068778161, + "loss": 0.91457039, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.34936523, + "step": 1289, + "time_per_iteration": 2.8194985389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087315, + "balance_loss_mlp": 1.05291104, + "epoch": 0.24817237398999614, + "flos": 623912426496.0, + "grad_norm": 0.05410640942937228, + "language_loss": 0.80728722, + "learning_rate": 0.0008802940165988511, + "loss": 0.81816041, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.34423828, + "step": 1290, + "time_per_iteration": 2.8703298568725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096846, + "balance_loss_mlp": 1.06225193, + "epoch": 0.2483647556752597, + "flos": 611981581824.0, + "grad_norm": 0.06277561181530684, + "language_loss": 0.88376027, + "learning_rate": 0.000880091678676655, + "loss": 0.89472872, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.34619141, + "step": 1291, + "time_per_iteration": 2.7943451404571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088363, + "balance_loss_mlp": 1.05496097, + "epoch": 0.2485571373605233, + "flos": 583270419456.0, + "grad_norm": 0.061640996967182685, + "language_loss": 0.89207399, + "learning_rate": 0.0008798891931897821, + "loss": 0.90295762, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.33422852, + "step": 1292, + "time_per_iteration": 2.7013609409332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05463946, + "epoch": 0.24874951904578685, + "flos": 494503391232.0, + "grad_norm": 0.0568342609101268, + "language_loss": 0.84605837, + "learning_rate": 0.0008796865602168447, + "loss": 0.8569414, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.33691406, + "step": 1293, + "time_per_iteration": 2.517571210861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.05448937, + "epoch": 0.2489419007310504, + "flos": 455925957120.0, + "grad_norm": 0.05011975537228715, + "language_loss": 0.88745099, + "learning_rate": 0.0008794837798365115, + "loss": 0.8983261, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.33032227, + "step": 1294, + "time_per_iteration": 2.6243135929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093031, + "balance_loss_mlp": 1.05958056, + "epoch": 0.24913428241631397, + "flos": 485198733312.0, + "grad_norm": 0.05031013210073455, + "language_loss": 0.88537574, + "learning_rate": 0.0008792808521275089, + "loss": 0.89630604, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.3347168, + "step": 1295, + "time_per_iteration": 2.743821144104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090644, + "balance_loss_mlp": 1.05664551, + "epoch": 0.24932666410157753, + "flos": 518654651904.0, + "grad_norm": 0.0628198177294759, + "language_loss": 0.87554896, + "learning_rate": 0.0008790777771686206, + "loss": 0.8864553, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.34033203, + "step": 1296, + "time_per_iteration": 2.55996036529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091195, + "balance_loss_mlp": 1.05819798, + "epoch": 0.2495190457868411, + "flos": 472365831168.0, + "grad_norm": 0.05367084005526609, + "language_loss": 0.85479438, + "learning_rate": 0.0008788745550386872, + "loss": 0.86570632, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.33007812, + "step": 1297, + "time_per_iteration": 2.555238723754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099543, + "balance_loss_mlp": 1.06494844, + "epoch": 0.24971142747210465, + "flos": 745559202816.0, + "grad_norm": 0.05557204977607519, + "language_loss": 0.80045742, + "learning_rate": 0.0008786711858166063, + "loss": 0.81145287, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.34643555, + "step": 1298, + "time_per_iteration": 2.940908670425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090601, + "balance_loss_mlp": 1.05757999, + "epoch": 0.2499038091573682, + "flos": 749222839296.0, + "grad_norm": 0.08262860681241094, + "language_loss": 0.83490336, + "learning_rate": 0.0008784676695813332, + "loss": 0.84580934, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.33032227, + "step": 1299, + "time_per_iteration": 2.966646432876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092772, + "balance_loss_mlp": 1.05870187, + "epoch": 0.2500961908426318, + "flos": 744741513216.0, + "grad_norm": 0.04756275395178792, + "language_loss": 0.84761405, + "learning_rate": 0.0008782640064118796, + "loss": 0.85854173, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.34082031, + "step": 1300, + "time_per_iteration": 2.8889827728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078115, + "balance_loss_mlp": 1.06447709, + "epoch": 0.2502885725278953, + "flos": 1416652180992.0, + "grad_norm": 0.036683670441934005, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77262866, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.13671875, + "step": 1301, + "time_per_iteration": 4.988169431686401 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094633, + "balance_loss_mlp": 1.06196928, + "epoch": 0.2504809542131589, + "flos": 514961902080.0, + "grad_norm": 0.05923567857946263, + "language_loss": 0.86476314, + "learning_rate": 0.0008778562395867648, + "loss": 0.87570941, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.32666016, + "step": 1302, + "time_per_iteration": 2.5900919437408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087269, + "balance_loss_mlp": 1.05436766, + "epoch": 0.25067333589842244, + "flos": 525562905600.0, + "grad_norm": 0.06049595368492962, + "language_loss": 0.83774143, + "learning_rate": 0.0008776521360894127, + "loss": 0.8486141, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.32910156, + "step": 1303, + "time_per_iteration": 2.6029298305511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_mlp": 1.02275884, + "epoch": 0.25086571758368603, + "flos": 1473085108224.0, + "grad_norm": 0.024331867442101186, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.79997885, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.13085938, + "step": 1304, + "time_per_iteration": 4.800757646560669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096353, + "balance_loss_mlp": 1.063833, + "epoch": 0.2510580992689496, + "flos": 528128045568.0, + "grad_norm": 0.053799887970574674, + "language_loss": 0.90341735, + "learning_rate": 0.0008772434893213186, + "loss": 0.91438091, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.32519531, + "step": 1305, + "time_per_iteration": 2.5816421508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104465, + "balance_loss_mlp": 1.07123005, + "epoch": 0.25125048095421315, + "flos": 517192390656.0, + "grad_norm": 0.058690449713219205, + "language_loss": 0.84433925, + "learning_rate": 0.0008770389462092276, + "loss": 0.85538393, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.33251953, + "step": 1306, + "time_per_iteration": 2.6747090816497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106931, + "balance_loss_mlp": 1.07309926, + "epoch": 0.25144286263947674, + "flos": 620160039936.0, + "grad_norm": 0.16660488736040688, + "language_loss": 0.86719346, + "learning_rate": 0.0008768342567176357, + "loss": 0.87826276, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.33862305, + "step": 1307, + "time_per_iteration": 2.7788002490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098824, + "balance_loss_mlp": 1.06425333, + "epoch": 0.25163524432474027, + "flos": 503534444544.0, + "grad_norm": 0.04824933548887647, + "language_loss": 0.90589297, + "learning_rate": 0.0008766294209260107, + "loss": 0.91688126, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.34619141, + "step": 1308, + "time_per_iteration": 2.6300241947174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010929, + "balance_loss_mlp": 1.05852032, + "epoch": 0.25182762601000386, + "flos": 508821875712.0, + "grad_norm": 0.0633327884934456, + "language_loss": 0.91549027, + "learning_rate": 0.0008764244389138767, + "loss": 0.92641926, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.34399414, + "step": 1309, + "time_per_iteration": 2.574056625366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088516, + "balance_loss_mlp": 1.05306351, + "epoch": 0.2520200076952674, + "flos": 633596815872.0, + "grad_norm": 0.05898934519456769, + "language_loss": 0.8269434, + "learning_rate": 0.000876219310760815, + "loss": 0.83782852, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.35449219, + "step": 1310, + "time_per_iteration": 2.87404465675354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010989, + "balance_loss_mlp": 1.06299448, + "epoch": 0.252212389380531, + "flos": 494385527808.0, + "grad_norm": 0.05968729718727878, + "language_loss": 0.8144334, + "learning_rate": 0.0008760140365464631, + "loss": 0.82542241, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.35913086, + "step": 1311, + "time_per_iteration": 2.599480390548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105508, + "balance_loss_mlp": 1.07062793, + "epoch": 0.2524047710657945, + "flos": 490298489856.0, + "grad_norm": 0.06557576312810307, + "language_loss": 0.87226975, + "learning_rate": 0.0008758086163505156, + "loss": 0.88332486, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.34912109, + "step": 1312, + "time_per_iteration": 2.6165666580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112941, + "balance_loss_mlp": 1.07698762, + "epoch": 0.2525971527510581, + "flos": 647136898560.0, + "grad_norm": 0.06425852435188892, + "language_loss": 0.89039612, + "learning_rate": 0.0008756030502527239, + "loss": 0.90152562, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.36010742, + "step": 1313, + "time_per_iteration": 2.794595956802368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112418, + "balance_loss_mlp": 1.07636952, + "epoch": 0.2527895344363217, + "flos": 568991222784.0, + "grad_norm": 0.05792474282671988, + "language_loss": 0.90396988, + "learning_rate": 0.0008753973383328954, + "loss": 0.91509414, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.36108398, + "step": 1314, + "time_per_iteration": 2.66343355178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110344, + "balance_loss_mlp": 1.07491553, + "epoch": 0.2529819161215852, + "flos": 513795004416.0, + "grad_norm": 0.10488361484557306, + "language_loss": 0.84231019, + "learning_rate": 0.0008751914806708952, + "loss": 0.8534137, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.35449219, + "step": 1315, + "time_per_iteration": 2.5714006423950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099121, + "balance_loss_mlp": 1.06357241, + "epoch": 0.2531742978068488, + "flos": 530979784704.0, + "grad_norm": 0.0646255116041034, + "language_loss": 0.81763697, + "learning_rate": 0.0008749854773466439, + "loss": 0.82862812, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.35571289, + "step": 1316, + "time_per_iteration": 2.6507568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093995, + "balance_loss_mlp": 1.05892396, + "epoch": 0.25336667949211233, + "flos": 596362369536.0, + "grad_norm": 0.11519177634747009, + "language_loss": 0.84297431, + "learning_rate": 0.0008747793284401192, + "loss": 0.8539142, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.35107422, + "step": 1317, + "time_per_iteration": 2.6708261966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109538, + "balance_loss_mlp": 1.05966473, + "epoch": 0.2535590611773759, + "flos": 601764691968.0, + "grad_norm": 0.05376009268762157, + "language_loss": 0.86145389, + "learning_rate": 0.0008745730340313551, + "loss": 0.87240773, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.35742188, + "step": 1318, + "time_per_iteration": 2.7465810775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100043, + "balance_loss_mlp": 1.06504369, + "epoch": 0.25375144286263945, + "flos": 495079561728.0, + "grad_norm": 0.053440140598651036, + "language_loss": 0.8468703, + "learning_rate": 0.0008743665942004422, + "loss": 0.85787076, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.35009766, + "step": 1319, + "time_per_iteration": 2.632645606994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109509, + "balance_loss_mlp": 1.05908918, + "epoch": 0.25394382454790304, + "flos": 512219261952.0, + "grad_norm": 0.050076364746318706, + "language_loss": 0.92529714, + "learning_rate": 0.0008741600090275277, + "loss": 0.93624806, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.35986328, + "step": 1320, + "time_per_iteration": 2.564730405807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097426, + "balance_loss_mlp": 1.06092453, + "epoch": 0.25413620623316663, + "flos": 958586047488.0, + "grad_norm": 0.058049172943507095, + "language_loss": 0.83939385, + "learning_rate": 0.0008739532785928151, + "loss": 0.85036814, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.36474609, + "step": 1321, + "time_per_iteration": 3.4496617317199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056798, + "balance_loss_mlp": 1.04439986, + "epoch": 0.25432858791843016, + "flos": 1576445635584.0, + "grad_norm": 0.03297734471592195, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75950378, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.12353516, + "step": 1322, + "time_per_iteration": 4.803644418716431 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102711, + "balance_loss_mlp": 1.06706691, + "epoch": 0.25452096960369375, + "flos": 583530877440.0, + "grad_norm": 0.056711392027496164, + "language_loss": 0.83213425, + "learning_rate": 0.0008735393822590908, + "loss": 0.84316134, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.35668945, + "step": 1323, + "time_per_iteration": 2.6643528938293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099364, + "balance_loss_mlp": 1.06434083, + "epoch": 0.2547133512889573, + "flos": 508344629760.0, + "grad_norm": 0.06006943476027706, + "language_loss": 0.87018919, + "learning_rate": 0.0008733322165207681, + "loss": 0.88118285, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.35083008, + "step": 1324, + "time_per_iteration": 2.627495765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110409, + "balance_loss_mlp": 1.06863689, + "epoch": 0.25490573297422087, + "flos": 782266940928.0, + "grad_norm": 0.05604709920606865, + "language_loss": 0.83055937, + "learning_rate": 0.0008731249058420247, + "loss": 0.8416003, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.35498047, + "step": 1325, + "time_per_iteration": 3.0361831188201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100095, + "balance_loss_mlp": 1.06468964, + "epoch": 0.2550981146594844, + "flos": 509610451968.0, + "grad_norm": 0.06314633870869373, + "language_loss": 0.90780556, + "learning_rate": 0.0008729174503033459, + "loss": 0.91880649, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.35424805, + "step": 1326, + "time_per_iteration": 2.639625072479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109522, + "balance_loss_mlp": 1.06007695, + "epoch": 0.255290496344748, + "flos": 676360212480.0, + "grad_norm": 0.06489195741671011, + "language_loss": 0.82650065, + "learning_rate": 0.0008727098499852728, + "loss": 0.83745289, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.35180664, + "step": 1327, + "time_per_iteration": 2.830500602722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109753, + "balance_loss_mlp": 1.06231546, + "epoch": 0.2554828780300115, + "flos": 537524273664.0, + "grad_norm": 0.06666455638552511, + "language_loss": 0.89945138, + "learning_rate": 0.0008725021049684034, + "loss": 0.91042662, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.35253906, + "step": 1328, + "time_per_iteration": 2.747800350189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097741, + "balance_loss_mlp": 1.06240726, + "epoch": 0.2556752597152751, + "flos": 823828534272.0, + "grad_norm": 0.052131047599379726, + "language_loss": 0.82919741, + "learning_rate": 0.000872294215333391, + "loss": 0.84017479, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.35400391, + "step": 1329, + "time_per_iteration": 3.1658926010131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096694, + "balance_loss_mlp": 1.06176591, + "epoch": 0.2558676414005387, + "flos": 570517502976.0, + "grad_norm": 0.05425014800623288, + "language_loss": 0.82993001, + "learning_rate": 0.0008720861811609457, + "loss": 0.84089696, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.34985352, + "step": 1330, + "time_per_iteration": 2.709085702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101009, + "balance_loss_mlp": 1.06448317, + "epoch": 0.2560600230858022, + "flos": 486419475456.0, + "grad_norm": 0.05425594622111712, + "language_loss": 0.83756936, + "learning_rate": 0.0008718780025318338, + "loss": 0.84857947, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.36523438, + "step": 1331, + "time_per_iteration": 2.7126388549804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097427, + "balance_loss_mlp": 1.06280875, + "epoch": 0.2562524047710658, + "flos": 512874008064.0, + "grad_norm": 0.06594145834934585, + "language_loss": 0.8406449, + "learning_rate": 0.0008716696795268771, + "loss": 0.85161918, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.34667969, + "step": 1332, + "time_per_iteration": 2.6650350093841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089329, + "balance_loss_mlp": 1.05308938, + "epoch": 0.25644478645632934, + "flos": 634498873344.0, + "grad_norm": 0.051413439896644035, + "language_loss": 0.85076845, + "learning_rate": 0.0008714612122269538, + "loss": 0.86166173, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.36279297, + "step": 1333, + "time_per_iteration": 2.8611392974853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109443, + "balance_loss_mlp": 1.05754697, + "epoch": 0.25663716814159293, + "flos": 436353537024.0, + "grad_norm": 0.0705935369031189, + "language_loss": 0.89120972, + "learning_rate": 0.0008712526007129982, + "loss": 0.90215403, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.36889648, + "step": 1334, + "time_per_iteration": 2.5217065811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109442, + "balance_loss_mlp": 1.05813241, + "epoch": 0.25682954982685646, + "flos": 497892013056.0, + "grad_norm": 0.06578019441075163, + "language_loss": 0.90784955, + "learning_rate": 0.0008710438450660003, + "loss": 0.91879368, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.36303711, + "step": 1335, + "time_per_iteration": 2.651367425918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093579, + "balance_loss_mlp": 1.05643392, + "epoch": 0.25702193151212005, + "flos": 457471176192.0, + "grad_norm": 0.07087944464696884, + "language_loss": 0.8744905, + "learning_rate": 0.0008708349453670064, + "loss": 0.88542628, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.37133789, + "step": 1336, + "time_per_iteration": 2.51411771774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090754, + "balance_loss_mlp": 1.0543952, + "epoch": 0.2572143131973836, + "flos": 598002130944.0, + "grad_norm": 0.06329524505646734, + "language_loss": 0.91480416, + "learning_rate": 0.0008706259016971185, + "loss": 0.92571175, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.36401367, + "step": 1337, + "time_per_iteration": 2.754173517227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096487, + "balance_loss_mlp": 1.0589596, + "epoch": 0.25740669488264717, + "flos": 698004559872.0, + "grad_norm": 0.06697174190166053, + "language_loss": 0.83331275, + "learning_rate": 0.0008704167141374944, + "loss": 0.84427762, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.375, + "step": 1338, + "time_per_iteration": 2.795552968978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011015, + "balance_loss_mlp": 1.06385398, + "epoch": 0.25759907656791076, + "flos": 502130409984.0, + "grad_norm": 0.06639008708045263, + "language_loss": 0.88657552, + "learning_rate": 0.0008702073827693482, + "loss": 0.89759052, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.3762207, + "step": 1339, + "time_per_iteration": 2.6935572624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103743, + "balance_loss_mlp": 1.065763, + "epoch": 0.2577914582531743, + "flos": 773541425664.0, + "grad_norm": 0.06917089880544881, + "language_loss": 0.88938046, + "learning_rate": 0.0008699979076739494, + "loss": 0.90041792, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.37963867, + "step": 1340, + "time_per_iteration": 2.951148509979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102218, + "balance_loss_mlp": 1.06552505, + "epoch": 0.2579838399384379, + "flos": 459431032320.0, + "grad_norm": 0.07085954822691051, + "language_loss": 0.88831556, + "learning_rate": 0.0008697882889326234, + "loss": 0.89933777, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.36669922, + "step": 1341, + "time_per_iteration": 2.492182731628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105931, + "balance_loss_mlp": 1.06916654, + "epoch": 0.2581762216237014, + "flos": 568917029376.0, + "grad_norm": 0.060702491086151805, + "language_loss": 0.86630756, + "learning_rate": 0.0008695785266267515, + "loss": 0.8773669, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.36816406, + "step": 1342, + "time_per_iteration": 2.6635031700134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111038, + "balance_loss_mlp": 1.07448828, + "epoch": 0.258368603308965, + "flos": 603906430464.0, + "grad_norm": 0.06467765584173796, + "language_loss": 0.83112109, + "learning_rate": 0.0008693686208377704, + "loss": 0.84223145, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.36547852, + "step": 1343, + "time_per_iteration": 2.7769596576690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105659, + "balance_loss_mlp": 1.06975329, + "epoch": 0.2585609849942285, + "flos": 491204929536.0, + "grad_norm": 0.06376456739082713, + "language_loss": 0.88889539, + "learning_rate": 0.0008691585716471733, + "loss": 0.89995199, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.35913086, + "step": 1344, + "time_per_iteration": 2.6467716693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104044, + "balance_loss_mlp": 1.06809044, + "epoch": 0.2587533666794921, + "flos": 640455607296.0, + "grad_norm": 0.057733681270749564, + "language_loss": 0.85255873, + "learning_rate": 0.0008689483791365079, + "loss": 0.86359918, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.35961914, + "step": 1345, + "time_per_iteration": 2.8041999340057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099237, + "balance_loss_mlp": 1.06380773, + "epoch": 0.2589457483647557, + "flos": 576564397056.0, + "grad_norm": 0.05015471530609978, + "language_loss": 0.89365089, + "learning_rate": 0.0008687380433873786, + "loss": 0.9046433, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.35473633, + "step": 1346, + "time_per_iteration": 2.7955591678619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101447, + "balance_loss_mlp": 1.06630445, + "epoch": 0.25913813005001923, + "flos": 535164337152.0, + "grad_norm": 0.06074647569776127, + "language_loss": 0.82164252, + "learning_rate": 0.0008685275644814448, + "loss": 0.83265698, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.3515625, + "step": 1347, + "time_per_iteration": 2.6922154426574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100414, + "balance_loss_mlp": 1.06419861, + "epoch": 0.2593305117352828, + "flos": 720713908224.0, + "grad_norm": 0.05981927153656866, + "language_loss": 0.8445859, + "learning_rate": 0.0008683169425004216, + "loss": 0.85558999, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.36230469, + "step": 1348, + "time_per_iteration": 2.8701395988464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.05186677, + "epoch": 0.25952289342054635, + "flos": 709782635520.0, + "grad_norm": 0.06994851779161643, + "language_loss": 0.83445579, + "learning_rate": 0.0008681061775260799, + "loss": 0.84533083, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.35644531, + "step": 1349, + "time_per_iteration": 2.8206968307495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096473, + "balance_loss_mlp": 1.06032848, + "epoch": 0.25971527510580994, + "flos": 455688820224.0, + "grad_norm": 0.06118298275127208, + "language_loss": 0.91987318, + "learning_rate": 0.0008678952696402458, + "loss": 0.93083793, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.36132812, + "step": 1350, + "time_per_iteration": 2.5547540187835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108511, + "balance_loss_mlp": 1.04932308, + "epoch": 0.25990765679107347, + "flos": 612223100928.0, + "grad_norm": 0.04808004024566397, + "language_loss": 0.86496055, + "learning_rate": 0.000867684218924801, + "loss": 0.8758117, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.35791016, + "step": 1351, + "time_per_iteration": 2.8406949043273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110301, + "balance_loss_mlp": 1.09857082, + "epoch": 0.26010003847633706, + "flos": 1537105766400.0, + "grad_norm": 0.059206679514604114, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80057395, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.1171875, + "step": 1352, + "time_per_iteration": 4.8775153160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082749, + "balance_loss_mlp": 1.04753494, + "epoch": 0.2602924201616006, + "flos": 715947393024.0, + "grad_norm": 0.046134849134736367, + "language_loss": 0.85103661, + "learning_rate": 0.0008672616893328834, + "loss": 0.86186409, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.35253906, + "step": 1353, + "time_per_iteration": 2.98063588142395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108322, + "balance_loss_mlp": 1.04764819, + "epoch": 0.2604848018468642, + "flos": 643241917440.0, + "grad_norm": 0.060512322591449175, + "language_loss": 0.9000203, + "learning_rate": 0.0008670502106204512, + "loss": 0.91085243, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.35595703, + "step": 1354, + "time_per_iteration": 2.832679271697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087805, + "balance_loss_mlp": 1.05073047, + "epoch": 0.26067718353212777, + "flos": 516783545856.0, + "grad_norm": 0.05860289542603218, + "language_loss": 0.8165139, + "learning_rate": 0.0008668385894064892, + "loss": 0.82739192, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.37084961, + "step": 1355, + "time_per_iteration": 2.6204822063446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086317, + "balance_loss_mlp": 1.05143666, + "epoch": 0.2608695652173913, + "flos": 822361890816.0, + "grad_norm": 0.0623840657908754, + "language_loss": 0.88803548, + "learning_rate": 0.0008666268257731562, + "loss": 0.8988986, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.34912109, + "step": 1356, + "time_per_iteration": 3.113147735595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109295, + "balance_loss_mlp": 1.0566628, + "epoch": 0.2610619469026549, + "flos": 1007451744768.0, + "grad_norm": 0.056693012024963345, + "language_loss": 0.85794425, + "learning_rate": 0.0008664149198026662, + "loss": 0.86887372, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.36279297, + "step": 1357, + "time_per_iteration": 3.2569541931152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095714, + "balance_loss_mlp": 1.06040418, + "epoch": 0.2612543285879184, + "flos": 536523291648.0, + "grad_norm": 0.061594313952015485, + "language_loss": 0.88599586, + "learning_rate": 0.0008662028715772883, + "loss": 0.89695299, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.35351562, + "step": 1358, + "time_per_iteration": 2.6102256774902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100258, + "balance_loss_mlp": 1.06475711, + "epoch": 0.261446710273182, + "flos": 519166803456.0, + "grad_norm": 0.04975036534081278, + "language_loss": 0.85662109, + "learning_rate": 0.0008659906811793467, + "loss": 0.86762363, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.35546875, + "step": 1359, + "time_per_iteration": 2.6921935081481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101082, + "balance_loss_mlp": 1.06543839, + "epoch": 0.26163909195844554, + "flos": 582975055872.0, + "grad_norm": 0.06646109128582675, + "language_loss": 0.89397144, + "learning_rate": 0.0008657783486912215, + "loss": 0.90498233, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.35693359, + "step": 1360, + "time_per_iteration": 2.7003283500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110494, + "balance_loss_mlp": 1.06920147, + "epoch": 0.2618314736437091, + "flos": 958362057216.0, + "grad_norm": 0.06344844215605515, + "language_loss": 0.89840877, + "learning_rate": 0.0008655658741953472, + "loss": 0.90945816, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.35742188, + "step": 1361, + "time_per_iteration": 3.207960844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101357, + "balance_loss_mlp": 1.0664053, + "epoch": 0.26202385532897265, + "flos": 574530347520.0, + "grad_norm": 0.04606923720206454, + "language_loss": 0.88105857, + "learning_rate": 0.0008653532577742136, + "loss": 0.89207214, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.34960938, + "step": 1362, + "time_per_iteration": 2.69209885597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091565, + "balance_loss_mlp": 1.05744767, + "epoch": 0.26221623701423624, + "flos": 445240585728.0, + "grad_norm": 0.05480512848555835, + "language_loss": 0.87200153, + "learning_rate": 0.0008651404995103659, + "loss": 0.88291717, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.34155273, + "step": 1363, + "time_per_iteration": 2.5325255393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095927, + "balance_loss_mlp": 1.06164205, + "epoch": 0.26240861869949983, + "flos": 535459700736.0, + "grad_norm": 0.04992660146640532, + "language_loss": 0.870713, + "learning_rate": 0.0008649275994864041, + "loss": 0.8816722, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.34301758, + "step": 1364, + "time_per_iteration": 2.682365894317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098846, + "balance_loss_mlp": 1.06267846, + "epoch": 0.26260100038476336, + "flos": 564940500480.0, + "grad_norm": 0.05369640644722127, + "language_loss": 0.83917898, + "learning_rate": 0.0008647145577849834, + "loss": 0.85016745, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.36157227, + "step": 1365, + "time_per_iteration": 2.8129918575286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102304, + "balance_loss_mlp": 1.06701851, + "epoch": 0.26279338207002695, + "flos": 612745426944.0, + "grad_norm": 0.045782565775991005, + "language_loss": 0.82809973, + "learning_rate": 0.0008645013744888139, + "loss": 0.83912277, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.35327148, + "step": 1366, + "time_per_iteration": 2.8523411750793457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101615, + "balance_loss_mlp": 1.06730664, + "epoch": 0.2629857637552905, + "flos": 522555425280.0, + "grad_norm": 0.0597350257589219, + "language_loss": 0.87579656, + "learning_rate": 0.0008642880496806607, + "loss": 0.88681269, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.34350586, + "step": 1367, + "time_per_iteration": 2.766350507736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105811, + "balance_loss_mlp": 1.0706687, + "epoch": 0.26317814544055407, + "flos": 534273864192.0, + "grad_norm": 0.05812227598952832, + "language_loss": 0.84219468, + "learning_rate": 0.0008640745834433437, + "loss": 0.85325277, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.35205078, + "step": 1368, + "time_per_iteration": 2.7220964431762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100459, + "balance_loss_mlp": 1.06553102, + "epoch": 0.2633705271258176, + "flos": 555235762176.0, + "grad_norm": 0.06954601812969684, + "language_loss": 0.86862296, + "learning_rate": 0.000863860975859738, + "loss": 0.87962759, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.34960938, + "step": 1369, + "time_per_iteration": 2.8985280990600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094199, + "balance_loss_mlp": 1.05931866, + "epoch": 0.2635629088110812, + "flos": 552136711680.0, + "grad_norm": 0.06493737783890446, + "language_loss": 0.88711715, + "learning_rate": 0.0008636472270127733, + "loss": 0.89805913, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.34936523, + "step": 1370, + "time_per_iteration": 2.634615182876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090389, + "balance_loss_mlp": 1.05498338, + "epoch": 0.2637552904963448, + "flos": 455752839168.0, + "grad_norm": 0.062476231294863314, + "language_loss": 0.89913595, + "learning_rate": 0.0008634333369854345, + "loss": 0.91003978, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.35424805, + "step": 1371, + "time_per_iteration": 2.5908331871032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082759, + "balance_loss_mlp": 1.04818797, + "epoch": 0.2639476721816083, + "flos": 612847323648.0, + "grad_norm": 0.05509554660574217, + "language_loss": 0.87495965, + "learning_rate": 0.0008632193058607608, + "loss": 0.88578725, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.34594727, + "step": 1372, + "time_per_iteration": 2.6963188648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108436, + "balance_loss_mlp": 1.04878759, + "epoch": 0.2641400538668719, + "flos": 571645112832.0, + "grad_norm": 0.05982264925210271, + "language_loss": 0.81028771, + "learning_rate": 0.0008630051337218466, + "loss": 0.82113135, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.35595703, + "step": 1373, + "time_per_iteration": 2.644540786743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079985, + "balance_loss_mlp": 1.04582, + "epoch": 0.2643324355521354, + "flos": 581979866112.0, + "grad_norm": 0.08561984623812412, + "language_loss": 0.82428128, + "learning_rate": 0.0008627908206518409, + "loss": 0.8350811, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.34179688, + "step": 1374, + "time_per_iteration": 2.660578966140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110354, + "balance_loss_mlp": 1.08904421, + "epoch": 0.264524817237399, + "flos": 1543845284352.0, + "grad_norm": 0.03725698642258328, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76254791, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.14453125, + "step": 1375, + "time_per_iteration": 4.9595324993133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079461, + "balance_loss_mlp": 1.04474711, + "epoch": 0.26471719892266254, + "flos": 517783117824.0, + "grad_norm": 0.05493851972821551, + "language_loss": 0.91330564, + "learning_rate": 0.0008623617720514241, + "loss": 0.92410028, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.34741211, + "step": 1376, + "time_per_iteration": 2.5929205417633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.05498242, + "epoch": 0.26490958060792613, + "flos": 516936314880.0, + "grad_norm": 0.08106601153347975, + "language_loss": 0.84946424, + "learning_rate": 0.0008621470366875848, + "loss": 0.8603667, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.3527832, + "step": 1377, + "time_per_iteration": 2.5729684829711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084439, + "balance_loss_mlp": 1.0497725, + "epoch": 0.26510196229318966, + "flos": 596298350592.0, + "grad_norm": 0.05588669268878349, + "language_loss": 0.87771004, + "learning_rate": 0.0008619321607257966, + "loss": 0.88855445, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.34692383, + "step": 1378, + "time_per_iteration": 2.6708004474639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082921, + "balance_loss_mlp": 1.0483501, + "epoch": 0.26529434397845325, + "flos": 685488780288.0, + "grad_norm": 0.051774701706919043, + "language_loss": 0.82311988, + "learning_rate": 0.000861717144249482, + "loss": 0.83394915, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.34594727, + "step": 1379, + "time_per_iteration": 2.8249831199645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081979, + "balance_loss_mlp": 1.0468595, + "epoch": 0.26548672566371684, + "flos": 424127328768.0, + "grad_norm": 0.06288210815556809, + "language_loss": 0.90205348, + "learning_rate": 0.0008615019873421175, + "loss": 0.91287327, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.35131836, + "step": 1380, + "time_per_iteration": 2.455320358276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108606, + "balance_loss_mlp": 1.05108428, + "epoch": 0.26567910734898037, + "flos": 489619012608.0, + "grad_norm": 0.05393715583789803, + "language_loss": 0.8609767, + "learning_rate": 0.0008612866900872349, + "loss": 0.87183726, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.35009766, + "step": 1381, + "time_per_iteration": 2.54070782661438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083002, + "balance_loss_mlp": 1.04895568, + "epoch": 0.26587148903424396, + "flos": 533947977216.0, + "grad_norm": 0.05290962754614328, + "language_loss": 0.88052452, + "learning_rate": 0.0008610712525684197, + "loss": 0.8913545, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.34082031, + "step": 1382, + "time_per_iteration": 2.6350595951080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084878, + "balance_loss_mlp": 1.05049801, + "epoch": 0.2660638707195075, + "flos": 1017067732992.0, + "grad_norm": 0.06267977315545337, + "language_loss": 0.84534729, + "learning_rate": 0.0008608556748693121, + "loss": 0.85619605, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.34423828, + "step": 1383, + "time_per_iteration": 3.231172561645508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086499, + "balance_loss_mlp": 1.05216646, + "epoch": 0.2662562524047711, + "flos": 523712148480.0, + "grad_norm": 0.0585640776606728, + "language_loss": 0.86247015, + "learning_rate": 0.000860639957073607, + "loss": 0.87333512, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.34375, + "step": 1384, + "time_per_iteration": 2.72265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108718, + "balance_loss_mlp": 1.05280018, + "epoch": 0.2664486340900346, + "flos": 552107598336.0, + "grad_norm": 0.07312693577598182, + "language_loss": 0.87888551, + "learning_rate": 0.0008604240992650534, + "loss": 0.88975734, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.34423828, + "step": 1385, + "time_per_iteration": 2.6524593830108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079729, + "balance_loss_mlp": 1.0455637, + "epoch": 0.2666410157752982, + "flos": 469895233536.0, + "grad_norm": 0.058731941016447735, + "language_loss": 0.89070451, + "learning_rate": 0.0008602081015274545, + "loss": 0.90150183, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.34179688, + "step": 1386, + "time_per_iteration": 2.7026963233947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083917, + "balance_loss_mlp": 1.04953694, + "epoch": 0.2668333974605617, + "flos": 569645968896.0, + "grad_norm": 0.04572049987167494, + "language_loss": 0.83031899, + "learning_rate": 0.0008599919639446684, + "loss": 0.84115815, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.34423828, + "step": 1387, + "time_per_iteration": 2.6891515254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083945, + "balance_loss_mlp": 1.04920745, + "epoch": 0.2670257791458253, + "flos": 398755325952.0, + "grad_norm": 0.06113709644372323, + "language_loss": 0.80263156, + "learning_rate": 0.000859775686600607, + "loss": 0.81347102, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.34765625, + "step": 1388, + "time_per_iteration": 2.5367043018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088195, + "balance_loss_mlp": 1.05400586, + "epoch": 0.2672181608310889, + "flos": 515587534848.0, + "grad_norm": 0.07715457421599592, + "language_loss": 0.85045016, + "learning_rate": 0.0008595592695792367, + "loss": 0.86133218, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.34228516, + "step": 1389, + "time_per_iteration": 2.653684139251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097788, + "balance_loss_mlp": 1.06348014, + "epoch": 0.26741054251635243, + "flos": 507270864384.0, + "grad_norm": 0.05276083290405683, + "language_loss": 0.9085412, + "learning_rate": 0.0008593427129645778, + "loss": 0.91951907, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.34350586, + "step": 1390, + "time_per_iteration": 2.5497426986694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100169, + "balance_loss_mlp": 1.06512117, + "epoch": 0.267602924201616, + "flos": 576357783552.0, + "grad_norm": 0.0907689109524766, + "language_loss": 0.85371816, + "learning_rate": 0.0008591260168407052, + "loss": 0.86471987, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.35083008, + "step": 1391, + "time_per_iteration": 2.752777576446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099505, + "balance_loss_mlp": 1.06598353, + "epoch": 0.26779530588687955, + "flos": 523731087360.0, + "grad_norm": 0.05201595058269412, + "language_loss": 0.83216429, + "learning_rate": 0.0008589091812917479, + "loss": 0.84315932, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.33544922, + "step": 1392, + "time_per_iteration": 2.602858781814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092875, + "balance_loss_mlp": 1.05897164, + "epoch": 0.26798768757214314, + "flos": 556508938752.0, + "grad_norm": 0.054199587407170555, + "language_loss": 0.85476619, + "learning_rate": 0.0008586922064018887, + "loss": 0.86569488, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.33911133, + "step": 1393, + "time_per_iteration": 2.663135528564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110003, + "balance_loss_mlp": 1.0641005, + "epoch": 0.2681800692574067, + "flos": 930246004224.0, + "grad_norm": 0.05606615550920643, + "language_loss": 0.89228028, + "learning_rate": 0.0008584750922553651, + "loss": 0.90328062, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.35961914, + "step": 1394, + "time_per_iteration": 3.126030206680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094414, + "balance_loss_mlp": 1.06020141, + "epoch": 0.26837245094267026, + "flos": 700771931136.0, + "grad_norm": 0.055333821001128054, + "language_loss": 0.83724457, + "learning_rate": 0.0008582578389364677, + "loss": 0.84818876, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.3425293, + "step": 1395, + "time_per_iteration": 2.858774423599243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096924, + "balance_loss_mlp": 1.06187642, + "epoch": 0.26856483262793385, + "flos": 592892199936.0, + "grad_norm": 0.04773968262798697, + "language_loss": 0.9195987, + "learning_rate": 0.0008580404465295422, + "loss": 0.93056792, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.35058594, + "step": 1396, + "time_per_iteration": 2.7737646102905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096258, + "balance_loss_mlp": 1.06190252, + "epoch": 0.2687572143131974, + "flos": 713943866880.0, + "grad_norm": 0.07288281155022573, + "language_loss": 0.88208908, + "learning_rate": 0.0008578229151189876, + "loss": 0.89305162, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.34375, + "step": 1397, + "time_per_iteration": 2.9974029064178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087674, + "balance_loss_mlp": 1.05441451, + "epoch": 0.26894959599846097, + "flos": 467481452544.0, + "grad_norm": 0.0581153622766974, + "language_loss": 0.81433654, + "learning_rate": 0.0008576052447892573, + "loss": 0.82521319, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.33276367, + "step": 1398, + "time_per_iteration": 2.586427688598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090085, + "balance_loss_mlp": 1.05551457, + "epoch": 0.2691419776837245, + "flos": 468470850048.0, + "grad_norm": 0.08083264737918114, + "language_loss": 0.86589479, + "learning_rate": 0.000857387435624858, + "loss": 0.87679559, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.34619141, + "step": 1399, + "time_per_iteration": 2.5227789878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096966, + "balance_loss_mlp": 1.06239533, + "epoch": 0.2693343593689881, + "flos": 937244418048.0, + "grad_norm": 0.0443934808912178, + "language_loss": 0.88525635, + "learning_rate": 0.0008571694877103513, + "loss": 0.89622605, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.34594727, + "step": 1400, + "time_per_iteration": 3.252573251724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095335, + "balance_loss_mlp": 1.06064546, + "epoch": 0.2695267410542516, + "flos": 577303511040.0, + "grad_norm": 0.05297583192015558, + "language_loss": 0.87603962, + "learning_rate": 0.0008569514011303515, + "loss": 0.88699305, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.34716797, + "step": 1401, + "time_per_iteration": 2.7824223041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092716, + "balance_loss_mlp": 1.0577879, + "epoch": 0.2697191227395152, + "flos": 556539462144.0, + "grad_norm": 0.06414718170709632, + "language_loss": 0.87859815, + "learning_rate": 0.0008567331759695277, + "loss": 0.88952529, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.34985352, + "step": 1402, + "time_per_iteration": 2.7109498977661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098142, + "balance_loss_mlp": 1.06183052, + "epoch": 0.26991150442477874, + "flos": 529024310784.0, + "grad_norm": 0.05462837975359106, + "language_loss": 0.86148876, + "learning_rate": 0.0008565148123126023, + "loss": 0.87247014, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.36328125, + "step": 1403, + "time_per_iteration": 2.6526310443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088301, + "balance_loss_mlp": 1.05289555, + "epoch": 0.2701038861100423, + "flos": 531737837568.0, + "grad_norm": 0.12276519374226595, + "language_loss": 0.86177158, + "learning_rate": 0.0008562963102443516, + "loss": 0.87265456, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.35400391, + "step": 1404, + "time_per_iteration": 2.6809849739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092814, + "balance_loss_mlp": 1.05757618, + "epoch": 0.2702962677953059, + "flos": 734908737024.0, + "grad_norm": 0.05743337882617235, + "language_loss": 0.85265231, + "learning_rate": 0.0008560776698496056, + "loss": 0.86358047, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.3527832, + "step": 1405, + "time_per_iteration": 2.9008774757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099947, + "balance_loss_mlp": 1.06420779, + "epoch": 0.27048864948056944, + "flos": 574453181952.0, + "grad_norm": 0.06281004106283315, + "language_loss": 0.85864103, + "learning_rate": 0.0008558588912132481, + "loss": 0.86964047, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.35742188, + "step": 1406, + "time_per_iteration": 2.8967840671539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057025, + "balance_loss_mlp": 1.04519963, + "epoch": 0.27068103116583303, + "flos": 1423091953152.0, + "grad_norm": 0.03126478371356873, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77516007, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.11816406, + "step": 1407, + "time_per_iteration": 4.933698892593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109874, + "balance_loss_mlp": 1.06400251, + "epoch": 0.27087341285109656, + "flos": 531742219776.0, + "grad_norm": 0.050597666424933845, + "language_loss": 0.82942683, + "learning_rate": 0.0008554209195555016, + "loss": 0.84041423, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.34765625, + "step": 1408, + "time_per_iteration": 2.6599888801574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093614, + "balance_loss_mlp": 1.0582329, + "epoch": 0.27106579453636015, + "flos": 581108332032.0, + "grad_norm": 0.058412744436649705, + "language_loss": 0.88199335, + "learning_rate": 0.0008552017267041483, + "loss": 0.89292949, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.35375977, + "step": 1409, + "time_per_iteration": 2.673828363418579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091257, + "balance_loss_mlp": 1.05563736, + "epoch": 0.2712581762216237, + "flos": 506533160448.0, + "grad_norm": 0.05246666988179206, + "language_loss": 0.83264577, + "learning_rate": 0.0008549823959512549, + "loss": 0.84355831, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.35644531, + "step": 1410, + "time_per_iteration": 2.634523868560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091517, + "balance_loss_mlp": 1.05451441, + "epoch": 0.27145055790688727, + "flos": 997019476992.0, + "grad_norm": 0.050905982394943275, + "language_loss": 0.86668658, + "learning_rate": 0.0008547629273819728, + "loss": 0.87760168, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.36987305, + "step": 1411, + "time_per_iteration": 3.3559322357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094566, + "balance_loss_mlp": 1.05875564, + "epoch": 0.2716429395921508, + "flos": 546420086784.0, + "grad_norm": 0.06363965087638479, + "language_loss": 0.83773881, + "learning_rate": 0.0008545433210815074, + "loss": 0.84868449, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.35839844, + "step": 1412, + "time_per_iteration": 2.607379913330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109226, + "balance_loss_mlp": 1.05606771, + "epoch": 0.2718353212774144, + "flos": 572954605056.0, + "grad_norm": 0.05881941163427475, + "language_loss": 0.87753916, + "learning_rate": 0.0008543235771351176, + "loss": 0.88846171, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.36230469, + "step": 1413, + "time_per_iteration": 2.722318649291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096323, + "balance_loss_mlp": 1.06118035, + "epoch": 0.272027702962678, + "flos": 643986823680.0, + "grad_norm": 0.044269909609048815, + "language_loss": 0.84649068, + "learning_rate": 0.0008541036956281154, + "loss": 0.85745388, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.3515625, + "step": 1414, + "time_per_iteration": 2.8785104751586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100326, + "balance_loss_mlp": 1.0645628, + "epoch": 0.2722200846479415, + "flos": 653410755072.0, + "grad_norm": 0.0658433573318433, + "language_loss": 0.82281864, + "learning_rate": 0.0008538836766458665, + "loss": 0.83382189, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.35791016, + "step": 1415, + "time_per_iteration": 2.834148645401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098566, + "balance_loss_mlp": 1.06311321, + "epoch": 0.2724124663332051, + "flos": 579346324992.0, + "grad_norm": 0.07330345680392343, + "language_loss": 0.85275221, + "learning_rate": 0.0008536635202737897, + "loss": 0.86373788, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.35473633, + "step": 1416, + "time_per_iteration": 2.7886626720428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099993, + "balance_loss_mlp": 1.06513667, + "epoch": 0.2726048480184686, + "flos": 537178037760.0, + "grad_norm": 0.06667202152625065, + "language_loss": 0.82212454, + "learning_rate": 0.0008534432265973573, + "loss": 0.8331244, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.34912109, + "step": 1417, + "time_per_iteration": 2.604626417160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095149, + "balance_loss_mlp": 1.05931497, + "epoch": 0.2727972297037322, + "flos": 995360776704.0, + "grad_norm": 0.08172035912068172, + "language_loss": 0.88052338, + "learning_rate": 0.000853222795702095, + "loss": 0.8914749, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.35839844, + "step": 1418, + "time_per_iteration": 3.391664505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095275, + "balance_loss_mlp": 1.06125307, + "epoch": 0.27298961138899575, + "flos": 605924513280.0, + "grad_norm": 0.05480231780963067, + "language_loss": 0.83608377, + "learning_rate": 0.0008530022276735813, + "loss": 0.84703648, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.34057617, + "step": 1419, + "time_per_iteration": 2.705235004425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107985, + "balance_loss_mlp": 1.04666257, + "epoch": 0.27318199307425933, + "flos": 529059216384.0, + "grad_norm": 0.054542785174291425, + "language_loss": 0.85957551, + "learning_rate": 0.0008527815225974489, + "loss": 0.87037402, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.33203125, + "step": 1420, + "time_per_iteration": 2.654003620147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087931, + "balance_loss_mlp": 1.05395651, + "epoch": 0.2733743747595229, + "flos": 408809272320.0, + "grad_norm": 0.06460584893454492, + "language_loss": 0.88538897, + "learning_rate": 0.0008525606805593829, + "loss": 0.89626825, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.33984375, + "step": 1421, + "time_per_iteration": 2.4287912845611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087097, + "balance_loss_mlp": 1.05233574, + "epoch": 0.27356675644478645, + "flos": 515976030720.0, + "grad_norm": 0.055753761808712644, + "language_loss": 0.82379127, + "learning_rate": 0.0008523397016451213, + "loss": 0.8346622, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.34814453, + "step": 1422, + "time_per_iteration": 2.5620808601379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096714, + "balance_loss_mlp": 1.0617379, + "epoch": 0.27375913813005004, + "flos": 1051914539520.0, + "grad_norm": 0.0481984630724129, + "language_loss": 0.87272507, + "learning_rate": 0.0008521185859404564, + "loss": 0.88369215, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.34985352, + "step": 1423, + "time_per_iteration": 3.361171245574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085067, + "balance_loss_mlp": 1.05037737, + "epoch": 0.27395151981531357, + "flos": 624507535872.0, + "grad_norm": 0.05502068897485729, + "language_loss": 0.89717311, + "learning_rate": 0.0008518973335312326, + "loss": 0.90802383, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.34716797, + "step": 1424, + "time_per_iteration": 2.7964961528778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088932, + "balance_loss_mlp": 1.05390823, + "epoch": 0.27414390150057716, + "flos": 550112836608.0, + "grad_norm": 0.056708357312241935, + "language_loss": 0.83878243, + "learning_rate": 0.0008516759445033477, + "loss": 0.84967172, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.35058594, + "step": 1425, + "time_per_iteration": 2.6100170612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090578, + "balance_loss_mlp": 1.05603087, + "epoch": 0.2743362831858407, + "flos": 539596200960.0, + "grad_norm": 0.061048707375716146, + "language_loss": 0.84983361, + "learning_rate": 0.0008514544189427526, + "loss": 0.86073935, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.34594727, + "step": 1426, + "time_per_iteration": 2.6465015411376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088002, + "balance_loss_mlp": 1.05347919, + "epoch": 0.2745286648711043, + "flos": 468352986624.0, + "grad_norm": 0.061383055639382046, + "language_loss": 0.8704657, + "learning_rate": 0.0008512327569354511, + "loss": 0.88134569, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.34570312, + "step": 1427, + "time_per_iteration": 2.5696229934692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087297, + "balance_loss_mlp": 1.05212998, + "epoch": 0.2747210465563678, + "flos": 472617524736.0, + "grad_norm": 0.05941983459852813, + "language_loss": 0.8349936, + "learning_rate": 0.0008510109585675001, + "loss": 0.84586656, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.35180664, + "step": 1428, + "time_per_iteration": 2.6195123195648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086947, + "balance_loss_mlp": 1.07245111, + "epoch": 0.2749134282416314, + "flos": 1314345070080.0, + "grad_norm": 0.037284634304165044, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82240289, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.14453125, + "step": 1429, + "time_per_iteration": 4.714681625366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083808, + "balance_loss_mlp": 1.04938078, + "epoch": 0.275105809926895, + "flos": 970445670912.0, + "grad_norm": 0.07686972857934649, + "language_loss": 0.80942416, + "learning_rate": 0.0008505669530941415, + "loss": 0.82026225, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.34472656, + "step": 1430, + "time_per_iteration": 3.2975006103515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080869, + "balance_loss_mlp": 1.04715657, + "epoch": 0.2752981916121585, + "flos": 527089185792.0, + "grad_norm": 0.061626933195079385, + "language_loss": 0.84357536, + "learning_rate": 0.000850344746161112, + "loss": 0.85438406, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.33740234, + "step": 1431, + "time_per_iteration": 2.596623182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079512, + "balance_loss_mlp": 1.04487014, + "epoch": 0.2754905732974221, + "flos": 453487444992.0, + "grad_norm": 0.05883177646218185, + "language_loss": 0.87880194, + "learning_rate": 0.0008501224032121894, + "loss": 0.88959706, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.34692383, + "step": 1432, + "time_per_iteration": 2.5134201049804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082699, + "balance_loss_mlp": 1.04817569, + "epoch": 0.27568295498268564, + "flos": 497216918016.0, + "grad_norm": 0.05235854639463291, + "language_loss": 0.82002538, + "learning_rate": 0.0008498999243336946, + "loss": 0.83085239, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.34570312, + "step": 1433, + "time_per_iteration": 2.601771593093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095225, + "balance_loss_mlp": 1.06060696, + "epoch": 0.2758753366679492, + "flos": 607890161664.0, + "grad_norm": 0.05891633941102979, + "language_loss": 0.87444806, + "learning_rate": 0.0008496773096120021, + "loss": 0.8854003, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.34643555, + "step": 1434, + "time_per_iteration": 2.788516044616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096763, + "balance_loss_mlp": 1.06212115, + "epoch": 0.27606771835321275, + "flos": 739803290112.0, + "grad_norm": 0.06770297286276174, + "language_loss": 0.84185004, + "learning_rate": 0.0008494545591335381, + "loss": 0.85281765, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.34667969, + "step": 1435, + "time_per_iteration": 2.8759751319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110127, + "balance_loss_mlp": 1.06736696, + "epoch": 0.27626010003847634, + "flos": 554279860224.0, + "grad_norm": 0.04450223786838935, + "language_loss": 0.87244952, + "learning_rate": 0.0008492316729847823, + "loss": 0.88346225, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.33935547, + "step": 1436, + "time_per_iteration": 2.781244993209839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094118, + "balance_loss_mlp": 1.06023908, + "epoch": 0.2764524817237399, + "flos": 542270439936.0, + "grad_norm": 0.055325808882979444, + "language_loss": 0.79874223, + "learning_rate": 0.0008490086512522664, + "loss": 0.80968338, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.33911133, + "step": 1437, + "time_per_iteration": 2.7197165489196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093138, + "balance_loss_mlp": 1.05913925, + "epoch": 0.27664486340900346, + "flos": 406027344384.0, + "grad_norm": 0.0539948754920925, + "language_loss": 0.90713239, + "learning_rate": 0.0008487854940225755, + "loss": 0.91806382, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.34008789, + "step": 1438, + "time_per_iteration": 2.438218593597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094602, + "balance_loss_mlp": 1.06017423, + "epoch": 0.27683724509426705, + "flos": 521884712448.0, + "grad_norm": 0.06140365718889793, + "language_loss": 0.90140885, + "learning_rate": 0.0008485622013823466, + "loss": 0.91235483, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.34448242, + "step": 1439, + "time_per_iteration": 2.653393268585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102393, + "balance_loss_mlp": 1.06770289, + "epoch": 0.2770296267795306, + "flos": 535085761536.0, + "grad_norm": 0.07554461134761571, + "language_loss": 0.8283006, + "learning_rate": 0.00084833877341827, + "loss": 0.83932453, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.34692383, + "step": 1440, + "time_per_iteration": 2.6312928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109947, + "balance_loss_mlp": 1.06587648, + "epoch": 0.27722200846479417, + "flos": 487747906560.0, + "grad_norm": 0.12145939933268801, + "language_loss": 0.8064183, + "learning_rate": 0.000848115210217088, + "loss": 0.81741297, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.33618164, + "step": 1441, + "time_per_iteration": 2.5490710735321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086774, + "balance_loss_mlp": 1.05167925, + "epoch": 0.2774143901500577, + "flos": 618012509184.0, + "grad_norm": 0.05766268366580332, + "language_loss": 0.82057106, + "learning_rate": 0.0008478915118655952, + "loss": 0.83143878, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.35131836, + "step": 1442, + "time_per_iteration": 2.710240602493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086605, + "balance_loss_mlp": 1.05160558, + "epoch": 0.2776067718353213, + "flos": 513563659776.0, + "grad_norm": 0.05774564569051742, + "language_loss": 0.86505657, + "learning_rate": 0.0008476676784506393, + "loss": 0.87592262, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.35009766, + "step": 1443, + "time_per_iteration": 2.636622667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108686, + "balance_loss_mlp": 1.0526228, + "epoch": 0.2777991535205848, + "flos": 1003985957376.0, + "grad_norm": 0.10311001825576924, + "language_loss": 0.82024419, + "learning_rate": 0.0008474437100591201, + "loss": 0.8311128, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.34277344, + "step": 1444, + "time_per_iteration": 3.282383918762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091274, + "balance_loss_mlp": 1.05517697, + "epoch": 0.2779915352058484, + "flos": 550005147648.0, + "grad_norm": 0.05151300271624721, + "language_loss": 0.85496646, + "learning_rate": 0.0008472196067779898, + "loss": 0.86587918, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.36108398, + "step": 1445, + "time_per_iteration": 2.703263998031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092087, + "balance_loss_mlp": 1.05591917, + "epoch": 0.278183916891112, + "flos": 873444930048.0, + "grad_norm": 0.06736388569990436, + "language_loss": 0.85432607, + "learning_rate": 0.0008469953686942531, + "loss": 0.86524689, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.36181641, + "step": 1446, + "time_per_iteration": 3.0834743976593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087224, + "balance_loss_mlp": 1.05305839, + "epoch": 0.2783762985763755, + "flos": 623782978560.0, + "grad_norm": 0.06536474240751361, + "language_loss": 0.83167183, + "learning_rate": 0.0008467709958949668, + "loss": 0.84254414, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.34204102, + "step": 1447, + "time_per_iteration": 2.737135887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087199, + "balance_loss_mlp": 1.05362952, + "epoch": 0.2785686802616391, + "flos": 581571021312.0, + "grad_norm": 0.057056565872365954, + "language_loss": 0.85917461, + "learning_rate": 0.0008465464884672403, + "loss": 0.87004662, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.33569336, + "step": 1448, + "time_per_iteration": 2.6771810054779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087224, + "balance_loss_mlp": 1.05346394, + "epoch": 0.27876106194690264, + "flos": 587032980480.0, + "grad_norm": 0.06237565084734976, + "language_loss": 0.85356677, + "learning_rate": 0.0008463218464982348, + "loss": 0.86443901, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.33789062, + "step": 1449, + "time_per_iteration": 2.799407720565796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086336, + "balance_loss_mlp": 1.05228984, + "epoch": 0.27895344363216623, + "flos": 875621574144.0, + "grad_norm": 0.06450477685794259, + "language_loss": 0.87800258, + "learning_rate": 0.0008460970700751645, + "loss": 0.88886595, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.34057617, + "step": 1450, + "time_per_iteration": 3.0517759323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086942, + "balance_loss_mlp": 1.05322921, + "epoch": 0.27914582531742976, + "flos": 603630005760.0, + "grad_norm": 0.06893143963997089, + "language_loss": 0.87761652, + "learning_rate": 0.000845872159285295, + "loss": 0.88848597, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.33740234, + "step": 1451, + "time_per_iteration": 2.6964316368103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042481, + "balance_loss_mlp": 1.0276041, + "epoch": 0.27933820700269335, + "flos": 1496892953088.0, + "grad_norm": 0.0242162718076618, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78809333, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.1484375, + "step": 1452, + "time_per_iteration": 4.936378717422485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085289, + "balance_loss_mlp": 1.05162406, + "epoch": 0.2795305886879569, + "flos": 1031445854208.0, + "grad_norm": 0.05721806240601363, + "language_loss": 0.86067116, + "learning_rate": 0.0008454219349544836, + "loss": 0.87152404, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.33691406, + "step": 1453, + "time_per_iteration": 3.336336135864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086797, + "balance_loss_mlp": 1.053442, + "epoch": 0.27972297037322047, + "flos": 606766934016.0, + "grad_norm": 0.056433536115445035, + "language_loss": 0.81829166, + "learning_rate": 0.000845196621588334, + "loss": 0.82915968, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.33374023, + "step": 1454, + "time_per_iteration": 2.7415192127227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088727, + "balance_loss_mlp": 1.05475271, + "epoch": 0.27991535205848406, + "flos": 630085948416.0, + "grad_norm": 0.05700257056170363, + "language_loss": 0.76605666, + "learning_rate": 0.0008449711742049706, + "loss": 0.77694392, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.33984375, + "step": 1455, + "time_per_iteration": 2.755082130432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093567, + "balance_loss_mlp": 1.06035542, + "epoch": 0.2801077337437476, + "flos": 549034689024.0, + "grad_norm": 0.056412270826162, + "language_loss": 0.83750427, + "learning_rate": 0.0008447455928919196, + "loss": 0.84843993, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.33227539, + "step": 1456, + "time_per_iteration": 2.601306438446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097544, + "balance_loss_mlp": 1.06423688, + "epoch": 0.2803001154290112, + "flos": 486516989952.0, + "grad_norm": 0.08664389404831466, + "language_loss": 0.86875856, + "learning_rate": 0.0008445198777367595, + "loss": 0.87973404, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.33325195, + "step": 1457, + "time_per_iteration": 2.5534915924072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106361, + "balance_loss_mlp": 1.07267249, + "epoch": 0.2804924971142747, + "flos": 521820693504.0, + "grad_norm": 0.060155581105879694, + "language_loss": 0.8096568, + "learning_rate": 0.0008442940288271208, + "loss": 0.82072043, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.33691406, + "step": 1458, + "time_per_iteration": 2.6646370887756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108311, + "balance_loss_mlp": 1.07459903, + "epoch": 0.2806848787995383, + "flos": 527410690560.0, + "grad_norm": 0.05492641307724046, + "language_loss": 0.86995763, + "learning_rate": 0.0008440680462506856, + "loss": 0.88104069, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.33740234, + "step": 1459, + "time_per_iteration": 2.793306589126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103222, + "balance_loss_mlp": 1.06927133, + "epoch": 0.2808772604848018, + "flos": 485246785536.0, + "grad_norm": 0.053370474172872176, + "language_loss": 0.86799729, + "learning_rate": 0.0008438419300951883, + "loss": 0.87902945, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.33984375, + "step": 1460, + "time_per_iteration": 2.6732945442199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098403, + "balance_loss_mlp": 1.06488156, + "epoch": 0.2810696421700654, + "flos": 617840801280.0, + "grad_norm": 0.06081455520295947, + "language_loss": 0.86599934, + "learning_rate": 0.0008436156804484148, + "loss": 0.87698334, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.33544922, + "step": 1461, + "time_per_iteration": 2.768385410308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087899, + "balance_loss_mlp": 1.05397177, + "epoch": 0.28126202385532895, + "flos": 454521922560.0, + "grad_norm": 0.061036272851527865, + "language_loss": 0.88221931, + "learning_rate": 0.0008433892973982031, + "loss": 0.89309829, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.33959961, + "step": 1462, + "time_per_iteration": 2.502269983291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108859, + "balance_loss_mlp": 1.0546149, + "epoch": 0.28145440554059253, + "flos": 530447284224.0, + "grad_norm": 0.06533100110399645, + "language_loss": 0.85006338, + "learning_rate": 0.0008431627810324431, + "loss": 0.86094928, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.34008789, + "step": 1463, + "time_per_iteration": 2.6481637954711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097854, + "balance_loss_mlp": 1.06344974, + "epoch": 0.2816467872258561, + "flos": 451996070400.0, + "grad_norm": 0.053948569536927254, + "language_loss": 0.81259125, + "learning_rate": 0.000842936131439076, + "loss": 0.82356977, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.34423828, + "step": 1464, + "time_per_iteration": 2.598619222640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100723, + "balance_loss_mlp": 1.06665313, + "epoch": 0.28183916891111965, + "flos": 472464755712.0, + "grad_norm": 0.06117554261618067, + "language_loss": 0.88043475, + "learning_rate": 0.0008427093487060951, + "loss": 0.89144206, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.34106445, + "step": 1465, + "time_per_iteration": 2.611067533493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092883, + "balance_loss_mlp": 1.05917072, + "epoch": 0.28203155059638324, + "flos": 556770806784.0, + "grad_norm": 0.05001896034653533, + "language_loss": 0.84742111, + "learning_rate": 0.000842482432921545, + "loss": 0.85834992, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.33740234, + "step": 1466, + "time_per_iteration": 2.8155059814453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109283, + "balance_loss_mlp": 1.05990458, + "epoch": 0.28222393228164677, + "flos": 416756385792.0, + "grad_norm": 0.06017010781955974, + "language_loss": 0.87132335, + "learning_rate": 0.0008422553841735225, + "loss": 0.88225162, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.3293457, + "step": 1467, + "time_per_iteration": 2.459348201751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091932, + "balance_loss_mlp": 1.05731392, + "epoch": 0.28241631396691036, + "flos": 604629577728.0, + "grad_norm": 0.060074700521020694, + "language_loss": 0.84810078, + "learning_rate": 0.0008420282025501757, + "loss": 0.85902011, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.34643555, + "step": 1468, + "time_per_iteration": 2.7499678134918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086806, + "balance_loss_mlp": 1.05275989, + "epoch": 0.2826086956521739, + "flos": 572698529280.0, + "grad_norm": 0.05717030328031113, + "language_loss": 0.854882, + "learning_rate": 0.0008418008881397043, + "loss": 0.86575013, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.34082031, + "step": 1469, + "time_per_iteration": 2.6512861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080943, + "balance_loss_mlp": 1.04677796, + "epoch": 0.2828010773374375, + "flos": 842367886848.0, + "grad_norm": 0.05184982716140645, + "language_loss": 0.82590878, + "learning_rate": 0.0008415734410303595, + "loss": 0.8367182, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.34204102, + "step": 1470, + "time_per_iteration": 3.1787467002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085547, + "balance_loss_mlp": 1.05214453, + "epoch": 0.28299345902270107, + "flos": 542402860032.0, + "grad_norm": 0.04590644458835405, + "language_loss": 0.90709066, + "learning_rate": 0.0008413458613104444, + "loss": 0.9179461, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.33447266, + "step": 1471, + "time_per_iteration": 2.6650707721710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092285, + "balance_loss_mlp": 1.05780995, + "epoch": 0.2831858407079646, + "flos": 571320635904.0, + "grad_norm": 0.05367648266979066, + "language_loss": 0.82737631, + "learning_rate": 0.0008411181490683129, + "loss": 0.83829916, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.34472656, + "step": 1472, + "time_per_iteration": 2.7423322200775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104233, + "balance_loss_mlp": 1.06909025, + "epoch": 0.2833782223932282, + "flos": 763491861504.0, + "grad_norm": 0.05498194123694656, + "language_loss": 0.82467097, + "learning_rate": 0.0008408903043923707, + "loss": 0.83571333, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.35180664, + "step": 1473, + "time_per_iteration": 2.991787910461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114011, + "balance_loss_mlp": 1.07810485, + "epoch": 0.2835706040784917, + "flos": 538793068032.0, + "grad_norm": 0.05681509946110844, + "language_loss": 0.81401253, + "learning_rate": 0.0008406623273710754, + "loss": 0.82515264, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.35913086, + "step": 1474, + "time_per_iteration": 2.58866286277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112011, + "balance_loss_mlp": 1.07732141, + "epoch": 0.2837629857637553, + "flos": 530329420800.0, + "grad_norm": 0.06008709036576614, + "language_loss": 0.82883334, + "learning_rate": 0.0008404342180929351, + "loss": 0.83995342, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.34716797, + "step": 1475, + "time_per_iteration": 2.636383295059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112842, + "balance_loss_mlp": 1.07834268, + "epoch": 0.28395536744901884, + "flos": 539763526656.0, + "grad_norm": 0.06514959519071023, + "language_loss": 0.81725156, + "learning_rate": 0.00084020597664651, + "loss": 0.82837999, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.34521484, + "step": 1476, + "time_per_iteration": 2.7587718963623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106069, + "balance_loss_mlp": 1.0697813, + "epoch": 0.2841477491342824, + "flos": 573344510976.0, + "grad_norm": 0.0608139165355994, + "language_loss": 0.84204602, + "learning_rate": 0.0008399776031204111, + "loss": 0.85310674, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.36303711, + "step": 1477, + "time_per_iteration": 2.7376203536987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097711, + "balance_loss_mlp": 1.06263912, + "epoch": 0.28434013081954596, + "flos": 571802264064.0, + "grad_norm": 0.06275845169868324, + "language_loss": 0.8026123, + "learning_rate": 0.0008397490976033009, + "loss": 0.81358939, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.35083008, + "step": 1478, + "time_per_iteration": 2.6391618251800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103776, + "balance_loss_mlp": 1.02412283, + "epoch": 0.28453251250480954, + "flos": 1552554832896.0, + "grad_norm": 0.016614249421738093, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78917408, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.13671875, + "step": 1479, + "time_per_iteration": 4.730362176895142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079858, + "balance_loss_mlp": 1.04550207, + "epoch": 0.28472489419007313, + "flos": 748720862208.0, + "grad_norm": 0.04873312803653651, + "language_loss": 0.85529596, + "learning_rate": 0.0008392916909509525, + "loss": 0.86609453, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.34399414, + "step": 1480, + "time_per_iteration": 3.0429892539978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083924, + "balance_loss_mlp": 1.0495913, + "epoch": 0.28491727587533666, + "flos": 489914376192.0, + "grad_norm": 0.056617404906403615, + "language_loss": 0.85149431, + "learning_rate": 0.0008390627899932954, + "loss": 0.86233348, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.34375, + "step": 1481, + "time_per_iteration": 2.6355843544006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083301, + "balance_loss_mlp": 1.04818201, + "epoch": 0.28510965756060025, + "flos": 728671196160.0, + "grad_norm": 0.06013951169928809, + "language_loss": 0.88358414, + "learning_rate": 0.000838833757399789, + "loss": 0.89441717, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.3515625, + "step": 1482, + "time_per_iteration": 2.9198856353759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088409, + "balance_loss_mlp": 1.05357623, + "epoch": 0.2853020392458638, + "flos": 551300083200.0, + "grad_norm": 0.06378715850004578, + "language_loss": 0.80512154, + "learning_rate": 0.0008386045932593515, + "loss": 0.81600565, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.34887695, + "step": 1483, + "time_per_iteration": 2.665919065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087283, + "balance_loss_mlp": 1.05233121, + "epoch": 0.28549442093112737, + "flos": 754456425984.0, + "grad_norm": 0.06049898751226662, + "language_loss": 0.86304945, + "learning_rate": 0.0008383752976609525, + "loss": 0.87392229, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.34960938, + "step": 1484, + "time_per_iteration": 2.9113876819610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081674, + "balance_loss_mlp": 1.04586315, + "epoch": 0.2856868026163909, + "flos": 538311439872.0, + "grad_norm": 0.05363431349597561, + "language_loss": 0.79897112, + "learning_rate": 0.0008381458706936123, + "loss": 0.80978787, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.3581543, + "step": 1485, + "time_per_iteration": 2.715182065963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082792, + "balance_loss_mlp": 1.0470289, + "epoch": 0.2858791843016545, + "flos": 583487207424.0, + "grad_norm": 0.055785658857036256, + "language_loss": 0.8776381, + "learning_rate": 0.0008379163124464025, + "loss": 0.888466, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.35766602, + "step": 1486, + "time_per_iteration": 2.713412284851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083174, + "balance_loss_mlp": 1.04881775, + "epoch": 0.286071565986918, + "flos": 644503357440.0, + "grad_norm": 0.05967072286491994, + "language_loss": 0.76593089, + "learning_rate": 0.0008376866230084452, + "loss": 0.7767626, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.34399414, + "step": 1487, + "time_per_iteration": 2.812953472137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091632, + "balance_loss_mlp": 1.05522537, + "epoch": 0.2862639476721816, + "flos": 491120561664.0, + "grad_norm": 0.06413337589788286, + "language_loss": 0.85965335, + "learning_rate": 0.000837456802468914, + "loss": 0.87056965, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.36401367, + "step": 1488, + "time_per_iteration": 2.5974318981170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096408, + "balance_loss_mlp": 1.06050217, + "epoch": 0.2864563293574452, + "flos": 521363796480.0, + "grad_norm": 0.06049840128310572, + "language_loss": 0.85439187, + "learning_rate": 0.0008372268509170331, + "loss": 0.86535597, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.35888672, + "step": 1489, + "time_per_iteration": 2.6646056175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100143, + "balance_loss_mlp": 1.06478548, + "epoch": 0.2866487110427087, + "flos": 546834723840.0, + "grad_norm": 0.05582745965585505, + "language_loss": 0.84845203, + "learning_rate": 0.0008369967684420779, + "loss": 0.85945344, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.35424805, + "step": 1490, + "time_per_iteration": 2.737180471420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094742, + "balance_loss_mlp": 1.05902719, + "epoch": 0.2868410927279723, + "flos": 481977437184.0, + "grad_norm": 0.0702351654670911, + "language_loss": 0.84684229, + "learning_rate": 0.0008367665551333736, + "loss": 0.85778964, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.35717773, + "step": 1491, + "time_per_iteration": 2.591179847717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095437, + "balance_loss_mlp": 1.05807662, + "epoch": 0.28703347441323585, + "flos": 724578365952.0, + "grad_norm": 0.0690733570245185, + "language_loss": 0.85732669, + "learning_rate": 0.0008365362110802977, + "loss": 0.86828107, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.37329102, + "step": 1492, + "time_per_iteration": 2.8586251735687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083578, + "balance_loss_mlp": 1.04733849, + "epoch": 0.28722585609849943, + "flos": 634670581248.0, + "grad_norm": 0.059898504183233336, + "language_loss": 0.82604659, + "learning_rate": 0.0008363057363722773, + "loss": 0.83688229, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.36254883, + "step": 1493, + "time_per_iteration": 2.8491427898406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076927, + "balance_loss_mlp": 1.04264212, + "epoch": 0.28741823778376296, + "flos": 509974216704.0, + "grad_norm": 0.05796804627405179, + "language_loss": 0.84095198, + "learning_rate": 0.0008360751310987906, + "loss": 0.85172129, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.34301758, + "step": 1494, + "time_per_iteration": 2.5735158920288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077369, + "balance_loss_mlp": 1.04294157, + "epoch": 0.28761061946902655, + "flos": 603458297856.0, + "grad_norm": 0.07368534281083228, + "language_loss": 0.85552645, + "learning_rate": 0.0008358443953493666, + "loss": 0.86630011, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.34472656, + "step": 1495, + "time_per_iteration": 2.8492400646209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078466, + "balance_loss_mlp": 1.04260767, + "epoch": 0.28780300115429014, + "flos": 406977454080.0, + "grad_norm": 0.061458136593000166, + "language_loss": 0.88553399, + "learning_rate": 0.0008356135292135851, + "loss": 0.89631861, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.35864258, + "step": 1496, + "time_per_iteration": 2.499234676361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109055, + "balance_loss_mlp": 1.05428672, + "epoch": 0.28799538283955367, + "flos": 374726310912.0, + "grad_norm": 0.06023187099093886, + "language_loss": 0.92244387, + "learning_rate": 0.0008353825327810758, + "loss": 0.93334937, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.36230469, + "step": 1497, + "time_per_iteration": 2.4068801403045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083025, + "balance_loss_mlp": 1.04761958, + "epoch": 0.28818776452481726, + "flos": 591645316608.0, + "grad_norm": 0.050935971597675156, + "language_loss": 0.81914794, + "learning_rate": 0.00083515140614152, + "loss": 0.82997811, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.35473633, + "step": 1498, + "time_per_iteration": 2.7172293663024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05012727, + "epoch": 0.2883801462100808, + "flos": 534819511296.0, + "grad_norm": 0.055380500747477406, + "language_loss": 0.86671853, + "learning_rate": 0.0008349201493846485, + "loss": 0.87756932, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.34985352, + "step": 1499, + "time_per_iteration": 2.666877508163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088163, + "balance_loss_mlp": 1.05268669, + "epoch": 0.2885725278953444, + "flos": 479850255360.0, + "grad_norm": 0.06392675802491345, + "language_loss": 0.89344347, + "learning_rate": 0.0008346887626002432, + "loss": 0.90432513, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.35473633, + "step": 1500, + "time_per_iteration": 2.547353744506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081111, + "balance_loss_mlp": 1.04613519, + "epoch": 0.2887649095806079, + "flos": 463798877184.0, + "grad_norm": 0.050375470508879826, + "language_loss": 0.86195928, + "learning_rate": 0.000834457245878137, + "loss": 0.87277037, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.35009766, + "step": 1501, + "time_per_iteration": 2.6108102798461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076981, + "balance_loss_mlp": 1.04317355, + "epoch": 0.2889572912658715, + "flos": 930631527936.0, + "grad_norm": 0.05668037017333152, + "language_loss": 0.81365681, + "learning_rate": 0.000834225599308212, + "loss": 0.82442665, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.33837891, + "step": 1502, + "time_per_iteration": 3.2222447395324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085232, + "balance_loss_mlp": 1.04994583, + "epoch": 0.28914967295113503, + "flos": 569848200192.0, + "grad_norm": 0.05132223508893719, + "language_loss": 0.85018057, + "learning_rate": 0.0008339938229804016, + "loss": 0.8610329, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.35327148, + "step": 1503, + "time_per_iteration": 2.698528289794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_mlp": 1.01945031, + "epoch": 0.2893420546363986, + "flos": 1485803119104.0, + "grad_norm": 0.02573157997511775, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76467812, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.13574219, + "step": 1504, + "time_per_iteration": 4.950274467468262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083767, + "balance_loss_mlp": 1.04793239, + "epoch": 0.2895344363216622, + "flos": 469938903552.0, + "grad_norm": 0.06568085425348943, + "language_loss": 0.84119928, + "learning_rate": 0.0008335298814111094, + "loss": 0.85203701, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.35864258, + "step": 1505, + "time_per_iteration": 2.542043924331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085418, + "balance_loss_mlp": 1.05089498, + "epoch": 0.28972681800692573, + "flos": 647909508096.0, + "grad_norm": 0.06591449016003405, + "language_loss": 0.87860626, + "learning_rate": 0.0008332977163497455, + "loss": 0.88946044, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.34570312, + "step": 1506, + "time_per_iteration": 2.810399293899536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087351, + "balance_loss_mlp": 1.05163622, + "epoch": 0.2899191996921893, + "flos": 571955033088.0, + "grad_norm": 0.054529888005095714, + "language_loss": 0.83185649, + "learning_rate": 0.0008330654218907325, + "loss": 0.84272999, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.35742188, + "step": 1507, + "time_per_iteration": 2.6968414783477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084817, + "balance_loss_mlp": 1.04981744, + "epoch": 0.29011158137745285, + "flos": 661037773824.0, + "grad_norm": 0.1280653735040032, + "language_loss": 0.81777966, + "learning_rate": 0.0008328329981242548, + "loss": 0.82862782, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.3503418, + "step": 1508, + "time_per_iteration": 2.886396884918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.05441689, + "epoch": 0.29030396306271644, + "flos": 535933974528.0, + "grad_norm": 0.060234790533374126, + "language_loss": 0.87937206, + "learning_rate": 0.0008326004451405475, + "loss": 0.89026886, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.3527832, + "step": 1509, + "time_per_iteration": 2.7797772884368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096033, + "balance_loss_mlp": 1.06148589, + "epoch": 0.29049634474798, + "flos": 511707110400.0, + "grad_norm": 0.05385470227261208, + "language_loss": 0.82548428, + "learning_rate": 0.0008323677630298957, + "loss": 0.83644462, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.34594727, + "step": 1510, + "time_per_iteration": 2.5542855262756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093546, + "balance_loss_mlp": 1.05892766, + "epoch": 0.29068872643324356, + "flos": 613454017536.0, + "grad_norm": 0.05556182475666109, + "language_loss": 0.85001689, + "learning_rate": 0.0008321349518826345, + "loss": 0.86095232, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.34643555, + "step": 1511, + "time_per_iteration": 2.7849388122558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093876, + "balance_loss_mlp": 1.05878115, + "epoch": 0.2908811081185071, + "flos": 546164011008.0, + "grad_norm": 0.07046084545113683, + "language_loss": 0.94823933, + "learning_rate": 0.0008319020117891491, + "loss": 0.95917809, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.35131836, + "step": 1512, + "time_per_iteration": 2.5936031341552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090975, + "balance_loss_mlp": 1.05485463, + "epoch": 0.2910734898037707, + "flos": 604516096512.0, + "grad_norm": 0.06307487928884016, + "language_loss": 0.87063539, + "learning_rate": 0.0008316689428398751, + "loss": 0.88154513, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.36108398, + "step": 1513, + "time_per_iteration": 2.6774067878723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082761, + "balance_loss_mlp": 1.04792798, + "epoch": 0.29126587148903427, + "flos": 574383370752.0, + "grad_norm": 0.043578668947666564, + "language_loss": 0.88254529, + "learning_rate": 0.0008314357451252979, + "loss": 0.89337289, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.34887695, + "step": 1514, + "time_per_iteration": 2.7561941146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086548, + "balance_loss_mlp": 1.05092812, + "epoch": 0.2914582531742978, + "flos": 570802692096.0, + "grad_norm": 0.06240160889449628, + "language_loss": 0.87564558, + "learning_rate": 0.0008312024187359527, + "loss": 0.88651109, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.35644531, + "step": 1515, + "time_per_iteration": 2.636056900024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084088, + "balance_loss_mlp": 1.04942179, + "epoch": 0.2916506348595614, + "flos": 730523363328.0, + "grad_norm": 0.06185972429295104, + "language_loss": 0.87361014, + "learning_rate": 0.000830968963762425, + "loss": 0.88445103, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.34716797, + "step": 1516, + "time_per_iteration": 3.021732807159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091474, + "balance_loss_mlp": 1.05528235, + "epoch": 0.2918430165448249, + "flos": 510220118016.0, + "grad_norm": 0.05583453201751925, + "language_loss": 0.83947027, + "learning_rate": 0.0008307353802953497, + "loss": 0.85038507, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.36206055, + "step": 1517, + "time_per_iteration": 2.6659955978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100836, + "balance_loss_mlp": 1.06318951, + "epoch": 0.2920353982300885, + "flos": 630096122880.0, + "grad_norm": 0.04472517729516854, + "language_loss": 0.86110896, + "learning_rate": 0.0008305016684254125, + "loss": 0.87211728, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.37646484, + "step": 1518, + "time_per_iteration": 2.7896409034729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098986, + "balance_loss_mlp": 1.06153059, + "epoch": 0.29222777991535204, + "flos": 501411644928.0, + "grad_norm": 0.055409034097420505, + "language_loss": 0.86932153, + "learning_rate": 0.0008302678282433479, + "loss": 0.88031137, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.37451172, + "step": 1519, + "time_per_iteration": 2.585256814956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095676, + "balance_loss_mlp": 1.05891216, + "epoch": 0.2924201616006156, + "flos": 486522782208.0, + "grad_norm": 0.057505891705300044, + "language_loss": 0.85011005, + "learning_rate": 0.0008300338598399411, + "loss": 0.86106682, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.36791992, + "step": 1520, + "time_per_iteration": 2.6471352577209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109825, + "balance_loss_mlp": 1.07210708, + "epoch": 0.2926125432858792, + "flos": 476211350016.0, + "grad_norm": 0.05302442020038178, + "language_loss": 0.9456166, + "learning_rate": 0.0008297997633060263, + "loss": 0.95671487, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.37719727, + "step": 1521, + "time_per_iteration": 2.547457695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.06987774, + "epoch": 0.29280492497114274, + "flos": 676379151360.0, + "grad_norm": 0.054888704647412474, + "language_loss": 0.85310549, + "learning_rate": 0.0008295655387324883, + "loss": 0.86417043, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.36645508, + "step": 1522, + "time_per_iteration": 2.822557210922241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105716, + "balance_loss_mlp": 1.0686183, + "epoch": 0.29299730665640633, + "flos": 458175384576.0, + "grad_norm": 0.055715580232585875, + "language_loss": 0.85025144, + "learning_rate": 0.0008293311862102609, + "loss": 0.86130863, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.37084961, + "step": 1523, + "time_per_iteration": 2.5033791065216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103076, + "balance_loss_mlp": 1.06795669, + "epoch": 0.29318968834166986, + "flos": 446343464448.0, + "grad_norm": 0.0584953499596263, + "language_loss": 0.88722956, + "learning_rate": 0.0008290967058303275, + "loss": 0.89826035, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.3515625, + "step": 1524, + "time_per_iteration": 2.5981156826019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_mlp": 1.06630898, + "epoch": 0.29338207002693345, + "flos": 450085676544.0, + "grad_norm": 0.05072610752657829, + "language_loss": 0.86932707, + "learning_rate": 0.0008288620976837219, + "loss": 0.88035178, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.36181641, + "step": 1525, + "time_per_iteration": 2.522019863128662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092716, + "balance_loss_mlp": 1.05623853, + "epoch": 0.293574451712197, + "flos": 502027103232.0, + "grad_norm": 0.05230718040210392, + "language_loss": 0.83001733, + "learning_rate": 0.000828627361861527, + "loss": 0.84094453, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.36474609, + "step": 1526, + "time_per_iteration": 2.559201955795288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085779, + "balance_loss_mlp": 1.05051661, + "epoch": 0.29376683339746057, + "flos": 696158184960.0, + "grad_norm": 0.071892180297548, + "language_loss": 0.8465147, + "learning_rate": 0.0008283924984548752, + "loss": 0.85737246, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.3527832, + "step": 1527, + "time_per_iteration": 2.8108816146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079347, + "balance_loss_mlp": 1.04494321, + "epoch": 0.2939592150827241, + "flos": 478353088512.0, + "grad_norm": 0.05128355551395112, + "language_loss": 0.85087478, + "learning_rate": 0.0008281575075549485, + "loss": 0.86166823, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.34399414, + "step": 1528, + "time_per_iteration": 2.576814889907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051452, + "balance_loss_mlp": 1.04057992, + "epoch": 0.2941515967679877, + "flos": 1484482042368.0, + "grad_norm": 0.031732851839211505, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.7840414, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.10888672, + "step": 1529, + "time_per_iteration": 4.662023067474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089321, + "balance_loss_mlp": 1.0547502, + "epoch": 0.2943439784532513, + "flos": 673848916992.0, + "grad_norm": 0.06453398347295829, + "language_loss": 0.90086716, + "learning_rate": 0.0008276871436402469, + "loss": 0.91176039, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.34619141, + "step": 1530, + "time_per_iteration": 2.795783758163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097742, + "balance_loss_mlp": 1.06460166, + "epoch": 0.2945363601385148, + "flos": 576031896576.0, + "grad_norm": 0.05195467848041957, + "language_loss": 0.87790835, + "learning_rate": 0.000827451770808083, + "loss": 0.88888574, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.33154297, + "step": 1531, + "time_per_iteration": 2.6522414684295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100953, + "balance_loss_mlp": 1.06628692, + "epoch": 0.2947287418237784, + "flos": 480416251392.0, + "grad_norm": 0.05572078055736918, + "language_loss": 0.83276248, + "learning_rate": 0.0008272162708478674, + "loss": 0.84377199, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.34692383, + "step": 1532, + "time_per_iteration": 2.5960874557495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098518, + "balance_loss_mlp": 1.06459141, + "epoch": 0.2949211235090419, + "flos": 557917355520.0, + "grad_norm": 0.05232404820193651, + "language_loss": 0.86136103, + "learning_rate": 0.000826980643851029, + "loss": 0.87234622, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.33959961, + "step": 1533, + "time_per_iteration": 2.671867609024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106353, + "balance_loss_mlp": 1.07147205, + "epoch": 0.2951135051943055, + "flos": 483646311936.0, + "grad_norm": 0.06650262295584625, + "language_loss": 0.84864676, + "learning_rate": 0.0008267448899090464, + "loss": 0.85971034, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.34887695, + "step": 1534, + "time_per_iteration": 2.5133543014526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095921, + "balance_loss_mlp": 1.0604682, + "epoch": 0.29530588687956905, + "flos": 550015322112.0, + "grad_norm": 0.05711998360561463, + "language_loss": 0.80980158, + "learning_rate": 0.0008265090091134473, + "loss": 0.82076073, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.35473633, + "step": 1535, + "time_per_iteration": 2.8528778553009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085228, + "balance_loss_mlp": 1.05072904, + "epoch": 0.29549826856483263, + "flos": 672731481600.0, + "grad_norm": 0.047870597747086484, + "language_loss": 0.80150926, + "learning_rate": 0.0008262730015558088, + "loss": 0.8123616, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.34521484, + "step": 1536, + "time_per_iteration": 2.8849382400512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076376, + "balance_loss_mlp": 1.04192495, + "epoch": 0.29569065025009617, + "flos": 764300786688.0, + "grad_norm": 0.06331525049863725, + "language_loss": 0.82269859, + "learning_rate": 0.0008260368673277574, + "loss": 0.8334623, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.34472656, + "step": 1537, + "time_per_iteration": 3.12172269821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078873, + "balance_loss_mlp": 1.04480314, + "epoch": 0.29588303193535975, + "flos": 543398049792.0, + "grad_norm": 0.05107262607685598, + "language_loss": 0.84019077, + "learning_rate": 0.0008258006065209682, + "loss": 0.85097957, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.34106445, + "step": 1538, + "time_per_iteration": 2.7388381958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082306, + "balance_loss_mlp": 1.04744971, + "epoch": 0.29607541362062334, + "flos": 596648968704.0, + "grad_norm": 0.06469434822608365, + "language_loss": 0.80634302, + "learning_rate": 0.0008255642192271657, + "loss": 0.81716609, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.34863281, + "step": 1539, + "time_per_iteration": 2.7957324981689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082434, + "balance_loss_mlp": 1.04774427, + "epoch": 0.29626779530588687, + "flos": 609588149760.0, + "grad_norm": 0.06097977692176942, + "language_loss": 0.83830953, + "learning_rate": 0.0008253277055381241, + "loss": 0.84913385, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.34741211, + "step": 1540, + "time_per_iteration": 2.8428521156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085866, + "balance_loss_mlp": 1.05146217, + "epoch": 0.29646017699115046, + "flos": 867050237952.0, + "grad_norm": 0.06407432486539091, + "language_loss": 0.8580029, + "learning_rate": 0.0008250910655456658, + "loss": 0.8688615, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.34448242, + "step": 1541, + "time_per_iteration": 3.1741185188293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081587, + "balance_loss_mlp": 1.04696846, + "epoch": 0.296652558676414, + "flos": 495616444416.0, + "grad_norm": 0.06683547404256097, + "language_loss": 0.83703458, + "learning_rate": 0.0008248542993416625, + "loss": 0.84785044, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.34643555, + "step": 1542, + "time_per_iteration": 2.5634429454803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083482, + "balance_loss_mlp": 1.04914963, + "epoch": 0.2968449403616776, + "flos": 571275555840.0, + "grad_norm": 0.054805025189504364, + "language_loss": 0.83645159, + "learning_rate": 0.0008246174070180352, + "loss": 0.84728634, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.34375, + "step": 1543, + "time_per_iteration": 2.664029121398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108698, + "balance_loss_mlp": 1.05226684, + "epoch": 0.2970373220469411, + "flos": 793799115264.0, + "grad_norm": 0.06369286414713611, + "language_loss": 0.84087443, + "learning_rate": 0.0008243803886667537, + "loss": 0.85174423, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.34765625, + "step": 1544, + "time_per_iteration": 3.129185199737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082817, + "balance_loss_mlp": 1.04793644, + "epoch": 0.2972297037322047, + "flos": 660736617984.0, + "grad_norm": 0.0569938777400986, + "language_loss": 0.79051471, + "learning_rate": 0.0008241432443798364, + "loss": 0.80134284, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.34936523, + "step": 1545, + "time_per_iteration": 2.7968478202819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076425, + "balance_loss_mlp": 1.04225969, + "epoch": 0.29742208541746823, + "flos": 596849789952.0, + "grad_norm": 0.05185676674228935, + "language_loss": 0.85634965, + "learning_rate": 0.0008239059742493512, + "loss": 0.86711389, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.34204102, + "step": 1546, + "time_per_iteration": 2.730803966522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079044, + "balance_loss_mlp": 1.0448308, + "epoch": 0.2976144671027318, + "flos": 769519816704.0, + "grad_norm": 0.049935350225070424, + "language_loss": 0.87424839, + "learning_rate": 0.0008236685783674142, + "loss": 0.88503873, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.34228516, + "step": 1547, + "time_per_iteration": 3.0735998153686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060573, + "balance_loss_mlp": 1.04903388, + "epoch": 0.2978068487879954, + "flos": 1483980065280.0, + "grad_norm": 0.022808758650826922, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77281767, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.11523438, + "step": 1548, + "time_per_iteration": 4.902673959732056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088892, + "balance_loss_mlp": 1.05460715, + "epoch": 0.29799923047325894, + "flos": 475079357952.0, + "grad_norm": 0.07696298762455249, + "language_loss": 0.82568306, + "learning_rate": 0.0008231934097178955, + "loss": 0.83657193, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.34326172, + "step": 1549, + "time_per_iteration": 2.59600567817688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087503, + "balance_loss_mlp": 1.05173981, + "epoch": 0.2981916121585225, + "flos": 759464460288.0, + "grad_norm": 0.05308820200633048, + "language_loss": 0.8525809, + "learning_rate": 0.0008229556371347903, + "loss": 0.86345589, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.35791016, + "step": 1550, + "time_per_iteration": 2.955467939376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080247, + "balance_loss_mlp": 1.04593909, + "epoch": 0.29838399384378606, + "flos": 874642351104.0, + "grad_norm": 0.058723621398699785, + "language_loss": 0.79088616, + "learning_rate": 0.0008227177391691874, + "loss": 0.80168855, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.34350586, + "step": 1551, + "time_per_iteration": 3.1204521656036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084967, + "balance_loss_mlp": 1.05001473, + "epoch": 0.29857637552904964, + "flos": 579389995008.0, + "grad_norm": 0.060980844602782615, + "language_loss": 0.89576113, + "learning_rate": 0.0008224797159134463, + "loss": 0.90661073, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.34985352, + "step": 1552, + "time_per_iteration": 2.7535347938537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.04132128, + "epoch": 0.2987687572143132, + "flos": 836048950272.0, + "grad_norm": 0.05791718796165568, + "language_loss": 0.83571118, + "learning_rate": 0.0008222415674599765, + "loss": 0.84646177, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.33764648, + "step": 1553, + "time_per_iteration": 3.0609707832336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077849, + "balance_loss_mlp": 1.0417521, + "epoch": 0.29896113889957676, + "flos": 566800022016.0, + "grad_norm": 0.05477323920870417, + "language_loss": 0.83255476, + "learning_rate": 0.0008220032939012349, + "loss": 0.84333324, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.36108398, + "step": 1554, + "time_per_iteration": 2.6683521270751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077555, + "balance_loss_mlp": 1.04310393, + "epoch": 0.29915352058484035, + "flos": 498370669056.0, + "grad_norm": 0.049159177960894807, + "language_loss": 0.87956095, + "learning_rate": 0.0008217648953297277, + "loss": 0.89033645, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.34472656, + "step": 1555, + "time_per_iteration": 2.82114315032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109245, + "balance_loss_mlp": 1.05711639, + "epoch": 0.2993459022701039, + "flos": 591837373440.0, + "grad_norm": 0.06210935096260163, + "language_loss": 0.7799179, + "learning_rate": 0.0008215263718380095, + "loss": 0.79084241, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.35327148, + "step": 1556, + "time_per_iteration": 2.6806485652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104132, + "balance_loss_mlp": 1.06815481, + "epoch": 0.29953828395536747, + "flos": 572107802112.0, + "grad_norm": 0.051501670996139066, + "language_loss": 0.8437115, + "learning_rate": 0.0008212877235186833, + "loss": 0.8547529, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.36010742, + "step": 1557, + "time_per_iteration": 2.706531286239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075275, + "balance_loss_mlp": 1.06321061, + "epoch": 0.299730665640631, + "flos": 1503855051264.0, + "grad_norm": 0.03618665962020262, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78812838, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.12060547, + "step": 1558, + "time_per_iteration": 4.914030313491821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102225, + "balance_loss_mlp": 1.06641483, + "epoch": 0.2999230473258946, + "flos": 513538928640.0, + "grad_norm": 0.06717328469529676, + "language_loss": 0.80777293, + "learning_rate": 0.0008208100527678611, + "loss": 0.8187952, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.3581543, + "step": 1559, + "time_per_iteration": 2.5862650871276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097511, + "balance_loss_mlp": 1.06294012, + "epoch": 0.3001154290111581, + "flos": 834128381952.0, + "grad_norm": 0.05731533213860712, + "language_loss": 0.78337204, + "learning_rate": 0.0008205710305218135, + "loss": 0.79434717, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.34594727, + "step": 1560, + "time_per_iteration": 3.00581693649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094947, + "balance_loss_mlp": 1.06149733, + "epoch": 0.3003078106964217, + "flos": 556485617664.0, + "grad_norm": 0.051151635719759364, + "language_loss": 0.89917201, + "learning_rate": 0.0008203318838190541, + "loss": 0.9101215, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.3347168, + "step": 1561, + "time_per_iteration": 2.730187177658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087932, + "balance_loss_mlp": 1.05345702, + "epoch": 0.30050019238168524, + "flos": 525897556992.0, + "grad_norm": 0.07455466191279551, + "language_loss": 0.85053575, + "learning_rate": 0.0008200926127524281, + "loss": 0.86141509, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.34521484, + "step": 1562, + "time_per_iteration": 2.6252634525299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082824, + "balance_loss_mlp": 1.04837239, + "epoch": 0.3006925740669488, + "flos": 577582907904.0, + "grad_norm": 0.08578868432126639, + "language_loss": 0.83193934, + "learning_rate": 0.0008198532174148289, + "loss": 0.84276754, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.3449707, + "step": 1563, + "time_per_iteration": 2.784132957458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010668, + "balance_loss_mlp": 0.99912882, + "epoch": 0.3008849557522124, + "flos": 1489408528896.0, + "grad_norm": 0.006418694176289122, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81696838, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.11523438, + "step": 1564, + "time_per_iteration": 4.826026678085327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079569, + "balance_loss_mlp": 1.04607105, + "epoch": 0.30107733743747594, + "flos": 509565371904.0, + "grad_norm": 0.057361266050022765, + "language_loss": 0.88701093, + "learning_rate": 0.0008193740542985244, + "loss": 0.89780664, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.33520508, + "step": 1565, + "time_per_iteration": 2.5685722827911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082334, + "balance_loss_mlp": 1.04881263, + "epoch": 0.30126971912273953, + "flos": 587425858560.0, + "grad_norm": 0.055549771382925904, + "language_loss": 0.86598676, + "learning_rate": 0.0008191342867058467, + "loss": 0.87681007, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.33520508, + "step": 1566, + "time_per_iteration": 2.7413527965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088116, + "balance_loss_mlp": 1.05423677, + "epoch": 0.30146210080800306, + "flos": 601822918656.0, + "grad_norm": 0.054174391750340056, + "language_loss": 0.83411789, + "learning_rate": 0.0008188943952142509, + "loss": 0.84499902, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.33911133, + "step": 1567, + "time_per_iteration": 2.816777229309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098385, + "balance_loss_mlp": 1.06376624, + "epoch": 0.30165448249326665, + "flos": 917424686592.0, + "grad_norm": 0.057308899380469513, + "language_loss": 0.81973398, + "learning_rate": 0.0008186543799168711, + "loss": 0.8307178, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.34643555, + "step": 1568, + "time_per_iteration": 3.138439655303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094575, + "balance_loss_mlp": 1.060076, + "epoch": 0.3018468641785302, + "flos": 776953368576.0, + "grad_norm": 0.06314470525088989, + "language_loss": 0.88671768, + "learning_rate": 0.0008184142409068892, + "loss": 0.89766341, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.34545898, + "step": 1569, + "time_per_iteration": 3.0061678886413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104914, + "balance_loss_mlp": 1.07134473, + "epoch": 0.30203924586379377, + "flos": 522101500416.0, + "grad_norm": 0.05000282823150535, + "language_loss": 0.86630476, + "learning_rate": 0.000818173978277536, + "loss": 0.87735385, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.3359375, + "step": 1570, + "time_per_iteration": 2.7171432971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101367, + "balance_loss_mlp": 1.06779718, + "epoch": 0.3022316275490573, + "flos": 524288318976.0, + "grad_norm": 0.052630401262377564, + "language_loss": 0.83781934, + "learning_rate": 0.000817933592122089, + "loss": 0.84883296, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.3359375, + "step": 1571, + "time_per_iteration": 2.7346580028533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105149, + "balance_loss_mlp": 1.0710789, + "epoch": 0.3024240092343209, + "flos": 479672755200.0, + "grad_norm": 0.05357670269103591, + "language_loss": 0.83451247, + "learning_rate": 0.0008176930825338749, + "loss": 0.84556395, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.34106445, + "step": 1572, + "time_per_iteration": 2.5449459552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095582, + "balance_loss_mlp": 1.06110692, + "epoch": 0.3026163909195845, + "flos": 686901579264.0, + "grad_norm": 0.06283664606524127, + "language_loss": 0.8873198, + "learning_rate": 0.0008174524496062679, + "loss": 0.89827561, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.3449707, + "step": 1573, + "time_per_iteration": 2.8826043605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108869, + "balance_loss_mlp": 1.05380964, + "epoch": 0.302808772604848, + "flos": 542654553600.0, + "grad_norm": 0.05929060654319276, + "language_loss": 0.85444844, + "learning_rate": 0.0008172116934326894, + "loss": 0.86533535, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.34912109, + "step": 1574, + "time_per_iteration": 2.7539572715759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088334, + "balance_loss_mlp": 1.05435979, + "epoch": 0.3030011542901116, + "flos": 474852395520.0, + "grad_norm": 0.051325587648683973, + "language_loss": 0.87683225, + "learning_rate": 0.0008169708141066097, + "loss": 0.88771558, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.34008789, + "step": 1575, + "time_per_iteration": 2.5635225772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086472, + "balance_loss_mlp": 1.05199683, + "epoch": 0.30319353597537513, + "flos": 481233940992.0, + "grad_norm": 0.06106098638193731, + "language_loss": 0.90820259, + "learning_rate": 0.0008167298117215465, + "loss": 0.91906732, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.3449707, + "step": 1576, + "time_per_iteration": 2.5388035774230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087257, + "balance_loss_mlp": 1.05242443, + "epoch": 0.3033859176606387, + "flos": 704455916544.0, + "grad_norm": 0.06728579874610481, + "language_loss": 0.88300574, + "learning_rate": 0.0008164886863710649, + "loss": 0.89387834, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.34887695, + "step": 1577, + "time_per_iteration": 2.8935675621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088541, + "balance_loss_mlp": 1.05554342, + "epoch": 0.30357829934590225, + "flos": 764344456704.0, + "grad_norm": 0.04642698643554312, + "language_loss": 0.86113924, + "learning_rate": 0.0008162474381487783, + "loss": 0.87202466, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.33007812, + "step": 1578, + "time_per_iteration": 3.0151257514953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088208, + "balance_loss_mlp": 1.05401909, + "epoch": 0.30377068103116583, + "flos": 532082663424.0, + "grad_norm": 0.05691489249418783, + "language_loss": 0.84894794, + "learning_rate": 0.0008160060671483475, + "loss": 0.85983002, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.34228516, + "step": 1579, + "time_per_iteration": 2.6575984954833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097191, + "balance_loss_mlp": 1.06338358, + "epoch": 0.3039630627164294, + "flos": 509934928896.0, + "grad_norm": 0.07240450604386858, + "language_loss": 0.83520651, + "learning_rate": 0.0008157645734634809, + "loss": 0.84617841, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.33837891, + "step": 1580, + "time_per_iteration": 2.5869438648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061274, + "balance_loss_mlp": 1.04992568, + "epoch": 0.30415544440169295, + "flos": 1505206803456.0, + "grad_norm": 0.030998653754179664, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.77957761, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.11328125, + "step": 1581, + "time_per_iteration": 4.907174348831177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_mlp": 1.01606703, + "epoch": 0.30434782608695654, + "flos": 1457976637440.0, + "grad_norm": 0.012214555664928241, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74242246, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.11669922, + "step": 1582, + "time_per_iteration": 4.8717710971832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102415, + "balance_loss_mlp": 1.0685122, + "epoch": 0.3045402077722201, + "flos": 482312088576.0, + "grad_norm": 0.05813255519406619, + "language_loss": 0.83851862, + "learning_rate": 0.000815039357240067, + "loss": 0.84954274, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.33935547, + "step": 1583, + "time_per_iteration": 2.640148401260376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102551, + "balance_loss_mlp": 1.06879056, + "epoch": 0.30473258945748366, + "flos": 543220549632.0, + "grad_norm": 0.06312099992380371, + "language_loss": 0.85312426, + "learning_rate": 0.0008147973737554952, + "loss": 0.86414981, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.33789062, + "step": 1584, + "time_per_iteration": 2.772367000579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098103, + "balance_loss_mlp": 1.06443787, + "epoch": 0.3049249711427472, + "flos": 566789847552.0, + "grad_norm": 0.054268296030043885, + "language_loss": 0.85613728, + "learning_rate": 0.000814555268055744, + "loss": 0.86711836, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.33691406, + "step": 1585, + "time_per_iteration": 2.616687536239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109794, + "balance_loss_mlp": 1.06441879, + "epoch": 0.3051173528280108, + "flos": 527970894336.0, + "grad_norm": 0.05527556644311566, + "language_loss": 0.87648201, + "learning_rate": 0.0008143130402348073, + "loss": 0.88746148, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.33544922, + "step": 1586, + "time_per_iteration": 2.635103940963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094771, + "balance_loss_mlp": 1.06141627, + "epoch": 0.3053097345132743, + "flos": 586097427456.0, + "grad_norm": 0.052385807505719764, + "language_loss": 0.79520649, + "learning_rate": 0.0008140706903867265, + "loss": 0.80615419, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.33349609, + "step": 1587, + "time_per_iteration": 2.7922940254211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087687, + "balance_loss_mlp": 1.05263984, + "epoch": 0.3055021161985379, + "flos": 606810604032.0, + "grad_norm": 0.054380951058352583, + "language_loss": 0.90043247, + "learning_rate": 0.0008138282186055897, + "loss": 0.9113093, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.35058594, + "step": 1588, + "time_per_iteration": 2.683783769607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085545, + "balance_loss_mlp": 1.05290556, + "epoch": 0.3056944978838015, + "flos": 573594794496.0, + "grad_norm": 0.05235756550943364, + "language_loss": 0.8193745, + "learning_rate": 0.0008135856249855331, + "loss": 0.83023, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.32641602, + "step": 1589, + "time_per_iteration": 2.6717309951782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081523, + "balance_loss_mlp": 1.04900289, + "epoch": 0.305886879569065, + "flos": 633640485888.0, + "grad_norm": 0.06284243799371535, + "language_loss": 0.89691997, + "learning_rate": 0.0008133429096207398, + "loss": 0.90773523, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.32519531, + "step": 1590, + "time_per_iteration": 2.757962465286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037916, + "balance_loss_mlp": 1.02561319, + "epoch": 0.3060792612543286, + "flos": 1368227414016.0, + "grad_norm": 0.023218914608202516, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76350176, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.12304688, + "step": 1591, + "time_per_iteration": 4.927033185958862 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078048, + "balance_loss_mlp": 1.0450511, + "epoch": 0.30627164293959214, + "flos": 518290887168.0, + "grad_norm": 0.05132667013942606, + "language_loss": 0.86601979, + "learning_rate": 0.0008128571140339123, + "loss": 0.87680024, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.33007812, + "step": 1592, + "time_per_iteration": 2.627272367477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075862, + "balance_loss_mlp": 1.0423162, + "epoch": 0.3064640246248557, + "flos": 455354168832.0, + "grad_norm": 0.054345541641725725, + "language_loss": 0.87405455, + "learning_rate": 0.0008126140340004805, + "loss": 0.88481319, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.33569336, + "step": 1593, + "time_per_iteration": 2.5047686100006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076578, + "balance_loss_mlp": 1.04355717, + "epoch": 0.30665640631011926, + "flos": 849718480896.0, + "grad_norm": 0.04925367115496714, + "language_loss": 0.82262254, + "learning_rate": 0.0008123708325995172, + "loss": 0.83338827, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.33032227, + "step": 1594, + "time_per_iteration": 3.1693196296691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107923, + "balance_loss_mlp": 1.04582715, + "epoch": 0.30684878799538284, + "flos": 757996406784.0, + "grad_norm": 0.04977841797679214, + "language_loss": 0.79901791, + "learning_rate": 0.0008121275099254414, + "loss": 0.80981016, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.33422852, + "step": 1595, + "time_per_iteration": 2.9197185039520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108089, + "balance_loss_mlp": 1.04758275, + "epoch": 0.3070411696806464, + "flos": 517320428544.0, + "grad_norm": 0.05488318824662342, + "language_loss": 0.88300943, + "learning_rate": 0.0008118840660727194, + "loss": 0.89381832, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.33325195, + "step": 1596, + "time_per_iteration": 2.6452150344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079954, + "balance_loss_mlp": 1.04788685, + "epoch": 0.30723355136590996, + "flos": 843883992576.0, + "grad_norm": 0.05612425557740203, + "language_loss": 0.87403214, + "learning_rate": 0.0008116405011358644, + "loss": 0.88483167, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.32055664, + "step": 1597, + "time_per_iteration": 3.135666608810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_mlp": 1.05092525, + "epoch": 0.30742593305117355, + "flos": 465905710080.0, + "grad_norm": 0.05391343675647517, + "language_loss": 0.80005342, + "learning_rate": 0.0008113968152094369, + "loss": 0.81089526, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.33276367, + "step": 1598, + "time_per_iteration": 2.5122313499450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076054, + "balance_loss_mlp": 1.04331923, + "epoch": 0.3076183147364371, + "flos": 686286120960.0, + "grad_norm": 0.04979397496165333, + "language_loss": 0.82305032, + "learning_rate": 0.0008111530083880438, + "loss": 0.83381081, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.32739258, + "step": 1599, + "time_per_iteration": 2.8883755207061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072089, + "balance_loss_mlp": 1.03949702, + "epoch": 0.30781069642170067, + "flos": 613729032192.0, + "grad_norm": 0.059000164712882774, + "language_loss": 0.86657357, + "learning_rate": 0.0008109090807663399, + "loss": 0.87729448, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.32592773, + "step": 1600, + "time_per_iteration": 2.7799928188323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075122, + "balance_loss_mlp": 1.04260147, + "epoch": 0.3080030781069642, + "flos": 590021521920.0, + "grad_norm": 0.046450828420206536, + "language_loss": 0.88735926, + "learning_rate": 0.0008106650324390257, + "loss": 0.89811045, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.32519531, + "step": 1601, + "time_per_iteration": 2.7887444496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071489, + "balance_loss_mlp": 1.03913534, + "epoch": 0.3081954597922278, + "flos": 562353601536.0, + "grad_norm": 0.06865077604181559, + "language_loss": 0.81335884, + "learning_rate": 0.0008104208635008493, + "loss": 0.82407373, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.32348633, + "step": 1602, + "time_per_iteration": 2.6526358127593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077173, + "balance_loss_mlp": 1.04448628, + "epoch": 0.3083878414774913, + "flos": 447599112192.0, + "grad_norm": 0.053973671264543166, + "language_loss": 0.81925714, + "learning_rate": 0.0008101765740466058, + "loss": 0.83002889, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.3269043, + "step": 1603, + "time_per_iteration": 2.5337142944335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073512, + "balance_loss_mlp": 1.04020488, + "epoch": 0.3085802231627549, + "flos": 493297205760.0, + "grad_norm": 0.05670542728571842, + "language_loss": 0.84135199, + "learning_rate": 0.0008099321641711364, + "loss": 0.85208714, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.33325195, + "step": 1604, + "time_per_iteration": 2.6616737842559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071402, + "balance_loss_mlp": 1.03804755, + "epoch": 0.3087726048480185, + "flos": 487437986304.0, + "grad_norm": 0.0517354770696361, + "language_loss": 0.8343811, + "learning_rate": 0.0008096876339693295, + "loss": 0.8450951, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.33374023, + "step": 1605, + "time_per_iteration": 2.6034042835235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078765, + "balance_loss_mlp": 1.04412317, + "epoch": 0.308964986533282, + "flos": 730265877504.0, + "grad_norm": 0.0630488444124333, + "language_loss": 0.8123467, + "learning_rate": 0.0008094429835361206, + "loss": 0.8231343, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.34667969, + "step": 1606, + "time_per_iteration": 2.9442811012268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_mlp": 1.03788495, + "epoch": 0.3091573682185456, + "flos": 605131554816.0, + "grad_norm": 0.0490228515497239, + "language_loss": 0.85833865, + "learning_rate": 0.0008091982129664908, + "loss": 0.8690542, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.33691406, + "step": 1607, + "time_per_iteration": 2.734976053237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077455, + "balance_loss_mlp": 1.04290783, + "epoch": 0.30934974990380915, + "flos": 460081396224.0, + "grad_norm": 0.04772079658934369, + "language_loss": 0.82646394, + "learning_rate": 0.0008089533223554687, + "loss": 0.83723843, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.34594727, + "step": 1608, + "time_per_iteration": 2.756741762161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080172, + "balance_loss_mlp": 1.04669785, + "epoch": 0.30954213158907273, + "flos": 553142075904.0, + "grad_norm": 0.05499274022240881, + "language_loss": 0.8525604, + "learning_rate": 0.0008087083117981294, + "loss": 0.86336207, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.33496094, + "step": 1609, + "time_per_iteration": 2.9062788486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081142, + "balance_loss_mlp": 1.04676199, + "epoch": 0.30973451327433627, + "flos": 552776901120.0, + "grad_norm": 0.0512798930400947, + "language_loss": 0.87996054, + "learning_rate": 0.0008084631813895943, + "loss": 0.89077199, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.34375, + "step": 1610, + "time_per_iteration": 2.789893627166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079299, + "balance_loss_mlp": 1.04575384, + "epoch": 0.30992689495959985, + "flos": 565430893056.0, + "grad_norm": 0.07403274033744815, + "language_loss": 0.83632123, + "learning_rate": 0.0008082179312250315, + "loss": 0.84711421, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.33544922, + "step": 1611, + "time_per_iteration": 2.713533878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_mlp": 1.02864099, + "epoch": 0.3101192766448634, + "flos": 1441621131264.0, + "grad_norm": 0.023040649208851512, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.80895925, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.11425781, + "step": 1612, + "time_per_iteration": 4.866560459136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_mlp": 1.02448523, + "epoch": 0.31031165833012697, + "flos": 1531086575616.0, + "grad_norm": 0.021256554447441355, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77664924, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.11132812, + "step": 1613, + "time_per_iteration": 5.01593279838562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010886, + "balance_loss_mlp": 1.05483997, + "epoch": 0.31050404001539056, + "flos": 991534196736.0, + "grad_norm": 0.06253960188626659, + "language_loss": 0.81937206, + "learning_rate": 0.0008074814631475545, + "loss": 0.83025801, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.33789062, + "step": 1614, + "time_per_iteration": 3.3154871463775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092644, + "balance_loss_mlp": 1.05940843, + "epoch": 0.3106964217006541, + "flos": 445748355072.0, + "grad_norm": 0.0719929788966035, + "language_loss": 0.78655052, + "learning_rate": 0.0008072357349114907, + "loss": 0.79747701, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.33251953, + "step": 1615, + "time_per_iteration": 2.6783502101898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084302, + "balance_loss_mlp": 1.05063736, + "epoch": 0.3108888033859177, + "flos": 510259405824.0, + "grad_norm": 0.06269338504314155, + "language_loss": 0.88523185, + "learning_rate": 0.0008069898873959363, + "loss": 0.89607489, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.33691406, + "step": 1616, + "time_per_iteration": 2.6805779933929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092165, + "balance_loss_mlp": 1.05952573, + "epoch": 0.3110811850711812, + "flos": 520471913472.0, + "grad_norm": 0.06669389997650658, + "language_loss": 0.85964763, + "learning_rate": 0.0008067439206963375, + "loss": 0.87056935, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.32641602, + "step": 1617, + "time_per_iteration": 2.6084542274475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091143, + "balance_loss_mlp": 1.05797851, + "epoch": 0.3112735667564448, + "flos": 686085299712.0, + "grad_norm": 0.06020913179087489, + "language_loss": 0.86049557, + "learning_rate": 0.0008064978349081873, + "loss": 0.87140703, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.33178711, + "step": 1618, + "time_per_iteration": 2.9622116088867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089777, + "balance_loss_mlp": 1.05554032, + "epoch": 0.31146594844170833, + "flos": 532786871808.0, + "grad_norm": 0.04562356821057988, + "language_loss": 0.86218596, + "learning_rate": 0.0008062516301270245, + "loss": 0.87308377, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.3425293, + "step": 1619, + "time_per_iteration": 2.691589593887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091127, + "balance_loss_mlp": 1.0575099, + "epoch": 0.3116583301269719, + "flos": 679187220480.0, + "grad_norm": 0.05429224886242875, + "language_loss": 0.88343138, + "learning_rate": 0.0008060053064484343, + "loss": 0.89434266, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.33642578, + "step": 1620, + "time_per_iteration": 2.936244487762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096127, + "balance_loss_mlp": 1.06277251, + "epoch": 0.31185071181223545, + "flos": 585855908352.0, + "grad_norm": 0.05040245512912965, + "language_loss": 0.85009742, + "learning_rate": 0.0008057588639680482, + "loss": 0.86105865, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.33374023, + "step": 1621, + "time_per_iteration": 2.7633163928985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107807, + "balance_loss_mlp": 1.07309282, + "epoch": 0.31204309349749904, + "flos": 725090517504.0, + "grad_norm": 0.06801147163116106, + "language_loss": 0.82624507, + "learning_rate": 0.0008055123027815434, + "loss": 0.83732307, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.34741211, + "step": 1622, + "time_per_iteration": 2.946943521499634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105084, + "balance_loss_mlp": 1.07156253, + "epoch": 0.3122354751827626, + "flos": 576558604800.0, + "grad_norm": 0.0611005921730787, + "language_loss": 0.85109818, + "learning_rate": 0.0008052656229846436, + "loss": 0.862149, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.33544922, + "step": 1623, + "time_per_iteration": 2.6431145668029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106208, + "balance_loss_mlp": 1.07106483, + "epoch": 0.31242785686802615, + "flos": 575672514048.0, + "grad_norm": 0.055122717603047884, + "language_loss": 0.90674621, + "learning_rate": 0.0008050188246731182, + "loss": 0.91780829, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.35180664, + "step": 1624, + "time_per_iteration": 2.6666738986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101645, + "balance_loss_mlp": 1.06745625, + "epoch": 0.31262023855328974, + "flos": 736490271744.0, + "grad_norm": 0.05430344032768667, + "language_loss": 0.81962687, + "learning_rate": 0.0008047719079427834, + "loss": 0.8306433, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.34204102, + "step": 1625, + "time_per_iteration": 2.978775978088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095181, + "balance_loss_mlp": 1.07791936, + "epoch": 0.3128126202385533, + "flos": 1558395113472.0, + "grad_norm": 0.034550759669135796, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75446886, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.17285156, + "step": 1626, + "time_per_iteration": 4.800370931625366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109181, + "balance_loss_mlp": 1.05695319, + "epoch": 0.31300500192381686, + "flos": 514666538496.0, + "grad_norm": 0.04752817769408696, + "language_loss": 0.86259782, + "learning_rate": 0.0008042777196091757, + "loss": 0.87351596, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.34863281, + "step": 1627, + "time_per_iteration": 2.695350408554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088728, + "balance_loss_mlp": 1.05301261, + "epoch": 0.3131973836090804, + "flos": 526370420736.0, + "grad_norm": 0.06407391520506579, + "language_loss": 0.8214981, + "learning_rate": 0.0008040304481977643, + "loss": 0.83238542, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.35742188, + "step": 1628, + "time_per_iteration": 2.6652393341064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_mlp": 1.05030346, + "epoch": 0.313389765294344, + "flos": 822473961984.0, + "grad_norm": 0.08346342139557943, + "language_loss": 0.86950874, + "learning_rate": 0.0008037830587512649, + "loss": 0.88034296, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.33129883, + "step": 1629, + "time_per_iteration": 3.0668327808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090413, + "balance_loss_mlp": 1.05651021, + "epoch": 0.31358214697960757, + "flos": 393604697088.0, + "grad_norm": 0.061409761762948115, + "language_loss": 0.78720629, + "learning_rate": 0.0008035355513657224, + "loss": 0.79811049, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.33935547, + "step": 1630, + "time_per_iteration": 2.5013740062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087918, + "balance_loss_mlp": 1.05449188, + "epoch": 0.3137745286648711, + "flos": 571611617280.0, + "grad_norm": 0.049199842100191564, + "language_loss": 0.93020999, + "learning_rate": 0.0008032879261372279, + "loss": 0.94108921, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.33447266, + "step": 1631, + "time_per_iteration": 2.8559622764587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088241, + "balance_loss_mlp": 1.07612944, + "epoch": 0.3139669103501347, + "flos": 1497614690304.0, + "grad_norm": 0.04267228885339989, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80724084, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.12109375, + "step": 1632, + "time_per_iteration": 5.726024627685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081932, + "balance_loss_mlp": 1.04886341, + "epoch": 0.3141592920353982, + "flos": 525090041856.0, + "grad_norm": 0.04986838794009694, + "language_loss": 0.87459773, + "learning_rate": 0.0008027923225359748, + "loss": 0.88541704, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.33081055, + "step": 1633, + "time_per_iteration": 2.599775791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078913, + "balance_loss_mlp": 1.04465246, + "epoch": 0.3143516737206618, + "flos": 592989714432.0, + "grad_norm": 0.05680374588643473, + "language_loss": 0.8835839, + "learning_rate": 0.0008025443443556267, + "loss": 0.89437306, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.34301758, + "step": 1634, + "time_per_iteration": 2.7439024448394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010804, + "balance_loss_mlp": 1.04776073, + "epoch": 0.31454405540592534, + "flos": 648034573824.0, + "grad_norm": 0.04764849369773053, + "language_loss": 0.88161099, + "learning_rate": 0.000802296248717147, + "loss": 0.89241499, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.32641602, + "step": 1635, + "time_per_iteration": 2.902290105819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082457, + "balance_loss_mlp": 1.04850602, + "epoch": 0.3147364370911889, + "flos": 642543501312.0, + "grad_norm": 0.05380775409858787, + "language_loss": 0.79150212, + "learning_rate": 0.0008020480357168554, + "loss": 0.80232668, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.33984375, + "step": 1636, + "time_per_iteration": 2.7940564155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107486, + "balance_loss_mlp": 1.04176795, + "epoch": 0.31492881877645246, + "flos": 471607778304.0, + "grad_norm": 0.05509564816324918, + "language_loss": 0.88341308, + "learning_rate": 0.0008017997054511165, + "loss": 0.89416164, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.33105469, + "step": 1637, + "time_per_iteration": 2.596212148666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075188, + "balance_loss_mlp": 1.04157114, + "epoch": 0.31512120046171604, + "flos": 629135838720.0, + "grad_norm": 0.0536589952194777, + "language_loss": 0.8549943, + "learning_rate": 0.0008015512580163407, + "loss": 0.86574614, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.33642578, + "step": 1638, + "time_per_iteration": 2.763343334197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107569, + "balance_loss_mlp": 1.0416913, + "epoch": 0.31531358214697963, + "flos": 703460726784.0, + "grad_norm": 0.0636877441873346, + "language_loss": 0.80888116, + "learning_rate": 0.0008013026935089838, + "loss": 0.81963813, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.34033203, + "step": 1639, + "time_per_iteration": 2.9786083698272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070933, + "balance_loss_mlp": 1.03798366, + "epoch": 0.31550596383224316, + "flos": 572275127808.0, + "grad_norm": 0.055086353977466425, + "language_loss": 0.83909047, + "learning_rate": 0.0008010540120255472, + "loss": 0.84979975, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.32958984, + "step": 1640, + "time_per_iteration": 2.666520357131958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075633, + "balance_loss_mlp": 1.04196858, + "epoch": 0.31569834551750675, + "flos": 658047822336.0, + "grad_norm": 0.06483249406864507, + "language_loss": 0.86052895, + "learning_rate": 0.0008008052136625774, + "loss": 0.8712852, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.33691406, + "step": 1641, + "time_per_iteration": 2.8062589168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078165, + "balance_loss_mlp": 1.04407096, + "epoch": 0.3158907272027703, + "flos": 566002681344.0, + "grad_norm": 0.05792040128516231, + "language_loss": 0.86837387, + "learning_rate": 0.0008005562985166666, + "loss": 0.87915552, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.34130859, + "step": 1642, + "time_per_iteration": 2.6996512413024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081029, + "balance_loss_mlp": 1.04576707, + "epoch": 0.31608310888803387, + "flos": 536622216192.0, + "grad_norm": 0.04642534602938139, + "language_loss": 0.84936821, + "learning_rate": 0.0008003072666844524, + "loss": 0.86017853, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.35302734, + "step": 1643, + "time_per_iteration": 2.6999776363372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078239, + "balance_loss_mlp": 1.04292917, + "epoch": 0.3162754905732974, + "flos": 486428239872.0, + "grad_norm": 0.08271259063406261, + "language_loss": 0.82613683, + "learning_rate": 0.0008000581182626173, + "loss": 0.83691919, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.35302734, + "step": 1644, + "time_per_iteration": 2.541093111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082389, + "balance_loss_mlp": 1.04893875, + "epoch": 0.316467872258561, + "flos": 529792538112.0, + "grad_norm": 0.058359985905672214, + "language_loss": 0.86275887, + "learning_rate": 0.0007998088533478894, + "loss": 0.87358278, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.33447266, + "step": 1645, + "time_per_iteration": 2.641402006149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077931, + "balance_loss_mlp": 1.04309845, + "epoch": 0.3166602539438245, + "flos": 443197771776.0, + "grad_norm": 0.07387441321187599, + "language_loss": 0.84062803, + "learning_rate": 0.000799559472037042, + "loss": 0.85140741, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.34887695, + "step": 1646, + "time_per_iteration": 2.5274438858032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076892, + "balance_loss_mlp": 1.04222584, + "epoch": 0.3168526356290881, + "flos": 645513103872.0, + "grad_norm": 0.053861363144643716, + "language_loss": 0.87875295, + "learning_rate": 0.0007993099744268932, + "loss": 0.8895219, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.34716797, + "step": 1647, + "time_per_iteration": 2.8893649578094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070585, + "balance_loss_mlp": 1.03646684, + "epoch": 0.3170450173143517, + "flos": 585889403904.0, + "grad_norm": 0.05841982976759713, + "language_loss": 0.87792766, + "learning_rate": 0.000799060360614307, + "loss": 0.88863349, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.34155273, + "step": 1648, + "time_per_iteration": 2.6867480278015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076826, + "balance_loss_mlp": 1.04273248, + "epoch": 0.3172373989996152, + "flos": 826763231232.0, + "grad_norm": 0.05654214693871822, + "language_loss": 0.83848637, + "learning_rate": 0.0007988106306961917, + "loss": 0.84925467, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.34130859, + "step": 1649, + "time_per_iteration": 3.1321003437042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080036, + "balance_loss_mlp": 1.04577541, + "epoch": 0.3174297806848788, + "flos": 527153204736.0, + "grad_norm": 0.060794493166337976, + "language_loss": 0.84529203, + "learning_rate": 0.0007985607847695014, + "loss": 0.85609239, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.34301758, + "step": 1650, + "time_per_iteration": 2.6306049823760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081772, + "balance_loss_mlp": 1.04851258, + "epoch": 0.31762216237014235, + "flos": 712855544832.0, + "grad_norm": 0.05325998456044798, + "language_loss": 0.82638443, + "learning_rate": 0.0007983108229312345, + "loss": 0.83720207, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.33276367, + "step": 1651, + "time_per_iteration": 2.909571647644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108965, + "balance_loss_mlp": 1.05567503, + "epoch": 0.31781454405540593, + "flos": 483567736320.0, + "grad_norm": 0.0653784528473409, + "language_loss": 0.86672306, + "learning_rate": 0.0007980607452784351, + "loss": 0.87761962, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.33984375, + "step": 1652, + "time_per_iteration": 2.5339765548706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_mlp": 1.06639075, + "epoch": 0.31800692574066947, + "flos": 548483249664.0, + "grad_norm": 0.0685029555550019, + "language_loss": 0.90562367, + "learning_rate": 0.0007978105519081919, + "loss": 0.91662765, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.34008789, + "step": 1653, + "time_per_iteration": 2.6916213035583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096542, + "balance_loss_mlp": 1.06213784, + "epoch": 0.31819930742593305, + "flos": 516640951296.0, + "grad_norm": 0.07941193091019123, + "language_loss": 0.87969935, + "learning_rate": 0.0007975602429176385, + "loss": 0.89066482, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.34423828, + "step": 1654, + "time_per_iteration": 2.5997297763824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100835, + "balance_loss_mlp": 1.06695616, + "epoch": 0.31839168911119664, + "flos": 455748456960.0, + "grad_norm": 0.07129171690745044, + "language_loss": 0.81582803, + "learning_rate": 0.0007973098184039536, + "loss": 0.82683635, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.33911133, + "step": 1655, + "time_per_iteration": 2.654914140701294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096724, + "balance_loss_mlp": 1.06284511, + "epoch": 0.3185840707964602, + "flos": 625719513600.0, + "grad_norm": 0.05658637496419385, + "language_loss": 0.86710656, + "learning_rate": 0.0007970592784643602, + "loss": 0.87807381, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.33911133, + "step": 1656, + "time_per_iteration": 2.846390962600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090807, + "balance_loss_mlp": 1.05719042, + "epoch": 0.31877645248172376, + "flos": 567213249024.0, + "grad_norm": 0.058346793379709355, + "language_loss": 0.85032123, + "learning_rate": 0.0007968086231961272, + "loss": 0.8612293, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.33642578, + "step": 1657, + "time_per_iteration": 2.652986526489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094073, + "balance_loss_mlp": 1.0593828, + "epoch": 0.3189688341669873, + "flos": 489338205696.0, + "grad_norm": 0.08644842740903268, + "language_loss": 0.836254, + "learning_rate": 0.0007965578526965671, + "loss": 0.84719473, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.34741211, + "step": 1658, + "time_per_iteration": 2.5607872009277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092531, + "balance_loss_mlp": 1.05860353, + "epoch": 0.3191612158522509, + "flos": 575948938752.0, + "grad_norm": 0.04707712809776705, + "language_loss": 0.86020696, + "learning_rate": 0.0007963069670630377, + "loss": 0.87113225, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.33959961, + "step": 1659, + "time_per_iteration": 2.7435247898101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097687, + "balance_loss_mlp": 1.06447566, + "epoch": 0.3193535975375144, + "flos": 537867689472.0, + "grad_norm": 0.062321727217464123, + "language_loss": 0.87956834, + "learning_rate": 0.0007960559663929416, + "loss": 0.89054519, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.33227539, + "step": 1660, + "time_per_iteration": 2.6282846927642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096682, + "balance_loss_mlp": 1.06265998, + "epoch": 0.319545979222778, + "flos": 733954245120.0, + "grad_norm": 0.07201541894751945, + "language_loss": 0.87465358, + "learning_rate": 0.0007958048507837259, + "loss": 0.88562042, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.34057617, + "step": 1661, + "time_per_iteration": 2.9250974655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099132, + "balance_loss_mlp": 1.0647999, + "epoch": 0.31973836090804153, + "flos": 764136433152.0, + "grad_norm": 0.0721917610669121, + "language_loss": 0.87230003, + "learning_rate": 0.0007955536203328822, + "loss": 0.8832913, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.34375, + "step": 1662, + "time_per_iteration": 2.899735450744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109485, + "balance_loss_mlp": 1.06109047, + "epoch": 0.3199307425933051, + "flos": 560252560896.0, + "grad_norm": 0.06666532975578916, + "language_loss": 0.83308822, + "learning_rate": 0.0007953022751379469, + "loss": 0.84403676, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.33789062, + "step": 1663, + "time_per_iteration": 2.7743418216705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093375, + "balance_loss_mlp": 1.05899549, + "epoch": 0.3201231242785687, + "flos": 751019751936.0, + "grad_norm": 0.058114271957014456, + "language_loss": 0.81677037, + "learning_rate": 0.000795050815296501, + "loss": 0.82770407, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.34399414, + "step": 1664, + "time_per_iteration": 2.9620323181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091229, + "balance_loss_mlp": 1.05768323, + "epoch": 0.32031550596383224, + "flos": 496157709312.0, + "grad_norm": 0.061791342299560625, + "language_loss": 0.93274921, + "learning_rate": 0.0007947992409061695, + "loss": 0.94366151, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.33569336, + "step": 1665, + "time_per_iteration": 2.6097099781036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083934, + "balance_loss_mlp": 1.05022132, + "epoch": 0.3205078876490958, + "flos": 731294562816.0, + "grad_norm": 0.05133774923717053, + "language_loss": 0.86471802, + "learning_rate": 0.0007945475520646226, + "loss": 0.8755573, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.33740234, + "step": 1666, + "time_per_iteration": 2.9224231243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088823, + "balance_loss_mlp": 1.05487204, + "epoch": 0.32070026933435936, + "flos": 549177283584.0, + "grad_norm": 0.1345109768982335, + "language_loss": 0.8496111, + "learning_rate": 0.0007942957488695743, + "loss": 0.86049932, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.33959961, + "step": 1667, + "time_per_iteration": 2.6267666816711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086537, + "balance_loss_mlp": 1.05265749, + "epoch": 0.32089265101962294, + "flos": 744949536768.0, + "grad_norm": 0.061316479944915916, + "language_loss": 0.80963373, + "learning_rate": 0.0007940438314187833, + "loss": 0.82049918, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.33886719, + "step": 1668, + "time_per_iteration": 3.00421142578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089646, + "balance_loss_mlp": 1.05638647, + "epoch": 0.3210850327048865, + "flos": 493937395200.0, + "grad_norm": 0.05864654089818211, + "language_loss": 0.80047274, + "learning_rate": 0.0007937917998100529, + "loss": 0.81136918, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.33276367, + "step": 1669, + "time_per_iteration": 2.607917070388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_mlp": 1.06610548, + "epoch": 0.32127741439015006, + "flos": 530383265280.0, + "grad_norm": 0.060159342011431034, + "language_loss": 0.78680766, + "learning_rate": 0.0007935396541412302, + "loss": 0.79781532, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.34692383, + "step": 1670, + "time_per_iteration": 2.6022346019744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108678, + "balance_loss_mlp": 1.07458389, + "epoch": 0.3214697960754136, + "flos": 500948955648.0, + "grad_norm": 0.07085567852213893, + "language_loss": 0.85879421, + "learning_rate": 0.0007932873945102068, + "loss": 0.86988097, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.34130859, + "step": 1671, + "time_per_iteration": 2.581815719604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120776, + "balance_loss_mlp": 1.10942781, + "epoch": 0.3216621777606772, + "flos": 1382579394048.0, + "grad_norm": 0.04969555951860313, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76882553, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.11328125, + "step": 1672, + "time_per_iteration": 4.821724891662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106649, + "balance_loss_mlp": 1.07193518, + "epoch": 0.32185455944594077, + "flos": 571260999168.0, + "grad_norm": 0.05896773993357689, + "language_loss": 0.86527109, + "learning_rate": 0.0007927825337533461, + "loss": 0.87633765, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.34765625, + "step": 1673, + "time_per_iteration": 2.6640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105888, + "balance_loss_mlp": 1.07184219, + "epoch": 0.3220469411312043, + "flos": 543652715520.0, + "grad_norm": 0.06618360944756078, + "language_loss": 0.84761298, + "learning_rate": 0.0007925299328235131, + "loss": 0.85867184, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.34057617, + "step": 1674, + "time_per_iteration": 2.6524405479431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102416, + "balance_loss_mlp": 1.06705832, + "epoch": 0.3222393228164679, + "flos": 490884834816.0, + "grad_norm": 0.05872681692102293, + "language_loss": 0.85148364, + "learning_rate": 0.000792277218323488, + "loss": 0.86250782, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.35424805, + "step": 1675, + "time_per_iteration": 2.557460069656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100116, + "balance_loss_mlp": 1.06523526, + "epoch": 0.3224317045017314, + "flos": 490145720832.0, + "grad_norm": 0.05188137415598196, + "language_loss": 0.84647608, + "learning_rate": 0.0007920243903513833, + "loss": 0.85747719, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.34912109, + "step": 1676, + "time_per_iteration": 2.5208208560943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092752, + "balance_loss_mlp": 1.05813313, + "epoch": 0.322624086186995, + "flos": 575505188352.0, + "grad_norm": 0.06429192544800656, + "language_loss": 0.83992624, + "learning_rate": 0.0007917714490053556, + "loss": 0.8508538, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.34667969, + "step": 1677, + "time_per_iteration": 2.653686761856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109123, + "balance_loss_mlp": 1.05668271, + "epoch": 0.32281646787225854, + "flos": 628974305280.0, + "grad_norm": 0.048890211607645416, + "language_loss": 0.86094737, + "learning_rate": 0.0007915183943836055, + "loss": 0.87185967, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.34594727, + "step": 1678, + "time_per_iteration": 2.852612018585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083818, + "balance_loss_mlp": 1.04950905, + "epoch": 0.3230088495575221, + "flos": 781036024320.0, + "grad_norm": 0.05620908679364121, + "language_loss": 0.83880055, + "learning_rate": 0.0007912652265843773, + "loss": 0.8496387, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.34350586, + "step": 1679, + "time_per_iteration": 3.006805419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.0433867, + "epoch": 0.3232012312427857, + "flos": 535839432192.0, + "grad_norm": 0.04762836982551939, + "language_loss": 0.81587136, + "learning_rate": 0.0007910119457059597, + "loss": 0.82664907, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.34423828, + "step": 1680, + "time_per_iteration": 2.6930737495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_mlp": 1.04129148, + "epoch": 0.32339361292804925, + "flos": 704515553280.0, + "grad_norm": 0.06110281418881611, + "language_loss": 0.80031025, + "learning_rate": 0.0007907585518466849, + "loss": 0.81107438, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.35180664, + "step": 1681, + "time_per_iteration": 2.940950870513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081204, + "balance_loss_mlp": 1.04603744, + "epoch": 0.32358599461331283, + "flos": 452099377152.0, + "grad_norm": 0.0474614445796137, + "language_loss": 0.90124965, + "learning_rate": 0.000790505045104929, + "loss": 0.91206169, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.35205078, + "step": 1682, + "time_per_iteration": 2.4919469356536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078264, + "balance_loss_mlp": 1.0435977, + "epoch": 0.32377837629857636, + "flos": 600597794304.0, + "grad_norm": 0.057051782898604, + "language_loss": 0.86989701, + "learning_rate": 0.0007902514255791125, + "loss": 0.88067961, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.34692383, + "step": 1683, + "time_per_iteration": 2.7545859813690186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108269, + "balance_loss_mlp": 1.04721308, + "epoch": 0.32397075798383995, + "flos": 807180636672.0, + "grad_norm": 0.05145240385219177, + "language_loss": 0.87981123, + "learning_rate": 0.0007899976933676986, + "loss": 0.89063811, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.35498047, + "step": 1684, + "time_per_iteration": 2.97807240486145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081365, + "balance_loss_mlp": 1.04638934, + "epoch": 0.3241631396691035, + "flos": 601414073856.0, + "grad_norm": 0.06429290680378846, + "language_loss": 0.8767072, + "learning_rate": 0.0007897438485691955, + "loss": 0.88752091, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.3503418, + "step": 1685, + "time_per_iteration": 2.704326868057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083548, + "balance_loss_mlp": 1.04826176, + "epoch": 0.32435552135436707, + "flos": 473980861440.0, + "grad_norm": 0.058364951070402814, + "language_loss": 0.82023847, + "learning_rate": 0.0007894898912821542, + "loss": 0.831074, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.3527832, + "step": 1686, + "time_per_iteration": 2.5206680297851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076257, + "balance_loss_mlp": 1.04178166, + "epoch": 0.3245479030396306, + "flos": 537824019456.0, + "grad_norm": 0.04476181031616706, + "language_loss": 0.86661267, + "learning_rate": 0.0007892358216051695, + "loss": 0.87737525, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.3449707, + "step": 1687, + "time_per_iteration": 2.7332026958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075747, + "balance_loss_mlp": 1.04108071, + "epoch": 0.3247402847248942, + "flos": 547394927616.0, + "grad_norm": 0.05643246072623682, + "language_loss": 0.92275292, + "learning_rate": 0.0007889816396368803, + "loss": 0.93351042, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.34692383, + "step": 1688, + "time_per_iteration": 2.6158432960510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077832, + "balance_loss_mlp": 1.04388082, + "epoch": 0.3249326664101578, + "flos": 377941814784.0, + "grad_norm": 0.04953960067471088, + "language_loss": 0.85575634, + "learning_rate": 0.0007887273454759687, + "loss": 0.86653465, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.33984375, + "step": 1689, + "time_per_iteration": 2.4958317279815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075156, + "balance_loss_mlp": 1.04051399, + "epoch": 0.3251250480954213, + "flos": 527818125312.0, + "grad_norm": 0.050587956688220255, + "language_loss": 0.82717729, + "learning_rate": 0.0007884729392211603, + "loss": 0.83792883, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.34692383, + "step": 1690, + "time_per_iteration": 2.6325736045837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075634, + "balance_loss_mlp": 1.04113472, + "epoch": 0.3253174297806849, + "flos": 449435312640.0, + "grad_norm": 0.06211432544239721, + "language_loss": 0.85214412, + "learning_rate": 0.0007882184209712245, + "loss": 0.8629005, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.34545898, + "step": 1691, + "time_per_iteration": 2.5199012756347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076936, + "balance_loss_mlp": 1.04303288, + "epoch": 0.32550981146594843, + "flos": 703855014912.0, + "grad_norm": 0.0444021152083115, + "language_loss": 0.85646939, + "learning_rate": 0.000787963790824974, + "loss": 0.86723876, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.33935547, + "step": 1692, + "time_per_iteration": 2.9585483074188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076864, + "balance_loss_mlp": 1.04217362, + "epoch": 0.325702193151212, + "flos": 392491643904.0, + "grad_norm": 0.06035071191190156, + "language_loss": 0.89588344, + "learning_rate": 0.0007877090488812651, + "loss": 0.90665203, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.34716797, + "step": 1693, + "time_per_iteration": 2.4247992038726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073167, + "balance_loss_mlp": 1.0379529, + "epoch": 0.32589457483647555, + "flos": 577223525376.0, + "grad_norm": 0.051335929306222446, + "language_loss": 0.83377099, + "learning_rate": 0.0007874541952389973, + "loss": 0.84450269, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.35253906, + "step": 1694, + "time_per_iteration": 2.679868459701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074307, + "balance_loss_mlp": 1.03947401, + "epoch": 0.32608695652173914, + "flos": 498092834304.0, + "grad_norm": 0.051580366849716015, + "language_loss": 0.86795568, + "learning_rate": 0.0007871992299971136, + "loss": 0.87869877, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.34887695, + "step": 1695, + "time_per_iteration": 2.6005072593688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079265, + "balance_loss_mlp": 1.04431272, + "epoch": 0.32627933820700267, + "flos": 590858150400.0, + "grad_norm": 0.054409905067417906, + "language_loss": 0.84529006, + "learning_rate": 0.0007869441532546001, + "loss": 0.85608268, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.34985352, + "step": 1696, + "time_per_iteration": 2.7373292446136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071229, + "balance_loss_mlp": 1.03749299, + "epoch": 0.32647171989226625, + "flos": 608790809088.0, + "grad_norm": 0.05196776598603691, + "language_loss": 0.79551816, + "learning_rate": 0.0007866889651104867, + "loss": 0.80623043, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.33764648, + "step": 1697, + "time_per_iteration": 2.768869638442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069598, + "balance_loss_mlp": 1.03464603, + "epoch": 0.32666410157752984, + "flos": 476896619520.0, + "grad_norm": 0.05699082390473629, + "language_loss": 0.83313, + "learning_rate": 0.000786433665663846, + "loss": 0.84382606, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.34985352, + "step": 1698, + "time_per_iteration": 2.6574184894561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070595, + "balance_loss_mlp": 1.03664398, + "epoch": 0.3268564832627934, + "flos": 718060018176.0, + "grad_norm": 0.0499104315286651, + "language_loss": 0.86441195, + "learning_rate": 0.0007861782550137942, + "loss": 0.8751179, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.33984375, + "step": 1699, + "time_per_iteration": 2.8897016048431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071541, + "balance_loss_mlp": 1.0379957, + "epoch": 0.32704886494805696, + "flos": 768469372416.0, + "grad_norm": 0.05892131453680714, + "language_loss": 0.85990739, + "learning_rate": 0.0007859227332594901, + "loss": 0.87062275, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.33569336, + "step": 1700, + "time_per_iteration": 2.8941755294799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080492, + "balance_loss_mlp": 1.046803, + "epoch": 0.3272412466333205, + "flos": 849540980736.0, + "grad_norm": 0.0647173620985618, + "language_loss": 0.84537613, + "learning_rate": 0.0007856671005001365, + "loss": 0.85618103, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.3371582, + "step": 1701, + "time_per_iteration": 3.1362555027008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107778, + "balance_loss_mlp": 1.04373336, + "epoch": 0.3274336283185841, + "flos": 831224208384.0, + "grad_norm": 0.055785838120560656, + "language_loss": 0.81608075, + "learning_rate": 0.0007854113568349787, + "loss": 0.82685852, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.34082031, + "step": 1702, + "time_per_iteration": 3.0684425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108348, + "balance_loss_mlp": 1.04900455, + "epoch": 0.3276260100038476, + "flos": 691721938944.0, + "grad_norm": 0.059478075679183354, + "language_loss": 0.80008304, + "learning_rate": 0.0007851555023633052, + "loss": 0.81091785, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.34521484, + "step": 1703, + "time_per_iteration": 2.829838991165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083551, + "balance_loss_mlp": 1.04974365, + "epoch": 0.3278183916891112, + "flos": 435831211008.0, + "grad_norm": 0.05938301584715095, + "language_loss": 0.82290888, + "learning_rate": 0.0007848995371844474, + "loss": 0.83374435, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.33837891, + "step": 1704, + "time_per_iteration": 2.498462200164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.0532254, + "epoch": 0.3280107733743748, + "flos": 460883119104.0, + "grad_norm": 0.06015932024064871, + "language_loss": 0.80622214, + "learning_rate": 0.0007846434613977801, + "loss": 0.81709725, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.34326172, + "step": 1705, + "time_per_iteration": 2.4933369159698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087045, + "balance_loss_mlp": 1.05330932, + "epoch": 0.3282031550596383, + "flos": 679018484736.0, + "grad_norm": 0.05558890685700398, + "language_loss": 0.78265798, + "learning_rate": 0.0007843872751027203, + "loss": 0.7935285, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.33764648, + "step": 1706, + "time_per_iteration": 2.81091046333313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090191, + "balance_loss_mlp": 1.05621648, + "epoch": 0.3283955367449019, + "flos": 544821023232.0, + "grad_norm": 0.10097233050810657, + "language_loss": 0.87312186, + "learning_rate": 0.0007841309783987287, + "loss": 0.88402379, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.34008789, + "step": 1707, + "time_per_iteration": 2.7456212043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083118, + "balance_loss_mlp": 1.0490005, + "epoch": 0.32858791843016544, + "flos": 481017153024.0, + "grad_norm": 0.06288690811568091, + "language_loss": 0.89185357, + "learning_rate": 0.0007838745713853084, + "loss": 0.90268475, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.34155273, + "step": 1708, + "time_per_iteration": 2.5734565258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086944, + "balance_loss_mlp": 1.0529933, + "epoch": 0.328780300115429, + "flos": 566529389568.0, + "grad_norm": 0.059735917623485235, + "language_loss": 0.84101981, + "learning_rate": 0.0007836180541620053, + "loss": 0.85188925, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.33984375, + "step": 1709, + "time_per_iteration": 2.6734848022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082239, + "balance_loss_mlp": 1.04843152, + "epoch": 0.32897268180069256, + "flos": 475787948544.0, + "grad_norm": 0.06557165815913592, + "language_loss": 0.86666405, + "learning_rate": 0.0007833614268284082, + "loss": 0.87748647, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.33813477, + "step": 1710, + "time_per_iteration": 2.5004236698150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109204, + "balance_loss_mlp": 1.07611382, + "epoch": 0.32916506348595614, + "flos": 1576517008896.0, + "grad_norm": 0.028486921929439343, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75201809, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.15917969, + "step": 1711, + "time_per_iteration": 4.857421159744263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084065, + "balance_loss_mlp": 1.05011439, + "epoch": 0.3293574451712197, + "flos": 482646739968.0, + "grad_norm": 0.05383776069577274, + "language_loss": 0.78376174, + "learning_rate": 0.0007828478422289016, + "loss": 0.79460239, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.33984375, + "step": 1712, + "time_per_iteration": 2.5763661861419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089524, + "balance_loss_mlp": 1.05438161, + "epoch": 0.32954982685648326, + "flos": 622266872832.0, + "grad_norm": 0.05220026625301518, + "language_loss": 0.89185119, + "learning_rate": 0.0007825908851623833, + "loss": 0.90274644, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.35205078, + "step": 1713, + "time_per_iteration": 2.7262768745422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081531, + "balance_loss_mlp": 1.04648352, + "epoch": 0.32974220854174685, + "flos": 544697367552.0, + "grad_norm": 0.06806070360888057, + "language_loss": 0.85360491, + "learning_rate": 0.0007823338183843533, + "loss": 0.86442018, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.35083008, + "step": 1714, + "time_per_iteration": 2.652278184890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078166, + "balance_loss_mlp": 1.0447638, + "epoch": 0.3299345902270104, + "flos": 981740708352.0, + "grad_norm": 0.05603975865081876, + "language_loss": 0.8075726, + "learning_rate": 0.0007820766419946141, + "loss": 0.81835425, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.33422852, + "step": 1715, + "time_per_iteration": 3.282278537750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087722, + "balance_loss_mlp": 1.07227242, + "epoch": 0.33012697191227397, + "flos": 1402857432576.0, + "grad_norm": 0.02753251532821737, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80760199, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.15429688, + "step": 1716, + "time_per_iteration": 4.925649881362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081747, + "balance_loss_mlp": 1.04789162, + "epoch": 0.3303193535975375, + "flos": 504897781248.0, + "grad_norm": 0.09582479469105179, + "language_loss": 0.75968826, + "learning_rate": 0.0007815619607794288, + "loss": 0.77050573, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.33886719, + "step": 1717, + "time_per_iteration": 2.653355598449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077451, + "balance_loss_mlp": 1.04423952, + "epoch": 0.3305117352828011, + "flos": 937602390528.0, + "grad_norm": 0.059316474336830904, + "language_loss": 0.8254683, + "learning_rate": 0.0007813044561538001, + "loss": 0.83624279, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.33227539, + "step": 1718, + "time_per_iteration": 3.1251535415649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084283, + "balance_loss_mlp": 1.05035567, + "epoch": 0.3307041169680646, + "flos": 721176597504.0, + "grad_norm": 0.08429030846847434, + "language_loss": 0.88411027, + "learning_rate": 0.0007810468423160958, + "loss": 0.89495313, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.33959961, + "step": 1719, + "time_per_iteration": 2.8598783016204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090007, + "balance_loss_mlp": 1.05760598, + "epoch": 0.3308964986533282, + "flos": 583315499520.0, + "grad_norm": 0.04547421634197757, + "language_loss": 0.81920642, + "learning_rate": 0.0007807891193663306, + "loss": 0.8301065, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.32397461, + "step": 1720, + "time_per_iteration": 2.7775802612304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092448, + "balance_loss_mlp": 1.0582583, + "epoch": 0.33108888033859174, + "flos": 473340672000.0, + "grad_norm": 0.07591254280459368, + "language_loss": 0.82440275, + "learning_rate": 0.0007805312874045614, + "loss": 0.83532727, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.34228516, + "step": 1721, + "time_per_iteration": 2.5138351917266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100808, + "balance_loss_mlp": 1.06657076, + "epoch": 0.3312812620238553, + "flos": 385913659392.0, + "grad_norm": 0.08101052667778896, + "language_loss": 0.86919391, + "learning_rate": 0.0007802733465308874, + "loss": 0.88020205, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.34277344, + "step": 1722, + "time_per_iteration": 2.440809726715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106056, + "balance_loss_mlp": 1.07074606, + "epoch": 0.3314736437091189, + "flos": 494292395520.0, + "grad_norm": 0.0567806329299034, + "language_loss": 0.8509872, + "learning_rate": 0.0007800152968454501, + "loss": 0.86204773, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.35375977, + "step": 1723, + "time_per_iteration": 2.61602520942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090428, + "balance_loss_mlp": 1.056072, + "epoch": 0.33166602539438245, + "flos": 653346736128.0, + "grad_norm": 0.038882210578800376, + "language_loss": 0.90476918, + "learning_rate": 0.0007797571384484334, + "loss": 0.91567349, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.34399414, + "step": 1724, + "time_per_iteration": 2.857562780380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090291, + "balance_loss_mlp": 1.05586314, + "epoch": 0.33185840707964603, + "flos": 520550489088.0, + "grad_norm": 0.04870849772261114, + "language_loss": 0.91599178, + "learning_rate": 0.0007794988714400633, + "loss": 0.92689478, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.34448242, + "step": 1725, + "time_per_iteration": 2.5884361267089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097227, + "balance_loss_mlp": 1.06077266, + "epoch": 0.33205078876490957, + "flos": 436712919552.0, + "grad_norm": 0.05260760436426434, + "language_loss": 0.85199809, + "learning_rate": 0.0007792404959206079, + "loss": 0.86297035, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.36474609, + "step": 1726, + "time_per_iteration": 2.4855122566223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095942, + "balance_loss_mlp": 1.05965447, + "epoch": 0.33224317045017315, + "flos": 768400971264.0, + "grad_norm": 0.052329818141719754, + "language_loss": 0.81527805, + "learning_rate": 0.0007789820119903774, + "loss": 0.82623744, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.36279297, + "step": 1727, + "time_per_iteration": 2.945405960083008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105991, + "balance_loss_mlp": 1.04579556, + "epoch": 0.3324355521354367, + "flos": 1465656090624.0, + "grad_norm": 0.02932642968329903, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79552573, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.14160156, + "step": 1728, + "time_per_iteration": 4.810296297073364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084477, + "balance_loss_mlp": 1.04880977, + "epoch": 0.3326279338207003, + "flos": 496415195136.0, + "grad_norm": 0.05339720943334808, + "language_loss": 0.83919221, + "learning_rate": 0.0007784647192990428, + "loss": 0.85003698, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.35693359, + "step": 1729, + "time_per_iteration": 2.6848132610321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083753, + "balance_loss_mlp": 1.04844344, + "epoch": 0.33282031550596386, + "flos": 635600342016.0, + "grad_norm": 0.05212570885713578, + "language_loss": 0.80661625, + "learning_rate": 0.0007782059107387696, + "loss": 0.81745386, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.35351562, + "step": 1730, + "time_per_iteration": 2.874356269836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078294, + "balance_loss_mlp": 1.04329371, + "epoch": 0.3330126971912274, + "flos": 689210643456.0, + "grad_norm": 0.05636936917064103, + "language_loss": 0.88407743, + "learning_rate": 0.0007779469941693826, + "loss": 0.89486033, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.3503418, + "step": 1731, + "time_per_iteration": 2.7914862632751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079351, + "balance_loss_mlp": 1.04511368, + "epoch": 0.333205078876491, + "flos": 566184563712.0, + "grad_norm": 0.05730145040609657, + "language_loss": 0.77017218, + "learning_rate": 0.0007776879696914029, + "loss": 0.78096569, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.34277344, + "step": 1732, + "time_per_iteration": 2.8158769607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081, + "balance_loss_mlp": 1.04666734, + "epoch": 0.3333974605617545, + "flos": 640618550784.0, + "grad_norm": 0.044212495165629015, + "language_loss": 0.8903594, + "learning_rate": 0.000777428837405392, + "loss": 0.90116942, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.34375, + "step": 1733, + "time_per_iteration": 2.8417906761169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079717, + "balance_loss_mlp": 1.04536092, + "epoch": 0.3335898422470181, + "flos": 461597501952.0, + "grad_norm": 0.05109390766697835, + "language_loss": 0.87070495, + "learning_rate": 0.0007771695974119544, + "loss": 0.88150203, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.34399414, + "step": 1734, + "time_per_iteration": 2.4995057582855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078764, + "balance_loss_mlp": 1.04514742, + "epoch": 0.33378222393228163, + "flos": 852504791040.0, + "grad_norm": 0.05825672376588237, + "language_loss": 0.75576115, + "learning_rate": 0.0007769102498117359, + "loss": 0.76654887, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.33642578, + "step": 1735, + "time_per_iteration": 3.0868663787841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083295, + "balance_loss_mlp": 1.04991651, + "epoch": 0.3339746056175452, + "flos": 954256080384.0, + "grad_norm": 0.05069255593645712, + "language_loss": 0.79858601, + "learning_rate": 0.000776650794705424, + "loss": 0.80941892, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.33398438, + "step": 1736, + "time_per_iteration": 3.2665328979492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108809, + "balance_loss_mlp": 1.05387688, + "epoch": 0.33416698730280875, + "flos": 544559155200.0, + "grad_norm": 0.045819605067785145, + "language_loss": 0.82160866, + "learning_rate": 0.0007763912321937483, + "loss": 0.83248949, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.34228516, + "step": 1737, + "time_per_iteration": 2.677316665649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081816, + "balance_loss_mlp": 1.04798412, + "epoch": 0.33435936898807234, + "flos": 1013652817920.0, + "grad_norm": 0.053421386657792044, + "language_loss": 0.82471478, + "learning_rate": 0.0007761315623774799, + "loss": 0.8355329, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.33862305, + "step": 1738, + "time_per_iteration": 3.4182283878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089571, + "balance_loss_mlp": 1.05554879, + "epoch": 0.3345517506733359, + "flos": 614935217664.0, + "grad_norm": 0.051536505858366714, + "language_loss": 0.87671852, + "learning_rate": 0.0007758717853574313, + "loss": 0.88761419, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.34057617, + "step": 1739, + "time_per_iteration": 2.7348380088806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084561, + "balance_loss_mlp": 1.05125391, + "epoch": 0.33474413235859946, + "flos": 494350622208.0, + "grad_norm": 0.06141180611747274, + "language_loss": 0.9002257, + "learning_rate": 0.0007756119012344571, + "loss": 0.91107136, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.33325195, + "step": 1740, + "time_per_iteration": 2.536121129989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091135, + "balance_loss_mlp": 1.05754209, + "epoch": 0.33493651404386304, + "flos": 628105743360.0, + "grad_norm": 0.06662069566578578, + "language_loss": 0.84404671, + "learning_rate": 0.0007753519101094535, + "loss": 0.85495806, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.33618164, + "step": 1741, + "time_per_iteration": 2.753371238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082833, + "balance_loss_mlp": 1.04945421, + "epoch": 0.3351288957291266, + "flos": 513474909696.0, + "grad_norm": 0.05750412427252262, + "language_loss": 0.86366677, + "learning_rate": 0.0007750918120833575, + "loss": 0.87449515, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.33398438, + "step": 1742, + "time_per_iteration": 2.56093168258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082276, + "balance_loss_mlp": 1.05037546, + "epoch": 0.33532127741439016, + "flos": 647008860672.0, + "grad_norm": 0.0676260973392943, + "language_loss": 0.87342101, + "learning_rate": 0.0007748316072571485, + "loss": 0.88424373, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.31884766, + "step": 1743, + "time_per_iteration": 2.7759556770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086123, + "balance_loss_mlp": 1.05193388, + "epoch": 0.3355136590996537, + "flos": 768134721024.0, + "grad_norm": 0.047185436483198326, + "language_loss": 0.79306734, + "learning_rate": 0.0007745712957318467, + "loss": 0.80392861, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.34228516, + "step": 1744, + "time_per_iteration": 2.959686756134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085284, + "balance_loss_mlp": 1.05119014, + "epoch": 0.3357060407849173, + "flos": 595259490816.0, + "grad_norm": 0.046948111550021425, + "language_loss": 0.86506951, + "learning_rate": 0.0007743108776085141, + "loss": 0.87592232, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.34106445, + "step": 1745, + "time_per_iteration": 2.7391204833984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089169, + "balance_loss_mlp": 1.05462217, + "epoch": 0.3358984224701808, + "flos": 598288730112.0, + "grad_norm": 0.04983419543630797, + "language_loss": 0.82728243, + "learning_rate": 0.0007740503529882543, + "loss": 0.8381741, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.34594727, + "step": 1746, + "time_per_iteration": 2.788041114807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108953, + "balance_loss_mlp": 1.05474496, + "epoch": 0.3360908041554444, + "flos": 578055771648.0, + "grad_norm": 0.05677254755827829, + "language_loss": 0.91252941, + "learning_rate": 0.0007737897219722114, + "loss": 0.92342472, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.34790039, + "step": 1747, + "time_per_iteration": 2.6752376556396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083715, + "balance_loss_mlp": 1.04945374, + "epoch": 0.336283185840708, + "flos": 513332315136.0, + "grad_norm": 0.05427874766165502, + "language_loss": 0.81146061, + "learning_rate": 0.0007735289846615716, + "loss": 0.82229781, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.34301758, + "step": 1748, + "time_per_iteration": 2.670315742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083181, + "balance_loss_mlp": 1.04984999, + "epoch": 0.3364755675259715, + "flos": 524716102656.0, + "grad_norm": 0.05445380235157479, + "language_loss": 0.81899059, + "learning_rate": 0.0007732681411575621, + "loss": 0.82982242, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.33349609, + "step": 1749, + "time_per_iteration": 2.644740104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079502, + "balance_loss_mlp": 1.04567027, + "epoch": 0.3366679492112351, + "flos": 554594162688.0, + "grad_norm": 0.05291201013717534, + "language_loss": 0.87517959, + "learning_rate": 0.0007730071915614514, + "loss": 0.88597459, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.33862305, + "step": 1750, + "time_per_iteration": 2.6605777740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082694, + "balance_loss_mlp": 1.04874277, + "epoch": 0.33686033089649864, + "flos": 427051851264.0, + "grad_norm": 0.07867660779661921, + "language_loss": 0.88976741, + "learning_rate": 0.0007727461359745489, + "loss": 0.90059435, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.33984375, + "step": 1751, + "time_per_iteration": 2.4562768936157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082471, + "balance_loss_mlp": 1.04987907, + "epoch": 0.3370527125817622, + "flos": 541452750336.0, + "grad_norm": 0.05472309390748721, + "language_loss": 0.86156446, + "learning_rate": 0.0007724849744982056, + "loss": 0.87238914, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.32592773, + "step": 1752, + "time_per_iteration": 2.683575391769409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086284, + "balance_loss_mlp": 1.05295336, + "epoch": 0.33724509426702576, + "flos": 541836864000.0, + "grad_norm": 0.052181206472060114, + "language_loss": 0.81578106, + "learning_rate": 0.0007722237072338131, + "loss": 0.82664388, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.33349609, + "step": 1753, + "time_per_iteration": 2.7059788703918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108949, + "balance_loss_mlp": 1.05563486, + "epoch": 0.33743747595228935, + "flos": 472557888000.0, + "grad_norm": 0.063588606701447, + "language_loss": 0.85402888, + "learning_rate": 0.0007719623342828046, + "loss": 0.86492383, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.33886719, + "step": 1754, + "time_per_iteration": 2.5117459297180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090728, + "balance_loss_mlp": 1.05708706, + "epoch": 0.33762985763755293, + "flos": 469564964352.0, + "grad_norm": 0.05602573096673387, + "language_loss": 0.84115714, + "learning_rate": 0.000771700855746654, + "loss": 0.85206437, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.33666992, + "step": 1755, + "time_per_iteration": 2.5685064792633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085423, + "balance_loss_mlp": 1.05214, + "epoch": 0.33782223932281646, + "flos": 492002270208.0, + "grad_norm": 0.05352941428578995, + "language_loss": 0.88329422, + "learning_rate": 0.0007714392717268763, + "loss": 0.89414847, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.33300781, + "step": 1756, + "time_per_iteration": 2.568432569503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084365, + "balance_loss_mlp": 1.04981852, + "epoch": 0.33801462100808005, + "flos": 464827562496.0, + "grad_norm": 0.051807056092833426, + "language_loss": 0.86368155, + "learning_rate": 0.0007711775823250273, + "loss": 0.87452519, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.34594727, + "step": 1757, + "time_per_iteration": 2.5542263984680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010902, + "balance_loss_mlp": 1.05660665, + "epoch": 0.3382070026933436, + "flos": 795319603200.0, + "grad_norm": 0.05510084593487172, + "language_loss": 0.83019066, + "learning_rate": 0.0007709157876427039, + "loss": 0.84109271, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.33618164, + "step": 1758, + "time_per_iteration": 3.07852840423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082149, + "balance_loss_mlp": 1.04903245, + "epoch": 0.33839938437860717, + "flos": 508181686272.0, + "grad_norm": 0.0524958838474987, + "language_loss": 0.85602981, + "learning_rate": 0.0007706538877815439, + "loss": 0.86685127, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.33129883, + "step": 1759, + "time_per_iteration": 2.6002085208892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082716, + "balance_loss_mlp": 1.05021918, + "epoch": 0.3385917660638707, + "flos": 483986755584.0, + "grad_norm": 0.05079207863068971, + "language_loss": 0.83150595, + "learning_rate": 0.0007703918828432259, + "loss": 0.84233308, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.32495117, + "step": 1760, + "time_per_iteration": 2.5961215496063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086286, + "balance_loss_mlp": 1.05297899, + "epoch": 0.3387841477491343, + "flos": 545071306752.0, + "grad_norm": 0.0542286668270813, + "language_loss": 0.89021361, + "learning_rate": 0.000770129772929469, + "loss": 0.9010765, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.33325195, + "step": 1761, + "time_per_iteration": 2.6393394470214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076289, + "balance_loss_mlp": 1.04264784, + "epoch": 0.3389765294343978, + "flos": 719487373824.0, + "grad_norm": 0.057381721603975526, + "language_loss": 0.88803625, + "learning_rate": 0.0007698675581420334, + "loss": 0.89879912, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.33666992, + "step": 1762, + "time_per_iteration": 2.8959014415740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_mlp": 1.03968656, + "epoch": 0.3391689111196614, + "flos": 699596269056.0, + "grad_norm": 0.05381480837735757, + "language_loss": 0.78922743, + "learning_rate": 0.0007696052385827199, + "loss": 0.79995811, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.33398438, + "step": 1763, + "time_per_iteration": 2.9110584259033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068767, + "balance_loss_mlp": 1.03519773, + "epoch": 0.339361292804925, + "flos": 626806425600.0, + "grad_norm": 0.05521588721088573, + "language_loss": 0.78407156, + "learning_rate": 0.00076934281435337, + "loss": 0.79475927, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.33569336, + "step": 1764, + "time_per_iteration": 2.7673115730285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073935, + "balance_loss_mlp": 1.04043674, + "epoch": 0.33955367449018853, + "flos": 609302960640.0, + "grad_norm": 0.0615155635578628, + "language_loss": 0.85995364, + "learning_rate": 0.0007690802855558658, + "loss": 0.87069303, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.33520508, + "step": 1765, + "time_per_iteration": 2.871255397796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_mlp": 1.07131636, + "epoch": 0.3397460561754521, + "flos": 1452494177280.0, + "grad_norm": 0.03113174858532202, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77458668, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.12402344, + "step": 1766, + "time_per_iteration": 4.906975746154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089831, + "balance_loss_mlp": 1.05485463, + "epoch": 0.33993843786071565, + "flos": 487068429312.0, + "grad_norm": 0.059784397062932884, + "language_loss": 0.89060128, + "learning_rate": 0.0007685549146641262, + "loss": 0.90149957, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.35009766, + "step": 1767, + "time_per_iteration": 2.5172948837280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081834, + "balance_loss_mlp": 1.04683375, + "epoch": 0.34013081954597923, + "flos": 417115768320.0, + "grad_norm": 0.05470212710373979, + "language_loss": 0.88085568, + "learning_rate": 0.0007682920727738579, + "loss": 0.89167398, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.35058594, + "step": 1768, + "time_per_iteration": 2.4423539638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084324, + "balance_loss_mlp": 1.04939604, + "epoch": 0.34032320123124277, + "flos": 437293472256.0, + "grad_norm": 0.06228189549734304, + "language_loss": 0.84428132, + "learning_rate": 0.000768029126723369, + "loss": 0.85512453, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.34960938, + "step": 1769, + "time_per_iteration": 2.5054280757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082733, + "balance_loss_mlp": 1.04811513, + "epoch": 0.34051558291650635, + "flos": 457353312768.0, + "grad_norm": 0.058774755629116764, + "language_loss": 0.81489038, + "learning_rate": 0.0007677660766147447, + "loss": 0.82571769, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.34667969, + "step": 1770, + "time_per_iteration": 2.524327039718628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_mlp": 1.01844561, + "epoch": 0.3407079646017699, + "flos": 1558029938688.0, + "grad_norm": 0.017684799672329513, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.7350117, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.11767578, + "step": 1771, + "time_per_iteration": 4.924427032470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081114, + "balance_loss_mlp": 1.04666233, + "epoch": 0.3409003462870335, + "flos": 492312190464.0, + "grad_norm": 0.06375677891517043, + "language_loss": 0.79619604, + "learning_rate": 0.0007672396646316306, + "loss": 0.80700719, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.3449707, + "step": 1772, + "time_per_iteration": 2.5239012241363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081305, + "balance_loss_mlp": 1.04589987, + "epoch": 0.34109272797229706, + "flos": 808145303040.0, + "grad_norm": 0.06003817873980187, + "language_loss": 0.80518734, + "learning_rate": 0.000766976302961512, + "loss": 0.81600046, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.35424805, + "step": 1773, + "time_per_iteration": 2.9730074405670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083157, + "balance_loss_mlp": 1.04834807, + "epoch": 0.3412851096575606, + "flos": 469903997952.0, + "grad_norm": 0.05958263274361502, + "language_loss": 0.81420594, + "learning_rate": 0.0007667128376420003, + "loss": 0.82503754, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.34863281, + "step": 1774, + "time_per_iteration": 2.521329879760742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073378, + "balance_loss_mlp": 1.03842556, + "epoch": 0.3414774913428242, + "flos": 595402085376.0, + "grad_norm": 0.09709010294240925, + "language_loss": 0.84563607, + "learning_rate": 0.0007664492687753817, + "loss": 0.85636985, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.34985352, + "step": 1775, + "time_per_iteration": 2.6744766235351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072901, + "balance_loss_mlp": 1.03976059, + "epoch": 0.3416698730280877, + "flos": 527202667008.0, + "grad_norm": 0.05030413358353647, + "language_loss": 0.81513566, + "learning_rate": 0.000766185596463983, + "loss": 0.82586467, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.33154297, + "step": 1776, + "time_per_iteration": 2.6050221920013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073672, + "balance_loss_mlp": 1.03962612, + "epoch": 0.3418622547133513, + "flos": 874272794112.0, + "grad_norm": 0.05039515754698922, + "language_loss": 0.76683038, + "learning_rate": 0.0007659218208101706, + "loss": 0.77756709, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.34082031, + "step": 1777, + "time_per_iteration": 3.0900516510009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079189, + "balance_loss_mlp": 1.04504752, + "epoch": 0.34205463639861483, + "flos": 603462680064.0, + "grad_norm": 0.04915159817243754, + "language_loss": 0.84680861, + "learning_rate": 0.0007656579419163515, + "loss": 0.85760045, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.34179688, + "step": 1778, + "time_per_iteration": 2.7328884601593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081812, + "balance_loss_mlp": 1.04774237, + "epoch": 0.3422470180838784, + "flos": 463547183616.0, + "grad_norm": 0.05230649511498847, + "language_loss": 0.76939148, + "learning_rate": 0.0007653939598849724, + "loss": 0.7802096, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.34106445, + "step": 1779, + "time_per_iteration": 2.5020573139190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_mlp": 1.01441097, + "epoch": 0.34243939976914195, + "flos": 1585584377856.0, + "grad_norm": 0.019842751190116498, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83907396, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.12792969, + "step": 1780, + "time_per_iteration": 4.919352054595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107992, + "balance_loss_mlp": 1.04656482, + "epoch": 0.34263178145440554, + "flos": 872662146048.0, + "grad_norm": 0.0514393238831889, + "language_loss": 0.80344206, + "learning_rate": 0.000764865686819522, + "loss": 0.81424129, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.33374023, + "step": 1781, + "time_per_iteration": 3.059682846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089318, + "balance_loss_mlp": 1.05546236, + "epoch": 0.3428241631396691, + "flos": 506630674944.0, + "grad_norm": 0.04318417455303755, + "language_loss": 0.85701579, + "learning_rate": 0.0007646013959905449, + "loss": 0.86790895, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.33886719, + "step": 1782, + "time_per_iteration": 2.572772741317749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085097, + "balance_loss_mlp": 1.05162311, + "epoch": 0.34301654482493266, + "flos": 879669324288.0, + "grad_norm": 0.05640606275212692, + "language_loss": 0.80626374, + "learning_rate": 0.0007643370024341949, + "loss": 0.81711471, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.33496094, + "step": 1783, + "time_per_iteration": 3.0805578231811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089012, + "balance_loss_mlp": 1.05472708, + "epoch": 0.34320892651019624, + "flos": 431537559552.0, + "grad_norm": 0.05116039291223259, + "language_loss": 0.82947731, + "learning_rate": 0.0007640725062531195, + "loss": 0.84036732, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.34326172, + "step": 1784, + "time_per_iteration": 2.497859239578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083577, + "balance_loss_mlp": 1.05010295, + "epoch": 0.3434013081954598, + "flos": 463404589056.0, + "grad_norm": 0.06763804466989645, + "language_loss": 0.86272931, + "learning_rate": 0.0007638079075500047, + "loss": 0.87356508, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.33496094, + "step": 1785, + "time_per_iteration": 2.516842842102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017655, + "balance_loss_mlp": 1.0058769, + "epoch": 0.34359368988072336, + "flos": 1556499276288.0, + "grad_norm": 0.01279941843938601, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76198322, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.11767578, + "step": 1786, + "time_per_iteration": 4.979317665100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077073, + "balance_loss_mlp": 1.04417157, + "epoch": 0.3437860715659869, + "flos": 495267236352.0, + "grad_norm": 0.04590480874587016, + "language_loss": 0.83035767, + "learning_rate": 0.0007632784029886026, + "loss": 0.84112841, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.32910156, + "step": 1787, + "time_per_iteration": 2.6075103282928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079505, + "balance_loss_mlp": 1.04617453, + "epoch": 0.3439784532512505, + "flos": 717942154752.0, + "grad_norm": 0.04559278353066439, + "language_loss": 0.85611933, + "learning_rate": 0.0007630134973358873, + "loss": 0.86691439, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.33349609, + "step": 1788, + "time_per_iteration": 2.917405366897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086657, + "balance_loss_mlp": 1.05327868, + "epoch": 0.34417083493651407, + "flos": 565598218752.0, + "grad_norm": 0.05301353071730806, + "language_loss": 0.86864436, + "learning_rate": 0.0007627484895722763, + "loss": 0.879511, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.33398438, + "step": 1789, + "time_per_iteration": 2.6353273391723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081939, + "balance_loss_mlp": 1.04834569, + "epoch": 0.3443632166217776, + "flos": 795988905984.0, + "grad_norm": 0.057022653970397005, + "language_loss": 0.80155563, + "learning_rate": 0.0007624833798006552, + "loss": 0.81237495, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.3359375, + "step": 1790, + "time_per_iteration": 3.039126396179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090848, + "balance_loss_mlp": 1.05692101, + "epoch": 0.3445555983070412, + "flos": 569045067264.0, + "grad_norm": 0.05940117534987587, + "language_loss": 0.84113955, + "learning_rate": 0.0007622181681239483, + "loss": 0.85204804, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.33935547, + "step": 1791, + "time_per_iteration": 2.6392083168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083434, + "balance_loss_mlp": 1.04903054, + "epoch": 0.3447479799923047, + "flos": 568524151296.0, + "grad_norm": 0.04492792711883196, + "language_loss": 0.84501636, + "learning_rate": 0.0007619528546451202, + "loss": 0.8558507, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.34448242, + "step": 1792, + "time_per_iteration": 2.776982069015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080197, + "balance_loss_mlp": 1.04708052, + "epoch": 0.3449403616775683, + "flos": 967323299328.0, + "grad_norm": 0.05878857203246004, + "language_loss": 0.8358798, + "learning_rate": 0.0007616874394671745, + "loss": 0.84668171, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.33129883, + "step": 1793, + "time_per_iteration": 3.3427693843841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074615, + "balance_loss_mlp": 1.04128361, + "epoch": 0.34513274336283184, + "flos": 568340858880.0, + "grad_norm": 0.05893035372227358, + "language_loss": 0.84961653, + "learning_rate": 0.0007614219226931547, + "loss": 0.86036265, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.33349609, + "step": 1794, + "time_per_iteration": 2.6591315269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070169, + "balance_loss_mlp": 1.03783977, + "epoch": 0.3453251250480954, + "flos": 460715793408.0, + "grad_norm": 0.06432823181520617, + "language_loss": 0.84724808, + "learning_rate": 0.0007611563044261435, + "loss": 0.85794979, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.32324219, + "step": 1795, + "time_per_iteration": 2.51755690574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078106, + "balance_loss_mlp": 1.0443697, + "epoch": 0.34551750673335896, + "flos": 415397431296.0, + "grad_norm": 0.0640589434438139, + "language_loss": 0.87120652, + "learning_rate": 0.0007608905847692631, + "loss": 0.88198757, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.33764648, + "step": 1796, + "time_per_iteration": 2.47190260887146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074227, + "balance_loss_mlp": 1.04103947, + "epoch": 0.34570988841862255, + "flos": 587540749824.0, + "grad_norm": 0.04642061059617041, + "language_loss": 0.86689866, + "learning_rate": 0.0007606247638256749, + "loss": 0.8776409, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.33203125, + "step": 1797, + "time_per_iteration": 2.956444025039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094984, + "balance_loss_mlp": 1.08373046, + "epoch": 0.34590227010388613, + "flos": 1566835439616.0, + "grad_norm": 0.041839887126655914, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79265279, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.11230469, + "step": 1798, + "time_per_iteration": 4.9134039878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.05352104, + "epoch": 0.34609465178914967, + "flos": 1536950177280.0, + "grad_norm": 0.029939636480755576, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80391788, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.11083984, + "step": 1799, + "time_per_iteration": 4.743322849273682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083924, + "balance_loss_mlp": 1.05054486, + "epoch": 0.34628703347441325, + "flos": 609075998208.0, + "grad_norm": 0.0564087129204809, + "language_loss": 0.85731971, + "learning_rate": 0.0007598266943068686, + "loss": 0.86815894, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.33398438, + "step": 1800, + "time_per_iteration": 2.7374043464660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077797, + "balance_loss_mlp": 1.04603946, + "epoch": 0.3464794151596768, + "flos": 473084596224.0, + "grad_norm": 0.06346922489791823, + "language_loss": 0.83911705, + "learning_rate": 0.0007595604692488507, + "loss": 0.849895, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.31738281, + "step": 1801, + "time_per_iteration": 2.5296249389648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.04147625, + "epoch": 0.34667179684494037, + "flos": 605397805056.0, + "grad_norm": 0.05750507090521113, + "language_loss": 0.83014846, + "learning_rate": 0.0007592941434205215, + "loss": 0.84088963, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.32641602, + "step": 1802, + "time_per_iteration": 2.758260488510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017015, + "balance_loss_mlp": 1.00628662, + "epoch": 0.3468641785302039, + "flos": 1564053511680.0, + "grad_norm": 0.014489769518708178, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74588072, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.10742188, + "step": 1803, + "time_per_iteration": 5.078529119491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069541, + "balance_loss_mlp": 1.0363059, + "epoch": 0.3470565602154675, + "flos": 906902258688.0, + "grad_norm": 0.0666829693597375, + "language_loss": 0.79937375, + "learning_rate": 0.0007587611898665566, + "loss": 0.81006914, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.33251953, + "step": 1804, + "time_per_iteration": 3.05087947845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078664, + "balance_loss_mlp": 1.04621565, + "epoch": 0.347248941900731, + "flos": 638613614592.0, + "grad_norm": 0.050247612363814816, + "language_loss": 0.8218019, + "learning_rate": 0.0007584945623478315, + "loss": 0.83258855, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.32446289, + "step": 1805, + "time_per_iteration": 2.8188822269439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071735, + "balance_loss_mlp": 1.03940511, + "epoch": 0.3474413235859946, + "flos": 847009336320.0, + "grad_norm": 0.06830759319763476, + "language_loss": 0.81376302, + "learning_rate": 0.000758227834472617, + "loss": 0.82448041, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.32324219, + "step": 1806, + "time_per_iteration": 3.049736976623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080854, + "balance_loss_mlp": 1.04771423, + "epoch": 0.3476337052712582, + "flos": 515395478016.0, + "grad_norm": 0.0580200838122141, + "language_loss": 0.77365351, + "learning_rate": 0.0007579610063444664, + "loss": 0.78446203, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.33154297, + "step": 1807, + "time_per_iteration": 2.768986701965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072697, + "balance_loss_mlp": 1.03993857, + "epoch": 0.34782608695652173, + "flos": 913161558528.0, + "grad_norm": 0.05804810611861273, + "language_loss": 0.8735044, + "learning_rate": 0.0007576940780669712, + "loss": 0.88423139, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.32763672, + "step": 1808, + "time_per_iteration": 3.200984477996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073414, + "balance_loss_mlp": 1.04041636, + "epoch": 0.3480184686417853, + "flos": 773374099968.0, + "grad_norm": 0.05336970886803796, + "language_loss": 0.84611619, + "learning_rate": 0.0007574270497437624, + "loss": 0.85685027, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.33007812, + "step": 1809, + "time_per_iteration": 2.9432260990142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069124, + "balance_loss_mlp": 1.03619814, + "epoch": 0.34821085032704885, + "flos": 576549840384.0, + "grad_norm": 0.04930975616190813, + "language_loss": 0.87883413, + "learning_rate": 0.000757159921478509, + "loss": 0.88952535, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.3293457, + "step": 1810, + "time_per_iteration": 2.769669771194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079936, + "balance_loss_mlp": 1.06887364, + "epoch": 0.34840323201231244, + "flos": 1524176911872.0, + "grad_norm": 0.03214902088053901, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75530577, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.11083984, + "step": 1811, + "time_per_iteration": 4.764174222946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107769, + "balance_loss_mlp": 1.04469275, + "epoch": 0.34859561369757597, + "flos": 508910625792.0, + "grad_norm": 0.059132347701423886, + "language_loss": 0.87255216, + "learning_rate": 0.0007566253655367423, + "loss": 0.88332909, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.33007812, + "step": 1812, + "time_per_iteration": 2.578930616378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073116, + "balance_loss_mlp": 1.04014218, + "epoch": 0.34878799538283956, + "flos": 548390117376.0, + "grad_norm": 0.051501554075800156, + "language_loss": 0.89574003, + "learning_rate": 0.000756357938067762, + "loss": 0.90647119, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.32983398, + "step": 1813, + "time_per_iteration": 2.6508982181549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079394, + "balance_loss_mlp": 1.04673076, + "epoch": 0.34898037706810314, + "flos": 983251021824.0, + "grad_norm": 0.051360492330316726, + "language_loss": 0.82609868, + "learning_rate": 0.0007560904110718033, + "loss": 0.8368926, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.32666016, + "step": 1814, + "time_per_iteration": 3.236894130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075398, + "balance_loss_mlp": 1.04185271, + "epoch": 0.3491727587533667, + "flos": 681298435584.0, + "grad_norm": 0.05446392192761228, + "language_loss": 0.83478653, + "learning_rate": 0.0007558227846527297, + "loss": 0.84554052, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.33569336, + "step": 1815, + "time_per_iteration": 2.8674044609069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074987, + "balance_loss_mlp": 1.04175162, + "epoch": 0.34936514043863026, + "flos": 393811310592.0, + "grad_norm": 0.0691488486506015, + "language_loss": 0.83195454, + "learning_rate": 0.0007555550589144429, + "loss": 0.84270442, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.33251953, + "step": 1816, + "time_per_iteration": 2.421494722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071292, + "balance_loss_mlp": 1.03917694, + "epoch": 0.3495575221238938, + "flos": 461120256000.0, + "grad_norm": 0.07868701205222765, + "language_loss": 0.8463372, + "learning_rate": 0.000755287233960883, + "loss": 0.85705012, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.32104492, + "step": 1817, + "time_per_iteration": 2.54315185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072544, + "balance_loss_mlp": 1.04023862, + "epoch": 0.3497499038091574, + "flos": 723859600896.0, + "grad_norm": 0.06602653795060065, + "language_loss": 0.77636009, + "learning_rate": 0.0007550193098960292, + "loss": 0.78708553, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.32299805, + "step": 1818, + "time_per_iteration": 2.848236560821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076763, + "balance_loss_mlp": 1.04452837, + "epoch": 0.3499422854944209, + "flos": 827364132864.0, + "grad_norm": 0.049816715297611704, + "language_loss": 0.86387616, + "learning_rate": 0.0007547512868238988, + "loss": 0.8746438, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.32226562, + "step": 1819, + "time_per_iteration": 3.140552043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076553, + "balance_loss_mlp": 1.0441277, + "epoch": 0.3501346671796845, + "flos": 493214247936.0, + "grad_norm": 0.049810070694169546, + "language_loss": 0.83196282, + "learning_rate": 0.0007544831648485473, + "loss": 0.84272838, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.32421875, + "step": 1820, + "time_per_iteration": 2.7085797786712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074414, + "balance_loss_mlp": 1.04179859, + "epoch": 0.35032704886494803, + "flos": 578479173120.0, + "grad_norm": 0.05987447994889705, + "language_loss": 0.81237, + "learning_rate": 0.0007542149440740694, + "loss": 0.82311416, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.32617188, + "step": 1821, + "time_per_iteration": 2.6648108959198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075377, + "balance_loss_mlp": 1.0426898, + "epoch": 0.3505194305502116, + "flos": 584383472640.0, + "grad_norm": 0.06285767185927299, + "language_loss": 0.85454488, + "learning_rate": 0.000753946624604597, + "loss": 0.86529863, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.3269043, + "step": 1822, + "time_per_iteration": 2.7114102840423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080366, + "balance_loss_mlp": 1.04722571, + "epoch": 0.3507118122354752, + "flos": 526705072128.0, + "grad_norm": 0.056571758739544044, + "language_loss": 0.88259315, + "learning_rate": 0.0007536782065443015, + "loss": 0.89339685, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.33154297, + "step": 1823, + "time_per_iteration": 2.6336190700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077518, + "balance_loss_mlp": 1.04576099, + "epoch": 0.35090419392073874, + "flos": 511269152256.0, + "grad_norm": 0.06612506998948281, + "language_loss": 0.74917412, + "learning_rate": 0.0007534096899973919, + "loss": 0.75994933, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.31738281, + "step": 1824, + "time_per_iteration": 2.5683584213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108279, + "balance_loss_mlp": 1.05069852, + "epoch": 0.3510965756060023, + "flos": 563728522752.0, + "grad_norm": 0.05207355522992398, + "language_loss": 0.82511663, + "learning_rate": 0.0007531410750681154, + "loss": 0.83594453, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.32080078, + "step": 1825, + "time_per_iteration": 2.7370071411132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108428, + "balance_loss_mlp": 1.05207014, + "epoch": 0.35128895729126586, + "flos": 1020107146752.0, + "grad_norm": 0.05855996344544413, + "language_loss": 0.86223209, + "learning_rate": 0.0007528723618607575, + "loss": 0.87307489, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.32202148, + "step": 1826, + "time_per_iteration": 3.4230828285217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080919, + "balance_loss_mlp": 1.04889941, + "epoch": 0.35148133897652944, + "flos": 587972915712.0, + "grad_norm": 0.06514472806491804, + "language_loss": 0.82370871, + "learning_rate": 0.0007526035504796422, + "loss": 0.8345179, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.32006836, + "step": 1827, + "time_per_iteration": 2.7472023963928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083394, + "balance_loss_mlp": 1.05011046, + "epoch": 0.351673720661793, + "flos": 495054830592.0, + "grad_norm": 0.13631276803870807, + "language_loss": 0.86120903, + "learning_rate": 0.0007523346410291312, + "loss": 0.87204289, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.33300781, + "step": 1828, + "time_per_iteration": 2.7665555477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080213, + "balance_loss_mlp": 1.04757404, + "epoch": 0.35186610234705656, + "flos": 762339520512.0, + "grad_norm": 0.04983334941453678, + "language_loss": 0.85021639, + "learning_rate": 0.0007520656336136245, + "loss": 0.86101854, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.32641602, + "step": 1829, + "time_per_iteration": 2.9405102729797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080442, + "balance_loss_mlp": 1.04847038, + "epoch": 0.3520584840323201, + "flos": 625822820352.0, + "grad_norm": 0.049266285647792965, + "language_loss": 0.87560928, + "learning_rate": 0.0007517965283375599, + "loss": 0.88641369, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.31958008, + "step": 1830, + "time_per_iteration": 2.8742456436157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076907, + "balance_loss_mlp": 1.04429162, + "epoch": 0.3522508657175837, + "flos": 537124193280.0, + "grad_norm": 0.05152278098600794, + "language_loss": 0.8913554, + "learning_rate": 0.0007515273253054132, + "loss": 0.90212452, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.32617188, + "step": 1831, + "time_per_iteration": 2.647270917892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085251, + "balance_loss_mlp": 1.0506562, + "epoch": 0.35244324740284727, + "flos": 567105560064.0, + "grad_norm": 0.052396269804254075, + "language_loss": 0.82697165, + "learning_rate": 0.0007512580246216988, + "loss": 0.83782411, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.34643555, + "step": 1832, + "time_per_iteration": 2.6887552738189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079591, + "balance_loss_mlp": 1.04673672, + "epoch": 0.3526356290881108, + "flos": 512809989120.0, + "grad_norm": 0.05749675796225481, + "language_loss": 0.85263908, + "learning_rate": 0.000750988626390968, + "loss": 0.86343497, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.32861328, + "step": 1833, + "time_per_iteration": 2.6013457775115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080641, + "balance_loss_mlp": 1.04781032, + "epoch": 0.3528280107733744, + "flos": 595496627712.0, + "grad_norm": 0.05344239959905588, + "language_loss": 0.84880912, + "learning_rate": 0.0007507191307178108, + "loss": 0.8596155, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.32836914, + "step": 1834, + "time_per_iteration": 2.7490363121032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080747, + "balance_loss_mlp": 1.04682052, + "epoch": 0.3530203924586379, + "flos": 550969814016.0, + "grad_norm": 0.06515988244691072, + "language_loss": 0.74431223, + "learning_rate": 0.0007504495377068543, + "loss": 0.75511968, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.33959961, + "step": 1835, + "time_per_iteration": 2.729029417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084115, + "balance_loss_mlp": 1.04925871, + "epoch": 0.3532127741439015, + "flos": 652662876672.0, + "grad_norm": 0.06759605529963146, + "language_loss": 0.81589389, + "learning_rate": 0.0007501798474627642, + "loss": 0.82673502, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.34912109, + "step": 1836, + "time_per_iteration": 2.9048030376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080775, + "balance_loss_mlp": 1.04708695, + "epoch": 0.35340515582916504, + "flos": 722452594176.0, + "grad_norm": 0.055893281392717674, + "language_loss": 0.83221173, + "learning_rate": 0.0007499100600902433, + "loss": 0.84301955, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.3371582, + "step": 1837, + "time_per_iteration": 2.9900574684143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080937, + "balance_loss_mlp": 1.0464375, + "epoch": 0.35359753751442863, + "flos": 594619301376.0, + "grad_norm": 0.06113982905710786, + "language_loss": 0.84191763, + "learning_rate": 0.0007496401756940324, + "loss": 0.852727, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.34545898, + "step": 1838, + "time_per_iteration": 2.6746203899383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079632, + "balance_loss_mlp": 1.04575253, + "epoch": 0.3537899191996922, + "flos": 632384838144.0, + "grad_norm": 0.05956961248716192, + "language_loss": 0.82392603, + "learning_rate": 0.0007493701943789098, + "loss": 0.8347224, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.33886719, + "step": 1839, + "time_per_iteration": 2.773550510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089423, + "balance_loss_mlp": 1.05590141, + "epoch": 0.35398230088495575, + "flos": 506118523392.0, + "grad_norm": 0.05410374630174333, + "language_loss": 0.82311571, + "learning_rate": 0.000749100116249692, + "loss": 0.83400989, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.33544922, + "step": 1840, + "time_per_iteration": 2.6255862712860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089923, + "balance_loss_mlp": 1.05649722, + "epoch": 0.35417468257021933, + "flos": 507783015936.0, + "grad_norm": 0.06109989504264522, + "language_loss": 0.86315167, + "learning_rate": 0.0007488299414112321, + "loss": 0.87405092, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.33447266, + "step": 1841, + "time_per_iteration": 2.5875229835510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088685, + "balance_loss_mlp": 1.05552149, + "epoch": 0.35436706425548287, + "flos": 656133046272.0, + "grad_norm": 0.05742985112465967, + "language_loss": 0.77833533, + "learning_rate": 0.0007485596699684215, + "loss": 0.78922212, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.33178711, + "step": 1842, + "time_per_iteration": 2.819591760635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092886, + "balance_loss_mlp": 1.05903101, + "epoch": 0.35455944594074645, + "flos": 652322433024.0, + "grad_norm": 0.047878329403948795, + "language_loss": 0.85455877, + "learning_rate": 0.000748289302026189, + "loss": 0.86548758, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.33886719, + "step": 1843, + "time_per_iteration": 2.829897880554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088118, + "balance_loss_mlp": 1.05569351, + "epoch": 0.35475182762601, + "flos": 848240252928.0, + "grad_norm": 0.06279452498251797, + "language_loss": 0.85658133, + "learning_rate": 0.0007480188376895004, + "loss": 0.86746252, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.32421875, + "step": 1844, + "time_per_iteration": 3.067828893661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085552, + "balance_loss_mlp": 1.07358336, + "epoch": 0.3549442093112736, + "flos": 1520644133376.0, + "grad_norm": 0.027370210450034033, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.7489689, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.11962891, + "step": 1845, + "time_per_iteration": 4.860119342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087227, + "balance_loss_mlp": 1.05401564, + "epoch": 0.3551365909965371, + "flos": 651087134208.0, + "grad_norm": 0.057022057365061586, + "language_loss": 0.7840451, + "learning_rate": 0.0007474776202528074, + "loss": 0.79491735, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.33227539, + "step": 1846, + "time_per_iteration": 2.924600601196289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081626, + "balance_loss_mlp": 1.04877198, + "epoch": 0.3553289726818007, + "flos": 897094213632.0, + "grad_norm": 0.05655103479540665, + "language_loss": 0.81245291, + "learning_rate": 0.000747206867362922, + "loss": 0.82326913, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.32861328, + "step": 1847, + "time_per_iteration": 3.0635437965393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078587, + "balance_loss_mlp": 1.0454942, + "epoch": 0.3555213543670643, + "flos": 688181958144.0, + "grad_norm": 0.057996459019562165, + "language_loss": 0.83748043, + "learning_rate": 0.0007469360184988194, + "loss": 0.84826624, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.33105469, + "step": 1848, + "time_per_iteration": 2.816774606704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072989, + "balance_loss_mlp": 1.03977752, + "epoch": 0.3557137360523278, + "flos": 538305647616.0, + "grad_norm": 0.0578078380794177, + "language_loss": 0.87284935, + "learning_rate": 0.0007466650737656518, + "loss": 0.88357925, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.33203125, + "step": 1849, + "time_per_iteration": 2.611743927001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075842, + "balance_loss_mlp": 1.04208231, + "epoch": 0.3559061177375914, + "flos": 402039230976.0, + "grad_norm": 0.05251231214094578, + "language_loss": 0.90093362, + "learning_rate": 0.0007463940332686098, + "loss": 0.91169202, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.33789062, + "step": 1850, + "time_per_iteration": 2.4692726135253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073866, + "balance_loss_mlp": 1.04017735, + "epoch": 0.35609849942285493, + "flos": 696238170624.0, + "grad_norm": 0.04795835093932571, + "language_loss": 0.84167922, + "learning_rate": 0.0007461228971129205, + "loss": 0.85241795, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.33691406, + "step": 1851, + "time_per_iteration": 2.894505023956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106997, + "balance_loss_mlp": 1.0372591, + "epoch": 0.3562908811081185, + "flos": 568660953600.0, + "grad_norm": 0.055081415669052246, + "language_loss": 0.85513294, + "learning_rate": 0.0007458516654038483, + "loss": 0.86583257, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.32714844, + "step": 1852, + "time_per_iteration": 2.678018569946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076636, + "balance_loss_mlp": 1.0421133, + "epoch": 0.35648326279338205, + "flos": 682081219584.0, + "grad_norm": 0.04842584798560518, + "language_loss": 0.8668319, + "learning_rate": 0.0007455803382466946, + "loss": 0.87759829, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.34545898, + "step": 1853, + "time_per_iteration": 2.795799493789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081318, + "balance_loss_mlp": 1.04674757, + "epoch": 0.35667564447864564, + "flos": 628840475136.0, + "grad_norm": 0.04891463031827082, + "language_loss": 0.87319207, + "learning_rate": 0.0007453089157467979, + "loss": 0.88400525, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.34594727, + "step": 1854, + "time_per_iteration": 2.7683348655700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084525, + "balance_loss_mlp": 1.05000162, + "epoch": 0.35686802616390917, + "flos": 813685837824.0, + "grad_norm": 0.04901692214928195, + "language_loss": 0.81941634, + "learning_rate": 0.0007450373980095341, + "loss": 0.83026159, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.34545898, + "step": 1855, + "time_per_iteration": 3.069664716720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088761, + "balance_loss_mlp": 1.0541904, + "epoch": 0.35706040784917276, + "flos": 525922288128.0, + "grad_norm": 0.06393454459125486, + "language_loss": 0.86792582, + "learning_rate": 0.0007447657851403155, + "loss": 0.87881339, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.34619141, + "step": 1856, + "time_per_iteration": 2.6120662689208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081268, + "balance_loss_mlp": 1.04793692, + "epoch": 0.35725278953443634, + "flos": 511698345984.0, + "grad_norm": 0.060959809088696394, + "language_loss": 0.78963649, + "learning_rate": 0.0007444940772445915, + "loss": 0.80044913, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.33349609, + "step": 1857, + "time_per_iteration": 2.802053689956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079861, + "balance_loss_mlp": 1.04653037, + "epoch": 0.3574451712196999, + "flos": 487162971648.0, + "grad_norm": 0.06448223618511208, + "language_loss": 0.80338144, + "learning_rate": 0.0007442222744278484, + "loss": 0.81418002, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.33349609, + "step": 1858, + "time_per_iteration": 2.660689353942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080529, + "balance_loss_mlp": 1.04765141, + "epoch": 0.35763755290496346, + "flos": 550384879104.0, + "grad_norm": 0.061962253699798436, + "language_loss": 0.84126002, + "learning_rate": 0.0007439503767956099, + "loss": 0.85206527, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.32885742, + "step": 1859, + "time_per_iteration": 2.7479875087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080547, + "balance_loss_mlp": 1.06767237, + "epoch": 0.357829934590227, + "flos": 1503300791808.0, + "grad_norm": 0.035903025748828234, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80752152, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.12890625, + "step": 1860, + "time_per_iteration": 4.900041580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077433, + "balance_loss_mlp": 1.04479325, + "epoch": 0.3580223162754906, + "flos": 568410670080.0, + "grad_norm": 0.040558802150678905, + "language_loss": 0.85799539, + "learning_rate": 0.000743406297506922, + "loss": 0.86876976, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.32641602, + "step": 1861, + "time_per_iteration": 2.701162576675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084428, + "balance_loss_mlp": 1.05107355, + "epoch": 0.3582146979607541, + "flos": 626153089536.0, + "grad_norm": 0.04686630584337546, + "language_loss": 0.8419295, + "learning_rate": 0.0007431341160617031, + "loss": 0.85277379, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.33374023, + "step": 1862, + "time_per_iteration": 2.860173463821411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085309, + "balance_loss_mlp": 1.05266929, + "epoch": 0.3584070796460177, + "flos": 507010406400.0, + "grad_norm": 0.04939599291948986, + "language_loss": 0.88143289, + "learning_rate": 0.0007428618402234491, + "loss": 0.89228594, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.32641602, + "step": 1863, + "time_per_iteration": 2.62233567237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083488, + "balance_loss_mlp": 1.05030036, + "epoch": 0.3585994613312813, + "flos": 606190763520.0, + "grad_norm": 0.051497717495276533, + "language_loss": 0.80248374, + "learning_rate": 0.0007425894700978668, + "loss": 0.81331861, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.33203125, + "step": 1864, + "time_per_iteration": 2.711484670639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087216, + "balance_loss_mlp": 1.05424261, + "epoch": 0.3587918430165448, + "flos": 1412338484736.0, + "grad_norm": 0.047877863134497434, + "language_loss": 0.79232943, + "learning_rate": 0.0007423170057906996, + "loss": 0.80320162, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.32983398, + "step": 1865, + "time_per_iteration": 3.852776527404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091453, + "balance_loss_mlp": 1.05821717, + "epoch": 0.3589842247018084, + "flos": 478313800704.0, + "grad_norm": 0.06431447428318769, + "language_loss": 0.85827845, + "learning_rate": 0.0007420444474077275, + "loss": 0.86919296, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.33251953, + "step": 1866, + "time_per_iteration": 2.6104037761688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101796, + "balance_loss_mlp": 1.06829846, + "epoch": 0.35917660638707194, + "flos": 504464205312.0, + "grad_norm": 0.06438653143979521, + "language_loss": 0.89830631, + "learning_rate": 0.0007417717950547671, + "loss": 0.90932429, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.33520508, + "step": 1867, + "time_per_iteration": 2.5619330406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108848, + "balance_loss_mlp": 1.09687901, + "epoch": 0.3593689880723355, + "flos": 1491294191616.0, + "grad_norm": 0.037524520389889536, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77105457, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.11962891, + "step": 1868, + "time_per_iteration": 4.971943378448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096427, + "balance_loss_mlp": 1.06388319, + "epoch": 0.35956136975759906, + "flos": 528369564672.0, + "grad_norm": 0.050983088733796166, + "language_loss": 0.84612024, + "learning_rate": 0.0007412262088623299, + "loss": 0.85708451, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.32543945, + "step": 1869, + "time_per_iteration": 2.7295072078704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104656, + "balance_loss_mlp": 1.07142007, + "epoch": 0.35975375144286265, + "flos": 534647803392.0, + "grad_norm": 0.057848782745497714, + "language_loss": 0.79012549, + "learning_rate": 0.0007409532752346684, + "loss": 0.80117208, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.33251953, + "step": 1870, + "time_per_iteration": 2.74664568901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107858, + "balance_loss_mlp": 1.07464623, + "epoch": 0.3599461331281262, + "flos": 504695549952.0, + "grad_norm": 0.054621035664709404, + "language_loss": 0.88661271, + "learning_rate": 0.0007406802480606491, + "loss": 0.89769125, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.33227539, + "step": 1871, + "time_per_iteration": 2.636101722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_mlp": 1.06923246, + "epoch": 0.36013851481338977, + "flos": 511283708928.0, + "grad_norm": 0.05849515281409536, + "language_loss": 0.90592384, + "learning_rate": 0.0007404071274462707, + "loss": 0.91694903, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.33300781, + "step": 1872, + "time_per_iteration": 2.559588670730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098641, + "balance_loss_mlp": 1.06533384, + "epoch": 0.36033089649865335, + "flos": 547330908672.0, + "grad_norm": 0.06237198940644659, + "language_loss": 0.8363173, + "learning_rate": 0.0007401339134975682, + "loss": 0.84730369, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.33325195, + "step": 1873, + "time_per_iteration": 2.6156845092773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100197, + "balance_loss_mlp": 1.06617522, + "epoch": 0.3605232781839169, + "flos": 458416903680.0, + "grad_norm": 0.05108892475659159, + "language_loss": 0.84187275, + "learning_rate": 0.0007398606063206122, + "loss": 0.85287476, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.34033203, + "step": 1874, + "time_per_iteration": 2.6152756214141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090341, + "balance_loss_mlp": 1.05729628, + "epoch": 0.36071565986918047, + "flos": 509309296128.0, + "grad_norm": 0.05589329807105905, + "language_loss": 0.7857852, + "learning_rate": 0.0007395872060215101, + "loss": 0.79668868, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.33056641, + "step": 1875, + "time_per_iteration": 2.592906951904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095107, + "balance_loss_mlp": 1.06230044, + "epoch": 0.360908041554444, + "flos": 558931484160.0, + "grad_norm": 0.12468103825296885, + "language_loss": 0.88329792, + "learning_rate": 0.0007393137127064056, + "loss": 0.89424896, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.328125, + "step": 1876, + "time_per_iteration": 2.629368782043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109653, + "balance_loss_mlp": 1.06300879, + "epoch": 0.3611004232397076, + "flos": 523588492800.0, + "grad_norm": 0.05189881397754868, + "language_loss": 0.84167802, + "learning_rate": 0.0007390401264814779, + "loss": 0.85264337, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.33544922, + "step": 1877, + "time_per_iteration": 2.644322156906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084049, + "balance_loss_mlp": 1.05179131, + "epoch": 0.3612928049249711, + "flos": 540728193024.0, + "grad_norm": 0.07312313725982984, + "language_loss": 0.84472072, + "learning_rate": 0.0007387664474529427, + "loss": 0.8555612, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.32250977, + "step": 1878, + "time_per_iteration": 2.6105034351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094149, + "balance_loss_mlp": 1.06131935, + "epoch": 0.3614851866102347, + "flos": 552289480704.0, + "grad_norm": 0.06338398309504269, + "language_loss": 0.9129535, + "learning_rate": 0.0007384926757270518, + "loss": 0.923895, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.32836914, + "step": 1879, + "time_per_iteration": 2.6268200874328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082708, + "balance_loss_mlp": 1.05023539, + "epoch": 0.36167756829549824, + "flos": 771734338560.0, + "grad_norm": 0.048672507925477976, + "language_loss": 0.79680419, + "learning_rate": 0.0007382188114100924, + "loss": 0.80763125, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.32470703, + "step": 1880, + "time_per_iteration": 2.9548373222351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078144, + "balance_loss_mlp": 1.04509938, + "epoch": 0.36186994998076183, + "flos": 711560609280.0, + "grad_norm": 0.04804943389379678, + "language_loss": 0.81544787, + "learning_rate": 0.0007379448546083884, + "loss": 0.82622933, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.33056641, + "step": 1881, + "time_per_iteration": 2.900480031967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082413, + "balance_loss_mlp": 1.04884315, + "epoch": 0.3620623316660254, + "flos": 747209138688.0, + "grad_norm": 0.049920719635936736, + "language_loss": 0.88019323, + "learning_rate": 0.0007376708054282992, + "loss": 0.89101738, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.3359375, + "step": 1882, + "time_per_iteration": 2.9482829570770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075385, + "balance_loss_mlp": 1.04286492, + "epoch": 0.36225471335128895, + "flos": 482312088576.0, + "grad_norm": 0.04692483307288239, + "language_loss": 0.83908749, + "learning_rate": 0.0007373966639762201, + "loss": 0.84984136, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.32519531, + "step": 1883, + "time_per_iteration": 2.597809076309204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078049, + "balance_loss_mlp": 1.0448606, + "epoch": 0.36244709503655254, + "flos": 506655406080.0, + "grad_norm": 0.0703007209611724, + "language_loss": 0.8835175, + "learning_rate": 0.0007371224303585822, + "loss": 0.89429802, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.33203125, + "step": 1884, + "time_per_iteration": 2.5686471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046189, + "balance_loss_mlp": 1.03360081, + "epoch": 0.36263947672181607, + "flos": 1393302643200.0, + "grad_norm": 0.020620128786032376, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81403255, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.12597656, + "step": 1885, + "time_per_iteration": 4.68831205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073343, + "balance_loss_mlp": 1.03939199, + "epoch": 0.36283185840707965, + "flos": 652991735808.0, + "grad_norm": 0.05943029236677907, + "language_loss": 0.82845902, + "learning_rate": 0.0007365736870525335, + "loss": 0.83919251, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.33959961, + "step": 1886, + "time_per_iteration": 2.8566346168518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070323, + "balance_loss_mlp": 1.0373739, + "epoch": 0.3630242400923432, + "flos": 488619440640.0, + "grad_norm": 0.06703223685064427, + "language_loss": 0.82574463, + "learning_rate": 0.000736299177577164, + "loss": 0.83644783, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.32958984, + "step": 1887, + "time_per_iteration": 2.5848894119262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074233, + "balance_loss_mlp": 1.04130709, + "epoch": 0.3632166217776068, + "flos": 516892644864.0, + "grad_norm": 0.0626455482667494, + "language_loss": 0.83844066, + "learning_rate": 0.0007360245763623174, + "loss": 0.84918302, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.3293457, + "step": 1888, + "time_per_iteration": 2.6179397106170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067525, + "balance_loss_mlp": 1.03564882, + "epoch": 0.36340900346287036, + "flos": 645881250816.0, + "grad_norm": 0.06111369810549259, + "language_loss": 0.89794236, + "learning_rate": 0.0007357498835146039, + "loss": 0.90861762, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.31860352, + "step": 1889, + "time_per_iteration": 2.8311662673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070281, + "balance_loss_mlp": 1.03854752, + "epoch": 0.3636013851481339, + "flos": 553057708032.0, + "grad_norm": 0.0568549422608731, + "language_loss": 0.86402494, + "learning_rate": 0.0007354750991406684, + "loss": 0.87472773, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.31713867, + "step": 1890, + "time_per_iteration": 2.6922197341918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072041, + "balance_loss_mlp": 1.03952074, + "epoch": 0.3637937668333975, + "flos": 546395355648.0, + "grad_norm": 0.053455628499382915, + "language_loss": 0.80957252, + "learning_rate": 0.0007352002233471919, + "loss": 0.82029295, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.32519531, + "step": 1891, + "time_per_iteration": 2.621241569519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066281, + "balance_loss_mlp": 1.03371286, + "epoch": 0.363986148518661, + "flos": 537838576128.0, + "grad_norm": 0.07508945751401845, + "language_loss": 0.79549944, + "learning_rate": 0.0007349252562408906, + "loss": 0.80616224, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.32543945, + "step": 1892, + "time_per_iteration": 2.674318552017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069175, + "balance_loss_mlp": 1.0372504, + "epoch": 0.3641785302039246, + "flos": 659895607296.0, + "grad_norm": 0.04761623500947703, + "language_loss": 0.81258041, + "learning_rate": 0.0007346501979285158, + "loss": 0.82327211, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.3190918, + "step": 1893, + "time_per_iteration": 2.8671226501464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_mlp": 1.0238719, + "epoch": 0.36437091188918813, + "flos": 1467911158272.0, + "grad_norm": 0.02143179240630706, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81574464, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.11474609, + "step": 1894, + "time_per_iteration": 4.7720019817352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075531, + "balance_loss_mlp": 1.04248571, + "epoch": 0.3645632935744517, + "flos": 597012733440.0, + "grad_norm": 0.049808711864839754, + "language_loss": 0.85850054, + "learning_rate": 0.0007340998081127308, + "loss": 0.8692559, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.33056641, + "step": 1895, + "time_per_iteration": 2.753730058670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081037, + "balance_loss_mlp": 1.0486834, + "epoch": 0.36475567525971525, + "flos": 599214108672.0, + "grad_norm": 0.05384470863640996, + "language_loss": 0.9063257, + "learning_rate": 0.0007338244768230007, + "loss": 0.91713607, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.32348633, + "step": 1896, + "time_per_iteration": 2.749844551086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_mlp": 1.05189669, + "epoch": 0.36494805694497884, + "flos": 798047686656.0, + "grad_norm": 0.12041633701688108, + "language_loss": 0.88843018, + "learning_rate": 0.0007335490547545578, + "loss": 0.89927363, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.32446289, + "step": 1897, + "time_per_iteration": 3.0181519985198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089996, + "balance_loss_mlp": 1.05795288, + "epoch": 0.3651404386302424, + "flos": 637023315456.0, + "grad_norm": 0.06340749789439089, + "language_loss": 0.82377589, + "learning_rate": 0.0007332735420143308, + "loss": 0.83467579, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.3203125, + "step": 1898, + "time_per_iteration": 2.7370855808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077681, + "balance_loss_mlp": 1.04442132, + "epoch": 0.36533282031550596, + "flos": 491337349632.0, + "grad_norm": 0.05751458989837244, + "language_loss": 0.8663426, + "learning_rate": 0.0007329979387092826, + "loss": 0.87711942, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.33276367, + "step": 1899, + "time_per_iteration": 2.552072048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076766, + "balance_loss_mlp": 1.0440551, + "epoch": 0.36552520200076954, + "flos": 855587874816.0, + "grad_norm": 0.050366197091212025, + "language_loss": 0.83863711, + "learning_rate": 0.0007327222449464124, + "loss": 0.84940481, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.32714844, + "step": 1900, + "time_per_iteration": 3.2450594902038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072697, + "balance_loss_mlp": 1.03936601, + "epoch": 0.3657175836860331, + "flos": 483449872896.0, + "grad_norm": 0.053278478248789174, + "language_loss": 0.88864619, + "learning_rate": 0.0007324464608327538, + "loss": 0.89937317, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.33349609, + "step": 1901, + "time_per_iteration": 2.6027348041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072333, + "balance_loss_mlp": 1.03957391, + "epoch": 0.36590996537129666, + "flos": 434561006592.0, + "grad_norm": 0.058664113400220264, + "language_loss": 0.88440275, + "learning_rate": 0.0007321705864753758, + "loss": 0.8951261, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.32763672, + "step": 1902, + "time_per_iteration": 2.668935537338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073172, + "balance_loss_mlp": 1.03950715, + "epoch": 0.3661023470565602, + "flos": 711880704000.0, + "grad_norm": 0.047699393186438684, + "language_loss": 0.84307706, + "learning_rate": 0.0007318946219813823, + "loss": 0.85380876, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.33691406, + "step": 1903, + "time_per_iteration": 3.025866985321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074477, + "balance_loss_mlp": 1.04100275, + "epoch": 0.3662947287418238, + "flos": 564495340032.0, + "grad_norm": 0.05797091317965262, + "language_loss": 0.90078342, + "learning_rate": 0.000731618567457912, + "loss": 0.91152817, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.3347168, + "step": 1904, + "time_per_iteration": 2.6391115188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073585, + "balance_loss_mlp": 1.03937197, + "epoch": 0.3664871104270873, + "flos": 789391982592.0, + "grad_norm": 0.05925410463566973, + "language_loss": 0.87083924, + "learning_rate": 0.000731342423012139, + "loss": 0.88157511, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.3425293, + "step": 1905, + "time_per_iteration": 3.020660400390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070848, + "balance_loss_mlp": 1.03682566, + "epoch": 0.3666794921123509, + "flos": 752202616320.0, + "grad_norm": 0.06601024748857935, + "language_loss": 0.82244205, + "learning_rate": 0.0007310661887512722, + "loss": 0.83315057, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.34033203, + "step": 1906, + "time_per_iteration": 3.0128185749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068568, + "balance_loss_mlp": 1.03571391, + "epoch": 0.3668718737976145, + "flos": 523264015872.0, + "grad_norm": 0.04853340441162438, + "language_loss": 0.82115662, + "learning_rate": 0.0007307898647825549, + "loss": 0.8318423, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.32861328, + "step": 1907, + "time_per_iteration": 2.6610257625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075142, + "balance_loss_mlp": 1.04126275, + "epoch": 0.367064255482878, + "flos": 571698957312.0, + "grad_norm": 0.05773956677348378, + "language_loss": 0.89470363, + "learning_rate": 0.0007305134512132659, + "loss": 0.90545505, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.33886719, + "step": 1908, + "time_per_iteration": 2.706658124923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076976, + "balance_loss_mlp": 1.04309678, + "epoch": 0.3672566371681416, + "flos": 446880347136.0, + "grad_norm": 0.0894454707503668, + "language_loss": 0.83388865, + "learning_rate": 0.0007302369481507183, + "loss": 0.84465849, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.33911133, + "step": 1909, + "time_per_iteration": 2.499483346939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049259, + "balance_loss_mlp": 1.03838694, + "epoch": 0.36744901885340514, + "flos": 1539275208192.0, + "grad_norm": 0.032302576214162944, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81011015, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.10888672, + "step": 1910, + "time_per_iteration": 4.86514949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088721, + "balance_loss_mlp": 1.05476999, + "epoch": 0.36764140053866873, + "flos": 563417192448.0, + "grad_norm": 0.061805914783829616, + "language_loss": 0.85575247, + "learning_rate": 0.000729683673975274, + "loss": 0.86663967, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.33984375, + "step": 1911, + "time_per_iteration": 2.6907522678375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091619, + "balance_loss_mlp": 1.05709589, + "epoch": 0.36783378222393226, + "flos": 1216168971264.0, + "grad_norm": 0.04498413319979697, + "language_loss": 0.82746279, + "learning_rate": 0.0007294069030771774, + "loss": 0.83837891, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.34570312, + "step": 1912, + "time_per_iteration": 3.6445353031158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109623, + "balance_loss_mlp": 1.06196952, + "epoch": 0.36802616390919585, + "flos": 498476947968.0, + "grad_norm": 0.055898807174015214, + "language_loss": 0.90671504, + "learning_rate": 0.0007291300431154224, + "loss": 0.9176774, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.34301758, + "step": 1913, + "time_per_iteration": 2.5600366592407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025736, + "balance_loss_mlp": 1.0155319, + "epoch": 0.36821854559445943, + "flos": 1581281961984.0, + "grad_norm": 0.015307788275572325, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71415472, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.10205078, + "step": 1914, + "time_per_iteration": 4.9577555656433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110633, + "balance_loss_mlp": 1.07287991, + "epoch": 0.36841092727972297, + "flos": 835261784064.0, + "grad_norm": 0.06223209716338702, + "language_loss": 0.79735458, + "learning_rate": 0.0007285760564309179, + "loss": 0.80841786, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.3347168, + "step": 1915, + "time_per_iteration": 3.1251590251922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101251, + "balance_loss_mlp": 1.06811082, + "epoch": 0.36860330896498655, + "flos": 689517591552.0, + "grad_norm": 0.05672479428696366, + "language_loss": 0.85010201, + "learning_rate": 0.0007282989299232448, + "loss": 0.8611145, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.33154297, + "step": 1916, + "time_per_iteration": 3.062971353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096553, + "balance_loss_mlp": 1.06381774, + "epoch": 0.3687956906502501, + "flos": 553919067648.0, + "grad_norm": 0.05955658020637064, + "language_loss": 0.83600092, + "learning_rate": 0.0007280217147820668, + "loss": 0.84696645, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.32739258, + "step": 1917, + "time_per_iteration": 2.6169495582580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097845, + "balance_loss_mlp": 1.06465673, + "epoch": 0.3689880723355137, + "flos": 576426184704.0, + "grad_norm": 0.05443515430960571, + "language_loss": 0.79137111, + "learning_rate": 0.0007277444111150079, + "loss": 0.80234957, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.33203125, + "step": 1918, + "time_per_iteration": 2.672696828842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101204, + "balance_loss_mlp": 1.06820679, + "epoch": 0.3691804540207772, + "flos": 528615465984.0, + "grad_norm": 0.06564490716140688, + "language_loss": 0.84340626, + "learning_rate": 0.0007274670190297272, + "loss": 0.85441828, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.33007812, + "step": 1919, + "time_per_iteration": 2.569920539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091279, + "balance_loss_mlp": 1.0575906, + "epoch": 0.3693728357060408, + "flos": 560729806848.0, + "grad_norm": 0.06475948680742319, + "language_loss": 0.81988895, + "learning_rate": 0.0007271895386339179, + "loss": 0.83080173, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.3371582, + "step": 1920, + "time_per_iteration": 2.765470027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086642, + "balance_loss_mlp": 1.05350161, + "epoch": 0.3695652173913043, + "flos": 579488919552.0, + "grad_norm": 0.0536525739451854, + "language_loss": 0.82950377, + "learning_rate": 0.0007269119700353073, + "loss": 0.84037018, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.33154297, + "step": 1921, + "time_per_iteration": 2.703117847442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089729, + "balance_loss_mlp": 1.05756688, + "epoch": 0.3697575990765679, + "flos": 512629516800.0, + "grad_norm": 0.04104943724396866, + "language_loss": 0.84983069, + "learning_rate": 0.0007266343133416571, + "loss": 0.86072791, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.3215332, + "step": 1922, + "time_per_iteration": 2.7371909618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060973, + "balance_loss_mlp": 1.04967153, + "epoch": 0.3699499807618315, + "flos": 1569826953216.0, + "grad_norm": 0.023907464900796205, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78177893, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.11279297, + "step": 1923, + "time_per_iteration": 4.827981233596802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084577, + "balance_loss_mlp": 1.05136538, + "epoch": 0.37014236244709503, + "flos": 497093262336.0, + "grad_norm": 0.06739223877154035, + "language_loss": 0.84575641, + "learning_rate": 0.0007260787361004556, + "loss": 0.85660219, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.33227539, + "step": 1924, + "time_per_iteration": 2.6221601963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048708, + "balance_loss_mlp": 1.03764546, + "epoch": 0.3703347441323586, + "flos": 1443562048512.0, + "grad_norm": 0.02017040526472397, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74810213, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.11083984, + "step": 1925, + "time_per_iteration": 4.909639120101929 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083689, + "balance_loss_mlp": 1.04997683, + "epoch": 0.37052712581762215, + "flos": 563324060160.0, + "grad_norm": 0.19489786122972683, + "language_loss": 0.87265027, + "learning_rate": 0.0007255228077730903, + "loss": 0.88348716, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.33740234, + "step": 1926, + "time_per_iteration": 2.726412773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092339, + "balance_loss_mlp": 1.05850744, + "epoch": 0.37071950750288574, + "flos": 925706451456.0, + "grad_norm": 0.06639539702607969, + "language_loss": 0.81730163, + "learning_rate": 0.0007252447122218632, + "loss": 0.82822502, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.33862305, + "step": 1927, + "time_per_iteration": 3.157439708709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090678, + "balance_loss_mlp": 1.05710912, + "epoch": 0.37091188918814927, + "flos": 418090609152.0, + "grad_norm": 0.06586667444600991, + "language_loss": 0.87736213, + "learning_rate": 0.0007249665292228834, + "loss": 0.88826889, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.3359375, + "step": 1928, + "time_per_iteration": 2.569284677505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086968, + "balance_loss_mlp": 1.05208778, + "epoch": 0.37110427087341286, + "flos": 462941899776.0, + "grad_norm": 0.056849308308669105, + "language_loss": 0.83676869, + "learning_rate": 0.000724688258884151, + "loss": 0.84763837, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.34912109, + "step": 1929, + "time_per_iteration": 2.522596597671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085634, + "balance_loss_mlp": 1.05204105, + "epoch": 0.3712966525586764, + "flos": 849303843840.0, + "grad_norm": 0.0484214736208702, + "language_loss": 0.86208755, + "learning_rate": 0.0007244099013137002, + "loss": 0.87294388, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.33618164, + "step": 1930, + "time_per_iteration": 3.055302619934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089345, + "balance_loss_mlp": 1.05370176, + "epoch": 0.37148903424394, + "flos": 925555092480.0, + "grad_norm": 0.05147814185741214, + "language_loss": 0.88918859, + "learning_rate": 0.0007241314566195993, + "loss": 0.90008199, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.35693359, + "step": 1931, + "time_per_iteration": 3.249950408935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108542, + "balance_loss_mlp": 1.05020559, + "epoch": 0.37168141592920356, + "flos": 519565473792.0, + "grad_norm": 0.061459583473066896, + "language_loss": 0.85347825, + "learning_rate": 0.0007238529249099496, + "loss": 0.86433244, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.35253906, + "step": 1932, + "time_per_iteration": 2.603287696838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068675, + "balance_loss_mlp": 1.05599129, + "epoch": 0.3718737976144671, + "flos": 1445107267584.0, + "grad_norm": 0.02721294021284605, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.7892555, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.12695312, + "step": 1933, + "time_per_iteration": 4.850685358047485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089729, + "balance_loss_mlp": 1.05346537, + "epoch": 0.3720661792997307, + "flos": 759218558976.0, + "grad_norm": 0.08735029164116491, + "language_loss": 0.80658156, + "learning_rate": 0.000723295600876581, + "loss": 0.81747884, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.36279297, + "step": 1934, + "time_per_iteration": 3.0082099437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092646, + "balance_loss_mlp": 1.05690694, + "epoch": 0.3722585609849942, + "flos": 516686031360.0, + "grad_norm": 0.1760204301041219, + "language_loss": 0.8798061, + "learning_rate": 0.0007230168087692344, + "loss": 0.89073259, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.35791016, + "step": 1935, + "time_per_iteration": 2.7076756954193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095918, + "balance_loss_mlp": 1.06070328, + "epoch": 0.3724509426702578, + "flos": 782114171904.0, + "grad_norm": 0.058450977170247324, + "language_loss": 0.82290804, + "learning_rate": 0.0007227379300790839, + "loss": 0.83386725, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.35205078, + "step": 1936, + "time_per_iteration": 3.0381107330322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108913, + "balance_loss_mlp": 1.07262528, + "epoch": 0.37264332435552133, + "flos": 391502246400.0, + "grad_norm": 0.062314619417064634, + "language_loss": 0.85779369, + "learning_rate": 0.0007224589649143997, + "loss": 0.86888283, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.36328125, + "step": 1937, + "time_per_iteration": 2.5413918495178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118094, + "balance_loss_mlp": 1.08223581, + "epoch": 0.3728357060407849, + "flos": 542599299072.0, + "grad_norm": 0.08241458585549921, + "language_loss": 0.80921531, + "learning_rate": 0.0007221799133834861, + "loss": 0.82039624, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.35839844, + "step": 1938, + "time_per_iteration": 2.620593309402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122902, + "balance_loss_mlp": 1.08682895, + "epoch": 0.3730280877260485, + "flos": 433344646656.0, + "grad_norm": 0.05702640818290307, + "language_loss": 0.81373966, + "learning_rate": 0.00072190077559468, + "loss": 0.8249687, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.36083984, + "step": 1939, + "time_per_iteration": 2.512871026992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133095, + "balance_loss_mlp": 1.09587836, + "epoch": 0.37322046941131204, + "flos": 531230068224.0, + "grad_norm": 0.0616329871980105, + "language_loss": 0.89228082, + "learning_rate": 0.0007216215516563527, + "loss": 0.90361178, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.37207031, + "step": 1940, + "time_per_iteration": 2.6655144691467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113311, + "balance_loss_mlp": 1.09534478, + "epoch": 0.3734128510965756, + "flos": 531294087168.0, + "grad_norm": 0.05659412158312536, + "language_loss": 0.83479297, + "learning_rate": 0.0007213422416769083, + "loss": 0.84612405, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.37744141, + "step": 1941, + "time_per_iteration": 2.6088144779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127109, + "balance_loss_mlp": 1.09022546, + "epoch": 0.37360523278183916, + "flos": 500195284992.0, + "grad_norm": 0.05910558413712496, + "language_loss": 0.74991721, + "learning_rate": 0.0007210628457647849, + "loss": 0.76118833, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.36889648, + "step": 1942, + "time_per_iteration": 2.57867169380188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131479, + "balance_loss_mlp": 1.09366596, + "epoch": 0.37379761446710275, + "flos": 547652413440.0, + "grad_norm": 0.06364761456819781, + "language_loss": 0.79148316, + "learning_rate": 0.000720783364028453, + "loss": 0.80279785, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.37768555, + "step": 1943, + "time_per_iteration": 2.744575023651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121556, + "balance_loss_mlp": 1.0834806, + "epoch": 0.3739899961523663, + "flos": 475517316096.0, + "grad_norm": 0.05406366318559307, + "language_loss": 0.87411249, + "learning_rate": 0.0007205037965764177, + "loss": 0.88532799, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.38061523, + "step": 1944, + "time_per_iteration": 2.5253238677978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122588, + "balance_loss_mlp": 1.0851568, + "epoch": 0.37418237783762986, + "flos": 611626581504.0, + "grad_norm": 0.05571778703090581, + "language_loss": 0.85614675, + "learning_rate": 0.0007202241435172161, + "loss": 0.86737263, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.37426758, + "step": 1945, + "time_per_iteration": 2.7462716102600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117651, + "balance_loss_mlp": 1.07931328, + "epoch": 0.3743747595228934, + "flos": 765953694720.0, + "grad_norm": 0.05225192391609906, + "language_loss": 0.88148731, + "learning_rate": 0.0007199444049594198, + "loss": 0.89266384, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.38330078, + "step": 1946, + "time_per_iteration": 2.9533469676971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116752, + "balance_loss_mlp": 1.07798529, + "epoch": 0.374567141208157, + "flos": 524120993280.0, + "grad_norm": 0.06150523549490838, + "language_loss": 0.83402771, + "learning_rate": 0.0007196645810116322, + "loss": 0.84519523, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.38720703, + "step": 1947, + "time_per_iteration": 2.709965705871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106001, + "balance_loss_mlp": 1.06735349, + "epoch": 0.37475952289342057, + "flos": 681067090944.0, + "grad_norm": 0.05833074938802531, + "language_loss": 0.83909506, + "learning_rate": 0.0007193846717824912, + "loss": 0.850155, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.38598633, + "step": 1948, + "time_per_iteration": 2.854522705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094783, + "balance_loss_mlp": 1.05682671, + "epoch": 0.3749519045786841, + "flos": 460061047296.0, + "grad_norm": 0.06673844071937801, + "language_loss": 0.88263041, + "learning_rate": 0.0007191046773806669, + "loss": 0.89357823, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.37915039, + "step": 1949, + "time_per_iteration": 2.575989007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085738, + "balance_loss_mlp": 1.04682803, + "epoch": 0.3751442862639477, + "flos": 954471458304.0, + "grad_norm": 0.06638817682476543, + "language_loss": 0.83010924, + "learning_rate": 0.0007188245979148631, + "loss": 0.84096658, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.38867188, + "step": 1950, + "time_per_iteration": 3.1386518478393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082888, + "balance_loss_mlp": 1.04462171, + "epoch": 0.3753366679492112, + "flos": 527483473920.0, + "grad_norm": 0.05996025340147905, + "language_loss": 0.8766306, + "learning_rate": 0.0007185444334938157, + "loss": 0.88745946, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.38232422, + "step": 1951, + "time_per_iteration": 2.644848585128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082635, + "balance_loss_mlp": 1.04501283, + "epoch": 0.3755290496344748, + "flos": 521535504384.0, + "grad_norm": 0.05938829335869994, + "language_loss": 0.84891546, + "learning_rate": 0.0007182641842262947, + "loss": 0.85974181, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.3762207, + "step": 1952, + "time_per_iteration": 2.6239395141601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081192, + "balance_loss_mlp": 1.04361689, + "epoch": 0.37572143131973834, + "flos": 620810403840.0, + "grad_norm": 0.06544951097265184, + "language_loss": 0.77827752, + "learning_rate": 0.0007179838502211022, + "loss": 0.78908944, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.37524414, + "step": 1953, + "time_per_iteration": 2.8444712162017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076992, + "balance_loss_mlp": 1.03958416, + "epoch": 0.37591381300500193, + "flos": 770635842048.0, + "grad_norm": 0.05616797515781331, + "language_loss": 0.86183697, + "learning_rate": 0.0007177034315870738, + "loss": 0.87260687, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.37402344, + "step": 1954, + "time_per_iteration": 2.9628727436065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078673, + "balance_loss_mlp": 1.0411936, + "epoch": 0.37610619469026546, + "flos": 520191106560.0, + "grad_norm": 0.05872311076525267, + "language_loss": 0.9098376, + "learning_rate": 0.0007174229284330773, + "loss": 0.92062426, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.37402344, + "step": 1955, + "time_per_iteration": 2.579792022705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010784, + "balance_loss_mlp": 1.0412302, + "epoch": 0.37629857637552905, + "flos": 598524456960.0, + "grad_norm": 0.050284285498010506, + "language_loss": 0.86896843, + "learning_rate": 0.0007171423408680141, + "loss": 0.8797524, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.37133789, + "step": 1956, + "time_per_iteration": 2.7764384746551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079026, + "balance_loss_mlp": 1.04211903, + "epoch": 0.37649095806079264, + "flos": 564687396864.0, + "grad_norm": 0.058102078307858664, + "language_loss": 0.89614129, + "learning_rate": 0.0007168616690008176, + "loss": 0.90693152, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.36889648, + "step": 1957, + "time_per_iteration": 2.646986246109009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083688, + "balance_loss_mlp": 1.04606569, + "epoch": 0.37668333974605617, + "flos": 592196755968.0, + "grad_norm": 0.10223927136981294, + "language_loss": 0.86142451, + "learning_rate": 0.0007165809129404545, + "loss": 0.8722614, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.37573242, + "step": 1958, + "time_per_iteration": 2.756287097930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081219, + "balance_loss_mlp": 1.04440713, + "epoch": 0.37687572143131975, + "flos": 419257506816.0, + "grad_norm": 0.0560584683493853, + "language_loss": 0.85760534, + "learning_rate": 0.0007163000727959239, + "loss": 0.8684175, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.36791992, + "step": 1959, + "time_per_iteration": 2.5415151119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105237, + "balance_loss_mlp": 1.03587127, + "epoch": 0.3770681031165833, + "flos": 1356484243968.0, + "grad_norm": 0.028402472736143748, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.7901144, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.16503906, + "step": 1960, + "time_per_iteration": 4.892657518386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086261, + "balance_loss_mlp": 1.04892445, + "epoch": 0.3772604848018469, + "flos": 644592107520.0, + "grad_norm": 0.04530874218926827, + "language_loss": 0.84377986, + "learning_rate": 0.00071573814069052, + "loss": 0.85464251, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.37329102, + "step": 1961, + "time_per_iteration": 2.898301839828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088156, + "balance_loss_mlp": 1.05098641, + "epoch": 0.3774528664871104, + "flos": 901265619456.0, + "grad_norm": 0.052585227845940184, + "language_loss": 0.87987518, + "learning_rate": 0.0007154570489478081, + "loss": 0.89075673, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.37158203, + "step": 1962, + "time_per_iteration": 3.1638717651367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088198, + "balance_loss_mlp": 1.05048013, + "epoch": 0.377645248172374, + "flos": 787717315584.0, + "grad_norm": 0.047624248218528294, + "language_loss": 0.864995, + "learning_rate": 0.0007151758735572514, + "loss": 0.87587702, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.37695312, + "step": 1963, + "time_per_iteration": 2.985558271408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090796, + "balance_loss_mlp": 1.05236292, + "epoch": 0.3778376298576376, + "flos": 586417522176.0, + "grad_norm": 0.06598015027050642, + "language_loss": 0.80448836, + "learning_rate": 0.0007148946146280119, + "loss": 0.81539631, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.38427734, + "step": 1964, + "time_per_iteration": 2.784947395324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053975, + "balance_loss_mlp": 1.03938425, + "epoch": 0.3780300115429011, + "flos": 1396014759936.0, + "grad_norm": 0.018109037433210438, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73246121, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.14550781, + "step": 1965, + "time_per_iteration": 4.85606050491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052214, + "balance_loss_mlp": 1.03838599, + "epoch": 0.3782223932281647, + "flos": 1356935348736.0, + "grad_norm": 0.019183634636191996, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76394159, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.13867188, + "step": 1966, + "time_per_iteration": 4.946903467178345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082797, + "balance_loss_mlp": 1.04517484, + "epoch": 0.37841477491342823, + "flos": 703811344896.0, + "grad_norm": 0.05921890511558738, + "language_loss": 0.83782387, + "learning_rate": 0.0007140503377003022, + "loss": 0.84865183, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.3762207, + "step": 1967, + "time_per_iteration": 2.984163761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089165, + "balance_loss_mlp": 1.0504458, + "epoch": 0.3786071565986918, + "flos": 528856985088.0, + "grad_norm": 0.047303083725180994, + "language_loss": 0.84754062, + "learning_rate": 0.000713768745708599, + "loss": 0.85843223, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.38696289, + "step": 1968, + "time_per_iteration": 2.6251039505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086627, + "balance_loss_mlp": 1.04881418, + "epoch": 0.37879953828395535, + "flos": 992872802304.0, + "grad_norm": 0.053091209869219315, + "language_loss": 0.76740122, + "learning_rate": 0.0007134870707245085, + "loss": 0.7782675, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.37792969, + "step": 1969, + "time_per_iteration": 3.252840995788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082313, + "balance_loss_mlp": 1.04435682, + "epoch": 0.37899191996921894, + "flos": 626358292992.0, + "grad_norm": 0.06088981396741891, + "language_loss": 0.8454808, + "learning_rate": 0.0007132053128573864, + "loss": 0.85630393, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.37915039, + "step": 1970, + "time_per_iteration": 2.739210844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078955, + "balance_loss_mlp": 1.0417614, + "epoch": 0.37918430165448247, + "flos": 686005314048.0, + "grad_norm": 0.05972304110224919, + "language_loss": 0.83631253, + "learning_rate": 0.0007129234722166211, + "loss": 0.84710205, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.37182617, + "step": 1971, + "time_per_iteration": 2.814235210418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086171, + "balance_loss_mlp": 1.05012178, + "epoch": 0.37937668333974606, + "flos": 475374721536.0, + "grad_norm": 0.05230765101952506, + "language_loss": 0.91063309, + "learning_rate": 0.0007126415489116328, + "loss": 0.92149478, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.3605957, + "step": 1972, + "time_per_iteration": 2.657435178756714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091568, + "balance_loss_mlp": 1.05413604, + "epoch": 0.37956906502500964, + "flos": 707271340032.0, + "grad_norm": 0.05210329015751025, + "language_loss": 0.81174934, + "learning_rate": 0.0007123595430518736, + "loss": 0.82266498, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.37402344, + "step": 1973, + "time_per_iteration": 2.832801103591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108628, + "balance_loss_mlp": 1.04856205, + "epoch": 0.3797614467102732, + "flos": 426421836288.0, + "grad_norm": 0.07403044475037865, + "language_loss": 0.8602494, + "learning_rate": 0.0007120774547468282, + "loss": 0.87111217, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.37695312, + "step": 1974, + "time_per_iteration": 2.523059844970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087165, + "balance_loss_mlp": 1.05016232, + "epoch": 0.37995382839553676, + "flos": 481588941312.0, + "grad_norm": 0.05250431859571283, + "language_loss": 0.81228226, + "learning_rate": 0.0007117952841060128, + "loss": 0.82315391, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.36962891, + "step": 1975, + "time_per_iteration": 2.648947238922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080927, + "balance_loss_mlp": 1.04409158, + "epoch": 0.3801462100808003, + "flos": 560286056448.0, + "grad_norm": 0.0511194255012935, + "language_loss": 0.83466387, + "learning_rate": 0.0007115130312389756, + "loss": 0.84547317, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.3684082, + "step": 1976, + "time_per_iteration": 2.6648154258728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086722, + "balance_loss_mlp": 1.0505538, + "epoch": 0.3803385917660639, + "flos": 464699524608.0, + "grad_norm": 0.06028169205400359, + "language_loss": 0.79143679, + "learning_rate": 0.0007112306962552973, + "loss": 0.80230403, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.36181641, + "step": 1977, + "time_per_iteration": 2.5715434551239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086468, + "balance_loss_mlp": 1.04934657, + "epoch": 0.3805309734513274, + "flos": 521614080000.0, + "grad_norm": 0.055719330197324175, + "language_loss": 0.8517288, + "learning_rate": 0.0007109482792645896, + "loss": 0.86259341, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.37084961, + "step": 1978, + "time_per_iteration": 2.6932663917541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088226, + "balance_loss_mlp": 1.05222487, + "epoch": 0.380723355136591, + "flos": 591128782848.0, + "grad_norm": 0.06665517257748008, + "language_loss": 0.83491528, + "learning_rate": 0.0007106657803764969, + "loss": 0.84579754, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.36010742, + "step": 1979, + "time_per_iteration": 2.710658311843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079049, + "balance_loss_mlp": 1.04204643, + "epoch": 0.38091573682185453, + "flos": 622394910720.0, + "grad_norm": 0.05735071115872701, + "language_loss": 0.81648314, + "learning_rate": 0.0007103831997006948, + "loss": 0.82727367, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.36987305, + "step": 1980, + "time_per_iteration": 2.7589223384857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107835, + "balance_loss_mlp": 1.04168153, + "epoch": 0.3811081185071181, + "flos": 568716208128.0, + "grad_norm": 0.047165366669346453, + "language_loss": 0.85731214, + "learning_rate": 0.0007101005373468908, + "loss": 0.86809564, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.36669922, + "step": 1981, + "time_per_iteration": 2.85588002204895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077478, + "balance_loss_mlp": 1.04161954, + "epoch": 0.3813005001923817, + "flos": 584550798336.0, + "grad_norm": 0.055048826019740454, + "language_loss": 0.86394024, + "learning_rate": 0.0007098177934248242, + "loss": 0.87471503, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.35888672, + "step": 1982, + "time_per_iteration": 2.7226805686950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077944, + "balance_loss_mlp": 1.04160953, + "epoch": 0.38149288187764524, + "flos": 621287649792.0, + "grad_norm": 0.056689743602043985, + "language_loss": 0.85823661, + "learning_rate": 0.0007095349680442661, + "loss": 0.86901605, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.36352539, + "step": 1983, + "time_per_iteration": 2.8454253673553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075326, + "balance_loss_mlp": 1.03829932, + "epoch": 0.3816852635629088, + "flos": 570414196224.0, + "grad_norm": 0.07971741755252446, + "language_loss": 0.78927159, + "learning_rate": 0.0007092520613150188, + "loss": 0.80002487, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.37036133, + "step": 1984, + "time_per_iteration": 2.7279529571533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081451, + "balance_loss_mlp": 1.04368556, + "epoch": 0.38187764524817236, + "flos": 565313029632.0, + "grad_norm": 0.06238598748372814, + "language_loss": 0.81304747, + "learning_rate": 0.0007089690733469165, + "loss": 0.82386196, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.37719727, + "step": 1985, + "time_per_iteration": 2.7343544960021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077201, + "balance_loss_mlp": 1.04115212, + "epoch": 0.38207002693343595, + "flos": 630932751360.0, + "grad_norm": 0.07832972313002672, + "language_loss": 0.82561398, + "learning_rate": 0.000708686004249825, + "loss": 0.83638602, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.36035156, + "step": 1986, + "time_per_iteration": 2.7691054344177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082228, + "balance_loss_mlp": 1.04572582, + "epoch": 0.3822624086186995, + "flos": 548507980800.0, + "grad_norm": 0.053849318526496194, + "language_loss": 0.9147824, + "learning_rate": 0.0007084028541336413, + "loss": 0.9256047, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.36499023, + "step": 1987, + "time_per_iteration": 2.7131056785583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083994, + "balance_loss_mlp": 1.04753971, + "epoch": 0.38245479030396307, + "flos": 613571880960.0, + "grad_norm": 0.06787860515410171, + "language_loss": 0.86709088, + "learning_rate": 0.0007081196231082942, + "loss": 0.87793082, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.36450195, + "step": 1988, + "time_per_iteration": 2.7983548641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083168, + "balance_loss_mlp": 1.04621339, + "epoch": 0.38264717198922665, + "flos": 667787466240.0, + "grad_norm": 0.05230973877590939, + "language_loss": 0.80107033, + "learning_rate": 0.0007078363112837436, + "loss": 0.81190205, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.36938477, + "step": 1989, + "time_per_iteration": 2.8133270740509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087048, + "balance_loss_mlp": 1.04935408, + "epoch": 0.3828395536744902, + "flos": 454521922560.0, + "grad_norm": 0.077904410907181, + "language_loss": 0.84988701, + "learning_rate": 0.000707552918769981, + "loss": 0.86075753, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.37646484, + "step": 1990, + "time_per_iteration": 2.5100817680358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089992, + "balance_loss_mlp": 1.05213106, + "epoch": 0.3830319353597538, + "flos": 499191330816.0, + "grad_norm": 0.06242573245055077, + "language_loss": 0.83457661, + "learning_rate": 0.000707269445677029, + "loss": 0.84547657, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.37817383, + "step": 1991, + "time_per_iteration": 2.7737526893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099946, + "balance_loss_mlp": 1.06327772, + "epoch": 0.3832243170450173, + "flos": 743787021312.0, + "grad_norm": 0.05437985066129539, + "language_loss": 0.84858984, + "learning_rate": 0.0007069858921149416, + "loss": 0.85958934, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.36694336, + "step": 1992, + "time_per_iteration": 2.9642581939697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095684, + "balance_loss_mlp": 1.05868101, + "epoch": 0.3834166987302809, + "flos": 577937908224.0, + "grad_norm": 0.10762195872615073, + "language_loss": 0.85869837, + "learning_rate": 0.0007067022581938043, + "loss": 0.86965525, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.36987305, + "step": 1993, + "time_per_iteration": 2.805201292037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101221, + "balance_loss_mlp": 1.06531525, + "epoch": 0.3836090804155444, + "flos": 536194432512.0, + "grad_norm": 0.06280477504596697, + "language_loss": 0.831635, + "learning_rate": 0.0007064185440237334, + "loss": 0.84264719, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.359375, + "step": 1994, + "time_per_iteration": 2.7297706604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101916, + "balance_loss_mlp": 1.06527066, + "epoch": 0.383801462100808, + "flos": 601587191808.0, + "grad_norm": 0.05513764490979663, + "language_loss": 0.84278905, + "learning_rate": 0.0007061347497148764, + "loss": 0.85380822, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.36621094, + "step": 1995, + "time_per_iteration": 2.725632429122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094031, + "balance_loss_mlp": 1.05876923, + "epoch": 0.38399384378607154, + "flos": 572427896832.0, + "grad_norm": 0.06604776765413087, + "language_loss": 0.86282277, + "learning_rate": 0.0007058508753774122, + "loss": 0.87376308, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.35302734, + "step": 1996, + "time_per_iteration": 2.760045051574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092829, + "balance_loss_mlp": 1.05773425, + "epoch": 0.38418622547133513, + "flos": 536513117184.0, + "grad_norm": 0.058737109015633, + "language_loss": 0.86788458, + "learning_rate": 0.0007055669211215505, + "loss": 0.87881291, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.35131836, + "step": 1997, + "time_per_iteration": 2.6091408729553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088772, + "balance_loss_mlp": 1.05238962, + "epoch": 0.3843786071565987, + "flos": 572673798144.0, + "grad_norm": 0.06483433205315106, + "language_loss": 0.77687544, + "learning_rate": 0.0007052828870575322, + "loss": 0.78776312, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.36376953, + "step": 1998, + "time_per_iteration": 2.671349048614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109131, + "balance_loss_mlp": 1.0558331, + "epoch": 0.38457098884186225, + "flos": 728361275904.0, + "grad_norm": 0.05010154832824161, + "language_loss": 0.86955881, + "learning_rate": 0.0007049987732956291, + "loss": 0.88047194, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.35498047, + "step": 1999, + "time_per_iteration": 2.9918439388275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108718, + "balance_loss_mlp": 1.05189395, + "epoch": 0.38476337052712584, + "flos": 583123442688.0, + "grad_norm": 0.047224279388360366, + "language_loss": 0.82623643, + "learning_rate": 0.0007047145799461439, + "loss": 0.83710825, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.35327148, + "step": 2000, + "time_per_iteration": 2.84328293800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092374, + "balance_loss_mlp": 1.05656374, + "epoch": 0.38495575221238937, + "flos": 552787075584.0, + "grad_norm": 0.05254385134155795, + "language_loss": 0.82269979, + "learning_rate": 0.00070443030711941, + "loss": 0.83362353, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.3581543, + "step": 2001, + "time_per_iteration": 2.753903865814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092226, + "balance_loss_mlp": 1.0559628, + "epoch": 0.38514813389765296, + "flos": 654173190144.0, + "grad_norm": 0.05896823149323879, + "language_loss": 0.8241961, + "learning_rate": 0.0007041459549257924, + "loss": 0.83511841, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.36254883, + "step": 2002, + "time_per_iteration": 2.85963773727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086715, + "balance_loss_mlp": 1.05030835, + "epoch": 0.3853405155829165, + "flos": 867715158528.0, + "grad_norm": 0.0671523306708724, + "language_loss": 0.78569824, + "learning_rate": 0.0007038615234756859, + "loss": 0.79656541, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.36425781, + "step": 2003, + "time_per_iteration": 3.1452226638793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_mlp": 1.04440188, + "epoch": 0.3855328972681801, + "flos": 546164011008.0, + "grad_norm": 0.05736478292188374, + "language_loss": 0.83675313, + "learning_rate": 0.000703577012879517, + "loss": 0.84755898, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.36230469, + "step": 2004, + "time_per_iteration": 2.6308705806732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075931, + "balance_loss_mlp": 1.04040706, + "epoch": 0.3857252789534436, + "flos": 533819939328.0, + "grad_norm": 0.0602394573591363, + "language_loss": 0.8843599, + "learning_rate": 0.0007032924232477423, + "loss": 0.89511919, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.35595703, + "step": 2005, + "time_per_iteration": 2.6188220977783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079495, + "balance_loss_mlp": 1.04337406, + "epoch": 0.3859176606387072, + "flos": 491514849792.0, + "grad_norm": 0.055511202055775664, + "language_loss": 0.80448711, + "learning_rate": 0.0007030077546908493, + "loss": 0.81528199, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.36132812, + "step": 2006, + "time_per_iteration": 2.6309516429901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088067, + "balance_loss_mlp": 1.07609844, + "epoch": 0.3861100423239708, + "flos": 1486278955008.0, + "grad_norm": 0.032522163132150485, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84152722, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.11962891, + "step": 2007, + "time_per_iteration": 4.736604452133179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083286, + "balance_loss_mlp": 1.04816651, + "epoch": 0.3863024240092343, + "flos": 473493441024.0, + "grad_norm": 0.05514866045126494, + "language_loss": 0.79152983, + "learning_rate": 0.0007024381812438117, + "loss": 0.80236268, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.3515625, + "step": 2008, + "time_per_iteration": 2.5392396450042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108868, + "balance_loss_mlp": 1.05306053, + "epoch": 0.3864948056944979, + "flos": 716258723328.0, + "grad_norm": 0.059806412844581394, + "language_loss": 0.83199877, + "learning_rate": 0.0007021532765747951, + "loss": 0.84288561, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.35668945, + "step": 2009, + "time_per_iteration": 2.9926528930664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087876, + "balance_loss_mlp": 1.0511353, + "epoch": 0.38668718737976143, + "flos": 727302067200.0, + "grad_norm": 0.05631620148302912, + "language_loss": 0.7933259, + "learning_rate": 0.0007018682934229162, + "loss": 0.8042047, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.36743164, + "step": 2010, + "time_per_iteration": 2.924781322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087301, + "balance_loss_mlp": 1.05103779, + "epoch": 0.386879569065025, + "flos": 525218079744.0, + "grad_norm": 0.05794664731816873, + "language_loss": 0.82387936, + "learning_rate": 0.0007015832318988152, + "loss": 0.83475244, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.36328125, + "step": 2011, + "time_per_iteration": 2.668565511703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_mlp": 1.02173615, + "epoch": 0.38707195075028855, + "flos": 1527036005376.0, + "grad_norm": 0.019732384975687786, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74922872, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11083984, + "step": 2012, + "time_per_iteration": 4.948081970214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085105, + "balance_loss_mlp": 1.04886508, + "epoch": 0.38726433243555214, + "flos": 557045821440.0, + "grad_norm": 0.049164425244227684, + "language_loss": 0.84333575, + "learning_rate": 0.0007010128741766604, + "loss": 0.85418677, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.36230469, + "step": 2013, + "time_per_iteration": 2.7263684272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073123, + "balance_loss_mlp": 1.03695476, + "epoch": 0.38745671412081567, + "flos": 553431647232.0, + "grad_norm": 0.06787190277242791, + "language_loss": 0.84107876, + "learning_rate": 0.0007007275782000391, + "loss": 0.85181004, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.36181641, + "step": 2014, + "time_per_iteration": 2.6458756923675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082824, + "balance_loss_mlp": 1.04679942, + "epoch": 0.38764909580607926, + "flos": 458175384576.0, + "grad_norm": 0.05583745089019265, + "language_loss": 0.85148585, + "learning_rate": 0.0007004422042940605, + "loss": 0.86231411, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.3605957, + "step": 2015, + "time_per_iteration": 2.5374679565429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074881, + "balance_loss_mlp": 1.03847444, + "epoch": 0.38784147749134285, + "flos": 521973462528.0, + "grad_norm": 0.056017537147394686, + "language_loss": 0.89528251, + "learning_rate": 0.0007001567525695169, + "loss": 0.90603131, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.36425781, + "step": 2016, + "time_per_iteration": 2.5863571166992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081449, + "balance_loss_mlp": 1.04504275, + "epoch": 0.3880338591766064, + "flos": 665696600064.0, + "grad_norm": 0.05583938490392423, + "language_loss": 0.839926, + "learning_rate": 0.0006998712231372303, + "loss": 0.85074055, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.36425781, + "step": 2017, + "time_per_iteration": 2.998652219772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076964, + "balance_loss_mlp": 1.04103458, + "epoch": 0.38822624086186996, + "flos": 593660427264.0, + "grad_norm": 0.044278068469259586, + "language_loss": 0.86088806, + "learning_rate": 0.0006995856161080532, + "loss": 0.87165773, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.35961914, + "step": 2018, + "time_per_iteration": 2.870619297027588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077822, + "balance_loss_mlp": 1.03972268, + "epoch": 0.3884186225471335, + "flos": 612256596480.0, + "grad_norm": 0.10653426783792587, + "language_loss": 0.8221643, + "learning_rate": 0.0006992999315928679, + "loss": 0.83294249, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.38061523, + "step": 2019, + "time_per_iteration": 2.7867438793182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074258, + "balance_loss_mlp": 1.03782773, + "epoch": 0.3886110042323971, + "flos": 606737820672.0, + "grad_norm": 0.05830260104080337, + "language_loss": 0.85476196, + "learning_rate": 0.0006990141697025871, + "loss": 0.8655045, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.36401367, + "step": 2020, + "time_per_iteration": 2.7722346782684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008633, + "balance_loss_mlp": 0.99642587, + "epoch": 0.3888033859176606, + "flos": 1527289108992.0, + "grad_norm": 0.011259985776032525, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77368271, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.12207031, + "step": 2021, + "time_per_iteration": 4.71975302696228 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069739, + "balance_loss_mlp": 1.03368998, + "epoch": 0.3889957676029242, + "flos": 692145340416.0, + "grad_norm": 0.08577921290538253, + "language_loss": 0.82040119, + "learning_rate": 0.0006984424142405392, + "loss": 0.83109856, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.36035156, + "step": 2022, + "time_per_iteration": 2.8003616333007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070636, + "balance_loss_mlp": 1.03413415, + "epoch": 0.3891881492881878, + "flos": 514937170944.0, + "grad_norm": 0.06357614890279897, + "language_loss": 0.81860286, + "learning_rate": 0.0006981564208907474, + "loss": 0.82930923, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.36499023, + "step": 2023, + "time_per_iteration": 2.581556558609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074791, + "balance_loss_mlp": 1.03933799, + "epoch": 0.3893805309734513, + "flos": 628770663936.0, + "grad_norm": 0.04985256691663517, + "language_loss": 0.90055227, + "learning_rate": 0.0006978703506098102, + "loss": 0.91130018, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.35498047, + "step": 2024, + "time_per_iteration": 2.7220654487609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080786, + "balance_loss_mlp": 1.04497564, + "epoch": 0.3895729126587149, + "flos": 543894234624.0, + "grad_norm": 0.06500254639996711, + "language_loss": 0.88078821, + "learning_rate": 0.00069758420350879, + "loss": 0.89159608, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.35839844, + "step": 2025, + "time_per_iteration": 2.6044023036956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086131, + "balance_loss_mlp": 1.05012965, + "epoch": 0.38976529434397844, + "flos": 617987778048.0, + "grad_norm": 0.06153368317516065, + "language_loss": 0.86008936, + "learning_rate": 0.000697297979698779, + "loss": 0.87095064, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.36010742, + "step": 2026, + "time_per_iteration": 2.7051053047180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091671, + "balance_loss_mlp": 1.05628932, + "epoch": 0.38995767602924203, + "flos": 834518287872.0, + "grad_norm": 0.05732441037152358, + "language_loss": 0.83766049, + "learning_rate": 0.0006970116792908992, + "loss": 0.84857726, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.35400391, + "step": 2027, + "time_per_iteration": 3.086228847503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096114, + "balance_loss_mlp": 1.06032705, + "epoch": 0.39015005771450556, + "flos": 541343651328.0, + "grad_norm": 0.060477391230123065, + "language_loss": 0.8159399, + "learning_rate": 0.000696725302396302, + "loss": 0.82690096, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.3581543, + "step": 2028, + "time_per_iteration": 2.6521902084350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093769, + "balance_loss_mlp": 1.05867422, + "epoch": 0.39034243939976915, + "flos": 1007102536704.0, + "grad_norm": 0.04866281229524116, + "language_loss": 0.85781944, + "learning_rate": 0.0006964388491261692, + "loss": 0.86875713, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.35131836, + "step": 2029, + "time_per_iteration": 3.2338647842407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099777, + "balance_loss_mlp": 1.06401408, + "epoch": 0.3905348210850327, + "flos": 678723121152.0, + "grad_norm": 0.05278281932643199, + "language_loss": 0.87335277, + "learning_rate": 0.0006961523195917114, + "loss": 0.88435054, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.35791016, + "step": 2030, + "time_per_iteration": 2.8414504528045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098277, + "balance_loss_mlp": 1.06256151, + "epoch": 0.39072720277029627, + "flos": 548606905344.0, + "grad_norm": 0.05643291477722073, + "language_loss": 0.77850938, + "learning_rate": 0.0006958657139041696, + "loss": 0.78949213, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.35742188, + "step": 2031, + "time_per_iteration": 2.7278060913085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091715, + "balance_loss_mlp": 1.07807708, + "epoch": 0.39091958445555985, + "flos": 1546912401408.0, + "grad_norm": 0.03426807627657635, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77804685, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.13671875, + "step": 2032, + "time_per_iteration": 4.927444696426392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099209, + "balance_loss_mlp": 1.06399429, + "epoch": 0.3911119661408234, + "flos": 503741058048.0, + "grad_norm": 0.050822615130563034, + "language_loss": 0.78371578, + "learning_rate": 0.0006952922745149434, + "loss": 0.79470789, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.35253906, + "step": 2033, + "time_per_iteration": 2.6730306148529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095244, + "balance_loss_mlp": 1.05933857, + "epoch": 0.391304347826087, + "flos": 556967245824.0, + "grad_norm": 0.05118619150019999, + "language_loss": 0.87770367, + "learning_rate": 0.000695005441035888, + "loss": 0.88865614, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.359375, + "step": 2034, + "time_per_iteration": 2.6585283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_mlp": 1.02461255, + "epoch": 0.3914967295113505, + "flos": 1499309858304.0, + "grad_norm": 0.00946210886057752, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74762058, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.140625, + "step": 2035, + "time_per_iteration": 4.8464648723602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087203, + "balance_loss_mlp": 1.05182171, + "epoch": 0.3916891111966141, + "flos": 706715518464.0, + "grad_norm": 0.07007748344060821, + "language_loss": 0.81416976, + "learning_rate": 0.0006944315470656863, + "loss": 0.82504177, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.35424805, + "step": 2036, + "time_per_iteration": 2.9289584159851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010868, + "balance_loss_mlp": 1.05051255, + "epoch": 0.3918814928818776, + "flos": 556085537280.0, + "grad_norm": 0.05570869743183256, + "language_loss": 0.91007531, + "learning_rate": 0.000694144486797345, + "loss": 0.92094326, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.36303711, + "step": 2037, + "time_per_iteration": 0.0013861656188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043662, + "balance_loss_mlp": 1.02954793, + "epoch": 0.3920738745671412, + "flos": 1537845032448.0, + "grad_norm": 0.018656318140729232, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80564094, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.14160156, + "step": 2038, + "time_per_iteration": 4.6249003410339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091153, + "balance_loss_mlp": 1.05586696, + "epoch": 0.39226625625240474, + "flos": 498594811392.0, + "grad_norm": 0.06764177247916761, + "language_loss": 0.89479941, + "learning_rate": 0.0006935701402514156, + "loss": 0.90571094, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.35327148, + "step": 2039, + "time_per_iteration": 2.535269260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_mlp": 1.01963174, + "epoch": 0.39245863793766833, + "flos": 1346465203200.0, + "grad_norm": 0.017448285120256823, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.7406826, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.13769531, + "step": 2040, + "time_per_iteration": 4.913180589675903 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086444, + "balance_loss_mlp": 1.05072939, + "epoch": 0.3926510196229319, + "flos": 1345614474240.0, + "grad_norm": 0.055350347752650936, + "language_loss": 0.8453002, + "learning_rate": 0.0006929954931031422, + "loss": 0.85616457, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.35742188, + "step": 2041, + "time_per_iteration": 3.6796491146087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081967, + "balance_loss_mlp": 1.04722965, + "epoch": 0.39284340130819545, + "flos": 499333925376.0, + "grad_norm": 0.05434437059814268, + "language_loss": 0.88856936, + "learning_rate": 0.0006927080570819805, + "loss": 0.89938903, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.34790039, + "step": 2042, + "time_per_iteration": 2.634052038192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087578, + "balance_loss_mlp": 1.05217266, + "epoch": 0.39303578299345904, + "flos": 520077625344.0, + "grad_norm": 0.06468620716873735, + "language_loss": 0.80649555, + "learning_rate": 0.0006924205462449161, + "loss": 0.81737131, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.35473633, + "step": 2043, + "time_per_iteration": 2.6021780967712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078315, + "balance_loss_mlp": 1.04288566, + "epoch": 0.39322816467872257, + "flos": 907529301504.0, + "grad_norm": 0.05516365318311268, + "language_loss": 0.82013571, + "learning_rate": 0.0006921329607035702, + "loss": 0.83091891, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.35473633, + "step": 2044, + "time_per_iteration": 3.2195992469787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078665, + "balance_loss_mlp": 1.04473805, + "epoch": 0.39342054636398616, + "flos": 517330603008.0, + "grad_norm": 0.046703626280748714, + "language_loss": 0.88374329, + "learning_rate": 0.0006918453005695938, + "loss": 0.89452994, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.33959961, + "step": 2045, + "time_per_iteration": 2.6319282054901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080714, + "balance_loss_mlp": 1.04435515, + "epoch": 0.3936129280492497, + "flos": 547646621184.0, + "grad_norm": 0.04434497339872072, + "language_loss": 0.8422206, + "learning_rate": 0.0006915575659546662, + "loss": 0.8530277, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.36401367, + "step": 2046, + "time_per_iteration": 2.648895263671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081267, + "balance_loss_mlp": 1.04519427, + "epoch": 0.3938053097345133, + "flos": 525858269184.0, + "grad_norm": 0.0524234289418272, + "language_loss": 0.80648899, + "learning_rate": 0.0006912697569704959, + "loss": 0.81730163, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.36083984, + "step": 2047, + "time_per_iteration": 2.6330111026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085798, + "balance_loss_mlp": 1.04934382, + "epoch": 0.39399769141977686, + "flos": 471390990336.0, + "grad_norm": 0.0542278175669412, + "language_loss": 0.86721706, + "learning_rate": 0.0006909818737288205, + "loss": 0.87807506, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.36450195, + "step": 2048, + "time_per_iteration": 2.537581205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090734, + "balance_loss_mlp": 1.05468488, + "epoch": 0.3941900731050404, + "flos": 501490220544.0, + "grad_norm": 0.056383256559611315, + "language_loss": 0.80660325, + "learning_rate": 0.000690693916341406, + "loss": 0.8175106, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.3605957, + "step": 2049, + "time_per_iteration": 2.622183084487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096224, + "balance_loss_mlp": 1.05898309, + "epoch": 0.394382454790304, + "flos": 580577241600.0, + "grad_norm": 0.11468284139168772, + "language_loss": 0.82465422, + "learning_rate": 0.0006904058849200475, + "loss": 0.83561641, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.37255859, + "step": 2050, + "time_per_iteration": 2.7216436862945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087551, + "balance_loss_mlp": 1.05088186, + "epoch": 0.3945748364755675, + "flos": 513563659776.0, + "grad_norm": 0.05187056217607278, + "language_loss": 0.84988606, + "learning_rate": 0.0006901177795765683, + "loss": 0.86076152, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.36694336, + "step": 2051, + "time_per_iteration": 2.5725293159484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079371, + "balance_loss_mlp": 1.04291642, + "epoch": 0.3947672181608311, + "flos": 593683748352.0, + "grad_norm": 0.0518129521666432, + "language_loss": 0.8131091, + "learning_rate": 0.0006898296004228213, + "loss": 0.82390279, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.36450195, + "step": 2052, + "time_per_iteration": 2.6969447135925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050959, + "balance_loss_mlp": 1.0379895, + "epoch": 0.39495959984609463, + "flos": 1546829443584.0, + "grad_norm": 0.029989620736742544, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79177701, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12988281, + "step": 2053, + "time_per_iteration": 4.8486199378967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078173, + "balance_loss_mlp": 1.04233825, + "epoch": 0.3951519815313582, + "flos": 496271190528.0, + "grad_norm": 0.06693197740080077, + "language_loss": 0.80026031, + "learning_rate": 0.0006892530211320763, + "loss": 0.81104207, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.35864258, + "step": 2054, + "time_per_iteration": 2.6731534004211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082597, + "balance_loss_mlp": 1.04657161, + "epoch": 0.39534436321662175, + "flos": 530934704640.0, + "grad_norm": 0.06400198340094926, + "language_loss": 0.8367995, + "learning_rate": 0.000688964621218926, + "loss": 0.84762549, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.36010742, + "step": 2055, + "time_per_iteration": 2.623870611190796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107818, + "balance_loss_mlp": 1.0422982, + "epoch": 0.39553674490188534, + "flos": 702224017920.0, + "grad_norm": 0.05929287076568038, + "language_loss": 0.80154717, + "learning_rate": 0.0006886761479432037, + "loss": 0.81232893, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.35864258, + "step": 2056, + "time_per_iteration": 2.810593366622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088441, + "balance_loss_mlp": 1.05263042, + "epoch": 0.3957291265871489, + "flos": 409552768512.0, + "grad_norm": 0.05784227470554994, + "language_loss": 0.84645867, + "learning_rate": 0.0006883876014169045, + "loss": 0.85734308, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.3581543, + "step": 2057, + "time_per_iteration": 2.464358329772949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088007, + "balance_loss_mlp": 1.05121863, + "epoch": 0.39592150827241246, + "flos": 618204566016.0, + "grad_norm": 0.05454135250908964, + "language_loss": 0.90161431, + "learning_rate": 0.000688098981752052, + "loss": 0.91249436, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.36791992, + "step": 2058, + "time_per_iteration": 2.742589235305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094817, + "balance_loss_mlp": 1.05819607, + "epoch": 0.39611388995767605, + "flos": 820986969600.0, + "grad_norm": 0.05656267147709111, + "language_loss": 0.80105305, + "learning_rate": 0.0006878102890606982, + "loss": 0.81200117, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.36621094, + "step": 2059, + "time_per_iteration": 3.0725343227386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096787, + "balance_loss_mlp": 1.06018949, + "epoch": 0.3963062716429396, + "flos": 491977539072.0, + "grad_norm": 0.0710153527902746, + "language_loss": 0.81321216, + "learning_rate": 0.0006875215234549239, + "loss": 0.82418001, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.3659668, + "step": 2060, + "time_per_iteration": 2.542090654373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097398, + "balance_loss_mlp": 1.06015694, + "epoch": 0.39649865332820317, + "flos": 584466430464.0, + "grad_norm": 0.08966956269211096, + "language_loss": 0.8554219, + "learning_rate": 0.0006872326850468376, + "loss": 0.86639589, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.37231445, + "step": 2061, + "time_per_iteration": 2.7194440364837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010999, + "balance_loss_mlp": 1.06139588, + "epoch": 0.3966910350134667, + "flos": 458328153600.0, + "grad_norm": 0.05276871533818733, + "language_loss": 0.78609985, + "learning_rate": 0.0006869437739485762, + "loss": 0.79709888, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.38476562, + "step": 2062, + "time_per_iteration": 2.6032114028930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089079, + "balance_loss_mlp": 1.05281568, + "epoch": 0.3968834166987303, + "flos": 508388299776.0, + "grad_norm": 0.05735909750828215, + "language_loss": 0.93035084, + "learning_rate": 0.0006866547902723053, + "loss": 0.94124162, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.36279297, + "step": 2063, + "time_per_iteration": 2.666294813156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097295, + "balance_loss_mlp": 1.06110287, + "epoch": 0.3970757983839938, + "flos": 572349321216.0, + "grad_norm": 0.05819495660266105, + "language_loss": 0.79961425, + "learning_rate": 0.000686365734130218, + "loss": 0.81058717, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.36206055, + "step": 2064, + "time_per_iteration": 2.667443037033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093878, + "balance_loss_mlp": 1.05639839, + "epoch": 0.3972681800692574, + "flos": 481391092224.0, + "grad_norm": 0.051061390118103664, + "language_loss": 0.84029245, + "learning_rate": 0.000686076605634536, + "loss": 0.85123128, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.37475586, + "step": 2065, + "time_per_iteration": 2.605252981185913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091284, + "balance_loss_mlp": 1.05406737, + "epoch": 0.397460561754521, + "flos": 487683887616.0, + "grad_norm": 0.060621892107923396, + "language_loss": 0.84327424, + "learning_rate": 0.0006857874048975088, + "loss": 0.85418713, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.37207031, + "step": 2066, + "time_per_iteration": 2.5779786109924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090658, + "balance_loss_mlp": 1.05329823, + "epoch": 0.3976529434397845, + "flos": 421768802304.0, + "grad_norm": 0.05929679602335689, + "language_loss": 0.86975348, + "learning_rate": 0.0006854981320314142, + "loss": 0.88066006, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.37353516, + "step": 2067, + "time_per_iteration": 2.4723403453826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082754, + "balance_loss_mlp": 1.04591799, + "epoch": 0.3978453251250481, + "flos": 545331764736.0, + "grad_norm": 0.058605050617464606, + "language_loss": 0.86758339, + "learning_rate": 0.0006852087871485579, + "loss": 0.87841094, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.36816406, + "step": 2068, + "time_per_iteration": 2.6007602214813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082239, + "balance_loss_mlp": 1.04602289, + "epoch": 0.39803770681031164, + "flos": 650548841472.0, + "grad_norm": 0.06821675645960798, + "language_loss": 0.81689966, + "learning_rate": 0.0006849193703612735, + "loss": 0.82772201, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.36206055, + "step": 2069, + "time_per_iteration": 2.7661337852478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083569, + "balance_loss_mlp": 1.04661417, + "epoch": 0.39823008849557523, + "flos": 739734888960.0, + "grad_norm": 0.059947122372600754, + "language_loss": 0.77649361, + "learning_rate": 0.0006846298817819225, + "loss": 0.78732932, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.36987305, + "step": 2070, + "time_per_iteration": 2.9364843368530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091151, + "balance_loss_mlp": 1.05436325, + "epoch": 0.39842247018083876, + "flos": 384825337344.0, + "grad_norm": 0.0736862776590319, + "language_loss": 0.80732799, + "learning_rate": 0.0006843403215228945, + "loss": 0.81823957, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.36767578, + "step": 2071, + "time_per_iteration": 2.4597537517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078334, + "balance_loss_mlp": 1.04133189, + "epoch": 0.39861485186610235, + "flos": 533431443456.0, + "grad_norm": 0.052578385162892496, + "language_loss": 0.80366135, + "learning_rate": 0.0006840506896966065, + "loss": 0.81444472, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.36962891, + "step": 2072, + "time_per_iteration": 2.6826841831207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081243, + "balance_loss_mlp": 1.04397774, + "epoch": 0.39880723355136594, + "flos": 642834482688.0, + "grad_norm": 0.055481383737447196, + "language_loss": 0.82090193, + "learning_rate": 0.0006837609864155038, + "loss": 0.83171439, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.37255859, + "step": 2073, + "time_per_iteration": 2.8541154861450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075844, + "balance_loss_mlp": 1.0408442, + "epoch": 0.39899961523662947, + "flos": 515587534848.0, + "grad_norm": 0.07686004257588779, + "language_loss": 0.83464944, + "learning_rate": 0.0006834712117920592, + "loss": 0.84540784, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.3503418, + "step": 2074, + "time_per_iteration": 2.6023800373077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107731, + "balance_loss_mlp": 1.04025948, + "epoch": 0.39919199692189306, + "flos": 464148085248.0, + "grad_norm": 0.0625246810856132, + "language_loss": 0.85794407, + "learning_rate": 0.0006831813659387729, + "loss": 0.86871719, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.37036133, + "step": 2075, + "time_per_iteration": 2.5916242599487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071695, + "balance_loss_mlp": 1.0353837, + "epoch": 0.3993843786071566, + "flos": 531382837248.0, + "grad_norm": 0.05588277312371317, + "language_loss": 0.84130096, + "learning_rate": 0.0006828914489681733, + "loss": 0.852018, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.36303711, + "step": 2076, + "time_per_iteration": 2.7014079093933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078543, + "balance_loss_mlp": 1.04168355, + "epoch": 0.3995767602924202, + "flos": 503701770240.0, + "grad_norm": 0.05616101270921505, + "language_loss": 0.85284638, + "learning_rate": 0.0006826014609928162, + "loss": 0.86363184, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.36816406, + "step": 2077, + "time_per_iteration": 2.6714975833892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089101, + "balance_loss_mlp": 1.07622623, + "epoch": 0.3997691419776837, + "flos": 1453780500480.0, + "grad_norm": 0.03492718818999835, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.8428849, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.12890625, + "step": 2078, + "time_per_iteration": 4.8198041915893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075675, + "balance_loss_mlp": 1.03943551, + "epoch": 0.3999615236629473, + "flos": 530418170880.0, + "grad_norm": 0.06060184252075253, + "language_loss": 0.79984158, + "learning_rate": 0.0006820212724781896, + "loss": 0.81059831, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.36279297, + "step": 2079, + "time_per_iteration": 2.725576400756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077383, + "balance_loss_mlp": 1.04209709, + "epoch": 0.4001539053482108, + "flos": 694823961600.0, + "grad_norm": 0.12722864956638674, + "language_loss": 0.83843565, + "learning_rate": 0.0006817310721641694, + "loss": 0.84920955, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.35302734, + "step": 2080, + "time_per_iteration": 2.808981418609619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083492, + "balance_loss_mlp": 1.04732358, + "epoch": 0.4003462870334744, + "flos": 520102356480.0, + "grad_norm": 0.09864886938158902, + "language_loss": 0.84025592, + "learning_rate": 0.00068144080129589, + "loss": 0.85109079, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.36181641, + "step": 2081, + "time_per_iteration": 2.61795973777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090466, + "balance_loss_mlp": 1.05403543, + "epoch": 0.400538668718738, + "flos": 492272902656.0, + "grad_norm": 0.05814134634807872, + "language_loss": 0.83103502, + "learning_rate": 0.0006811504599860441, + "loss": 0.84193969, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.36450195, + "step": 2082, + "time_per_iteration": 2.5586163997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109661, + "balance_loss_mlp": 1.06161022, + "epoch": 0.40073105040400153, + "flos": 490083111936.0, + "grad_norm": 0.05292967428813452, + "language_loss": 0.85547149, + "learning_rate": 0.0006808600483473526, + "loss": 0.86643761, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.35058594, + "step": 2083, + "time_per_iteration": 2.8549985885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110158, + "balance_loss_mlp": 1.06584144, + "epoch": 0.4009234320892651, + "flos": 562088761344.0, + "grad_norm": 0.051341860757237005, + "language_loss": 0.85926497, + "learning_rate": 0.0006805695664925629, + "loss": 0.87028074, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.35791016, + "step": 2084, + "time_per_iteration": 2.7807514667510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111507, + "balance_loss_mlp": 1.07619727, + "epoch": 0.40111581377452865, + "flos": 425786029056.0, + "grad_norm": 0.07139972521672847, + "language_loss": 0.84098327, + "learning_rate": 0.0006802790145344506, + "loss": 0.85209835, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.35327148, + "step": 2085, + "time_per_iteration": 2.4653491973876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106176, + "balance_loss_mlp": 1.07024658, + "epoch": 0.40130819545979224, + "flos": 612148907520.0, + "grad_norm": 0.09859033966702202, + "language_loss": 0.87080699, + "learning_rate": 0.0006799883925858176, + "loss": 0.88186872, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.35961914, + "step": 2086, + "time_per_iteration": 2.8432652950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101686, + "balance_loss_mlp": 1.06580365, + "epoch": 0.40150057714505577, + "flos": 523179648000.0, + "grad_norm": 0.06735788816740666, + "language_loss": 0.85303611, + "learning_rate": 0.0006796977007594933, + "loss": 0.86405295, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.35913086, + "step": 2087, + "time_per_iteration": 2.597883701324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109903, + "balance_loss_mlp": 1.06240904, + "epoch": 0.40169295883031936, + "flos": 561143033856.0, + "grad_norm": 0.0524220318715257, + "language_loss": 0.86402881, + "learning_rate": 0.0006794069391683345, + "loss": 0.87501919, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.36621094, + "step": 2088, + "time_per_iteration": 2.7313365936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101072, + "balance_loss_mlp": 1.06414104, + "epoch": 0.4018853405155829, + "flos": 518743401984.0, + "grad_norm": 0.056795041649419745, + "language_loss": 0.80919069, + "learning_rate": 0.0006791161079252248, + "loss": 0.8202014, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.36914062, + "step": 2089, + "time_per_iteration": 2.57450532913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109207, + "balance_loss_mlp": 1.05652201, + "epoch": 0.4020777222008465, + "flos": 525957193728.0, + "grad_norm": 0.05166370887572794, + "language_loss": 0.82473212, + "learning_rate": 0.0006788252071430747, + "loss": 0.83565277, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.35546875, + "step": 2090, + "time_per_iteration": 2.6603012084960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097817, + "balance_loss_mlp": 1.06100535, + "epoch": 0.40227010388611006, + "flos": 525494504448.0, + "grad_norm": 0.056931817338158205, + "language_loss": 0.86595076, + "learning_rate": 0.0006785342369348222, + "loss": 0.87692893, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.3684082, + "step": 2091, + "time_per_iteration": 2.807980537414551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091263, + "balance_loss_mlp": 1.05566692, + "epoch": 0.4024624855713736, + "flos": 432074442240.0, + "grad_norm": 0.0736357586886409, + "language_loss": 0.79799104, + "learning_rate": 0.0006782431974134316, + "loss": 0.80890369, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.35668945, + "step": 2092, + "time_per_iteration": 2.5331132411956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097919, + "balance_loss_mlp": 1.06044006, + "epoch": 0.4026548672566372, + "flos": 766304312832.0, + "grad_norm": 0.05288336614740697, + "language_loss": 0.89230573, + "learning_rate": 0.0006779520886918949, + "loss": 0.90328491, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.375, + "step": 2093, + "time_per_iteration": 3.014895439147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093032, + "balance_loss_mlp": 1.0560298, + "epoch": 0.4028472489419007, + "flos": 642636633600.0, + "grad_norm": 0.05102527643704043, + "language_loss": 0.8125242, + "learning_rate": 0.0006776609108832301, + "loss": 0.8234545, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.36987305, + "step": 2094, + "time_per_iteration": 2.7778923511505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089446, + "balance_loss_mlp": 1.05311072, + "epoch": 0.4030396306271643, + "flos": 491593425408.0, + "grad_norm": 0.053262929353227066, + "language_loss": 0.84942901, + "learning_rate": 0.0006773696641004828, + "loss": 0.86032349, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.36352539, + "step": 2095, + "time_per_iteration": 2.580313205718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089479, + "balance_loss_mlp": 1.05238152, + "epoch": 0.40323201231242783, + "flos": 901363133952.0, + "grad_norm": 0.05931554649921985, + "language_loss": 0.77618563, + "learning_rate": 0.0006770783484567247, + "loss": 0.78708041, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.37109375, + "step": 2096, + "time_per_iteration": 3.0955684185028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089916, + "balance_loss_mlp": 1.0536046, + "epoch": 0.4034243939976914, + "flos": 570267219456.0, + "grad_norm": 0.07944545156942663, + "language_loss": 0.8587091, + "learning_rate": 0.000676786964065055, + "loss": 0.86960828, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.36303711, + "step": 2097, + "time_per_iteration": 2.742293119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084372, + "balance_loss_mlp": 1.04829895, + "epoch": 0.403616775682955, + "flos": 507206845440.0, + "grad_norm": 0.04869402927646331, + "language_loss": 0.78305566, + "learning_rate": 0.0006764955110385986, + "loss": 0.79389936, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.3605957, + "step": 2098, + "time_per_iteration": 2.708390235900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086865, + "balance_loss_mlp": 1.05055428, + "epoch": 0.40380915736821854, + "flos": 519127515648.0, + "grad_norm": 0.06727344126892942, + "language_loss": 0.80247992, + "learning_rate": 0.0006762039894905083, + "loss": 0.81334853, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.36328125, + "step": 2099, + "time_per_iteration": 2.6428377628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095654, + "balance_loss_mlp": 1.05812716, + "epoch": 0.40400153905348213, + "flos": 441686048256.0, + "grad_norm": 0.06575852305434472, + "language_loss": 0.80233693, + "learning_rate": 0.000675912399533962, + "loss": 0.81329346, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.375, + "step": 2100, + "time_per_iteration": 2.5560812950134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088947, + "balance_loss_mlp": 1.05249298, + "epoch": 0.40419392073874566, + "flos": 771961300992.0, + "grad_norm": 0.1036114098840327, + "language_loss": 0.85183066, + "learning_rate": 0.0006756207412821656, + "loss": 0.86272013, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.36450195, + "step": 2101, + "time_per_iteration": 2.986583709716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086168, + "balance_loss_mlp": 1.05021429, + "epoch": 0.40438630242400925, + "flos": 766215562752.0, + "grad_norm": 0.06055449439143942, + "language_loss": 0.80025709, + "learning_rate": 0.0006753290148483505, + "loss": 0.81111872, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.36010742, + "step": 2102, + "time_per_iteration": 3.0076749324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080415, + "balance_loss_mlp": 1.04491425, + "epoch": 0.4045786841092728, + "flos": 415013317632.0, + "grad_norm": 0.052033945118291625, + "language_loss": 0.7866869, + "learning_rate": 0.0006750372203457752, + "loss": 0.79749095, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.35546875, + "step": 2103, + "time_per_iteration": 2.490941286087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083553, + "balance_loss_mlp": 1.04767144, + "epoch": 0.40477106579453637, + "flos": 538941454848.0, + "grad_norm": 0.07087529891902919, + "language_loss": 0.86455047, + "learning_rate": 0.0006747453578877242, + "loss": 0.875386, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.35864258, + "step": 2104, + "time_per_iteration": 2.6906399726867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083688, + "balance_loss_mlp": 1.04766345, + "epoch": 0.4049634474797999, + "flos": 826358768640.0, + "grad_norm": 0.07644078595746046, + "language_loss": 0.82677126, + "learning_rate": 0.0006744534275875085, + "loss": 0.83760816, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.3605957, + "step": 2105, + "time_per_iteration": 2.9925642013549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081927, + "balance_loss_mlp": 1.0459255, + "epoch": 0.4051558291650635, + "flos": 572417722368.0, + "grad_norm": 0.07127110995979934, + "language_loss": 0.8562066, + "learning_rate": 0.0006741614295584657, + "loss": 0.86702585, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.36010742, + "step": 2106, + "time_per_iteration": 2.6289658546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078264, + "balance_loss_mlp": 1.04321659, + "epoch": 0.4053482108503271, + "flos": 731541874176.0, + "grad_norm": 0.07814638610947379, + "language_loss": 0.78334522, + "learning_rate": 0.0006738693639139595, + "loss": 0.79412782, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.35083008, + "step": 2107, + "time_per_iteration": 3.0381481647491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078707, + "balance_loss_mlp": 1.04234815, + "epoch": 0.4055405925355906, + "flos": 1212588292608.0, + "grad_norm": 0.05182127384415646, + "language_loss": 0.77652568, + "learning_rate": 0.0006735772307673796, + "loss": 0.78731275, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.36376953, + "step": 2108, + "time_per_iteration": 3.5424931049346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075462, + "balance_loss_mlp": 1.03998494, + "epoch": 0.4057329742208542, + "flos": 715553104896.0, + "grad_norm": 0.0496802449600099, + "language_loss": 0.83129466, + "learning_rate": 0.0006732850302321421, + "loss": 0.84204924, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.35498047, + "step": 2109, + "time_per_iteration": 2.902758836746216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081445, + "balance_loss_mlp": 1.04506207, + "epoch": 0.4059253559061177, + "flos": 564623377920.0, + "grad_norm": 0.054690107844022846, + "language_loss": 0.84019876, + "learning_rate": 0.00067299276242169, + "loss": 0.85101312, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.36376953, + "step": 2110, + "time_per_iteration": 2.6453192234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108684, + "balance_loss_mlp": 1.07272601, + "epoch": 0.4061177375913813, + "flos": 1592886919680.0, + "grad_norm": 0.03852995701507201, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75469011, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.14160156, + "step": 2111, + "time_per_iteration": 4.936276197433472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092328, + "balance_loss_mlp": 1.05587411, + "epoch": 0.40631011927664484, + "flos": 615122892288.0, + "grad_norm": 0.05227822307204106, + "language_loss": 0.77911901, + "learning_rate": 0.0006724080254290395, + "loss": 0.79004228, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.36425781, + "step": 2112, + "time_per_iteration": 2.804931402206421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_mlp": 1.04893136, + "epoch": 0.40650250096190843, + "flos": 557390647296.0, + "grad_norm": 0.056265148252134925, + "language_loss": 0.89716649, + "learning_rate": 0.0006721155564738566, + "loss": 0.90801871, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.36303711, + "step": 2113, + "time_per_iteration": 2.756901502609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050781, + "balance_loss_mlp": 1.03676188, + "epoch": 0.40669488264717196, + "flos": 1579301756928.0, + "grad_norm": 0.015026311101099392, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79673421, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.140625, + "step": 2114, + "time_per_iteration": 4.975963354110718 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109673, + "balance_loss_mlp": 1.0599184, + "epoch": 0.40688726433243555, + "flos": 507398902272.0, + "grad_norm": 0.07464761746525102, + "language_loss": 0.85648221, + "learning_rate": 0.0006715304182135078, + "loss": 0.86744952, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.36816406, + "step": 2115, + "time_per_iteration": 2.5924360752105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104151, + "balance_loss_mlp": 1.06726742, + "epoch": 0.40707964601769914, + "flos": 588757109760.0, + "grad_norm": 0.06427267203463374, + "language_loss": 0.88647795, + "learning_rate": 0.0006712377491355127, + "loss": 0.89751947, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.36889648, + "step": 2116, + "time_per_iteration": 2.887439489364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097518, + "balance_loss_mlp": 1.06135035, + "epoch": 0.40727202770296267, + "flos": 580134901248.0, + "grad_norm": 0.10612280790481599, + "language_loss": 0.81211627, + "learning_rate": 0.0006709450135771274, + "loss": 0.82309151, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.36206055, + "step": 2117, + "time_per_iteration": 2.9730725288391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101548, + "balance_loss_mlp": 1.06523705, + "epoch": 0.40746440938822626, + "flos": 503819633664.0, + "grad_norm": 0.05032701187252936, + "language_loss": 0.86683893, + "learning_rate": 0.0006706522116520023, + "loss": 0.87785447, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.36328125, + "step": 2118, + "time_per_iteration": 2.6400580406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096769, + "balance_loss_mlp": 1.06122053, + "epoch": 0.4076567910734898, + "flos": 605323611648.0, + "grad_norm": 0.05658204986861598, + "language_loss": 0.82839441, + "learning_rate": 0.0006703593434738127, + "loss": 0.83936214, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.35571289, + "step": 2119, + "time_per_iteration": 2.77944016456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091629, + "balance_loss_mlp": 1.05622339, + "epoch": 0.4078491727587534, + "flos": 479313372672.0, + "grad_norm": 0.0532477275953574, + "language_loss": 0.78150344, + "learning_rate": 0.0006700664091562604, + "loss": 0.79241967, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.35449219, + "step": 2120, + "time_per_iteration": 2.580658435821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093922, + "balance_loss_mlp": 1.05780149, + "epoch": 0.4080415544440169, + "flos": 510126985728.0, + "grad_norm": 0.045251762284626275, + "language_loss": 0.85188484, + "learning_rate": 0.0006697734088130725, + "loss": 0.86282408, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.36157227, + "step": 2121, + "time_per_iteration": 2.5990941524505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108927, + "balance_loss_mlp": 1.05329287, + "epoch": 0.4082339361292805, + "flos": 734318009856.0, + "grad_norm": 0.06207508790269206, + "language_loss": 0.85326135, + "learning_rate": 0.0006694803425580018, + "loss": 0.86415404, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.36010742, + "step": 2122, + "time_per_iteration": 2.9514336585998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093256, + "balance_loss_mlp": 1.05687356, + "epoch": 0.4084263178145441, + "flos": 457239831552.0, + "grad_norm": 0.08260422277145335, + "language_loss": 0.84467387, + "learning_rate": 0.0006691872105048268, + "loss": 0.85560644, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.36401367, + "step": 2123, + "time_per_iteration": 2.584765672683716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093552, + "balance_loss_mlp": 1.05762231, + "epoch": 0.4086186994998076, + "flos": 562659139584.0, + "grad_norm": 0.056985949085160005, + "language_loss": 0.84641832, + "learning_rate": 0.0006688940127673513, + "loss": 0.85735387, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.35961914, + "step": 2124, + "time_per_iteration": 2.698777675628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100727, + "balance_loss_mlp": 1.06446397, + "epoch": 0.4088110811850712, + "flos": 573364859904.0, + "grad_norm": 0.04747345440626025, + "language_loss": 0.85754699, + "learning_rate": 0.0006686007494594049, + "loss": 0.86855423, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.36279297, + "step": 2125, + "time_per_iteration": 2.8035151958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101938, + "balance_loss_mlp": 1.06538868, + "epoch": 0.40900346287033473, + "flos": 456702948864.0, + "grad_norm": 0.06322616011827766, + "language_loss": 0.80074888, + "learning_rate": 0.0006683074206948425, + "loss": 0.81176829, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.36547852, + "step": 2126, + "time_per_iteration": 2.4856953620910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103432, + "balance_loss_mlp": 1.06697774, + "epoch": 0.4091958445555983, + "flos": 617097305088.0, + "grad_norm": 0.05684118517242104, + "language_loss": 0.8146261, + "learning_rate": 0.0006680140265875443, + "loss": 0.82566047, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.36474609, + "step": 2127, + "time_per_iteration": 2.772571325302124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111548, + "balance_loss_mlp": 1.07564259, + "epoch": 0.40938822624086185, + "flos": 472159217664.0, + "grad_norm": 0.051537767424008556, + "language_loss": 0.95483583, + "learning_rate": 0.0006677205672514162, + "loss": 0.96595132, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.35888672, + "step": 2128, + "time_per_iteration": 2.6006312370300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114142, + "balance_loss_mlp": 1.07642448, + "epoch": 0.40958060792612544, + "flos": 569734718976.0, + "grad_norm": 0.04853999942998699, + "language_loss": 0.88646978, + "learning_rate": 0.000667427042800389, + "loss": 0.8976112, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.37670898, + "step": 2129, + "time_per_iteration": 2.742804765701294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107096, + "balance_loss_mlp": 1.07030797, + "epoch": 0.40977298961138897, + "flos": 609065823744.0, + "grad_norm": 0.053374560930054, + "language_loss": 0.8288517, + "learning_rate": 0.0006671334533484192, + "loss": 0.83992267, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.36767578, + "step": 2130, + "time_per_iteration": 2.7175474166870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105432, + "balance_loss_mlp": 1.06854916, + "epoch": 0.40996537129665256, + "flos": 581463332352.0, + "grad_norm": 0.10187828374301312, + "language_loss": 0.83427989, + "learning_rate": 0.0006668397990094881, + "loss": 0.84533429, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.36889648, + "step": 2131, + "time_per_iteration": 2.718189239501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102513, + "balance_loss_mlp": 1.06438994, + "epoch": 0.41015775298191615, + "flos": 516296125440.0, + "grad_norm": 0.05088305967580112, + "language_loss": 0.84777439, + "learning_rate": 0.0006665460798976027, + "loss": 0.85879958, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.38134766, + "step": 2132, + "time_per_iteration": 2.754838228225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103514, + "balance_loss_mlp": 1.06448531, + "epoch": 0.4103501346671797, + "flos": 510083315712.0, + "grad_norm": 0.04980971333778078, + "language_loss": 0.81075269, + "learning_rate": 0.0006662522961267947, + "loss": 0.82178783, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.38989258, + "step": 2133, + "time_per_iteration": 2.630645513534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105465, + "balance_loss_mlp": 1.06514883, + "epoch": 0.41054251635244327, + "flos": 549459500544.0, + "grad_norm": 0.047627275091831754, + "language_loss": 0.87016159, + "learning_rate": 0.0006659584478111211, + "loss": 0.88121629, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.40307617, + "step": 2134, + "time_per_iteration": 2.7775702476501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114637, + "balance_loss_mlp": 1.07408166, + "epoch": 0.4107348980377068, + "flos": 839549643264.0, + "grad_norm": 0.06581962625194586, + "language_loss": 0.82464856, + "learning_rate": 0.000665664535064664, + "loss": 0.83579493, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.40551758, + "step": 2135, + "time_per_iteration": 3.0234854221343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011149, + "balance_loss_mlp": 1.07501245, + "epoch": 0.4109272797229704, + "flos": 503445694464.0, + "grad_norm": 0.05498766410062668, + "language_loss": 0.82554698, + "learning_rate": 0.0006653705580015303, + "loss": 0.83669591, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.39892578, + "step": 2136, + "time_per_iteration": 2.740478992462158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110871, + "balance_loss_mlp": 1.06786942, + "epoch": 0.4111196614082339, + "flos": 610533877248.0, + "grad_norm": 0.1069583069182241, + "language_loss": 0.86098707, + "learning_rate": 0.0006650765167358523, + "loss": 0.87207425, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.40844727, + "step": 2137, + "time_per_iteration": 2.7766735553741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112768, + "balance_loss_mlp": 1.07264185, + "epoch": 0.4113120430934975, + "flos": 452931623424.0, + "grad_norm": 0.06240188984530218, + "language_loss": 0.8998509, + "learning_rate": 0.0006647824113817864, + "loss": 0.91097856, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.40112305, + "step": 2138, + "time_per_iteration": 2.558088779449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109501, + "balance_loss_mlp": 1.06992376, + "epoch": 0.41150442477876104, + "flos": 541324712448.0, + "grad_norm": 0.06351755199965968, + "language_loss": 0.81488299, + "learning_rate": 0.000664488242053515, + "loss": 0.82597804, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.39550781, + "step": 2139, + "time_per_iteration": 2.7064287662506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102585, + "balance_loss_mlp": 1.06405628, + "epoch": 0.4116968064640246, + "flos": 576017339904.0, + "grad_norm": 0.052717271070364294, + "language_loss": 0.8372525, + "learning_rate": 0.0006641940088652445, + "loss": 0.8482784, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.38500977, + "step": 2140, + "time_per_iteration": 2.8360941410064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107136, + "balance_loss_mlp": 1.0685842, + "epoch": 0.4118891881492882, + "flos": 495857963520.0, + "grad_norm": 0.05632128251923113, + "language_loss": 0.82241237, + "learning_rate": 0.0006638997119312065, + "loss": 0.83348376, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.38500977, + "step": 2141, + "time_per_iteration": 2.695482015609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432807, + "balance_loss_mlp": 1.41773903, + "epoch": 0.41208156983455174, + "flos": 1537604923392.0, + "grad_norm": 0.12335560313674339, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76496112, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.15039062, + "step": 2142, + "time_per_iteration": 4.938086032867432 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096248, + "balance_loss_mlp": 1.05800605, + "epoch": 0.41227395151981533, + "flos": 584697775104.0, + "grad_norm": 0.06073263389064812, + "language_loss": 0.84852999, + "learning_rate": 0.000663310927282877, + "loss": 0.85949242, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.38208008, + "step": 2143, + "time_per_iteration": 2.776041269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098979, + "balance_loss_mlp": 1.06183362, + "epoch": 0.41246633320507886, + "flos": 442685620224.0, + "grad_norm": 0.05843533128868507, + "language_loss": 0.85999441, + "learning_rate": 0.000663016439797172, + "loss": 0.8709842, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.37109375, + "step": 2144, + "time_per_iteration": 2.6550843715667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099188, + "balance_loss_mlp": 1.06280541, + "epoch": 0.41265871489034245, + "flos": 579680976384.0, + "grad_norm": 0.05476235673703619, + "language_loss": 0.80718118, + "learning_rate": 0.0006627218890228724, + "loss": 0.81817305, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.36376953, + "step": 2145, + "time_per_iteration": 2.748966693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098226, + "balance_loss_mlp": 1.06139088, + "epoch": 0.412851096575606, + "flos": 760906372608.0, + "grad_norm": 0.06511227414480983, + "language_loss": 0.83519912, + "learning_rate": 0.0006624272750743326, + "loss": 0.84618139, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.3684082, + "step": 2146, + "time_per_iteration": 2.987541913986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098071, + "balance_loss_mlp": 1.05994785, + "epoch": 0.41304347826086957, + "flos": 555062644224.0, + "grad_norm": 0.04596756157996359, + "language_loss": 0.82878035, + "learning_rate": 0.0006621325980659322, + "loss": 0.83976108, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.38061523, + "step": 2147, + "time_per_iteration": 2.821556568145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104625, + "balance_loss_mlp": 1.0655247, + "epoch": 0.41323585994613315, + "flos": 665418765312.0, + "grad_norm": 0.06740751064613239, + "language_loss": 0.8204211, + "learning_rate": 0.000661837858112075, + "loss": 0.83146733, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.390625, + "step": 2148, + "time_per_iteration": 2.7922754287719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089584, + "balance_loss_mlp": 1.05136561, + "epoch": 0.4134282416313967, + "flos": 548429405184.0, + "grad_norm": 0.050771109286751076, + "language_loss": 0.88476944, + "learning_rate": 0.0006615430553271888, + "loss": 0.89566529, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.38208008, + "step": 2149, + "time_per_iteration": 2.7367136478424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091169, + "balance_loss_mlp": 1.05326056, + "epoch": 0.4136206233166603, + "flos": 645951062016.0, + "grad_norm": 0.056682848656222896, + "language_loss": 0.85300201, + "learning_rate": 0.0006612481898257264, + "loss": 0.86391366, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.37866211, + "step": 2150, + "time_per_iteration": 2.862969160079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082558, + "balance_loss_mlp": 1.04398179, + "epoch": 0.4138130050019238, + "flos": 517103640576.0, + "grad_norm": 0.07190872816549171, + "language_loss": 0.85216105, + "learning_rate": 0.000660953261722165, + "loss": 0.86298662, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.38549805, + "step": 2151, + "time_per_iteration": 2.608966588973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072414, + "balance_loss_mlp": 1.03379023, + "epoch": 0.4140053866871874, + "flos": 608977073664.0, + "grad_norm": 0.05213877076699988, + "language_loss": 0.82764488, + "learning_rate": 0.0006606582711310055, + "loss": 0.83836901, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.38574219, + "step": 2152, + "time_per_iteration": 2.704941511154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081545, + "balance_loss_mlp": 1.04287302, + "epoch": 0.4141977683724509, + "flos": 579493301760.0, + "grad_norm": 0.0573275470165796, + "language_loss": 0.83345616, + "learning_rate": 0.0006603632181667736, + "loss": 0.8442716, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.38671875, + "step": 2153, + "time_per_iteration": 2.670036792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157558, + "balance_loss_mlp": 1.14086878, + "epoch": 0.4143901500577145, + "flos": 1306598777856.0, + "grad_norm": 0.04466441147089705, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.80100882, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.16699219, + "step": 2154, + "time_per_iteration": 4.936178684234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088281, + "balance_loss_mlp": 1.04989576, + "epoch": 0.41458253174297804, + "flos": 459957740544.0, + "grad_norm": 0.05825483779723247, + "language_loss": 0.81504506, + "learning_rate": 0.0006597729255773153, + "loss": 0.82592785, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.38354492, + "step": 2155, + "time_per_iteration": 2.5436675548553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095127, + "balance_loss_mlp": 1.056885, + "epoch": 0.41477491342824163, + "flos": 553096995840.0, + "grad_norm": 0.14369101348323118, + "language_loss": 0.82126498, + "learning_rate": 0.0006594776861812608, + "loss": 0.83221632, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.38183594, + "step": 2156, + "time_per_iteration": 2.6603870391845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102341, + "balance_loss_mlp": 1.06414664, + "epoch": 0.4149672951135052, + "flos": 697444356096.0, + "grad_norm": 0.09619651786969989, + "language_loss": 0.86957002, + "learning_rate": 0.0006591823848704776, + "loss": 0.88059342, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.38183594, + "step": 2157, + "time_per_iteration": 2.888523578643799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112154, + "balance_loss_mlp": 1.07362556, + "epoch": 0.41515967679876875, + "flos": 565480355328.0, + "grad_norm": 0.06180894820080996, + "language_loss": 0.81514823, + "learning_rate": 0.0006588870217596117, + "loss": 0.82626975, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.38500977, + "step": 2158, + "time_per_iteration": 2.7872376441955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124674, + "balance_loss_mlp": 1.08497691, + "epoch": 0.41535205848403234, + "flos": 500938781184.0, + "grad_norm": 0.08519942481898463, + "language_loss": 0.85712391, + "learning_rate": 0.0006585915969633334, + "loss": 0.86837065, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.39672852, + "step": 2159, + "time_per_iteration": 2.5857338905334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135799, + "balance_loss_mlp": 1.09703159, + "epoch": 0.41554444016929587, + "flos": 607268911104.0, + "grad_norm": 0.06479316283343547, + "language_loss": 0.89294302, + "learning_rate": 0.0006582961105963366, + "loss": 0.90430105, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.38720703, + "step": 2160, + "time_per_iteration": 2.7831602096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153796, + "balance_loss_mlp": 1.11493373, + "epoch": 0.41573682185455946, + "flos": 528856985088.0, + "grad_norm": 0.06215124272048543, + "language_loss": 0.77626073, + "learning_rate": 0.0006580005627733395, + "loss": 0.7877987, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.38818359, + "step": 2161, + "time_per_iteration": 2.6620304584503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152884, + "balance_loss_mlp": 1.11349678, + "epoch": 0.415929203539823, + "flos": 504686785536.0, + "grad_norm": 0.0577168801928891, + "language_loss": 0.81587994, + "learning_rate": 0.0006577049536090838, + "loss": 0.82740879, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.39355469, + "step": 2162, + "time_per_iteration": 2.6678874492645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144655, + "balance_loss_mlp": 1.10693753, + "epoch": 0.4161215852250866, + "flos": 582467286528.0, + "grad_norm": 0.07160302952697103, + "language_loss": 0.85415941, + "learning_rate": 0.000657409283218335, + "loss": 0.86560595, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.37695312, + "step": 2163, + "time_per_iteration": 2.6405746936798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134301, + "balance_loss_mlp": 1.09570062, + "epoch": 0.4163139669103501, + "flos": 490432320000.0, + "grad_norm": 0.051386242205519156, + "language_loss": 0.80774486, + "learning_rate": 0.0006571135517158829, + "loss": 0.81908786, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.38549805, + "step": 2164, + "time_per_iteration": 2.6496996879577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218225, + "balance_loss_mlp": 1.20143986, + "epoch": 0.4165063485956137, + "flos": 1287445377024.0, + "grad_norm": 0.06520745435981959, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77982283, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.16796875, + "step": 2165, + "time_per_iteration": 4.76560640335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127203, + "balance_loss_mlp": 1.09003401, + "epoch": 0.4166987302808773, + "flos": 495015542784.0, + "grad_norm": 0.07154886739030113, + "language_loss": 0.83213758, + "learning_rate": 0.0006565219058351444, + "loss": 0.8434096, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.37133789, + "step": 2166, + "time_per_iteration": 2.539856433868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112149, + "balance_loss_mlp": 1.07397866, + "epoch": 0.4168911119661408, + "flos": 463823608320.0, + "grad_norm": 0.0764039854303378, + "language_loss": 0.83196324, + "learning_rate": 0.0006562259916865553, + "loss": 0.84308469, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.38110352, + "step": 2167, + "time_per_iteration": 2.5938220024108887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106062, + "balance_loss_mlp": 1.06939304, + "epoch": 0.4170834936514044, + "flos": 536499970560.0, + "grad_norm": 0.052882286550722295, + "language_loss": 0.7941224, + "learning_rate": 0.0006559300168856573, + "loss": 0.80518305, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.36694336, + "step": 2168, + "time_per_iteration": 2.7382309436798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100629, + "balance_loss_mlp": 1.0633167, + "epoch": 0.41727587533666793, + "flos": 550418374656.0, + "grad_norm": 0.05257418188896324, + "language_loss": 0.85768378, + "learning_rate": 0.0006556339815473577, + "loss": 0.86869007, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.37280273, + "step": 2169, + "time_per_iteration": 2.6762564182281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110352, + "balance_loss_mlp": 1.06501567, + "epoch": 0.4174682570219315, + "flos": 630795949056.0, + "grad_norm": 0.0440641640787593, + "language_loss": 0.85913342, + "learning_rate": 0.000655337885786588, + "loss": 0.87016863, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.38452148, + "step": 2170, + "time_per_iteration": 2.8669848442077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098716, + "balance_loss_mlp": 1.06068778, + "epoch": 0.41766063870719505, + "flos": 519501454848.0, + "grad_norm": 0.07103396575336611, + "language_loss": 0.84732234, + "learning_rate": 0.0006550417297183025, + "loss": 0.85830951, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.37988281, + "step": 2171, + "time_per_iteration": 2.6471290588378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110284, + "balance_loss_mlp": 1.0640254, + "epoch": 0.41785302039245864, + "flos": 557656897536.0, + "grad_norm": 0.051327988161677204, + "language_loss": 0.8175863, + "learning_rate": 0.0006547455134574793, + "loss": 0.82861477, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.38793945, + "step": 2172, + "time_per_iteration": 2.71508526802063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100888, + "balance_loss_mlp": 1.06338453, + "epoch": 0.41804540207772223, + "flos": 788156683776.0, + "grad_norm": 0.052280747851499734, + "language_loss": 0.84377366, + "learning_rate": 0.0006544492371191198, + "loss": 0.85478258, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.37475586, + "step": 2173, + "time_per_iteration": 3.114607810974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096475, + "balance_loss_mlp": 1.05775642, + "epoch": 0.41823778376298576, + "flos": 903944240640.0, + "grad_norm": 0.04972167781175626, + "language_loss": 0.83103442, + "learning_rate": 0.0006541529008182485, + "loss": 0.84199917, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.38696289, + "step": 2174, + "time_per_iteration": 3.165484666824341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094456, + "balance_loss_mlp": 1.0563333, + "epoch": 0.41843016544824935, + "flos": 511308440064.0, + "grad_norm": 0.05116159603840096, + "language_loss": 0.8702668, + "learning_rate": 0.0006538565046699136, + "loss": 0.88121128, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.38085938, + "step": 2175, + "time_per_iteration": 2.5701253414154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101136, + "balance_loss_mlp": 1.06389487, + "epoch": 0.4186225471335129, + "flos": 652774947840.0, + "grad_norm": 0.05537675869017034, + "language_loss": 0.81610411, + "learning_rate": 0.0006535600487891862, + "loss": 0.82711548, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.37231445, + "step": 2176, + "time_per_iteration": 2.7980031967163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096315, + "balance_loss_mlp": 1.05900216, + "epoch": 0.41881492881877647, + "flos": 568892298240.0, + "grad_norm": 0.05573219506936483, + "language_loss": 0.89184308, + "learning_rate": 0.0006532635332911603, + "loss": 0.90280616, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.37304688, + "step": 2177, + "time_per_iteration": 2.64104962348938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092004, + "balance_loss_mlp": 1.05495393, + "epoch": 0.41900731050404, + "flos": 911478127104.0, + "grad_norm": 0.05325324025552218, + "language_loss": 0.80538237, + "learning_rate": 0.0006529669582909541, + "loss": 0.81630242, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.37011719, + "step": 2178, + "time_per_iteration": 3.21323299407959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108647, + "balance_loss_mlp": 1.04896641, + "epoch": 0.4191996921893036, + "flos": 535498988544.0, + "grad_norm": 0.06510625194491998, + "language_loss": 0.85975909, + "learning_rate": 0.0006526703239037077, + "loss": 0.87062377, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.375, + "step": 2179, + "time_per_iteration": 2.630338430404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086501, + "balance_loss_mlp": 1.0496887, + "epoch": 0.4193920738745671, + "flos": 582363979776.0, + "grad_norm": 0.04783092813648227, + "language_loss": 0.86411011, + "learning_rate": 0.0006523736302445851, + "loss": 0.8749752, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.36816406, + "step": 2180, + "time_per_iteration": 2.7710120677948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083583, + "balance_loss_mlp": 1.04681921, + "epoch": 0.4195844555598307, + "flos": 1335279720960.0, + "grad_norm": 0.05415818779113344, + "language_loss": 0.77215266, + "learning_rate": 0.0006520768774287728, + "loss": 0.78298849, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.36743164, + "step": 2181, + "time_per_iteration": 3.738273859024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082946, + "balance_loss_mlp": 1.04642057, + "epoch": 0.4197768372450943, + "flos": 598480786944.0, + "grad_norm": 0.04672312513315136, + "language_loss": 0.85467362, + "learning_rate": 0.0006517800655714806, + "loss": 0.86550307, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.36547852, + "step": 2182, + "time_per_iteration": 2.796132802963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076263, + "balance_loss_mlp": 1.04016638, + "epoch": 0.4199692189303578, + "flos": 734929085952.0, + "grad_norm": 0.05966366646918548, + "language_loss": 0.84806752, + "learning_rate": 0.0006514831947879407, + "loss": 0.85883021, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.36132812, + "step": 2183, + "time_per_iteration": 2.9417624473571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077956, + "balance_loss_mlp": 1.04243183, + "epoch": 0.4201616006156214, + "flos": 749854264320.0, + "grad_norm": 0.05811307518141115, + "language_loss": 0.78259802, + "learning_rate": 0.0006511862651934091, + "loss": 0.79337758, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.35522461, + "step": 2184, + "time_per_iteration": 3.0546512603759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082116, + "balance_loss_mlp": 1.04601932, + "epoch": 0.42035398230088494, + "flos": 546764912640.0, + "grad_norm": 0.041926600273946305, + "language_loss": 0.82459891, + "learning_rate": 0.0006508892769031638, + "loss": 0.83542007, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.36083984, + "step": 2185, + "time_per_iteration": 2.7021775245666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085419, + "balance_loss_mlp": 1.04972804, + "epoch": 0.42054636398614853, + "flos": 616628823552.0, + "grad_norm": 0.31605549573939495, + "language_loss": 0.86902821, + "learning_rate": 0.000650592230032506, + "loss": 0.87988245, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.35742188, + "step": 2186, + "time_per_iteration": 2.725625514984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090024, + "balance_loss_mlp": 1.05175829, + "epoch": 0.42073874567141206, + "flos": 640077285888.0, + "grad_norm": 0.04878826269588872, + "language_loss": 0.84995645, + "learning_rate": 0.0006502951246967595, + "loss": 0.86085677, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.38256836, + "step": 2187, + "time_per_iteration": 2.8762335777282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092344, + "balance_loss_mlp": 1.05517459, + "epoch": 0.42093112735667565, + "flos": 493524168192.0, + "grad_norm": 0.05435264660880543, + "language_loss": 0.86905056, + "learning_rate": 0.0006499979610112706, + "loss": 0.87997395, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.37158203, + "step": 2188, + "time_per_iteration": 2.7210283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105519, + "balance_loss_mlp": 1.06615603, + "epoch": 0.4211235090419392, + "flos": 542097321984.0, + "grad_norm": 0.05832158753777823, + "language_loss": 0.84076196, + "learning_rate": 0.000649700739091409, + "loss": 0.85181713, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.39331055, + "step": 2189, + "time_per_iteration": 2.70627498626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109273, + "balance_loss_mlp": 1.09582591, + "epoch": 0.42131589072720277, + "flos": 1531342651392.0, + "grad_norm": 0.0317680876714807, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74945545, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.13476562, + "step": 2190, + "time_per_iteration": 4.8291919231414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103729, + "balance_loss_mlp": 1.0656538, + "epoch": 0.42150827241246636, + "flos": 566583234048.0, + "grad_norm": 0.055290985630161965, + "language_loss": 0.85335857, + "learning_rate": 0.0006491061210101557, + "loss": 0.86439586, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.38037109, + "step": 2191, + "time_per_iteration": 2.669895887374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096378, + "balance_loss_mlp": 1.05770612, + "epoch": 0.4217006540977299, + "flos": 707242226688.0, + "grad_norm": 0.050091435221191714, + "language_loss": 0.83998156, + "learning_rate": 0.0006488087250796157, + "loss": 0.85094529, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.38623047, + "step": 2192, + "time_per_iteration": 2.951594352722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098297, + "balance_loss_mlp": 1.05864835, + "epoch": 0.4218930357829935, + "flos": 626975161344.0, + "grad_norm": 0.047618767001194696, + "language_loss": 0.81377089, + "learning_rate": 0.0006485112713764049, + "loss": 0.82475388, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.39624023, + "step": 2193, + "time_per_iteration": 2.943021535873413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095527, + "balance_loss_mlp": 1.05592585, + "epoch": 0.422085417468257, + "flos": 460110509568.0, + "grad_norm": 0.051159508672241207, + "language_loss": 0.83686495, + "learning_rate": 0.0006482137600160051, + "loss": 0.84782028, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.39575195, + "step": 2194, + "time_per_iteration": 2.5134236812591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095144, + "balance_loss_mlp": 1.05590069, + "epoch": 0.4222777991535206, + "flos": 473788804608.0, + "grad_norm": 0.10490890222415104, + "language_loss": 0.84473735, + "learning_rate": 0.0006479161911139206, + "loss": 0.85568881, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.39208984, + "step": 2195, + "time_per_iteration": 2.577578544616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096098, + "balance_loss_mlp": 1.05754566, + "epoch": 0.4224701808387841, + "flos": 470647494144.0, + "grad_norm": 0.0782943385788455, + "language_loss": 0.85684174, + "learning_rate": 0.0006476185647856778, + "loss": 0.86780274, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.38500977, + "step": 2196, + "time_per_iteration": 2.578495740890503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102208, + "balance_loss_mlp": 1.06286871, + "epoch": 0.4226625625240477, + "flos": 677202633216.0, + "grad_norm": 0.22187176821456261, + "language_loss": 0.81400013, + "learning_rate": 0.0006473208811468255, + "loss": 0.82502222, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.39306641, + "step": 2197, + "time_per_iteration": 2.870922088623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099017, + "balance_loss_mlp": 1.05984497, + "epoch": 0.4228549442093113, + "flos": 503268194304.0, + "grad_norm": 0.05214229642018916, + "language_loss": 0.8430717, + "learning_rate": 0.0006470231403129347, + "loss": 0.85406196, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.39135742, + "step": 2198, + "time_per_iteration": 2.5834295749664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098125, + "balance_loss_mlp": 1.05959654, + "epoch": 0.42304732589457483, + "flos": 611543623680.0, + "grad_norm": 0.055955286861533095, + "language_loss": 0.81645906, + "learning_rate": 0.0006467253423995988, + "loss": 0.82744032, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.38500977, + "step": 2199, + "time_per_iteration": 2.8634603023529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097002, + "balance_loss_mlp": 1.05854511, + "epoch": 0.4232397075798384, + "flos": 515302345728.0, + "grad_norm": 0.05326479811347408, + "language_loss": 0.79026473, + "learning_rate": 0.000646427487522433, + "loss": 0.80123472, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.38452148, + "step": 2200, + "time_per_iteration": 2.649003744125366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102129, + "balance_loss_mlp": 1.063815, + "epoch": 0.42343208926510195, + "flos": 589513752576.0, + "grad_norm": 0.053706873495154336, + "language_loss": 0.83035368, + "learning_rate": 0.0006461295757970749, + "loss": 0.84137499, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.3828125, + "step": 2201, + "time_per_iteration": 2.8269903659820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103583, + "balance_loss_mlp": 1.06379044, + "epoch": 0.42362447095036554, + "flos": 640342126080.0, + "grad_norm": 0.05615670023579285, + "language_loss": 0.8144629, + "learning_rate": 0.0006458316073391839, + "loss": 0.8254987, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.39770508, + "step": 2202, + "time_per_iteration": 2.9145257472991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094508, + "balance_loss_mlp": 1.05595589, + "epoch": 0.42381685263562907, + "flos": 512421493248.0, + "grad_norm": 0.05176927409450969, + "language_loss": 0.87622833, + "learning_rate": 0.0006455335822644422, + "loss": 0.88717341, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.38525391, + "step": 2203, + "time_per_iteration": 2.596822500228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099235, + "balance_loss_mlp": 1.06032515, + "epoch": 0.42400923432089266, + "flos": 546523393536.0, + "grad_norm": 0.08269999762480702, + "language_loss": 0.77441901, + "learning_rate": 0.0006452355006885527, + "loss": 0.78541136, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.38867188, + "step": 2204, + "time_per_iteration": 2.6238672733306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105054, + "balance_loss_mlp": 1.06533396, + "epoch": 0.4242016160061562, + "flos": 621872584704.0, + "grad_norm": 0.06279334467905663, + "language_loss": 0.86963212, + "learning_rate": 0.0006449373627272412, + "loss": 0.88068271, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.39697266, + "step": 2205, + "time_per_iteration": 2.715792417526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094037, + "balance_loss_mlp": 1.05515122, + "epoch": 0.4243939976914198, + "flos": 571649495040.0, + "grad_norm": 0.055815664393925046, + "language_loss": 0.82368463, + "learning_rate": 0.0006446391684962553, + "loss": 0.83462495, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.38867188, + "step": 2206, + "time_per_iteration": 2.642230987548828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096131, + "balance_loss_mlp": 1.05822253, + "epoch": 0.42458637937668336, + "flos": 448509934080.0, + "grad_norm": 0.05868479731789126, + "language_loss": 0.83175069, + "learning_rate": 0.000644340918111364, + "loss": 0.84271193, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.37841797, + "step": 2207, + "time_per_iteration": 2.5489144325256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096536, + "balance_loss_mlp": 1.0566721, + "epoch": 0.4247787610619469, + "flos": 435176464896.0, + "grad_norm": 0.05469710752121124, + "language_loss": 0.84862429, + "learning_rate": 0.0006440426116883585, + "loss": 0.8595897, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.3984375, + "step": 2208, + "time_per_iteration": 2.5027823448181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106001, + "balance_loss_mlp": 1.06563711, + "epoch": 0.4249711427472105, + "flos": 495818675712.0, + "grad_norm": 0.04694631121992161, + "language_loss": 0.86197406, + "learning_rate": 0.0006437442493430519, + "loss": 0.87303412, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.40356445, + "step": 2209, + "time_per_iteration": 2.624462127685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111613, + "balance_loss_mlp": 1.0711534, + "epoch": 0.425163524432474, + "flos": 655498649088.0, + "grad_norm": 0.06243114219893557, + "language_loss": 0.86437929, + "learning_rate": 0.000643445831191278, + "loss": 0.87549543, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.40454102, + "step": 2210, + "time_per_iteration": 2.883671760559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110528, + "balance_loss_mlp": 1.06544065, + "epoch": 0.4253559061177376, + "flos": 650317496832.0, + "grad_norm": 0.059150918853506505, + "language_loss": 0.81800103, + "learning_rate": 0.0006431473573488937, + "loss": 0.82905388, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.39819336, + "step": 2211, + "time_per_iteration": 2.723308563232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098753, + "balance_loss_mlp": 1.05807877, + "epoch": 0.42554828780300114, + "flos": 553894336512.0, + "grad_norm": 0.05841858860857517, + "language_loss": 0.84883767, + "learning_rate": 0.0006428488279317765, + "loss": 0.85982525, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.40673828, + "step": 2212, + "time_per_iteration": 2.628831148147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098786, + "balance_loss_mlp": 1.05904126, + "epoch": 0.4257406694882647, + "flos": 514154386944.0, + "grad_norm": 0.056764121975701104, + "language_loss": 0.87647104, + "learning_rate": 0.0006425502430558259, + "loss": 0.88745892, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.39746094, + "step": 2213, + "time_per_iteration": 2.604146718978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094618, + "balance_loss_mlp": 1.0550406, + "epoch": 0.42593305117352825, + "flos": 515380921344.0, + "grad_norm": 0.05046529876809897, + "language_loss": 0.84638417, + "learning_rate": 0.0006422516028369628, + "loss": 0.85733032, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.39550781, + "step": 2214, + "time_per_iteration": 2.6178741455078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088889, + "balance_loss_mlp": 1.04864407, + "epoch": 0.42612543285879184, + "flos": 587766302208.0, + "grad_norm": 0.04660283784017015, + "language_loss": 0.83496028, + "learning_rate": 0.0006419529073911296, + "loss": 0.84584916, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.40234375, + "step": 2215, + "time_per_iteration": 2.8105666637420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085879, + "balance_loss_mlp": 1.04515672, + "epoch": 0.42631781454405543, + "flos": 635153619456.0, + "grad_norm": 0.05277435964401644, + "language_loss": 0.85660267, + "learning_rate": 0.0006416541568342901, + "loss": 0.86746144, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.40722656, + "step": 2216, + "time_per_iteration": 2.880662441253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080832, + "balance_loss_mlp": 1.040277, + "epoch": 0.42651019622931896, + "flos": 540891136512.0, + "grad_norm": 0.04969535335028593, + "language_loss": 0.84409285, + "learning_rate": 0.0006413553512824297, + "loss": 0.85490113, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.40551758, + "step": 2217, + "time_per_iteration": 2.7169618606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108871, + "balance_loss_mlp": 1.0485599, + "epoch": 0.42670257791458255, + "flos": 557892624384.0, + "grad_norm": 0.052410461022671016, + "language_loss": 0.84532559, + "learning_rate": 0.0006410564908515549, + "loss": 0.85621268, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.40136719, + "step": 2218, + "time_per_iteration": 2.657231092453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077064, + "balance_loss_mlp": 1.03710461, + "epoch": 0.4268949595998461, + "flos": 621025781760.0, + "grad_norm": 0.054635208049088675, + "language_loss": 0.8539567, + "learning_rate": 0.0006407575756576935, + "loss": 0.86472738, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.39941406, + "step": 2219, + "time_per_iteration": 2.7336490154266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089202, + "balance_loss_mlp": 1.04921913, + "epoch": 0.42708734128510967, + "flos": 537646519296.0, + "grad_norm": 0.04674173481591379, + "language_loss": 0.8770538, + "learning_rate": 0.0006404586058168951, + "loss": 0.88794577, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.3996582, + "step": 2220, + "time_per_iteration": 2.757380723953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080306, + "balance_loss_mlp": 1.0401566, + "epoch": 0.4272797229703732, + "flos": 502617830400.0, + "grad_norm": 0.05080694298179496, + "language_loss": 0.86598134, + "learning_rate": 0.0006401595814452296, + "loss": 0.87678444, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.40136719, + "step": 2221, + "time_per_iteration": 2.583448886871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082627, + "balance_loss_mlp": 1.04252505, + "epoch": 0.4274721046556368, + "flos": 492208883712.0, + "grad_norm": 0.05244104927134987, + "language_loss": 0.80640519, + "learning_rate": 0.000639860502658789, + "loss": 0.81723142, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.40087891, + "step": 2222, + "time_per_iteration": 2.6454262733459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080551, + "balance_loss_mlp": 1.04149842, + "epoch": 0.4276644863409004, + "flos": 568094957568.0, + "grad_norm": 0.049852493850949496, + "language_loss": 0.84906983, + "learning_rate": 0.0006395613695736853, + "loss": 0.85987538, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.39038086, + "step": 2223, + "time_per_iteration": 2.6607768535614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108883, + "balance_loss_mlp": 1.04841852, + "epoch": 0.4278568680261639, + "flos": 607155429888.0, + "grad_norm": 0.052366739862963044, + "language_loss": 0.8181783, + "learning_rate": 0.0006392621823060529, + "loss": 0.82906657, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.40405273, + "step": 2224, + "time_per_iteration": 2.7084245681762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085727, + "balance_loss_mlp": 1.045434, + "epoch": 0.4280492497114275, + "flos": 560265707520.0, + "grad_norm": 0.062247479017330604, + "language_loss": 0.85044312, + "learning_rate": 0.0006389629409720465, + "loss": 0.86130041, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.40307617, + "step": 2225, + "time_per_iteration": 2.6494481563568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083023, + "balance_loss_mlp": 1.04451835, + "epoch": 0.428241631396691, + "flos": 720334176768.0, + "grad_norm": 0.05784613309553924, + "language_loss": 0.88236213, + "learning_rate": 0.0006386636456878417, + "loss": 0.89319241, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.38452148, + "step": 2226, + "time_per_iteration": 2.8575398921966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086633, + "balance_loss_mlp": 1.04643595, + "epoch": 0.4284340130819546, + "flos": 429243052032.0, + "grad_norm": 0.05660062263134159, + "language_loss": 0.9185167, + "learning_rate": 0.0006383642965696353, + "loss": 0.92938304, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.40185547, + "step": 2227, + "time_per_iteration": 2.436495065689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093242, + "balance_loss_mlp": 1.05240059, + "epoch": 0.42862639476721814, + "flos": 524732069376.0, + "grad_norm": 0.06503204597883332, + "language_loss": 0.82736492, + "learning_rate": 0.000638064893733645, + "loss": 0.83829737, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.40844727, + "step": 2228, + "time_per_iteration": 2.737835645675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097834, + "balance_loss_mlp": 1.05937719, + "epoch": 0.42881877645248173, + "flos": 465089430528.0, + "grad_norm": 0.05835798065495767, + "language_loss": 0.90023828, + "learning_rate": 0.000637765437296109, + "loss": 0.91121662, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.38427734, + "step": 2229, + "time_per_iteration": 2.6694185733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102575, + "balance_loss_mlp": 1.06383204, + "epoch": 0.42901115813774526, + "flos": 560034362880.0, + "grad_norm": 0.048777417646368525, + "language_loss": 0.85443366, + "learning_rate": 0.000637465927373287, + "loss": 0.86545944, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.38720703, + "step": 2230, + "time_per_iteration": 2.608868360519409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097095, + "balance_loss_mlp": 1.05942452, + "epoch": 0.42920353982300885, + "flos": 561186703872.0, + "grad_norm": 0.058529600310023314, + "language_loss": 0.78994036, + "learning_rate": 0.000637166364081459, + "loss": 0.80091131, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.37670898, + "step": 2231, + "time_per_iteration": 2.6343741416931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109752, + "balance_loss_mlp": 1.06089842, + "epoch": 0.42939592150827244, + "flos": 555982230528.0, + "grad_norm": 0.06635954042372831, + "language_loss": 0.84122705, + "learning_rate": 0.0006368667475369256, + "loss": 0.8522023, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.36621094, + "step": 2232, + "time_per_iteration": 2.719153881072998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01385097, + "balance_loss_mlp": 1.36373484, + "epoch": 0.42958830319353597, + "flos": 1520796902400.0, + "grad_norm": 0.10507214536659652, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79912877, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.21386719, + "step": 2233, + "time_per_iteration": 4.869459390640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222541, + "balance_loss_mlp": 1.20547056, + "epoch": 0.42978068487879956, + "flos": 1495052522496.0, + "grad_norm": 0.06278147410173565, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.80117965, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.17089844, + "step": 2234, + "time_per_iteration": 4.809493780136108 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101654, + "balance_loss_mlp": 1.06386471, + "epoch": 0.4299730665640631, + "flos": 546725624832.0, + "grad_norm": 0.047028007384334866, + "language_loss": 0.86220634, + "learning_rate": 0.0006359675795504112, + "loss": 0.87322283, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.37744141, + "step": 2235, + "time_per_iteration": 2.644548177719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104888, + "balance_loss_mlp": 1.06671751, + "epoch": 0.4301654482493267, + "flos": 1128839473152.0, + "grad_norm": 0.053864842268977364, + "language_loss": 0.7475214, + "learning_rate": 0.0006356677511584775, + "loss": 0.75857025, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.38134766, + "step": 2236, + "time_per_iteration": 3.473637580871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104941, + "balance_loss_mlp": 1.06784356, + "epoch": 0.4303578299345902, + "flos": 495502963200.0, + "grad_norm": 0.07035023985335077, + "language_loss": 0.8582648, + "learning_rate": 0.0006353678700956511, + "loss": 0.86931419, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.37084961, + "step": 2237, + "time_per_iteration": 2.5412683486938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110161, + "balance_loss_mlp": 1.0728724, + "epoch": 0.4305502116198538, + "flos": 615472100352.0, + "grad_norm": 0.048926528615743585, + "language_loss": 0.83597398, + "learning_rate": 0.0006350679364783569, + "loss": 0.84707558, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.37255859, + "step": 2238, + "time_per_iteration": 2.7351441383361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108038, + "balance_loss_mlp": 1.0704397, + "epoch": 0.4307425933051173, + "flos": 558995503104.0, + "grad_norm": 0.05635941331688695, + "language_loss": 0.85586011, + "learning_rate": 0.0006347679504230393, + "loss": 0.8669405, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.37573242, + "step": 2239, + "time_per_iteration": 2.628014326095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108584, + "balance_loss_mlp": 1.06981754, + "epoch": 0.4309349749903809, + "flos": 971755163136.0, + "grad_norm": 0.06390031403556296, + "language_loss": 0.75844669, + "learning_rate": 0.0006344679120461632, + "loss": 0.76953256, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.38745117, + "step": 2240, + "time_per_iteration": 3.325970411300659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099219, + "balance_loss_mlp": 1.06123924, + "epoch": 0.4311273566756445, + "flos": 541663746048.0, + "grad_norm": 0.07957466882071795, + "language_loss": 0.79994094, + "learning_rate": 0.0006341678214642134, + "loss": 0.81093317, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.37963867, + "step": 2241, + "time_per_iteration": 2.598954916000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098329, + "balance_loss_mlp": 1.06118321, + "epoch": 0.43131973836090803, + "flos": 761316627456.0, + "grad_norm": 0.06316124390987561, + "language_loss": 0.82909411, + "learning_rate": 0.0006338676787936963, + "loss": 0.8400774, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.37133789, + "step": 2242, + "time_per_iteration": 3.057990074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092454, + "balance_loss_mlp": 1.0547359, + "epoch": 0.4315121200461716, + "flos": 554263893504.0, + "grad_norm": 0.058630582948494374, + "language_loss": 0.83799654, + "learning_rate": 0.0006335674841511367, + "loss": 0.84892106, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.37670898, + "step": 2243, + "time_per_iteration": 2.667917490005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152602, + "balance_loss_mlp": 1.1380111, + "epoch": 0.43170450173143515, + "flos": 1484499419136.0, + "grad_norm": 0.03105866471095203, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80333769, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.14550781, + "step": 2244, + "time_per_iteration": 4.996346473693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147416, + "balance_loss_mlp": 1.13225269, + "epoch": 0.43189688341669874, + "flos": 1472897433600.0, + "grad_norm": 0.02634625536346193, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78512967, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.15136719, + "step": 2245, + "time_per_iteration": 4.925641775131226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090243, + "balance_loss_mlp": 1.05293071, + "epoch": 0.43208926510196227, + "flos": 492677365248.0, + "grad_norm": 0.04832922480589342, + "language_loss": 0.82476389, + "learning_rate": 0.0006326665895567652, + "loss": 0.83566636, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.37304688, + "step": 2246, + "time_per_iteration": 2.6338651180267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108663, + "balance_loss_mlp": 1.04876888, + "epoch": 0.43228164678722586, + "flos": 519969936384.0, + "grad_norm": 0.06353903654252775, + "language_loss": 0.86891162, + "learning_rate": 0.0006323661881916976, + "loss": 0.87977791, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.37841797, + "step": 2247, + "time_per_iteration": 2.7270143032073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088946, + "balance_loss_mlp": 1.05082273, + "epoch": 0.4324740284724894, + "flos": 795722655744.0, + "grad_norm": 0.06655581665723238, + "language_loss": 0.81039822, + "learning_rate": 0.0006320657354375179, + "loss": 0.82128775, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.38134766, + "step": 2248, + "time_per_iteration": 2.9334113597869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090387, + "balance_loss_mlp": 1.05183434, + "epoch": 0.432666410157753, + "flos": 481917800448.0, + "grad_norm": 0.05858711608638651, + "language_loss": 0.87308645, + "learning_rate": 0.0006317652314108726, + "loss": 0.88399029, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.38500977, + "step": 2249, + "time_per_iteration": 2.5155436992645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083427, + "balance_loss_mlp": 1.04508948, + "epoch": 0.43285879184301657, + "flos": 499963940352.0, + "grad_norm": 0.06176153995331203, + "language_loss": 0.91197717, + "learning_rate": 0.0006314646762284277, + "loss": 0.92281145, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.38305664, + "step": 2250, + "time_per_iteration": 2.5938589572906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151081, + "balance_loss_mlp": 1.13324702, + "epoch": 0.4330511735282801, + "flos": 1509615346176.0, + "grad_norm": 0.03602865793169688, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76576912, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.17871094, + "step": 2251, + "time_per_iteration": 4.858763217926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082209, + "balance_loss_mlp": 1.04322791, + "epoch": 0.4332435552135437, + "flos": 699270382080.0, + "grad_norm": 0.07106828010915285, + "language_loss": 0.77364099, + "learning_rate": 0.0006308634128629022, + "loss": 0.78446311, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.3894043, + "step": 2252, + "time_per_iteration": 2.857311487197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081116, + "balance_loss_mlp": 1.04163396, + "epoch": 0.4334359368988072, + "flos": 591995934720.0, + "grad_norm": 0.05494240381392999, + "language_loss": 0.87411273, + "learning_rate": 0.0006305627049132531, + "loss": 0.88492393, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.39453125, + "step": 2253, + "time_per_iteration": 2.7931392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074672, + "balance_loss_mlp": 1.03628647, + "epoch": 0.4336283185840708, + "flos": 842440670208.0, + "grad_norm": 0.045544810523015906, + "language_loss": 0.85602796, + "learning_rate": 0.0006302619462746662, + "loss": 0.86677468, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.38330078, + "step": 2254, + "time_per_iteration": 3.137031078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072053, + "balance_loss_mlp": 1.03521752, + "epoch": 0.43382070026933434, + "flos": 625974179328.0, + "grad_norm": 0.05597321467051534, + "language_loss": 0.90273923, + "learning_rate": 0.0006299611370639069, + "loss": 0.91345972, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.36816406, + "step": 2255, + "time_per_iteration": 2.7370500564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078678, + "balance_loss_mlp": 1.04029226, + "epoch": 0.4340130819545979, + "flos": 590837801472.0, + "grad_norm": 0.05249156720482198, + "language_loss": 0.7960273, + "learning_rate": 0.0006296602773977593, + "loss": 0.80681407, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.38354492, + "step": 2256, + "time_per_iteration": 2.671543836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082288, + "balance_loss_mlp": 1.04387856, + "epoch": 0.4342054636398615, + "flos": 490624376832.0, + "grad_norm": 0.047941706130753194, + "language_loss": 0.87283635, + "learning_rate": 0.0006293593673930277, + "loss": 0.88365924, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.3840332, + "step": 2257, + "time_per_iteration": 2.622807741165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084566, + "balance_loss_mlp": 1.04694366, + "epoch": 0.43439784532512504, + "flos": 698679654912.0, + "grad_norm": 0.05256563639723818, + "language_loss": 0.78625226, + "learning_rate": 0.0006290584071665358, + "loss": 0.79709792, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.3762207, + "step": 2258, + "time_per_iteration": 2.8814268112182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084583, + "balance_loss_mlp": 1.0463171, + "epoch": 0.43459022701038863, + "flos": 485581436928.0, + "grad_norm": 0.05582719483060078, + "language_loss": 0.82315511, + "learning_rate": 0.0006287573968351266, + "loss": 0.83400095, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.38256836, + "step": 2259, + "time_per_iteration": 2.530107259750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093075, + "balance_loss_mlp": 1.05585814, + "epoch": 0.43478260869565216, + "flos": 642818515968.0, + "grad_norm": 0.06362082652150813, + "language_loss": 0.82416236, + "learning_rate": 0.0006284563365156626, + "loss": 0.83509314, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.37182617, + "step": 2260, + "time_per_iteration": 2.798595905303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088803, + "balance_loss_mlp": 1.05103791, + "epoch": 0.43497499038091575, + "flos": 425870396928.0, + "grad_norm": 0.05655312611086985, + "language_loss": 0.87709838, + "learning_rate": 0.0006281552263250261, + "loss": 0.88798642, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.37719727, + "step": 2261, + "time_per_iteration": 2.452665090560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160927, + "balance_loss_mlp": 1.14223516, + "epoch": 0.4351673720661793, + "flos": 1537594748928.0, + "grad_norm": 0.04176446008295971, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.8185246, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.18652344, + "step": 2262, + "time_per_iteration": 4.821255207061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101716, + "balance_loss_mlp": 1.0650475, + "epoch": 0.43535975375144287, + "flos": 748828551168.0, + "grad_norm": 0.06957692587484587, + "language_loss": 0.81302369, + "learning_rate": 0.0006275528567978593, + "loss": 0.82404089, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.36669922, + "step": 2263, + "time_per_iteration": 2.9021594524383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105224, + "balance_loss_mlp": 1.06710052, + "epoch": 0.4355521354367064, + "flos": 860914593792.0, + "grad_norm": 0.05359116837259303, + "language_loss": 0.8251968, + "learning_rate": 0.0006272515976951898, + "loss": 0.83624899, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.38134766, + "step": 2264, + "time_per_iteration": 3.051140546798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100567, + "balance_loss_mlp": 1.06160915, + "epoch": 0.43574451712197, + "flos": 734200146432.0, + "grad_norm": 0.04085362180640218, + "language_loss": 0.79003727, + "learning_rate": 0.0006269502891890687, + "loss": 0.80104291, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.38916016, + "step": 2265, + "time_per_iteration": 2.987435817718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109612, + "balance_loss_mlp": 1.05899858, + "epoch": 0.4359368988072336, + "flos": 570296332800.0, + "grad_norm": 0.04646658934269887, + "language_loss": 0.88059056, + "learning_rate": 0.0006266489313964743, + "loss": 0.89155173, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.37109375, + "step": 2266, + "time_per_iteration": 2.718259572982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098621, + "balance_loss_mlp": 1.06040287, + "epoch": 0.4361292804924971, + "flos": 555244526592.0, + "grad_norm": 0.06168340797293566, + "language_loss": 0.85241735, + "learning_rate": 0.0006263475244344041, + "loss": 0.86340356, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.38183594, + "step": 2267, + "time_per_iteration": 2.822174072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099998, + "balance_loss_mlp": 1.06232774, + "epoch": 0.4363216621777607, + "flos": 557021090304.0, + "grad_norm": 0.06545155195827496, + "language_loss": 0.84663981, + "learning_rate": 0.0006260460684198746, + "loss": 0.85763973, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.37646484, + "step": 2268, + "time_per_iteration": 2.652629852294922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092951, + "balance_loss_mlp": 1.05556679, + "epoch": 0.4365140438630242, + "flos": 477979149312.0, + "grad_norm": 0.06144025960698331, + "language_loss": 0.84485406, + "learning_rate": 0.0006257445634699213, + "loss": 0.85578358, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.3737793, + "step": 2269, + "time_per_iteration": 2.526547431945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091306, + "balance_loss_mlp": 1.05506659, + "epoch": 0.4367064255482878, + "flos": 578646498816.0, + "grad_norm": 0.047950904811088546, + "language_loss": 0.82840669, + "learning_rate": 0.0006254430097015993, + "loss": 0.83931977, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.36279297, + "step": 2270, + "time_per_iteration": 2.6397740840911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121077, + "balance_loss_mlp": 1.1094898, + "epoch": 0.43689880723355135, + "flos": 1458117669888.0, + "grad_norm": 0.029995875979849037, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77600169, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.11572266, + "step": 2271, + "time_per_iteration": 4.781012535095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093451, + "balance_loss_mlp": 1.0559721, + "epoch": 0.43709118891881493, + "flos": 667295663616.0, + "grad_norm": 0.05579821190743498, + "language_loss": 0.85169244, + "learning_rate": 0.0006248397561781609, + "loss": 0.86262697, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.37426758, + "step": 2272, + "time_per_iteration": 2.8750343322753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109427, + "balance_loss_mlp": 1.05617118, + "epoch": 0.43728357060407846, + "flos": 544612999680.0, + "grad_norm": 0.06638881020832643, + "language_loss": 0.86299849, + "learning_rate": 0.0006245380566572482, + "loss": 0.87394118, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.38085938, + "step": 2273, + "time_per_iteration": 2.667287826538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095571, + "balance_loss_mlp": 1.05873561, + "epoch": 0.43747595228934205, + "flos": 746504930304.0, + "grad_norm": 0.06509502789500103, + "language_loss": 0.75652242, + "learning_rate": 0.0006242363087863744, + "loss": 0.76747811, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.36816406, + "step": 2274, + "time_per_iteration": 2.948168992996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088988, + "balance_loss_mlp": 1.05060267, + "epoch": 0.43766833397460564, + "flos": 631060789248.0, + "grad_norm": 0.0773983629565932, + "language_loss": 0.85681164, + "learning_rate": 0.0006239345126826878, + "loss": 0.86770147, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.38354492, + "step": 2275, + "time_per_iteration": 2.7522637844085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084857, + "balance_loss_mlp": 1.04682946, + "epoch": 0.43786071565986917, + "flos": 530709152256.0, + "grad_norm": 0.05397848209837344, + "language_loss": 0.84028137, + "learning_rate": 0.0006236326684633561, + "loss": 0.85112989, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.37988281, + "step": 2276, + "time_per_iteration": 2.8013172149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083155, + "balance_loss_mlp": 1.04479384, + "epoch": 0.43805309734513276, + "flos": 538295473152.0, + "grad_norm": 0.057720697432170794, + "language_loss": 0.74613291, + "learning_rate": 0.0006233307762455658, + "loss": 0.75696445, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.38354492, + "step": 2277, + "time_per_iteration": 4.090092658996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088607, + "balance_loss_mlp": 1.05057979, + "epoch": 0.4382454790303963, + "flos": 864188324352.0, + "grad_norm": 0.052083504639934525, + "language_loss": 0.83232701, + "learning_rate": 0.0006230288361465216, + "loss": 0.84321308, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.37988281, + "step": 2278, + "time_per_iteration": 3.0360679626464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092026, + "balance_loss_mlp": 1.05368817, + "epoch": 0.4384378607156599, + "flos": 765175292928.0, + "grad_norm": 0.0765632057362916, + "language_loss": 0.85051048, + "learning_rate": 0.0006227268482834473, + "loss": 0.86143076, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.38305664, + "step": 2279, + "time_per_iteration": 2.875603437423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092125, + "balance_loss_mlp": 1.05369186, + "epoch": 0.4386302424009234, + "flos": 668260329984.0, + "grad_norm": 0.06746087226793605, + "language_loss": 0.87309432, + "learning_rate": 0.000622424812773585, + "loss": 0.88401562, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.3840332, + "step": 2280, + "time_per_iteration": 2.815737724304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091003, + "balance_loss_mlp": 1.05335641, + "epoch": 0.438822624086187, + "flos": 484941247488.0, + "grad_norm": 0.06660247150401381, + "language_loss": 0.7952022, + "learning_rate": 0.000622122729734195, + "loss": 0.80611223, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.3762207, + "step": 2281, + "time_per_iteration": 2.528907060623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010937, + "balance_loss_mlp": 1.05653024, + "epoch": 0.4390150057714506, + "flos": 498959986176.0, + "grad_norm": 0.07198447175498815, + "language_loss": 0.87400854, + "learning_rate": 0.0006218205992825566, + "loss": 0.88494551, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.37158203, + "step": 2282, + "time_per_iteration": 2.6437437534332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086909, + "balance_loss_mlp": 1.04895234, + "epoch": 0.4392073874567141, + "flos": 557937704448.0, + "grad_norm": 0.0537918663445124, + "language_loss": 0.81690598, + "learning_rate": 0.0006215184215359671, + "loss": 0.82777506, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.37939453, + "step": 2283, + "time_per_iteration": 2.7374680042266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082056, + "balance_loss_mlp": 1.04531598, + "epoch": 0.4393997691419777, + "flos": 605028248064.0, + "grad_norm": 0.053438963610997155, + "language_loss": 0.86718416, + "learning_rate": 0.0006212161966117425, + "loss": 0.87800473, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.36743164, + "step": 2284, + "time_per_iteration": 2.7031607627868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082719, + "balance_loss_mlp": 1.04476333, + "epoch": 0.43959215082724123, + "flos": 803812363776.0, + "grad_norm": 0.05414488390239245, + "language_loss": 0.81261152, + "learning_rate": 0.0006209139246272164, + "loss": 0.8234387, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.37915039, + "step": 2285, + "time_per_iteration": 2.942938804626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081504, + "balance_loss_mlp": 1.04354775, + "epoch": 0.4397845325125048, + "flos": 487403080704.0, + "grad_norm": 0.06213580776851028, + "language_loss": 0.8193686, + "learning_rate": 0.0006206116056997421, + "loss": 0.83018363, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.37939453, + "step": 2286, + "time_per_iteration": 2.549246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083431, + "balance_loss_mlp": 1.04671431, + "epoch": 0.43997691419776835, + "flos": 480569020416.0, + "grad_norm": 0.047189645190622125, + "language_loss": 0.82737786, + "learning_rate": 0.0006203092399466892, + "loss": 0.83821213, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.36694336, + "step": 2287, + "time_per_iteration": 2.533667802810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079141, + "balance_loss_mlp": 1.04259157, + "epoch": 0.44016929588303194, + "flos": 482873702400.0, + "grad_norm": 0.04521232958061075, + "language_loss": 0.85280973, + "learning_rate": 0.0006200068274854473, + "loss": 0.86360115, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.36523438, + "step": 2288, + "time_per_iteration": 2.6336212158203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087714, + "balance_loss_mlp": 1.05013943, + "epoch": 0.4403616775682955, + "flos": 571562155008.0, + "grad_norm": 0.04238785738832165, + "language_loss": 0.85822582, + "learning_rate": 0.0006197043684334229, + "loss": 0.86910295, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.37548828, + "step": 2289, + "time_per_iteration": 2.7420616149902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108734, + "balance_loss_mlp": 1.05028939, + "epoch": 0.44055405925355906, + "flos": 630563194368.0, + "grad_norm": 0.0573866619632787, + "language_loss": 0.79627317, + "learning_rate": 0.0006194018629080411, + "loss": 0.80714655, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.37036133, + "step": 2290, + "time_per_iteration": 2.7804791927337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108993, + "balance_loss_mlp": 1.0514729, + "epoch": 0.44074644093882265, + "flos": 536523291648.0, + "grad_norm": 0.052070709818396434, + "language_loss": 0.81445479, + "learning_rate": 0.0006190993110267451, + "loss": 0.82535404, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.38427734, + "step": 2291, + "time_per_iteration": 2.6991255283355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079676, + "balance_loss_mlp": 1.04317451, + "epoch": 0.4409388226240862, + "flos": 462995744256.0, + "grad_norm": 0.05365602748785357, + "language_loss": 0.84155387, + "learning_rate": 0.0006187967129069958, + "loss": 0.85235059, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.36523438, + "step": 2292, + "time_per_iteration": 2.558609962463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082437, + "balance_loss_mlp": 1.04569674, + "epoch": 0.44113120430934977, + "flos": 565717492224.0, + "grad_norm": 0.05065606510830679, + "language_loss": 0.87013716, + "learning_rate": 0.0006184940686662722, + "loss": 0.88096148, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.36743164, + "step": 2293, + "time_per_iteration": 2.753314733505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078141, + "balance_loss_mlp": 1.04125786, + "epoch": 0.4413235859946133, + "flos": 543313681920.0, + "grad_norm": 0.05240936044313176, + "language_loss": 0.89929485, + "learning_rate": 0.0006181913784220714, + "loss": 0.91007626, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.36865234, + "step": 2294, + "time_per_iteration": 2.6420986652374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111961, + "balance_loss_mlp": 1.09889555, + "epoch": 0.4415159676798769, + "flos": 1569016465920.0, + "grad_norm": 0.03544098021349555, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81665742, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.13085938, + "step": 2295, + "time_per_iteration": 4.864506483078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085965, + "balance_loss_mlp": 1.04831886, + "epoch": 0.4417083493651404, + "flos": 658423171584.0, + "grad_norm": 0.06256258413724265, + "language_loss": 0.79847091, + "learning_rate": 0.0006175858603933146, + "loss": 0.80933058, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.3762207, + "step": 2296, + "time_per_iteration": 2.8739333152770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079624, + "balance_loss_mlp": 1.04328871, + "epoch": 0.441900731050404, + "flos": 740119002624.0, + "grad_norm": 0.05454759239937102, + "language_loss": 0.80644178, + "learning_rate": 0.0006172830328438416, + "loss": 0.81723803, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.36352539, + "step": 2297, + "time_per_iteration": 2.9661777019500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082208, + "balance_loss_mlp": 1.0437274, + "epoch": 0.44209311273566754, + "flos": 539153860608.0, + "grad_norm": 0.05386131456834753, + "language_loss": 0.87081188, + "learning_rate": 0.0006169801597610572, + "loss": 0.88163394, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.38452148, + "step": 2298, + "time_per_iteration": 2.732304573059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107238, + "balance_loss_mlp": 1.03604531, + "epoch": 0.4422854944209311, + "flos": 621335702016.0, + "grad_norm": 0.07013675434202182, + "language_loss": 0.89663231, + "learning_rate": 0.0006166772412625469, + "loss": 0.90735614, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.36328125, + "step": 2299, + "time_per_iteration": 2.70890736579895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075195, + "balance_loss_mlp": 1.03793061, + "epoch": 0.4424778761061947, + "flos": 658516303872.0, + "grad_norm": 0.06419018913135732, + "language_loss": 0.81816357, + "learning_rate": 0.0006163742774659141, + "loss": 0.8289156, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.37255859, + "step": 2300, + "time_per_iteration": 2.830306053161621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081661, + "balance_loss_mlp": 1.0454216, + "epoch": 0.44267025779145824, + "flos": 568297188864.0, + "grad_norm": 0.05261241955347018, + "language_loss": 0.85695601, + "learning_rate": 0.0006160712684887801, + "loss": 0.86777264, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.36279297, + "step": 2301, + "time_per_iteration": 2.7931785583496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010826, + "balance_loss_mlp": 1.04600239, + "epoch": 0.44286263947672183, + "flos": 496469039616.0, + "grad_norm": 0.05340137710748247, + "language_loss": 0.81907189, + "learning_rate": 0.0006157682144487832, + "loss": 0.82989788, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.36572266, + "step": 2302, + "time_per_iteration": 2.7355551719665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108501, + "balance_loss_mlp": 1.04793596, + "epoch": 0.44305502116198536, + "flos": 609096347136.0, + "grad_norm": 0.060309070663334345, + "language_loss": 0.82788789, + "learning_rate": 0.0006154651154635793, + "loss": 0.83873796, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.37084961, + "step": 2303, + "time_per_iteration": 2.8048007488250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088624, + "balance_loss_mlp": 1.05150199, + "epoch": 0.44324740284724895, + "flos": 470558744064.0, + "grad_norm": 0.05169590776144269, + "language_loss": 0.84867418, + "learning_rate": 0.0006151619716508421, + "loss": 0.85956049, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.37084961, + "step": 2304, + "time_per_iteration": 2.5419833660125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.05046785, + "epoch": 0.4434397845325125, + "flos": 578454441984.0, + "grad_norm": 0.05720417651641939, + "language_loss": 0.86974978, + "learning_rate": 0.0006148587831282625, + "loss": 0.88062799, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.37353516, + "step": 2305, + "time_per_iteration": 2.689751386642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055473, + "balance_loss_mlp": 1.04326594, + "epoch": 0.44363216621777607, + "flos": 1495765343232.0, + "grad_norm": 0.012762307031937271, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80231541, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.12207031, + "step": 2306, + "time_per_iteration": 4.886535406112671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092659, + "balance_loss_mlp": 1.05699158, + "epoch": 0.44382454790303966, + "flos": 477082884096.0, + "grad_norm": 0.06286570611305137, + "language_loss": 0.86913157, + "learning_rate": 0.0006142522724244255, + "loss": 0.88005817, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.35693359, + "step": 2307, + "time_per_iteration": 2.499870777130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054077, + "balance_loss_mlp": 1.04177487, + "epoch": 0.4440169295883032, + "flos": 1543321548288.0, + "grad_norm": 0.013017387525484581, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.775388, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.12255859, + "step": 2308, + "time_per_iteration": 4.8646886348724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087414, + "balance_loss_mlp": 1.05115092, + "epoch": 0.4442093112735668, + "flos": 590789749248.0, + "grad_norm": 0.050195382328210664, + "language_loss": 0.77274799, + "learning_rate": 0.000613645584293942, + "loss": 0.78362215, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.36279297, + "step": 2309, + "time_per_iteration": 2.877244472503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087256, + "balance_loss_mlp": 1.05056334, + "epoch": 0.4444016929588303, + "flos": 530009326080.0, + "grad_norm": 0.047114011401622066, + "language_loss": 0.83068305, + "learning_rate": 0.0006133421739881185, + "loss": 0.8415556, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.36694336, + "step": 2310, + "time_per_iteration": 2.667240858078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081979, + "balance_loss_mlp": 1.04557252, + "epoch": 0.4445940746440939, + "flos": 619947634176.0, + "grad_norm": 0.055208144480819774, + "language_loss": 0.82587862, + "learning_rate": 0.0006130387196789605, + "loss": 0.83669835, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.36425781, + "step": 2311, + "time_per_iteration": 2.7925667762756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082597, + "balance_loss_mlp": 1.04704881, + "epoch": 0.4447864563293574, + "flos": 628782248448.0, + "grad_norm": 0.049856185775691036, + "language_loss": 0.83914995, + "learning_rate": 0.0006127352214842795, + "loss": 0.84997582, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.35571289, + "step": 2312, + "time_per_iteration": 2.9495813846588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079657, + "balance_loss_mlp": 1.04236865, + "epoch": 0.444978838014621, + "flos": 650548841472.0, + "grad_norm": 0.0527905378587152, + "language_loss": 0.85049295, + "learning_rate": 0.0006124316795219041, + "loss": 0.8612895, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.37255859, + "step": 2313, + "time_per_iteration": 2.760117769241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077597, + "balance_loss_mlp": 1.04119062, + "epoch": 0.44517121969988455, + "flos": 612153289728.0, + "grad_norm": 0.047764928605774304, + "language_loss": 0.82297838, + "learning_rate": 0.0006121280939096794, + "loss": 0.8337543, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.36401367, + "step": 2314, + "time_per_iteration": 2.737471580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075674, + "balance_loss_mlp": 1.0385046, + "epoch": 0.44536360138514813, + "flos": 488491402752.0, + "grad_norm": 0.07620217918322614, + "language_loss": 0.87685931, + "learning_rate": 0.000611824464765468, + "loss": 0.88761604, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.37133789, + "step": 2315, + "time_per_iteration": 2.5991926193237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103119, + "balance_loss_mlp": 1.01922143, + "epoch": 0.4455559830704117, + "flos": 1515425255424.0, + "grad_norm": 0.013293348061684912, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79626131, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.11962891, + "step": 2316, + "time_per_iteration": 4.652711391448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079239, + "balance_loss_mlp": 1.04335713, + "epoch": 0.44574836475567525, + "flos": 615314949120.0, + "grad_norm": 0.04747333782009751, + "language_loss": 0.85680878, + "learning_rate": 0.000611217076352619, + "loss": 0.86760116, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.35913086, + "step": 2317, + "time_per_iteration": 2.7729227542877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077005, + "balance_loss_mlp": 1.04140949, + "epoch": 0.44594074644093884, + "flos": 506070471168.0, + "grad_norm": 0.2761075259266177, + "language_loss": 0.82980591, + "learning_rate": 0.0006109133173197905, + "loss": 0.84057599, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.35620117, + "step": 2318, + "time_per_iteration": 2.6684277057647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087623, + "balance_loss_mlp": 1.05243218, + "epoch": 0.44613312812620237, + "flos": 726647321088.0, + "grad_norm": 0.057083346058123784, + "language_loss": 0.85251284, + "learning_rate": 0.0006106095152265935, + "loss": 0.86338907, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.35229492, + "step": 2319, + "time_per_iteration": 2.9197404384613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092234, + "balance_loss_mlp": 1.05604196, + "epoch": 0.44632550981146596, + "flos": 635419869696.0, + "grad_norm": 0.048967973341694476, + "language_loss": 0.8448627, + "learning_rate": 0.0006103056701909739, + "loss": 0.85578501, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.36230469, + "step": 2320, + "time_per_iteration": 2.885965347290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101974, + "balance_loss_mlp": 1.06604421, + "epoch": 0.4465178914967295, + "flos": 826690447872.0, + "grad_norm": 0.04429440839494469, + "language_loss": 0.82779431, + "learning_rate": 0.0006100017823308956, + "loss": 0.83881408, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.35961914, + "step": 2321, + "time_per_iteration": 3.1523914337158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110877, + "balance_loss_mlp": 1.0737319, + "epoch": 0.4467102731819931, + "flos": 665532246528.0, + "grad_norm": 0.05773147459468349, + "language_loss": 0.79802787, + "learning_rate": 0.0006096978517643377, + "loss": 0.80913663, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.37158203, + "step": 2322, + "time_per_iteration": 2.8030614852905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123327, + "balance_loss_mlp": 1.08668184, + "epoch": 0.4469026548672566, + "flos": 512692125696.0, + "grad_norm": 0.052696901781691036, + "language_loss": 0.83731532, + "learning_rate": 0.0006093938786092968, + "loss": 0.84854853, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.3659668, + "step": 2323, + "time_per_iteration": 2.6108593940734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112741, + "balance_loss_mlp": 1.0761435, + "epoch": 0.4470950365525202, + "flos": 683774825472.0, + "grad_norm": 0.0683875942547517, + "language_loss": 0.89724207, + "learning_rate": 0.0006090898629837857, + "loss": 0.90836942, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.3659668, + "step": 2324, + "time_per_iteration": 2.8141510486602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121866, + "balance_loss_mlp": 1.08515, + "epoch": 0.4472874182377838, + "flos": 627018831360.0, + "grad_norm": 0.05799026068482576, + "language_loss": 0.87375617, + "learning_rate": 0.0006087858050058337, + "loss": 0.88497484, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.3671875, + "step": 2325, + "time_per_iteration": 2.8174242973327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106391, + "balance_loss_mlp": 1.07053268, + "epoch": 0.4474797999230473, + "flos": 546946795008.0, + "grad_norm": 0.06107345330372946, + "language_loss": 0.81985253, + "learning_rate": 0.0006084817047934866, + "loss": 0.8309164, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.35888672, + "step": 2326, + "time_per_iteration": 2.627870798110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111871, + "balance_loss_mlp": 1.08211279, + "epoch": 0.4476721816083109, + "flos": 455585513472.0, + "grad_norm": 0.09021260210248909, + "language_loss": 0.89277744, + "learning_rate": 0.0006081775624648066, + "loss": 0.90396452, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.3659668, + "step": 2327, + "time_per_iteration": 2.517587900161743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107902, + "balance_loss_mlp": 1.07154357, + "epoch": 0.44786456329357444, + "flos": 481273228800.0, + "grad_norm": 0.05788938613905733, + "language_loss": 0.8277235, + "learning_rate": 0.0006078733781378721, + "loss": 0.83880252, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.36401367, + "step": 2328, + "time_per_iteration": 2.5216193199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102085, + "balance_loss_mlp": 1.06579816, + "epoch": 0.448056944978838, + "flos": 551822409216.0, + "grad_norm": 0.05774471450654044, + "language_loss": 0.82095438, + "learning_rate": 0.0006075691519307781, + "loss": 0.83197522, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.36303711, + "step": 2329, + "time_per_iteration": 2.8394477367401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093614, + "balance_loss_mlp": 1.05551517, + "epoch": 0.44824932666410156, + "flos": 550571143680.0, + "grad_norm": 0.05485541452922095, + "language_loss": 0.82042563, + "learning_rate": 0.0006072648839616356, + "loss": 0.83136177, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.38061523, + "step": 2330, + "time_per_iteration": 2.650087594985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089159, + "balance_loss_mlp": 1.05229926, + "epoch": 0.44844170834936514, + "flos": 988161541632.0, + "grad_norm": 0.0454185508799419, + "language_loss": 0.82814097, + "learning_rate": 0.0006069605743485718, + "loss": 0.83903253, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.3684082, + "step": 2331, + "time_per_iteration": 3.345179319381714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085878, + "balance_loss_mlp": 1.0494473, + "epoch": 0.44863409003462873, + "flos": 591040032768.0, + "grad_norm": 0.057018102026312835, + "language_loss": 0.83470714, + "learning_rate": 0.0006066562232097303, + "loss": 0.84556592, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.36425781, + "step": 2332, + "time_per_iteration": 2.7025153636932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089148, + "balance_loss_mlp": 1.0525744, + "epoch": 0.44882647171989226, + "flos": 724313525760.0, + "grad_norm": 0.055435808375502424, + "language_loss": 0.86104345, + "learning_rate": 0.0006063518306632708, + "loss": 0.87193495, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.36572266, + "step": 2333, + "time_per_iteration": 2.934469699859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082178, + "balance_loss_mlp": 1.04465127, + "epoch": 0.44901885340515585, + "flos": 534662360064.0, + "grad_norm": 0.061394686563490536, + "language_loss": 0.82313985, + "learning_rate": 0.0006060473968273688, + "loss": 0.83396161, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.375, + "step": 2334, + "time_per_iteration": 2.6561286449432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139417, + "balance_loss_mlp": 1.12782979, + "epoch": 0.4492112350904194, + "flos": 1554456462336.0, + "grad_norm": 0.048192148717983975, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.79018956, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.11572266, + "step": 2335, + "time_per_iteration": 4.895314693450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092745, + "balance_loss_mlp": 1.08144426, + "epoch": 0.44940361677568297, + "flos": 1522525413888.0, + "grad_norm": 0.0355581806637232, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.8209796, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.11279297, + "step": 2336, + "time_per_iteration": 4.86665940284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088054, + "balance_loss_mlp": 1.05064595, + "epoch": 0.4495959984609465, + "flos": 382289310720.0, + "grad_norm": 0.06064477802371089, + "language_loss": 0.88117951, + "learning_rate": 0.0006051338487650047, + "loss": 0.89206004, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.3737793, + "step": 2337, + "time_per_iteration": 2.4159162044525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086135, + "balance_loss_mlp": 1.04777336, + "epoch": 0.4497883801462101, + "flos": 497630145024.0, + "grad_norm": 0.058257925131248826, + "language_loss": 0.82456082, + "learning_rate": 0.0006048292509534095, + "loss": 0.83542222, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.38354492, + "step": 2338, + "time_per_iteration": 2.5835769176483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081051, + "balance_loss_mlp": 1.04392958, + "epoch": 0.4499807618314736, + "flos": 614166990336.0, + "grad_norm": 0.053787147945734054, + "language_loss": 0.77580249, + "learning_rate": 0.0006045246124434895, + "loss": 0.78661299, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.37109375, + "step": 2339, + "time_per_iteration": 2.7258870601654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080837, + "balance_loss_mlp": 1.04311895, + "epoch": 0.4501731435167372, + "flos": 1005122331648.0, + "grad_norm": 0.06446556175990359, + "language_loss": 0.86143219, + "learning_rate": 0.0006042199333535162, + "loss": 0.87224054, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.37695312, + "step": 2340, + "time_per_iteration": 3.2644054889678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089021, + "balance_loss_mlp": 1.05132723, + "epoch": 0.4503655252020008, + "flos": 820519898112.0, + "grad_norm": 0.05440597484835576, + "language_loss": 0.8378191, + "learning_rate": 0.0006039152138017763, + "loss": 0.84870934, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.37695312, + "step": 2341, + "time_per_iteration": 3.0747756958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082643, + "balance_loss_mlp": 1.04566467, + "epoch": 0.4505579068872643, + "flos": 486113937408.0, + "grad_norm": 0.06051531382505287, + "language_loss": 0.83470345, + "learning_rate": 0.0006036104539065726, + "loss": 0.84552985, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.36962891, + "step": 2342, + "time_per_iteration": 2.6581151485443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076125, + "balance_loss_mlp": 1.03812099, + "epoch": 0.4507502885725279, + "flos": 884421282816.0, + "grad_norm": 0.05288539322407846, + "language_loss": 0.845487, + "learning_rate": 0.000603305653786223, + "loss": 0.85624826, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.37963867, + "step": 2343, + "time_per_iteration": 3.1298844814300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079349, + "balance_loss_mlp": 1.04208446, + "epoch": 0.45094267025779144, + "flos": 578070328320.0, + "grad_norm": 0.04730162576611683, + "language_loss": 0.83859873, + "learning_rate": 0.0006030008135590622, + "loss": 0.84939224, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.37255859, + "step": 2344, + "time_per_iteration": 2.685067892074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107533, + "balance_loss_mlp": 1.03799331, + "epoch": 0.45113505194305503, + "flos": 525124947456.0, + "grad_norm": 0.051192045733620226, + "language_loss": 0.80228901, + "learning_rate": 0.0006026959333434387, + "loss": 0.81304228, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.37353516, + "step": 2345, + "time_per_iteration": 2.783407688140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107739, + "balance_loss_mlp": 1.04014897, + "epoch": 0.45132743362831856, + "flos": 501791376384.0, + "grad_norm": 0.05199160611628431, + "language_loss": 0.77699506, + "learning_rate": 0.0006023910132577181, + "loss": 0.78776896, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.37207031, + "step": 2346, + "time_per_iteration": 2.646801233291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077398, + "balance_loss_mlp": 1.03968024, + "epoch": 0.45151981531358215, + "flos": 431690328576.0, + "grad_norm": 0.04922592508563583, + "language_loss": 0.84707314, + "learning_rate": 0.0006020860534202806, + "loss": 0.85784709, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.37670898, + "step": 2347, + "time_per_iteration": 2.4788920879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078489, + "balance_loss_mlp": 1.04036641, + "epoch": 0.4517121969988457, + "flos": 711826859520.0, + "grad_norm": 0.07725824631471088, + "language_loss": 0.80951411, + "learning_rate": 0.0006017810539495224, + "loss": 0.82029903, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.38110352, + "step": 2348, + "time_per_iteration": 3.013258934020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071848, + "balance_loss_mlp": 1.03587079, + "epoch": 0.45190457868410927, + "flos": 579197938176.0, + "grad_norm": 0.052394100693581906, + "language_loss": 0.82200068, + "learning_rate": 0.0006014760149638547, + "loss": 0.83271921, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.35986328, + "step": 2349, + "time_per_iteration": 2.6988728046417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073469, + "balance_loss_mlp": 1.03823042, + "epoch": 0.45209696036937286, + "flos": 482415395328.0, + "grad_norm": 0.04812495303687425, + "language_loss": 0.88394493, + "learning_rate": 0.000601170936581704, + "loss": 0.89467961, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.35253906, + "step": 2350, + "time_per_iteration": 2.5537099838256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108294, + "balance_loss_mlp": 1.04617548, + "epoch": 0.4522893420546364, + "flos": 539945409024.0, + "grad_norm": 0.059990427154632556, + "language_loss": 0.84346575, + "learning_rate": 0.0006008658189215121, + "loss": 0.85429513, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.36767578, + "step": 2351, + "time_per_iteration": 2.649442434310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084325, + "balance_loss_mlp": 1.04803789, + "epoch": 0.4524817237399, + "flos": 496423959552.0, + "grad_norm": 0.09153462549619036, + "language_loss": 0.7966159, + "learning_rate": 0.0006005606621017366, + "loss": 0.80745912, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.36328125, + "step": 2352, + "time_per_iteration": 2.55026912689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086533, + "balance_loss_mlp": 1.04891062, + "epoch": 0.4526741054251635, + "flos": 652229300736.0, + "grad_norm": 0.05116414037173521, + "language_loss": 0.80266565, + "learning_rate": 0.0006002554662408496, + "loss": 0.81353092, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.3762207, + "step": 2353, + "time_per_iteration": 2.8708717823028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089479, + "balance_loss_mlp": 1.05259538, + "epoch": 0.4528664871104271, + "flos": 570674654208.0, + "grad_norm": 0.05934636879993742, + "language_loss": 0.91137719, + "learning_rate": 0.0005999502314573388, + "loss": 0.92227197, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.36865234, + "step": 2354, + "time_per_iteration": 2.636732339859009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091866, + "balance_loss_mlp": 1.05424321, + "epoch": 0.45305886879569063, + "flos": 458480922624.0, + "grad_norm": 0.06511026561582739, + "language_loss": 0.85993183, + "learning_rate": 0.0005996449578697066, + "loss": 0.87085044, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.3762207, + "step": 2355, + "time_per_iteration": 2.6497340202331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095767, + "balance_loss_mlp": 1.05916929, + "epoch": 0.4532512504809542, + "flos": 504922512384.0, + "grad_norm": 0.05408585590104452, + "language_loss": 0.81462455, + "learning_rate": 0.0005993396455964709, + "loss": 0.82558227, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.36621094, + "step": 2356, + "time_per_iteration": 2.67404842376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090921, + "balance_loss_mlp": 1.05360866, + "epoch": 0.4534436321662178, + "flos": 581940578304.0, + "grad_norm": 0.046652791791384825, + "language_loss": 0.81415474, + "learning_rate": 0.0005990342947561647, + "loss": 0.82506394, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.37304688, + "step": 2357, + "time_per_iteration": 2.694093942642212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092731, + "balance_loss_mlp": 1.05577612, + "epoch": 0.45363601385148133, + "flos": 549458090496.0, + "grad_norm": 0.05811050095266086, + "language_loss": 0.77914369, + "learning_rate": 0.0005987289054673351, + "loss": 0.79007101, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.36987305, + "step": 2358, + "time_per_iteration": 2.6171157360076904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187917, + "balance_loss_mlp": 1.16912949, + "epoch": 0.4538283955367449, + "flos": 1473754411008.0, + "grad_norm": 0.03301673104438644, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77763653, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.1875, + "step": 2359, + "time_per_iteration": 4.821492910385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096443, + "balance_loss_mlp": 1.05986929, + "epoch": 0.45402077722200845, + "flos": 584441699328.0, + "grad_norm": 0.059282629275687046, + "language_loss": 0.91217041, + "learning_rate": 0.0005981180120183722, + "loss": 0.92313486, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.36572266, + "step": 2360, + "time_per_iteration": 2.6678080558776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109692, + "balance_loss_mlp": 1.05901098, + "epoch": 0.45421315890727204, + "flos": 531462822912.0, + "grad_norm": 0.0444268091974553, + "language_loss": 0.85307455, + "learning_rate": 0.0005978125080954089, + "loss": 0.86404377, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.37915039, + "step": 2361, + "time_per_iteration": 2.7723591327667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093514, + "balance_loss_mlp": 1.05651164, + "epoch": 0.4544055405925356, + "flos": 784890307584.0, + "grad_norm": 0.08031817047800895, + "language_loss": 0.7639026, + "learning_rate": 0.000597506966198262, + "loss": 0.77483773, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.36987305, + "step": 2362, + "time_per_iteration": 2.9897196292877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109078, + "balance_loss_mlp": 1.05389667, + "epoch": 0.45459792227779916, + "flos": 517950443520.0, + "grad_norm": 0.07752194494873299, + "language_loss": 0.84128416, + "learning_rate": 0.0005972013864455536, + "loss": 0.85219198, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.36914062, + "step": 2363, + "time_per_iteration": 2.580357074737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091271, + "balance_loss_mlp": 1.05515027, + "epoch": 0.4547903039630627, + "flos": 537306075648.0, + "grad_norm": 0.05808697989569881, + "language_loss": 0.85570788, + "learning_rate": 0.0005968957689559203, + "loss": 0.8666206, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.36132812, + "step": 2364, + "time_per_iteration": 2.64911150932312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095782, + "balance_loss_mlp": 1.05997205, + "epoch": 0.4549826856483263, + "flos": 528423409152.0, + "grad_norm": 0.05494979115149378, + "language_loss": 0.88544732, + "learning_rate": 0.0005965901138480131, + "loss": 0.8964051, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.35839844, + "step": 2365, + "time_per_iteration": 2.61967396736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100343, + "balance_loss_mlp": 1.06379294, + "epoch": 0.45517506733358987, + "flos": 520649413632.0, + "grad_norm": 0.0583285525672419, + "language_loss": 0.87046576, + "learning_rate": 0.0005962844212404982, + "loss": 0.88146913, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.36547852, + "step": 2366, + "time_per_iteration": 2.663799524307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108056, + "balance_loss_mlp": 1.07067156, + "epoch": 0.4553674490188534, + "flos": 450814616064.0, + "grad_norm": 0.06095483853323617, + "language_loss": 0.86969483, + "learning_rate": 0.0005959786912520558, + "loss": 0.88077545, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.37353516, + "step": 2367, + "time_per_iteration": 2.604011058807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104168, + "balance_loss_mlp": 1.06740427, + "epoch": 0.455559830704117, + "flos": 546308015616.0, + "grad_norm": 0.04613637765687707, + "language_loss": 0.83717126, + "learning_rate": 0.0005956729240013806, + "loss": 0.84821296, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.36743164, + "step": 2368, + "time_per_iteration": 2.7852706909179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110558, + "balance_loss_mlp": 1.06917334, + "epoch": 0.4557522123893805, + "flos": 583491589632.0, + "grad_norm": 0.05161395773765414, + "language_loss": 0.91501808, + "learning_rate": 0.0005953671196071824, + "loss": 0.92607391, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.36401367, + "step": 2369, + "time_per_iteration": 2.7515223026275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110279, + "balance_loss_mlp": 1.06681311, + "epoch": 0.4559445940746441, + "flos": 526149250560.0, + "grad_norm": 0.05240938085212211, + "language_loss": 0.80084532, + "learning_rate": 0.0005950612781881846, + "loss": 0.8118732, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.35986328, + "step": 2370, + "time_per_iteration": 2.6867175102233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096163, + "balance_loss_mlp": 1.05873156, + "epoch": 0.45613697575990764, + "flos": 651810281472.0, + "grad_norm": 0.06280114629685846, + "language_loss": 0.7594825, + "learning_rate": 0.0005947553998631259, + "loss": 0.77044415, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.37451172, + "step": 2371, + "time_per_iteration": 2.8399033546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096425, + "balance_loss_mlp": 1.05985141, + "epoch": 0.4563293574451712, + "flos": 866744699904.0, + "grad_norm": 0.04396235367342953, + "language_loss": 0.78598678, + "learning_rate": 0.000594449484750758, + "loss": 0.79695106, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.36572266, + "step": 2372, + "time_per_iteration": 3.140890121459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088429, + "balance_loss_mlp": 1.05140269, + "epoch": 0.45652173913043476, + "flos": 497817819648.0, + "grad_norm": 0.06709411136792778, + "language_loss": 0.82665753, + "learning_rate": 0.0005941435329698484, + "loss": 0.83754182, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.36987305, + "step": 2373, + "time_per_iteration": 2.6316027641296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089126, + "balance_loss_mlp": 1.05238533, + "epoch": 0.45671412081569834, + "flos": 560581420032.0, + "grad_norm": 0.05173954705628188, + "language_loss": 0.82881534, + "learning_rate": 0.0005938375446391778, + "loss": 0.83970654, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.36743164, + "step": 2374, + "time_per_iteration": 2.6999659538269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096506, + "balance_loss_mlp": 1.05823994, + "epoch": 0.45690650250096193, + "flos": 502873906176.0, + "grad_norm": 0.06488189122368912, + "language_loss": 0.88693655, + "learning_rate": 0.0005935315198775415, + "loss": 0.89790159, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.38232422, + "step": 2375, + "time_per_iteration": 2.584855556488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084023, + "balance_loss_mlp": 1.04675794, + "epoch": 0.45709888418622546, + "flos": 430473968640.0, + "grad_norm": 0.054054227258136585, + "language_loss": 0.86900407, + "learning_rate": 0.0005932254588037486, + "loss": 0.87984431, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.37207031, + "step": 2376, + "time_per_iteration": 2.4713377952575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087126, + "balance_loss_mlp": 1.04907441, + "epoch": 0.45729126587148905, + "flos": 525395579904.0, + "grad_norm": 0.22673198102288197, + "language_loss": 0.86219609, + "learning_rate": 0.000592919361536623, + "loss": 0.87306732, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.38037109, + "step": 2377, + "time_per_iteration": 2.6324362754821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074984, + "balance_loss_mlp": 1.03821993, + "epoch": 0.4574836475567526, + "flos": 637717349376.0, + "grad_norm": 0.06562895013351942, + "language_loss": 0.88980031, + "learning_rate": 0.0005926132281950017, + "loss": 0.90055019, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.36767578, + "step": 2378, + "time_per_iteration": 2.7336690425872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079258, + "balance_loss_mlp": 1.04194546, + "epoch": 0.45767602924201617, + "flos": 649288811520.0, + "grad_norm": 0.05221471992659685, + "language_loss": 0.84916019, + "learning_rate": 0.0005923070588977367, + "loss": 0.85995281, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.37280273, + "step": 2379, + "time_per_iteration": 2.796694755554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073672, + "balance_loss_mlp": 1.03745568, + "epoch": 0.4578684109272797, + "flos": 746356543488.0, + "grad_norm": 0.05948192069014845, + "language_loss": 0.86265379, + "learning_rate": 0.0005920008537636931, + "loss": 0.8733905, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.36230469, + "step": 2380, + "time_per_iteration": 2.919175863265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073893, + "balance_loss_mlp": 1.03734303, + "epoch": 0.4580607926125433, + "flos": 641155433472.0, + "grad_norm": 0.07082348059879481, + "language_loss": 0.86767799, + "learning_rate": 0.0005916946129117504, + "loss": 0.8784169, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.36523438, + "step": 2381, + "time_per_iteration": 2.8834073543548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076603, + "balance_loss_mlp": 1.03983903, + "epoch": 0.4582531742978069, + "flos": 801513474048.0, + "grad_norm": 0.06015762492268947, + "language_loss": 0.80385733, + "learning_rate": 0.0005913883364608017, + "loss": 0.81462336, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.36791992, + "step": 2382, + "time_per_iteration": 3.05711030960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077489, + "balance_loss_mlp": 1.03984237, + "epoch": 0.4584455559830704, + "flos": 683991613440.0, + "grad_norm": 0.05122280126715116, + "language_loss": 0.88575673, + "learning_rate": 0.0005910820245297542, + "loss": 0.89653164, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.37646484, + "step": 2383, + "time_per_iteration": 2.8739712238311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107409, + "balance_loss_mlp": 1.03682566, + "epoch": 0.458637937668334, + "flos": 517902391296.0, + "grad_norm": 0.06830932289634356, + "language_loss": 0.80442882, + "learning_rate": 0.000590775677237529, + "loss": 0.81516975, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.37231445, + "step": 2384, + "time_per_iteration": 2.7162787914276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083903, + "balance_loss_mlp": 1.04585159, + "epoch": 0.4588303193535975, + "flos": 505242607104.0, + "grad_norm": 0.06045305543182838, + "language_loss": 0.80110037, + "learning_rate": 0.0005904692947030601, + "loss": 0.81193942, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.38012695, + "step": 2385, + "time_per_iteration": 2.615645408630371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077094, + "balance_loss_mlp": 1.04054475, + "epoch": 0.4590227010388611, + "flos": 495655732224.0, + "grad_norm": 0.07817461665700527, + "language_loss": 0.89474368, + "learning_rate": 0.0005901628770452963, + "loss": 0.90551466, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.36572266, + "step": 2386, + "time_per_iteration": 2.545145273208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_mlp": 1.03952503, + "epoch": 0.45921508272412465, + "flos": 493375781376.0, + "grad_norm": 0.05719900676000999, + "language_loss": 0.87518173, + "learning_rate": 0.000589856424383199, + "loss": 0.88595015, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.37280273, + "step": 2387, + "time_per_iteration": 2.5866873264312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078911, + "balance_loss_mlp": 1.04000092, + "epoch": 0.45940746440938823, + "flos": 691096306176.0, + "grad_norm": 0.05272732350360167, + "language_loss": 0.82854474, + "learning_rate": 0.000589549936835744, + "loss": 0.83933389, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.38867188, + "step": 2388, + "time_per_iteration": 2.886815309524536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082474, + "balance_loss_mlp": 1.04485154, + "epoch": 0.45959984609465176, + "flos": 503489364480.0, + "grad_norm": 0.061476086167368736, + "language_loss": 0.79490817, + "learning_rate": 0.0005892434145219202, + "loss": 0.80573285, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.37597656, + "step": 2389, + "time_per_iteration": 2.669055461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078287, + "balance_loss_mlp": 1.04035497, + "epoch": 0.45979222777991535, + "flos": 676339863552.0, + "grad_norm": 0.13998924312013794, + "language_loss": 0.82966721, + "learning_rate": 0.0005889368575607303, + "loss": 0.84045005, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.37890625, + "step": 2390, + "time_per_iteration": 2.8364429473876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075252, + "balance_loss_mlp": 1.03941786, + "epoch": 0.45998460946517894, + "flos": 777308368896.0, + "grad_norm": 0.05472501976139028, + "language_loss": 0.78496212, + "learning_rate": 0.00058863026607119, + "loss": 0.79571462, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.35864258, + "step": 2391, + "time_per_iteration": 3.104703664779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078215, + "balance_loss_mlp": 1.04059267, + "epoch": 0.46017699115044247, + "flos": 851073053184.0, + "grad_norm": 0.06149888926191146, + "language_loss": 0.79584855, + "learning_rate": 0.0005883236401723287, + "loss": 0.80663073, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.37597656, + "step": 2392, + "time_per_iteration": 3.1967198848724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074328, + "balance_loss_mlp": 1.03603745, + "epoch": 0.46036937283570606, + "flos": 575608495104.0, + "grad_norm": 0.05401888737183198, + "language_loss": 0.84525239, + "learning_rate": 0.0005880169799831893, + "loss": 0.85599566, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.3828125, + "step": 2393, + "time_per_iteration": 2.6700267791748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078621, + "balance_loss_mlp": 1.04049826, + "epoch": 0.4605617545209696, + "flos": 611553798144.0, + "grad_norm": 0.04760801272162673, + "language_loss": 0.81405449, + "learning_rate": 0.0005877102856228278, + "loss": 0.82484066, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.38110352, + "step": 2394, + "time_per_iteration": 2.8472628593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079988, + "balance_loss_mlp": 1.04100633, + "epoch": 0.4607541362062332, + "flos": 532884386304.0, + "grad_norm": 0.0583897063043048, + "language_loss": 0.84685498, + "learning_rate": 0.0005874035572103133, + "loss": 0.85765481, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.38964844, + "step": 2395, + "time_per_iteration": 2.6390676498413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081925, + "balance_loss_mlp": 1.04437459, + "epoch": 0.4609465178914967, + "flos": 647023417344.0, + "grad_norm": 0.07571396195119524, + "language_loss": 0.82582867, + "learning_rate": 0.0005870967948647288, + "loss": 0.83664787, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.37573242, + "step": 2396, + "time_per_iteration": 2.7459003925323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115963, + "balance_loss_mlp": 1.09889209, + "epoch": 0.4611388995767603, + "flos": 1465487202816.0, + "grad_norm": 0.025541481833947964, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75424266, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.17089844, + "step": 2397, + "time_per_iteration": 5.318708896636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083182, + "balance_loss_mlp": 1.04446316, + "epoch": 0.46133128126202383, + "flos": 722772688896.0, + "grad_norm": 0.0770893227760576, + "language_loss": 0.8586902, + "learning_rate": 0.0005864831688507443, + "loss": 0.86952198, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.38696289, + "step": 2398, + "time_per_iteration": 3.0177690982818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092841, + "balance_loss_mlp": 1.05266774, + "epoch": 0.4615236629472874, + "flos": 547735371264.0, + "grad_norm": 0.05577558539065206, + "language_loss": 0.74877977, + "learning_rate": 0.0005861763054205754, + "loss": 0.75970817, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.40161133, + "step": 2399, + "time_per_iteration": 4.235994815826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089549, + "balance_loss_mlp": 1.04885101, + "epoch": 0.461716044632551, + "flos": 601942192128.0, + "grad_norm": 0.04983292023279428, + "language_loss": 0.80479169, + "learning_rate": 0.0005858694085337976, + "loss": 0.81568718, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.40698242, + "step": 2400, + "time_per_iteration": 2.807819366455078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095586, + "balance_loss_mlp": 1.0549593, + "epoch": 0.46190842631781454, + "flos": 474236937216.0, + "grad_norm": 0.0664642499777789, + "language_loss": 0.8348912, + "learning_rate": 0.0005855624783095589, + "loss": 0.84584707, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.40625, + "step": 2401, + "time_per_iteration": 2.572861909866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088871, + "balance_loss_mlp": 1.04848242, + "epoch": 0.4621008080030781, + "flos": 437254184448.0, + "grad_norm": 0.05436683283363487, + "language_loss": 0.85176182, + "learning_rate": 0.00058525551486702, + "loss": 0.86265051, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.40405273, + "step": 2402, + "time_per_iteration": 2.5116658210754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091528, + "balance_loss_mlp": 1.05056739, + "epoch": 0.46229318968834165, + "flos": 525203523072.0, + "grad_norm": 0.06054832474170735, + "language_loss": 0.81057394, + "learning_rate": 0.0005849485183253548, + "loss": 0.82148921, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.40942383, + "step": 2403, + "time_per_iteration": 2.6135447025299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109357, + "balance_loss_mlp": 1.05446947, + "epoch": 0.46248557137360524, + "flos": 439395922944.0, + "grad_norm": 0.05271308957386849, + "language_loss": 0.87085575, + "learning_rate": 0.0005846414888037501, + "loss": 0.88179141, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.39086914, + "step": 2404, + "time_per_iteration": 2.479233503341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094001, + "balance_loss_mlp": 1.05513883, + "epoch": 0.4626779530588688, + "flos": 617318475264.0, + "grad_norm": 0.05681624365321511, + "language_loss": 0.82982111, + "learning_rate": 0.0005843344264214049, + "loss": 0.84076107, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.38818359, + "step": 2405, + "time_per_iteration": 2.8025927543640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094528, + "balance_loss_mlp": 1.05478346, + "epoch": 0.46287033474413236, + "flos": 669796784640.0, + "grad_norm": 0.07573173665893672, + "language_loss": 0.84474289, + "learning_rate": 0.0005840273312975317, + "loss": 0.8556881, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.39746094, + "step": 2406, + "time_per_iteration": 2.880143642425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096681, + "balance_loss_mlp": 1.05705631, + "epoch": 0.46306271642939595, + "flos": 479992849920.0, + "grad_norm": 0.09801123732991168, + "language_loss": 0.90446943, + "learning_rate": 0.0005837202035513555, + "loss": 0.91543621, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.39599609, + "step": 2407, + "time_per_iteration": 2.5880489349365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109455, + "balance_loss_mlp": 1.05583048, + "epoch": 0.4632550981146595, + "flos": 580395359232.0, + "grad_norm": 0.057934056350582984, + "language_loss": 0.81573331, + "learning_rate": 0.0005834130433021136, + "loss": 0.82667881, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.38671875, + "step": 2408, + "time_per_iteration": 2.739018201828003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100791, + "balance_loss_mlp": 1.06121325, + "epoch": 0.46344747979992307, + "flos": 523701974016.0, + "grad_norm": 0.11568384778980019, + "language_loss": 0.73278892, + "learning_rate": 0.0005831058506690563, + "loss": 0.74379677, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.39550781, + "step": 2409, + "time_per_iteration": 2.6164803504943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109281, + "balance_loss_mlp": 1.05513954, + "epoch": 0.4636398614851866, + "flos": 746174661120.0, + "grad_norm": 0.10585491609730635, + "language_loss": 0.85966945, + "learning_rate": 0.0005827986257714464, + "loss": 0.87059754, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.3762207, + "step": 2410, + "time_per_iteration": 2.9002575874328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108768, + "balance_loss_mlp": 1.05008137, + "epoch": 0.4638322431704502, + "flos": 596273619456.0, + "grad_norm": 0.054458395819511424, + "language_loss": 0.88645154, + "learning_rate": 0.0005824913687285591, + "loss": 0.89732838, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.37597656, + "step": 2411, + "time_per_iteration": 2.65468168258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108455, + "balance_loss_mlp": 1.046808, + "epoch": 0.4640246248557137, + "flos": 539172799488.0, + "grad_norm": 0.10537111148670983, + "language_loss": 0.81237781, + "learning_rate": 0.0005821840796596821, + "loss": 0.82322335, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.37744141, + "step": 2412, + "time_per_iteration": 2.64800763130188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086752, + "balance_loss_mlp": 1.04979706, + "epoch": 0.4642170065409773, + "flos": 562330280448.0, + "grad_norm": 0.05022524173963101, + "language_loss": 0.80493259, + "learning_rate": 0.0005818767586841158, + "loss": 0.81580019, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.36962891, + "step": 2413, + "time_per_iteration": 2.755119800567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081928, + "balance_loss_mlp": 1.04657054, + "epoch": 0.46440938822624084, + "flos": 530684421120.0, + "grad_norm": 0.05374997972366647, + "language_loss": 0.86088538, + "learning_rate": 0.0005815694059211726, + "loss": 0.87170464, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.35400391, + "step": 2414, + "time_per_iteration": 2.6568868160247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.09606647, + "epoch": 0.4646017699115044, + "flos": 1525503780864.0, + "grad_norm": 0.029698276976430914, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.81986189, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.16503906, + "step": 2415, + "time_per_iteration": 4.772961378097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103885, + "balance_loss_mlp": 1.08795917, + "epoch": 0.464794151596768, + "flos": 1539999765504.0, + "grad_norm": 0.029205098078145548, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78048944, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.15917969, + "step": 2416, + "time_per_iteration": 4.972976446151733 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085394, + "balance_loss_mlp": 1.04908264, + "epoch": 0.46498653328203154, + "flos": 501200649216.0, + "grad_norm": 0.04510206741076235, + "language_loss": 0.86396641, + "learning_rate": 0.0005806471581013931, + "loss": 0.87482029, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.36328125, + "step": 2417, + "time_per_iteration": 2.6620965003967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084664, + "balance_loss_mlp": 1.04806709, + "epoch": 0.46517891496729513, + "flos": 675856825344.0, + "grad_norm": 0.06302462590955567, + "language_loss": 0.78826416, + "learning_rate": 0.0005803396793823146, + "loss": 0.79911077, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.36572266, + "step": 2418, + "time_per_iteration": 2.7901804447174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108772, + "balance_loss_mlp": 1.05190992, + "epoch": 0.46537129665255866, + "flos": 585062949888.0, + "grad_norm": 0.06339234247272847, + "language_loss": 0.85623956, + "learning_rate": 0.0005800321694726065, + "loss": 0.86711681, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.35839844, + "step": 2419, + "time_per_iteration": 2.728811740875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085796, + "balance_loss_mlp": 1.04836476, + "epoch": 0.46556367833782225, + "flos": 587425858560.0, + "grad_norm": 0.05222204092555794, + "language_loss": 0.8708874, + "learning_rate": 0.0005797246284916545, + "loss": 0.88174534, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.37402344, + "step": 2420, + "time_per_iteration": 2.6684653759002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045596, + "balance_loss_mlp": 1.03043234, + "epoch": 0.4657560600230858, + "flos": 1484674099200.0, + "grad_norm": 0.011675297447767578, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78550786, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.15136719, + "step": 2421, + "time_per_iteration": 4.958959102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109154, + "balance_loss_mlp": 1.05506182, + "epoch": 0.46594844170834937, + "flos": 579961783296.0, + "grad_norm": 0.06275032464162542, + "language_loss": 0.88184166, + "learning_rate": 0.0005791094537936233, + "loss": 0.89275706, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.36499023, + "step": 2422, + "time_per_iteration": 2.682985782623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085598, + "balance_loss_mlp": 1.04761815, + "epoch": 0.4661408233936129, + "flos": 512322568704.0, + "grad_norm": 0.05420418194823272, + "language_loss": 0.8170498, + "learning_rate": 0.0005788018203153762, + "loss": 0.82790577, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.37988281, + "step": 2423, + "time_per_iteration": 2.5706470012664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085407, + "balance_loss_mlp": 1.04883409, + "epoch": 0.4663332050788765, + "flos": 490839754752.0, + "grad_norm": 0.06546291293651209, + "language_loss": 0.85642946, + "learning_rate": 0.000578494156243549, + "loss": 0.86728358, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.36572266, + "step": 2424, + "time_per_iteration": 2.578847646713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085751, + "balance_loss_mlp": 1.04746079, + "epoch": 0.4665255867641401, + "flos": 512353092096.0, + "grad_norm": 0.059152702804089866, + "language_loss": 0.89097798, + "learning_rate": 0.0005781864616975878, + "loss": 0.90183544, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.38256836, + "step": 2425, + "time_per_iteration": 2.6408798694610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085338, + "balance_loss_mlp": 1.04585552, + "epoch": 0.4667179684494036, + "flos": 424590018048.0, + "grad_norm": 0.07480086545967683, + "language_loss": 0.84123272, + "learning_rate": 0.0005778787367969502, + "loss": 0.85208613, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.39477539, + "step": 2426, + "time_per_iteration": 2.5963637828826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077247, + "balance_loss_mlp": 1.03910041, + "epoch": 0.4669103501346672, + "flos": 707640897024.0, + "grad_norm": 0.07167303988395164, + "language_loss": 0.80844486, + "learning_rate": 0.0005775709816611053, + "loss": 0.81921738, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.38134766, + "step": 2427, + "time_per_iteration": 2.971285581588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079771, + "balance_loss_mlp": 1.04138589, + "epoch": 0.4671027318199307, + "flos": 554554874880.0, + "grad_norm": 0.05405801443852106, + "language_loss": 0.83748919, + "learning_rate": 0.0005772631964095346, + "loss": 0.84828693, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.38354492, + "step": 2428, + "time_per_iteration": 2.709364175796509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080886, + "balance_loss_mlp": 1.04271483, + "epoch": 0.4672951135051943, + "flos": 566839309824.0, + "grad_norm": 0.060777782070244445, + "language_loss": 0.8565498, + "learning_rate": 0.000576955381161731, + "loss": 0.86735862, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.38183594, + "step": 2429, + "time_per_iteration": 2.708270311355591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083121, + "balance_loss_mlp": 1.04452121, + "epoch": 0.46748749519045785, + "flos": 424294654464.0, + "grad_norm": 0.05633631430335825, + "language_loss": 0.85906339, + "learning_rate": 0.0005766475360371985, + "loss": 0.86989462, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.38574219, + "step": 2430, + "time_per_iteration": 2.617856740951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089079, + "balance_loss_mlp": 1.05055118, + "epoch": 0.46767987687572143, + "flos": 538088859648.0, + "grad_norm": 0.05568735360450276, + "language_loss": 0.84486759, + "learning_rate": 0.0005763396611554536, + "loss": 0.85575831, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.38476562, + "step": 2431, + "time_per_iteration": 2.6460912227630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093376, + "balance_loss_mlp": 1.0557059, + "epoch": 0.467872258560985, + "flos": 823360052736.0, + "grad_norm": 0.05823580457003032, + "language_loss": 0.80262822, + "learning_rate": 0.0005760317566360237, + "loss": 0.81356204, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.37646484, + "step": 2432, + "time_per_iteration": 3.010744094848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104411, + "balance_loss_mlp": 1.066836, + "epoch": 0.46806464024624855, + "flos": 661366632960.0, + "grad_norm": 0.07453415962543286, + "language_loss": 0.85120392, + "learning_rate": 0.000575723822598448, + "loss": 0.86224806, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.37573242, + "step": 2433, + "time_per_iteration": 2.7999444007873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100188, + "balance_loss_mlp": 1.06232667, + "epoch": 0.46825702193151214, + "flos": 755362865664.0, + "grad_norm": 0.08922556949000433, + "language_loss": 0.81824166, + "learning_rate": 0.0005754158591622773, + "loss": 0.82924354, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.37866211, + "step": 2434, + "time_per_iteration": 3.016101837158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089201, + "balance_loss_mlp": 1.05250812, + "epoch": 0.4684494036167757, + "flos": 439164578304.0, + "grad_norm": 0.06367410837717138, + "language_loss": 0.82359827, + "learning_rate": 0.0005751078664470732, + "loss": 0.8344903, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.36694336, + "step": 2435, + "time_per_iteration": 2.5870590209960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095131, + "balance_loss_mlp": 1.05762815, + "epoch": 0.46864178530203926, + "flos": 532446428160.0, + "grad_norm": 0.059213993455869605, + "language_loss": 0.85874772, + "learning_rate": 0.0005747998445724094, + "loss": 0.86969906, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.375, + "step": 2436, + "time_per_iteration": 2.606999397277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088052, + "balance_loss_mlp": 1.05135953, + "epoch": 0.4688341669873028, + "flos": 576328670208.0, + "grad_norm": 0.05282393784178956, + "language_loss": 0.89627349, + "learning_rate": 0.0005744917936578707, + "loss": 0.90715402, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.3671875, + "step": 2437, + "time_per_iteration": 2.7902729511260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075719, + "balance_loss_mlp": 1.03978968, + "epoch": 0.4690265486725664, + "flos": 539296455168.0, + "grad_norm": 0.04430533887369339, + "language_loss": 0.84245884, + "learning_rate": 0.0005741837138230526, + "loss": 0.85321605, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.35913086, + "step": 2438, + "time_per_iteration": 2.726710319519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082309, + "balance_loss_mlp": 1.04580677, + "epoch": 0.4692189303578299, + "flos": 770168770560.0, + "grad_norm": 0.06182369714878754, + "language_loss": 0.86213875, + "learning_rate": 0.0005738756051875627, + "loss": 0.87296176, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.36547852, + "step": 2439, + "time_per_iteration": 3.07755708694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077719, + "balance_loss_mlp": 1.04178953, + "epoch": 0.4694113120430935, + "flos": 571118404608.0, + "grad_norm": 0.047772699497207846, + "language_loss": 0.82990217, + "learning_rate": 0.0005735674678710192, + "loss": 0.84067929, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.359375, + "step": 2440, + "time_per_iteration": 2.6625607013702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080403, + "balance_loss_mlp": 1.04423499, + "epoch": 0.4696036937283571, + "flos": 748498281984.0, + "grad_norm": 0.07690297936976162, + "language_loss": 0.81414962, + "learning_rate": 0.0005732593019930517, + "loss": 0.82495368, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.36181641, + "step": 2441, + "time_per_iteration": 2.918219566345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082294, + "balance_loss_mlp": 1.04669785, + "epoch": 0.4697960754136206, + "flos": 493208455680.0, + "grad_norm": 0.061105529929901724, + "language_loss": 0.87989414, + "learning_rate": 0.0005729511076733008, + "loss": 0.89071703, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.35620117, + "step": 2442, + "time_per_iteration": 2.6301560401916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085737, + "balance_loss_mlp": 1.04909194, + "epoch": 0.4699884570988842, + "flos": 724809710592.0, + "grad_norm": 0.0773152930313349, + "language_loss": 0.84905529, + "learning_rate": 0.000572642885031418, + "loss": 0.85991269, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.36645508, + "step": 2443, + "time_per_iteration": 2.8638129234313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081528, + "balance_loss_mlp": 1.04619479, + "epoch": 0.47018083878414774, + "flos": 555141219840.0, + "grad_norm": 0.0470926044275737, + "language_loss": 0.80651355, + "learning_rate": 0.0005723346341870662, + "loss": 0.81732887, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.35351562, + "step": 2444, + "time_per_iteration": 2.7571544647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093224, + "balance_loss_mlp": 1.05767596, + "epoch": 0.4703732204694113, + "flos": 423846521856.0, + "grad_norm": 0.060426187781859556, + "language_loss": 0.8612802, + "learning_rate": 0.0005720263552599188, + "loss": 0.87221241, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.35595703, + "step": 2445, + "time_per_iteration": 2.457702398300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087133, + "balance_loss_mlp": 1.05003476, + "epoch": 0.47056560215467486, + "flos": 703179919872.0, + "grad_norm": 0.05103700331104036, + "language_loss": 0.79627156, + "learning_rate": 0.0005717180483696604, + "loss": 0.80714285, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.37084961, + "step": 2446, + "time_per_iteration": 2.851597785949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096579, + "balance_loss_mlp": 1.05981517, + "epoch": 0.47075798383993844, + "flos": 554701851648.0, + "grad_norm": 0.05942499594418206, + "language_loss": 0.82931131, + "learning_rate": 0.0005714097136359862, + "loss": 0.84027708, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.36791992, + "step": 2447, + "time_per_iteration": 2.6262872219085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088323, + "balance_loss_mlp": 1.05203617, + "epoch": 0.470950365525202, + "flos": 564009329664.0, + "grad_norm": 0.04849265524269106, + "language_loss": 0.86289024, + "learning_rate": 0.0005711013511786027, + "loss": 0.87377352, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.36303711, + "step": 2448, + "time_per_iteration": 2.7698192596435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087117, + "balance_loss_mlp": 1.05066276, + "epoch": 0.47114274721046556, + "flos": 534189496320.0, + "grad_norm": 0.0564117191668664, + "language_loss": 0.83740294, + "learning_rate": 0.0005707929611172263, + "loss": 0.84827411, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.36450195, + "step": 2449, + "time_per_iteration": 2.679288864135742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091785, + "balance_loss_mlp": 1.0557121, + "epoch": 0.47133512889572915, + "flos": 472877982720.0, + "grad_norm": 0.05809255973733416, + "language_loss": 0.83857393, + "learning_rate": 0.000570484543571585, + "loss": 0.84949178, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.3605957, + "step": 2450, + "time_per_iteration": 2.53946852684021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085264, + "balance_loss_mlp": 1.04914355, + "epoch": 0.4715275105809927, + "flos": 458776286208.0, + "grad_norm": 0.05957003441240347, + "language_loss": 0.83003706, + "learning_rate": 0.0005701760986614171, + "loss": 0.84088969, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.36132812, + "step": 2451, + "time_per_iteration": 2.578679323196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108432, + "balance_loss_mlp": 1.04784179, + "epoch": 0.47171989226625627, + "flos": 421783358976.0, + "grad_norm": 0.04971859173266034, + "language_loss": 0.86998093, + "learning_rate": 0.0005698676265064714, + "loss": 0.88082415, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.36499023, + "step": 2452, + "time_per_iteration": 2.5178701877593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108693, + "balance_loss_mlp": 1.04887831, + "epoch": 0.4719122739515198, + "flos": 457200543744.0, + "grad_norm": 0.06455625952921856, + "language_loss": 0.89101571, + "learning_rate": 0.0005695591272265074, + "loss": 0.90188503, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.38037109, + "step": 2453, + "time_per_iteration": 2.527940511703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094235, + "balance_loss_mlp": 1.05601645, + "epoch": 0.4721046556367834, + "flos": 514716000768.0, + "grad_norm": 0.05921175255811472, + "language_loss": 0.81955969, + "learning_rate": 0.0005692506009412954, + "loss": 0.83050203, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.3815918, + "step": 2454, + "time_per_iteration": 2.6692135334014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152126, + "balance_loss_mlp": 1.13209891, + "epoch": 0.4722970373220469, + "flos": 1571399723520.0, + "grad_norm": 0.04281653423243919, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78703392, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.20019531, + "step": 2455, + "time_per_iteration": 4.940452337265015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085506, + "balance_loss_mlp": 1.04731131, + "epoch": 0.4724894190073105, + "flos": 585919927296.0, + "grad_norm": 0.06574328103666784, + "language_loss": 0.89537692, + "learning_rate": 0.0005686334678342593, + "loss": 0.906232, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.38183594, + "step": 2456, + "time_per_iteration": 2.8626763820648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085091, + "balance_loss_mlp": 1.04816043, + "epoch": 0.4726818006925741, + "flos": 867290347008.0, + "grad_norm": 0.053689359601525224, + "language_loss": 0.81760311, + "learning_rate": 0.0005683248612520274, + "loss": 0.82845408, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.36914062, + "step": 2457, + "time_per_iteration": 3.062195301055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079889, + "balance_loss_mlp": 1.04300618, + "epoch": 0.4728741823778376, + "flos": 752653721088.0, + "grad_norm": 0.06424431420602757, + "language_loss": 0.83881927, + "learning_rate": 0.0005680162281437321, + "loss": 0.84961808, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.36865234, + "step": 2458, + "time_per_iteration": 4.24756932258606 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081915, + "balance_loss_mlp": 1.04474509, + "epoch": 0.4730665640631012, + "flos": 538301265408.0, + "grad_norm": 0.04398827684533395, + "language_loss": 0.84583557, + "learning_rate": 0.000567707568629195, + "loss": 0.8566547, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.37158203, + "step": 2459, + "time_per_iteration": 2.678410530090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077333, + "balance_loss_mlp": 1.04104519, + "epoch": 0.47325894574836475, + "flos": 491396986368.0, + "grad_norm": 0.04729381274413396, + "language_loss": 0.82117784, + "learning_rate": 0.0005673988828282486, + "loss": 0.83195114, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.36303711, + "step": 2460, + "time_per_iteration": 2.6379287242889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080146, + "balance_loss_mlp": 1.04397774, + "epoch": 0.47345132743362833, + "flos": 764117494272.0, + "grad_norm": 0.048508898725252214, + "language_loss": 0.80703068, + "learning_rate": 0.0005670901708607352, + "loss": 0.81783217, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.36206055, + "step": 2461, + "time_per_iteration": 2.9682881832122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079185, + "balance_loss_mlp": 1.04366088, + "epoch": 0.47364370911889186, + "flos": 539925060096.0, + "grad_norm": 0.06522156043574484, + "language_loss": 0.84211236, + "learning_rate": 0.0005667814328465076, + "loss": 0.8529042, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.35546875, + "step": 2462, + "time_per_iteration": 2.6927719116210938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074953, + "balance_loss_mlp": 1.04031122, + "epoch": 0.47383609080415545, + "flos": 406002613248.0, + "grad_norm": 0.06749328280555515, + "language_loss": 0.81615329, + "learning_rate": 0.0005664726689054285, + "loss": 0.82690287, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.34692383, + "step": 2463, + "time_per_iteration": 2.4384853839874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078599, + "balance_loss_mlp": 1.04345584, + "epoch": 0.474028472489419, + "flos": 453237161472.0, + "grad_norm": 0.0467114590315811, + "language_loss": 0.81182402, + "learning_rate": 0.0005661638791573704, + "loss": 0.82261002, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.35180664, + "step": 2464, + "time_per_iteration": 2.695479154586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108113, + "balance_loss_mlp": 1.04582047, + "epoch": 0.47422085417468257, + "flos": 491923694592.0, + "grad_norm": 0.04732653708909472, + "language_loss": 0.86637986, + "learning_rate": 0.0005658550637222164, + "loss": 0.87719119, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.35327148, + "step": 2465, + "time_per_iteration": 2.6167092323303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079374, + "balance_loss_mlp": 1.04365873, + "epoch": 0.47441323585994616, + "flos": 738537467904.0, + "grad_norm": 0.057300064889236176, + "language_loss": 0.82372761, + "learning_rate": 0.0005655462227198592, + "loss": 0.83452135, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.35742188, + "step": 2466, + "time_per_iteration": 2.9023492336273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080505, + "balance_loss_mlp": 1.04509962, + "epoch": 0.4746056175452097, + "flos": 484439270400.0, + "grad_norm": 0.05227273448390526, + "language_loss": 0.83720088, + "learning_rate": 0.0005652373562702016, + "loss": 0.84800589, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.35449219, + "step": 2467, + "time_per_iteration": 2.5808918476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082419, + "balance_loss_mlp": 1.04715681, + "epoch": 0.4747979992304733, + "flos": 460814717952.0, + "grad_norm": 0.05382206625072039, + "language_loss": 0.88037241, + "learning_rate": 0.000564928464493156, + "loss": 0.89119661, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.35302734, + "step": 2468, + "time_per_iteration": 2.5377156734466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087106, + "balance_loss_mlp": 1.05198669, + "epoch": 0.4749903809157368, + "flos": 864070460928.0, + "grad_norm": 0.0577962749951369, + "language_loss": 0.81768191, + "learning_rate": 0.000564619547508645, + "loss": 0.82855296, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.3515625, + "step": 2469, + "time_per_iteration": 3.043691396713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086191, + "balance_loss_mlp": 1.05042827, + "epoch": 0.4751827626010004, + "flos": 505296451584.0, + "grad_norm": 0.1751373121791138, + "language_loss": 0.83049238, + "learning_rate": 0.0005643106054366008, + "loss": 0.84135431, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.3581543, + "step": 2470, + "time_per_iteration": 2.6487743854522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085754, + "balance_loss_mlp": 1.05118382, + "epoch": 0.47537514428626393, + "flos": 559123540992.0, + "grad_norm": 0.05689297252919276, + "language_loss": 0.79414684, + "learning_rate": 0.000564001638396965, + "loss": 0.80500442, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.34594727, + "step": 2471, + "time_per_iteration": 2.749767780303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087834, + "balance_loss_mlp": 1.05228639, + "epoch": 0.4755675259715275, + "flos": 833907211776.0, + "grad_norm": 0.05462179859190678, + "language_loss": 0.81897652, + "learning_rate": 0.0005636926465096897, + "loss": 0.82985491, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.35546875, + "step": 2472, + "time_per_iteration": 3.043703556060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091887, + "balance_loss_mlp": 1.05569541, + "epoch": 0.47575990765679105, + "flos": 507989629440.0, + "grad_norm": 0.050841736172577985, + "language_loss": 0.87258822, + "learning_rate": 0.0005633836298947363, + "loss": 0.88350713, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.36206055, + "step": 2473, + "time_per_iteration": 2.564831018447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098688, + "balance_loss_mlp": 1.06206715, + "epoch": 0.47595228934205464, + "flos": 591566740992.0, + "grad_norm": 0.05674114123782856, + "language_loss": 0.70767033, + "learning_rate": 0.000563074588672075, + "loss": 0.7186572, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.3659668, + "step": 2474, + "time_per_iteration": 2.6735401153564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095847, + "balance_loss_mlp": 1.05960727, + "epoch": 0.4761446710273182, + "flos": 580340104704.0, + "grad_norm": 0.055780063244739476, + "language_loss": 0.84891874, + "learning_rate": 0.0005627655229616868, + "loss": 0.85987723, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.36230469, + "step": 2475, + "time_per_iteration": 2.672621488571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096569, + "balance_loss_mlp": 1.05899405, + "epoch": 0.47633705271258175, + "flos": 672597651456.0, + "grad_norm": 0.05102987049441457, + "language_loss": 0.90229654, + "learning_rate": 0.0005624564328835616, + "loss": 0.91326219, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.37524414, + "step": 2476, + "time_per_iteration": 2.8432443141937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102069, + "balance_loss_mlp": 1.0635407, + "epoch": 0.47652943439784534, + "flos": 541580788224.0, + "grad_norm": 0.0471064217807047, + "language_loss": 0.84254396, + "learning_rate": 0.0005621473185576986, + "loss": 0.85356462, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.38525391, + "step": 2477, + "time_per_iteration": 2.702977180480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095957, + "balance_loss_mlp": 1.05826259, + "epoch": 0.4767218160831089, + "flos": 524563333632.0, + "grad_norm": 0.057656530584244435, + "language_loss": 0.87137967, + "learning_rate": 0.0005618381801041068, + "loss": 0.88233924, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.37670898, + "step": 2478, + "time_per_iteration": 2.603593111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098137, + "balance_loss_mlp": 1.05953729, + "epoch": 0.47691419776837246, + "flos": 567789419520.0, + "grad_norm": 0.11168904607405869, + "language_loss": 0.82855433, + "learning_rate": 0.0005615290176428044, + "loss": 0.83953571, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.38574219, + "step": 2479, + "time_per_iteration": 2.6339292526245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109593, + "balance_loss_mlp": 1.05959523, + "epoch": 0.477106579453636, + "flos": 530659689984.0, + "grad_norm": 0.06204032147038535, + "language_loss": 0.85517442, + "learning_rate": 0.0005612198312938187, + "loss": 0.86613369, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.36328125, + "step": 2480, + "time_per_iteration": 2.727931261062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096505, + "balance_loss_mlp": 1.05912077, + "epoch": 0.4772989611388996, + "flos": 593980521984.0, + "grad_norm": 0.07113059060466843, + "language_loss": 0.79093325, + "learning_rate": 0.0005609106211771868, + "loss": 0.80189824, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.37402344, + "step": 2481, + "time_per_iteration": 2.8239502906799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091646, + "balance_loss_mlp": 1.05471444, + "epoch": 0.4774913428241631, + "flos": 544352541696.0, + "grad_norm": 0.07337307686737661, + "language_loss": 0.89208174, + "learning_rate": 0.0005606013874129543, + "loss": 0.90299821, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.36914062, + "step": 2482, + "time_per_iteration": 2.7480216026306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088167, + "balance_loss_mlp": 1.05187941, + "epoch": 0.4776837245094267, + "flos": 539817371136.0, + "grad_norm": 0.16520730257770824, + "language_loss": 0.80029452, + "learning_rate": 0.0005602921301211768, + "loss": 0.81117618, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.36303711, + "step": 2483, + "time_per_iteration": 2.6802146434783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096429, + "balance_loss_mlp": 1.06021321, + "epoch": 0.4778761061946903, + "flos": 471543759360.0, + "grad_norm": 0.07816325562851568, + "language_loss": 0.81835008, + "learning_rate": 0.0005599828494219185, + "loss": 0.82931435, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.36206055, + "step": 2484, + "time_per_iteration": 2.546365976333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094667, + "balance_loss_mlp": 1.05923831, + "epoch": 0.4780684878799538, + "flos": 725769994752.0, + "grad_norm": 0.05627448694129284, + "language_loss": 0.88551247, + "learning_rate": 0.0005596735454352527, + "loss": 0.89645922, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.35498047, + "step": 2485, + "time_per_iteration": 2.862647771835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106974, + "balance_loss_mlp": 1.07054353, + "epoch": 0.4782608695652174, + "flos": 548665132032.0, + "grad_norm": 0.07015146645765026, + "language_loss": 0.85657477, + "learning_rate": 0.0005593642182812619, + "loss": 0.86764455, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.36425781, + "step": 2486, + "time_per_iteration": 2.609184741973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101968, + "balance_loss_mlp": 1.06558526, + "epoch": 0.47845325125048094, + "flos": 829555333632.0, + "grad_norm": 0.061922125379274766, + "language_loss": 0.83543551, + "learning_rate": 0.0005590548680800378, + "loss": 0.84645522, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.36401367, + "step": 2487, + "time_per_iteration": 3.089769124984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110265, + "balance_loss_mlp": 1.0746448, + "epoch": 0.4786456329357445, + "flos": 513889546752.0, + "grad_norm": 0.2189409026834594, + "language_loss": 0.76099992, + "learning_rate": 0.0005587454949516804, + "loss": 0.77210259, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.35644531, + "step": 2488, + "time_per_iteration": 2.751055955886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110821, + "balance_loss_mlp": 1.07187533, + "epoch": 0.47883801462100806, + "flos": 564392033280.0, + "grad_norm": 0.10409544878795325, + "language_loss": 0.87659556, + "learning_rate": 0.0005584360990162993, + "loss": 0.88767767, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.36376953, + "step": 2489, + "time_per_iteration": 2.6652133464813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113142, + "balance_loss_mlp": 1.07563877, + "epoch": 0.47903039630627164, + "flos": 579296862720.0, + "grad_norm": 0.09667813376582209, + "language_loss": 0.8484993, + "learning_rate": 0.0005581266803940124, + "loss": 0.8596307, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.375, + "step": 2490, + "time_per_iteration": 2.736374616622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119685, + "balance_loss_mlp": 1.08206201, + "epoch": 0.47922277799153523, + "flos": 618667255296.0, + "grad_norm": 0.050098276566308, + "language_loss": 0.87162292, + "learning_rate": 0.0005578172392049471, + "loss": 0.88281971, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.37573242, + "step": 2491, + "time_per_iteration": 2.7753453254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011097, + "balance_loss_mlp": 1.07307923, + "epoch": 0.47941515967679876, + "flos": 639352728576.0, + "grad_norm": 0.06461059150776577, + "language_loss": 0.83998954, + "learning_rate": 0.0005575077755692386, + "loss": 0.85108656, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.3659668, + "step": 2492, + "time_per_iteration": 2.788609266281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113104, + "balance_loss_mlp": 1.07595801, + "epoch": 0.47960754136206235, + "flos": 519561091584.0, + "grad_norm": 0.0557937811773086, + "language_loss": 0.86232179, + "learning_rate": 0.0005571982896070316, + "loss": 0.87345278, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.37158203, + "step": 2493, + "time_per_iteration": 2.6394574642181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107707, + "balance_loss_mlp": 1.07111025, + "epoch": 0.4797999230473259, + "flos": 474798551040.0, + "grad_norm": 0.0598408121702559, + "language_loss": 0.90174985, + "learning_rate": 0.0005568887814384792, + "loss": 0.9128269, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.3659668, + "step": 2494, + "time_per_iteration": 2.534224033355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111594, + "balance_loss_mlp": 1.0754025, + "epoch": 0.47999230473258947, + "flos": 531766950912.0, + "grad_norm": 0.07246176028888049, + "language_loss": 0.87038457, + "learning_rate": 0.000556579251183743, + "loss": 0.88150048, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.36230469, + "step": 2495, + "time_per_iteration": 2.6398251056671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094859, + "balance_loss_mlp": 1.05802298, + "epoch": 0.480184686417853, + "flos": 601207460352.0, + "grad_norm": 0.06271692106547645, + "language_loss": 0.79938626, + "learning_rate": 0.0005562696989629936, + "loss": 0.8103348, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.3684082, + "step": 2496, + "time_per_iteration": 2.6642816066741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093996, + "balance_loss_mlp": 1.05766106, + "epoch": 0.4803770681031166, + "flos": 527931606528.0, + "grad_norm": 0.05594777531112506, + "language_loss": 0.82110333, + "learning_rate": 0.0005559601248964095, + "loss": 0.83204329, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.36352539, + "step": 2497, + "time_per_iteration": 2.636070966720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093639, + "balance_loss_mlp": 1.05739903, + "epoch": 0.4805694497883801, + "flos": 510934500864.0, + "grad_norm": 0.054324508936697755, + "language_loss": 0.85873795, + "learning_rate": 0.0005556505291041783, + "loss": 0.86967432, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.36254883, + "step": 2498, + "time_per_iteration": 2.7246336936950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094125, + "balance_loss_mlp": 1.05757546, + "epoch": 0.4807618314736437, + "flos": 600027416064.0, + "grad_norm": 0.37566577491106196, + "language_loss": 0.84318507, + "learning_rate": 0.0005553409117064954, + "loss": 0.85412627, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.36547852, + "step": 2499, + "time_per_iteration": 2.8535146713256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.06770992, + "epoch": 0.4809542131589073, + "flos": 568700241408.0, + "grad_norm": 0.05235544022747109, + "language_loss": 0.84675509, + "learning_rate": 0.0005550312728235654, + "loss": 0.85780698, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.37475586, + "step": 2500, + "time_per_iteration": 2.691314697265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118964, + "balance_loss_mlp": 1.08138871, + "epoch": 0.4811465948441708, + "flos": 575703037440.0, + "grad_norm": 0.0667425977867665, + "language_loss": 0.83709896, + "learning_rate": 0.0005547216125756003, + "loss": 0.84828854, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.37573242, + "step": 2501, + "time_per_iteration": 2.7381327152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126097, + "balance_loss_mlp": 1.08754468, + "epoch": 0.4813389765294344, + "flos": 823508439552.0, + "grad_norm": 0.052606522983796165, + "language_loss": 0.82174253, + "learning_rate": 0.0005544119310828211, + "loss": 0.83300352, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.38549805, + "step": 2502, + "time_per_iteration": 3.072216272354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134856, + "balance_loss_mlp": 1.09632754, + "epoch": 0.48153135821469795, + "flos": 635240959488.0, + "grad_norm": 0.048230358167368766, + "language_loss": 0.84706873, + "learning_rate": 0.0005541022284654568, + "loss": 0.85841727, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.38525391, + "step": 2503, + "time_per_iteration": 2.916139602661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128897, + "balance_loss_mlp": 1.09051132, + "epoch": 0.48172373989996153, + "flos": 503450076672.0, + "grad_norm": 0.07897645884633452, + "language_loss": 0.84086657, + "learning_rate": 0.0005537925048437446, + "loss": 0.85215557, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.38354492, + "step": 2504, + "time_per_iteration": 2.5921871662139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110906, + "balance_loss_mlp": 1.09278584, + "epoch": 0.48191612158522507, + "flos": 1531563821568.0, + "grad_norm": 0.0372588251023387, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76862371, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.18164062, + "step": 2505, + "time_per_iteration": 4.9559855461120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141303, + "balance_loss_mlp": 1.10132027, + "epoch": 0.48210850327048865, + "flos": 702078451200.0, + "grad_norm": 0.058816464552035166, + "language_loss": 0.88463128, + "learning_rate": 0.0005531729950682664, + "loss": 0.89604431, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.3996582, + "step": 2506, + "time_per_iteration": 3.0114240646362305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132181, + "balance_loss_mlp": 1.09353316, + "epoch": 0.4823008849557522, + "flos": 439548691968.0, + "grad_norm": 0.06626147096234755, + "language_loss": 0.84781104, + "learning_rate": 0.000552863209155015, + "loss": 0.85913289, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.38598633, + "step": 2507, + "time_per_iteration": 2.5784101486206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113394, + "balance_loss_mlp": 1.09390914, + "epoch": 0.48249326664101577, + "flos": 471622334976.0, + "grad_norm": 0.05712589242287889, + "language_loss": 0.82110274, + "learning_rate": 0.0005525534027184461, + "loss": 0.83244216, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.40014648, + "step": 2508, + "time_per_iteration": 2.552065372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132678, + "balance_loss_mlp": 1.09395885, + "epoch": 0.48268564832627936, + "flos": 562954503168.0, + "grad_norm": 0.04979156125943264, + "language_loss": 0.82958996, + "learning_rate": 0.0005522435758788365, + "loss": 0.84091675, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.38696289, + "step": 2509, + "time_per_iteration": 2.727841854095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121662, + "balance_loss_mlp": 1.08210802, + "epoch": 0.4828780300115429, + "flos": 629298782208.0, + "grad_norm": 0.054057791094232886, + "language_loss": 0.79695261, + "learning_rate": 0.0005519337287564721, + "loss": 0.80816925, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.39526367, + "step": 2510, + "time_per_iteration": 2.841032028198242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111392, + "balance_loss_mlp": 1.07582068, + "epoch": 0.4830704116968065, + "flos": 631562766336.0, + "grad_norm": 0.0770242195625866, + "language_loss": 0.83640802, + "learning_rate": 0.000551623861471646, + "loss": 0.84754717, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.38061523, + "step": 2511, + "time_per_iteration": 2.7330808639526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051582, + "balance_loss_mlp": 1.03489304, + "epoch": 0.48326279338207, + "flos": 1568434503168.0, + "grad_norm": 0.02207943535017646, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79870415, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.16699219, + "step": 2512, + "time_per_iteration": 4.847305536270142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119635, + "balance_loss_mlp": 1.08015239, + "epoch": 0.4834551750673336, + "flos": 508989201408.0, + "grad_norm": 0.07604353740704149, + "language_loss": 0.86230296, + "learning_rate": 0.0005510040668958211, + "loss": 0.87349927, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.39453125, + "step": 2513, + "time_per_iteration": 2.6358695030212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039681, + "balance_loss_mlp": 1.02423155, + "epoch": 0.48364755675259713, + "flos": 1527875453952.0, + "grad_norm": 0.016719139942629795, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78800267, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.15429688, + "step": 2514, + "time_per_iteration": 4.8266448974609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108341, + "balance_loss_mlp": 1.06895423, + "epoch": 0.4838399384378607, + "flos": 564726684672.0, + "grad_norm": 0.05692617769518991, + "language_loss": 0.8306818, + "learning_rate": 0.0005503841931138645, + "loss": 0.84176517, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.39355469, + "step": 2515, + "time_per_iteration": 4.18599271774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108623, + "balance_loss_mlp": 1.07073843, + "epoch": 0.4840323201231243, + "flos": 387479227392.0, + "grad_norm": 0.0681425082817114, + "language_loss": 0.81703341, + "learning_rate": 0.0005500742268214025, + "loss": 0.82811964, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.37841797, + "step": 2516, + "time_per_iteration": 2.4660089015960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109531, + "balance_loss_mlp": 1.07116938, + "epoch": 0.48422470180838784, + "flos": 630701406720.0, + "grad_norm": 0.09015941461472031, + "language_loss": 0.85304928, + "learning_rate": 0.0005497642410884014, + "loss": 0.86414456, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.38305664, + "step": 2517, + "time_per_iteration": 2.8147974014282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108486, + "balance_loss_mlp": 1.06845522, + "epoch": 0.4844170834936514, + "flos": 498955603968.0, + "grad_norm": 0.05998889999991439, + "language_loss": 0.8499558, + "learning_rate": 0.0005494542360352085, + "loss": 0.86104071, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.40014648, + "step": 2518, + "time_per_iteration": 2.639248847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101242, + "balance_loss_mlp": 1.06335747, + "epoch": 0.48460946517891496, + "flos": 550798106112.0, + "grad_norm": 0.04916831458391579, + "language_loss": 0.85637897, + "learning_rate": 0.0005491442117821783, + "loss": 0.86739141, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.37866211, + "step": 2519, + "time_per_iteration": 2.7024173736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101572, + "balance_loss_mlp": 1.06275773, + "epoch": 0.48480184686417854, + "flos": 529123235328.0, + "grad_norm": 0.05557918275255021, + "language_loss": 0.87415975, + "learning_rate": 0.0005488341684496732, + "loss": 0.88517547, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.38793945, + "step": 2520, + "time_per_iteration": 2.6733944416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094952, + "balance_loss_mlp": 1.05732954, + "epoch": 0.4849942285494421, + "flos": 531630148608.0, + "grad_norm": 0.049677430441928086, + "language_loss": 0.91897535, + "learning_rate": 0.0005485241061580624, + "loss": 0.92992491, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.37646484, + "step": 2521, + "time_per_iteration": 2.7186949253082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087464, + "balance_loss_mlp": 1.04802954, + "epoch": 0.48518661023470566, + "flos": 722231424000.0, + "grad_norm": 0.05969395587297076, + "language_loss": 0.84698212, + "learning_rate": 0.0005482140250277228, + "loss": 0.85785675, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.39404297, + "step": 2522, + "time_per_iteration": 3.0005805492401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084269, + "balance_loss_mlp": 1.04664636, + "epoch": 0.4853789919199692, + "flos": 505843508736.0, + "grad_norm": 0.0576168536354582, + "language_loss": 0.87382847, + "learning_rate": 0.0005479039251790387, + "loss": 0.88467115, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.37597656, + "step": 2523, + "time_per_iteration": 2.612565517425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083753, + "balance_loss_mlp": 1.04508114, + "epoch": 0.4855713736052328, + "flos": 660185178624.0, + "grad_norm": 0.05213001441745639, + "language_loss": 0.84754556, + "learning_rate": 0.0005475938067324014, + "loss": 0.85838306, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.38647461, + "step": 2524, + "time_per_iteration": 2.7874755859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084646, + "balance_loss_mlp": 1.04556894, + "epoch": 0.48576375529049637, + "flos": 436727476224.0, + "grad_norm": 0.04741211423020534, + "language_loss": 0.83422267, + "learning_rate": 0.0005472836698082098, + "loss": 0.84506917, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.39086914, + "step": 2525, + "time_per_iteration": 2.50516676902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076506, + "balance_loss_mlp": 1.03764343, + "epoch": 0.4859561369757599, + "flos": 581424044544.0, + "grad_norm": 0.04357292691167825, + "language_loss": 0.84170592, + "learning_rate": 0.0005469735145268694, + "loss": 0.85247099, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.38818359, + "step": 2526, + "time_per_iteration": 2.7474558353424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076384, + "balance_loss_mlp": 1.03723574, + "epoch": 0.4861485186610235, + "flos": 487723175424.0, + "grad_norm": 0.056946126423794464, + "language_loss": 0.80818385, + "learning_rate": 0.0005466633410087933, + "loss": 0.81894767, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.39111328, + "step": 2527, + "time_per_iteration": 2.690655469894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080703, + "balance_loss_mlp": 1.06363261, + "epoch": 0.486340900346287, + "flos": 1556893564416.0, + "grad_norm": 0.03973044492620415, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78341526, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.17089844, + "step": 2528, + "time_per_iteration": 4.852689981460571 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076, + "balance_loss_mlp": 1.03723347, + "epoch": 0.4865332820315506, + "flos": 482760221184.0, + "grad_norm": 0.04657742417719492, + "language_loss": 0.88156307, + "learning_rate": 0.0005460429397441214, + "loss": 0.89232314, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.38720703, + "step": 2529, + "time_per_iteration": 2.55281662940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079921, + "balance_loss_mlp": 1.04053402, + "epoch": 0.48672566371681414, + "flos": 535548450816.0, + "grad_norm": 0.06549810250084472, + "language_loss": 0.86653185, + "learning_rate": 0.0005457327122383866, + "loss": 0.87733108, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.39379883, + "step": 2530, + "time_per_iteration": 2.671656847000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035467, + "balance_loss_mlp": 1.01963639, + "epoch": 0.4869180454020777, + "flos": 1411876901376.0, + "grad_norm": 0.025637836045087663, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75671959, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.15820312, + "step": 2531, + "time_per_iteration": 4.814793348312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081396, + "balance_loss_mlp": 1.04322505, + "epoch": 0.48711042708734126, + "flos": 572836741632.0, + "grad_norm": 0.048652424424379774, + "language_loss": 0.7607469, + "learning_rate": 0.0005451122040823244, + "loss": 0.77156091, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.38134766, + "step": 2532, + "time_per_iteration": 2.7569382190704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081141, + "balance_loss_mlp": 1.04246926, + "epoch": 0.48730280877260485, + "flos": 626231665152.0, + "grad_norm": 0.05261384345268123, + "language_loss": 0.76949328, + "learning_rate": 0.0005448019236728997, + "loss": 0.78030467, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.38647461, + "step": 2533, + "time_per_iteration": 2.8791191577911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082063, + "balance_loss_mlp": 1.04439306, + "epoch": 0.48749519045786843, + "flos": 512233818624.0, + "grad_norm": 0.05361284003065004, + "language_loss": 0.84639871, + "learning_rate": 0.0005444916258698255, + "loss": 0.85721934, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.37670898, + "step": 2534, + "time_per_iteration": 2.584188938140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108354, + "balance_loss_mlp": 1.04548812, + "epoch": 0.48768757214313196, + "flos": 525149678592.0, + "grad_norm": 0.044479444876285516, + "language_loss": 0.85999918, + "learning_rate": 0.0005441813107935704, + "loss": 0.87083459, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.38037109, + "step": 2535, + "time_per_iteration": 2.63484787940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089581, + "balance_loss_mlp": 1.05141044, + "epoch": 0.48787995382839555, + "flos": 504784300032.0, + "grad_norm": 0.05225590764746468, + "language_loss": 0.85801542, + "learning_rate": 0.0005438709785646091, + "loss": 0.86891127, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.38110352, + "step": 2536, + "time_per_iteration": 2.5857274532318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087898, + "balance_loss_mlp": 1.0496794, + "epoch": 0.4880723355136591, + "flos": 574904286720.0, + "grad_norm": 0.05427082704851873, + "language_loss": 0.8654719, + "learning_rate": 0.0005435606293034234, + "loss": 0.87635088, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.3815918, + "step": 2537, + "time_per_iteration": 2.6441421508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082535, + "balance_loss_mlp": 1.04498374, + "epoch": 0.48826471719892267, + "flos": 561172147200.0, + "grad_norm": 0.0666705066547564, + "language_loss": 0.84424317, + "learning_rate": 0.0005432502631305016, + "loss": 0.8550685, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.37548828, + "step": 2538, + "time_per_iteration": 2.657888174057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081573, + "balance_loss_mlp": 1.04383135, + "epoch": 0.4884570988841862, + "flos": 725849980416.0, + "grad_norm": 0.04200092081923836, + "language_loss": 0.83068514, + "learning_rate": 0.0005429398801663386, + "loss": 0.84150088, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.37744141, + "step": 2539, + "time_per_iteration": 2.926213264465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085607, + "balance_loss_mlp": 1.04726946, + "epoch": 0.4886494805694498, + "flos": 430794063360.0, + "grad_norm": 0.05775520457519848, + "language_loss": 0.82975113, + "learning_rate": 0.0005426294805314355, + "loss": 0.84060717, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.38305664, + "step": 2540, + "time_per_iteration": 2.476100444793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087326, + "balance_loss_mlp": 1.0497514, + "epoch": 0.4888418622547134, + "flos": 672673254912.0, + "grad_norm": 0.050739997063638825, + "language_loss": 0.79934752, + "learning_rate": 0.0005423190643463003, + "loss": 0.81022084, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.37573242, + "step": 2541, + "time_per_iteration": 2.983567953109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108794, + "balance_loss_mlp": 1.05005538, + "epoch": 0.4890342439399769, + "flos": 541639014912.0, + "grad_norm": 0.05834464255250002, + "language_loss": 0.82589471, + "learning_rate": 0.0005420086317314473, + "loss": 0.83677411, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.37841797, + "step": 2542, + "time_per_iteration": 2.6762986183166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088795, + "balance_loss_mlp": 1.04957485, + "epoch": 0.4892266256252405, + "flos": 590380904448.0, + "grad_norm": 0.056502349447813176, + "language_loss": 0.8105309, + "learning_rate": 0.0005416981828073971, + "loss": 0.82141888, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.39208984, + "step": 2543, + "time_per_iteration": 2.798063039779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111053, + "balance_loss_mlp": 1.0975107, + "epoch": 0.48941900731050403, + "flos": 1515460008960.0, + "grad_norm": 0.049245887260565786, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78226066, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.13574219, + "step": 2544, + "time_per_iteration": 4.86514949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084158, + "balance_loss_mlp": 1.04632151, + "epoch": 0.4896113889957676, + "flos": 470327399424.0, + "grad_norm": 0.0633775200016376, + "language_loss": 0.84418309, + "learning_rate": 0.000541077236513819, + "loss": 0.85502464, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.37792969, + "step": 2545, + "time_per_iteration": 2.590907335281372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085016, + "balance_loss_mlp": 1.04698849, + "epoch": 0.48980377068103115, + "flos": 496310478336.0, + "grad_norm": 0.05034497234802515, + "language_loss": 0.82352334, + "learning_rate": 0.0005407667393853638, + "loss": 0.83437347, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.37988281, + "step": 2546, + "time_per_iteration": 2.6386098861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079303, + "balance_loss_mlp": 1.04187095, + "epoch": 0.48999615236629473, + "flos": 692539628544.0, + "grad_norm": 0.05625529240804266, + "language_loss": 0.83240199, + "learning_rate": 0.0005404562264298569, + "loss": 0.84319508, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.37426758, + "step": 2547, + "time_per_iteration": 2.8305716514587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083931, + "balance_loss_mlp": 1.04459167, + "epoch": 0.49018853405155827, + "flos": 541432401408.0, + "grad_norm": 0.05508159523705553, + "language_loss": 0.83712828, + "learning_rate": 0.0005401456977678498, + "loss": 0.84796757, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.39306641, + "step": 2548, + "time_per_iteration": 2.647726058959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079917, + "balance_loss_mlp": 1.0415554, + "epoch": 0.49038091573682185, + "flos": 695304027648.0, + "grad_norm": 0.06449580544702971, + "language_loss": 0.77341408, + "learning_rate": 0.0005398351535199008, + "loss": 0.7842133, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.38330078, + "step": 2549, + "time_per_iteration": 3.0876851081848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087981, + "balance_loss_mlp": 1.04976225, + "epoch": 0.49057329742208544, + "flos": 596614063104.0, + "grad_norm": 0.053976289964032184, + "language_loss": 0.83800292, + "learning_rate": 0.0005395245938065735, + "loss": 0.84888279, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.38183594, + "step": 2550, + "time_per_iteration": 2.804429769515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082681, + "balance_loss_mlp": 1.04372382, + "epoch": 0.490765679107349, + "flos": 513154814976.0, + "grad_norm": 0.06066311696873723, + "language_loss": 0.8244735, + "learning_rate": 0.0005392140187484379, + "loss": 0.83530027, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.38916016, + "step": 2551, + "time_per_iteration": 2.597642421722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078468, + "balance_loss_mlp": 1.04001141, + "epoch": 0.49095806079261256, + "flos": 629298782208.0, + "grad_norm": 0.0491826620467597, + "language_loss": 0.89348012, + "learning_rate": 0.0005389034284660701, + "loss": 0.90426481, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.3840332, + "step": 2552, + "time_per_iteration": 2.7942707538604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081847, + "balance_loss_mlp": 1.04231691, + "epoch": 0.4911504424778761, + "flos": 914938122240.0, + "grad_norm": 0.07682264853807555, + "language_loss": 0.82114685, + "learning_rate": 0.000538592823080052, + "loss": 0.83196527, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.39501953, + "step": 2553, + "time_per_iteration": 3.1190438270568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079314, + "balance_loss_mlp": 1.04154849, + "epoch": 0.4913428241631397, + "flos": 438716445696.0, + "grad_norm": 0.05210768805810414, + "language_loss": 0.85049736, + "learning_rate": 0.000538282202710971, + "loss": 0.86129045, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.37768555, + "step": 2554, + "time_per_iteration": 2.5379602909088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073496, + "balance_loss_mlp": 1.03613555, + "epoch": 0.4915352058484032, + "flos": 635806955520.0, + "grad_norm": 0.06005848629390598, + "language_loss": 0.81770831, + "learning_rate": 0.000537971567479421, + "loss": 0.82844329, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.37329102, + "step": 2555, + "time_per_iteration": 2.7403476238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107527, + "balance_loss_mlp": 1.0371232, + "epoch": 0.4917275875336668, + "flos": 504272148480.0, + "grad_norm": 0.05941814666543565, + "language_loss": 0.87821388, + "learning_rate": 0.0005376609175060011, + "loss": 0.88896656, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.38110352, + "step": 2556, + "time_per_iteration": 2.5817511081695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069861, + "balance_loss_mlp": 1.03192806, + "epoch": 0.49191996921893033, + "flos": 654251765760.0, + "grad_norm": 0.06032782721564886, + "language_loss": 0.80381918, + "learning_rate": 0.0005373502529113162, + "loss": 0.81451786, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.37915039, + "step": 2557, + "time_per_iteration": 2.7871665954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077426, + "balance_loss_mlp": 1.03939795, + "epoch": 0.4921123509041939, + "flos": 492101194752.0, + "grad_norm": 0.054204772274654804, + "language_loss": 0.81538296, + "learning_rate": 0.0005370395738159773, + "loss": 0.82615721, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.38012695, + "step": 2558, + "time_per_iteration": 2.667402744293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071328, + "balance_loss_mlp": 1.03368151, + "epoch": 0.4923047325894575, + "flos": 545907935232.0, + "grad_norm": 0.05883600684350466, + "language_loss": 0.82952267, + "learning_rate": 0.0005367288803406003, + "loss": 0.84023595, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.3762207, + "step": 2559, + "time_per_iteration": 2.626527786254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078144, + "balance_loss_mlp": 1.03937757, + "epoch": 0.49249711427472104, + "flos": 596195043840.0, + "grad_norm": 0.05079842806629368, + "language_loss": 0.8133688, + "learning_rate": 0.0005364181726058073, + "loss": 0.82415026, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.38720703, + "step": 2560, + "time_per_iteration": 2.6742072105407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079994, + "balance_loss_mlp": 1.0413698, + "epoch": 0.4926894959599846, + "flos": 497580682752.0, + "grad_norm": 0.07402195837362009, + "language_loss": 0.8230688, + "learning_rate": 0.0005361074507322261, + "loss": 0.83386874, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.38574219, + "step": 2561, + "time_per_iteration": 2.5911788940429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079985, + "balance_loss_mlp": 1.04226756, + "epoch": 0.49288187764524816, + "flos": 535868545536.0, + "grad_norm": 0.051530448614758514, + "language_loss": 0.81235635, + "learning_rate": 0.000535796714840489, + "loss": 0.82315624, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.37695312, + "step": 2562, + "time_per_iteration": 2.607124090194702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108504, + "balance_loss_mlp": 1.04694033, + "epoch": 0.49307425933051174, + "flos": 641267504640.0, + "grad_norm": 0.0614534794373117, + "language_loss": 0.83895457, + "learning_rate": 0.0005354859650512348, + "loss": 0.84980506, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.38037109, + "step": 2563, + "time_per_iteration": 2.757147789001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.04889464, + "epoch": 0.4932666410157753, + "flos": 516000761856.0, + "grad_norm": 0.06049941260890761, + "language_loss": 0.87262708, + "learning_rate": 0.0005351752014851074, + "loss": 0.88350135, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.38500977, + "step": 2564, + "time_per_iteration": 2.546381711959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090812, + "balance_loss_mlp": 1.05190217, + "epoch": 0.49345902270103886, + "flos": 601217634816.0, + "grad_norm": 0.06075916964602771, + "language_loss": 0.83327425, + "learning_rate": 0.0005348644242627553, + "loss": 0.84418237, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.38867188, + "step": 2565, + "time_per_iteration": 2.737234592437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080753, + "balance_loss_mlp": 1.06368184, + "epoch": 0.49365140438630245, + "flos": 1492849585152.0, + "grad_norm": 0.03629255242441858, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76367378, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.17089844, + "step": 2566, + "time_per_iteration": 4.96724271774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093722, + "balance_loss_mlp": 1.05462122, + "epoch": 0.493843786071566, + "flos": 629303164416.0, + "grad_norm": 0.05641611710897844, + "language_loss": 0.81215966, + "learning_rate": 0.0005342428293320013, + "loss": 0.82309687, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.390625, + "step": 2567, + "time_per_iteration": 2.75099778175354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085401, + "balance_loss_mlp": 1.04722989, + "epoch": 0.49403616775682957, + "flos": 617283569664.0, + "grad_norm": 0.05682733114828458, + "language_loss": 0.83676398, + "learning_rate": 0.0005339320118649238, + "loss": 0.84761798, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.3815918, + "step": 2568, + "time_per_iteration": 2.6829991340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087145, + "balance_loss_mlp": 1.04945099, + "epoch": 0.4942285494420931, + "flos": 577357355520.0, + "grad_norm": 0.053270861905881636, + "language_loss": 0.86332101, + "learning_rate": 0.000533621181224271, + "loss": 0.87419248, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.37646484, + "step": 2569, + "time_per_iteration": 2.777698278427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092012, + "balance_loss_mlp": 1.0536983, + "epoch": 0.4944209311273567, + "flos": 629899683840.0, + "grad_norm": 0.059449335887268515, + "language_loss": 0.81470358, + "learning_rate": 0.0005333103375307182, + "loss": 0.82562375, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.3828125, + "step": 2570, + "time_per_iteration": 2.866680145263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087838, + "balance_loss_mlp": 1.0502398, + "epoch": 0.4946133128126202, + "flos": 587337108480.0, + "grad_norm": 0.04632852912872097, + "language_loss": 0.86004198, + "learning_rate": 0.0005329994809049451, + "loss": 0.8709203, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.37548828, + "step": 2571, + "time_per_iteration": 2.719249963760376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089339, + "balance_loss_mlp": 1.05147839, + "epoch": 0.4948056944978838, + "flos": 583437745152.0, + "grad_norm": 0.05131083950778726, + "language_loss": 0.87596244, + "learning_rate": 0.0005326886114676375, + "loss": 0.88685584, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.37866211, + "step": 2572, + "time_per_iteration": 2.7392373085021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083396, + "balance_loss_mlp": 1.04524934, + "epoch": 0.49499807618314734, + "flos": 481583149056.0, + "grad_norm": 0.0472919496744071, + "language_loss": 0.87958217, + "learning_rate": 0.0005323777293394854, + "loss": 0.89041615, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.38110352, + "step": 2573, + "time_per_iteration": 2.531196355819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078942, + "balance_loss_mlp": 1.04072404, + "epoch": 0.4951904578684109, + "flos": 518714288640.0, + "grad_norm": 0.0452048253819277, + "language_loss": 0.82375443, + "learning_rate": 0.000532066834641184, + "loss": 0.83454382, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.38183594, + "step": 2574, + "time_per_iteration": 2.6414644718170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076991, + "balance_loss_mlp": 1.03939271, + "epoch": 0.4953828395536745, + "flos": 535238530560.0, + "grad_norm": 0.0513606490930485, + "language_loss": 0.84946954, + "learning_rate": 0.0005317559274934334, + "loss": 0.86023939, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.37573242, + "step": 2575, + "time_per_iteration": 2.764742374420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075902, + "balance_loss_mlp": 1.03904271, + "epoch": 0.49557522123893805, + "flos": 528305545728.0, + "grad_norm": 0.0624025017343203, + "language_loss": 0.80560994, + "learning_rate": 0.0005314450080169382, + "loss": 0.816369, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.3684082, + "step": 2576, + "time_per_iteration": 2.594782590866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078443, + "balance_loss_mlp": 1.04017663, + "epoch": 0.49576760292420163, + "flos": 427780790784.0, + "grad_norm": 0.059991931078834576, + "language_loss": 0.80652928, + "learning_rate": 0.0005311340763324083, + "loss": 0.81731379, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.38232422, + "step": 2577, + "time_per_iteration": 2.5488879680633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107968, + "balance_loss_mlp": 1.04232025, + "epoch": 0.49595998460946517, + "flos": 564968203776.0, + "grad_norm": 0.04956045110382575, + "language_loss": 0.81899893, + "learning_rate": 0.0005308231325605578, + "loss": 0.82979578, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.37329102, + "step": 2578, + "time_per_iteration": 2.6677722930908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077103, + "balance_loss_mlp": 1.03905153, + "epoch": 0.49615236629472875, + "flos": 702161409024.0, + "grad_norm": 0.04106026216453222, + "language_loss": 0.76928478, + "learning_rate": 0.0005305121768221061, + "loss": 0.78005582, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.38012695, + "step": 2579, + "time_per_iteration": 3.070509672164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024013, + "balance_loss_mlp": 1.00970817, + "epoch": 0.4963447479799923, + "flos": 1440896573952.0, + "grad_norm": 0.02117966265403326, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76062334, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.14257812, + "step": 2580, + "time_per_iteration": 4.802190780639648 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084007, + "balance_loss_mlp": 1.04669428, + "epoch": 0.49653712966525587, + "flos": 537370094592.0, + "grad_norm": 0.04967277918174837, + "language_loss": 0.91594803, + "learning_rate": 0.0005298902299282984, + "loss": 0.92678809, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.37304688, + "step": 2581, + "time_per_iteration": 2.5916941165924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075168, + "balance_loss_mlp": 1.03823721, + "epoch": 0.4967295113505194, + "flos": 607002660864.0, + "grad_norm": 0.058889996692992934, + "language_loss": 0.84090436, + "learning_rate": 0.0005295792390144033, + "loss": 0.85165608, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.36889648, + "step": 2582, + "time_per_iteration": 2.731971502304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077994, + "balance_loss_mlp": 1.04065764, + "epoch": 0.496921893035783, + "flos": 474340243968.0, + "grad_norm": 0.06551304805839393, + "language_loss": 0.83421808, + "learning_rate": 0.0005292682366168294, + "loss": 0.844998, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.37304688, + "step": 2583, + "time_per_iteration": 2.575511932373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071469, + "balance_loss_mlp": 1.03437066, + "epoch": 0.4971142747210466, + "flos": 597180059136.0, + "grad_norm": 0.09149919184070833, + "language_loss": 0.79965729, + "learning_rate": 0.0005289572228563181, + "loss": 0.81037199, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.37084961, + "step": 2584, + "time_per_iteration": 2.7206363677978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107391, + "balance_loss_mlp": 1.03533435, + "epoch": 0.4973066564063101, + "flos": 599321797632.0, + "grad_norm": 0.052533233614156426, + "language_loss": 0.82869196, + "learning_rate": 0.000528646197853616, + "loss": 0.83943105, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.38549805, + "step": 2585, + "time_per_iteration": 2.6923370361328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078928, + "balance_loss_mlp": 1.04097223, + "epoch": 0.4974990380915737, + "flos": 649152009216.0, + "grad_norm": 0.05229001766272028, + "language_loss": 0.85541296, + "learning_rate": 0.0005283351617294735, + "loss": 0.86620224, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.37939453, + "step": 2586, + "time_per_iteration": 2.929431915283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021123, + "balance_loss_mlp": 1.00719905, + "epoch": 0.49769141977683723, + "flos": 1528490912256.0, + "grad_norm": 0.01235864360091676, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77657783, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.13964844, + "step": 2587, + "time_per_iteration": 5.021655082702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077541, + "balance_loss_mlp": 1.03977549, + "epoch": 0.4978838014621008, + "flos": 536114446848.0, + "grad_norm": 0.05582319417935397, + "language_loss": 0.866669, + "learning_rate": 0.0005277130565998916, + "loss": 0.87744439, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.37719727, + "step": 2588, + "time_per_iteration": 2.729919195175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079282, + "balance_loss_mlp": 1.04163599, + "epoch": 0.49807618314736435, + "flos": 539335742976.0, + "grad_norm": 0.05154521563335112, + "language_loss": 0.81850547, + "learning_rate": 0.0005274019878359748, + "loss": 0.82929826, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.3762207, + "step": 2589, + "time_per_iteration": 2.692312240600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080128, + "balance_loss_mlp": 1.04243433, + "epoch": 0.49826856483262794, + "flos": 542215185408.0, + "grad_norm": 0.0590106194524904, + "language_loss": 0.87004912, + "learning_rate": 0.0005270909084336628, + "loss": 0.88085043, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.37695312, + "step": 2590, + "time_per_iteration": 2.684134006500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085832, + "balance_loss_mlp": 1.04637384, + "epoch": 0.4984609465178915, + "flos": 522062212608.0, + "grad_norm": 0.056922673879229405, + "language_loss": 0.89000517, + "learning_rate": 0.0005267798185137276, + "loss": 0.90086353, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.39428711, + "step": 2591, + "time_per_iteration": 2.6129040718078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087547, + "balance_loss_mlp": 1.04942417, + "epoch": 0.49865332820315506, + "flos": 574255332864.0, + "grad_norm": 0.05087809825508884, + "language_loss": 0.89274907, + "learning_rate": 0.0005264687181969444, + "loss": 0.90362453, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.38085938, + "step": 2592, + "time_per_iteration": 2.7253634929656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087493, + "balance_loss_mlp": 1.04891706, + "epoch": 0.49884570988841864, + "flos": 1013207657472.0, + "grad_norm": 0.06815052907107509, + "language_loss": 0.75056839, + "learning_rate": 0.0005261576076040937, + "loss": 0.76144326, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.38525391, + "step": 2593, + "time_per_iteration": 3.2982125282287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086885, + "balance_loss_mlp": 1.04790401, + "epoch": 0.4990380915736822, + "flos": 559315597824.0, + "grad_norm": 0.05997761702509101, + "language_loss": 0.84464318, + "learning_rate": 0.0005258464868559591, + "loss": 0.85551196, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.38964844, + "step": 2594, + "time_per_iteration": 2.650743007659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087007, + "balance_loss_mlp": 1.04819274, + "epoch": 0.49923047325894576, + "flos": 498708292608.0, + "grad_norm": 0.060987476024219604, + "language_loss": 0.88568228, + "learning_rate": 0.0005255353560733284, + "loss": 0.89655238, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.38793945, + "step": 2595, + "time_per_iteration": 2.5599913597106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041145, + "balance_loss_mlp": 1.02760279, + "epoch": 0.4994228549442093, + "flos": 1495851273216.0, + "grad_norm": 0.01946244961408958, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76619792, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.13574219, + "step": 2596, + "time_per_iteration": 4.769503593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108652, + "balance_loss_mlp": 1.0481348, + "epoch": 0.4996152366294729, + "flos": 557090901504.0, + "grad_norm": 0.052826274831603945, + "language_loss": 0.83429873, + "learning_rate": 0.0005249130648877492, + "loss": 0.84516394, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.38354492, + "step": 2597, + "time_per_iteration": 2.724168300628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085761, + "balance_loss_mlp": 1.04785287, + "epoch": 0.4998076183147364, + "flos": 415372700160.0, + "grad_norm": 0.05706521232724688, + "language_loss": 0.84317046, + "learning_rate": 0.0005246019047263953, + "loss": 0.85402811, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.37841797, + "step": 2598, + "time_per_iteration": 2.4463517665863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081855, + "balance_loss_mlp": 1.04475701, + "epoch": 0.5, + "flos": 467107513344.0, + "grad_norm": 0.6792645039501298, + "language_loss": 0.82562613, + "learning_rate": 0.0005242907350137353, + "loss": 0.83644474, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.37060547, + "step": 2599, + "time_per_iteration": 2.560786008834839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099746, + "balance_loss_mlp": 1.06193328, + "epoch": 0.5001923816852636, + "flos": 482460475392.0, + "grad_norm": 0.06436348420044716, + "language_loss": 0.78717571, + "learning_rate": 0.0005239795558705754, + "loss": 0.79817319, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.37817383, + "step": 2600, + "time_per_iteration": 2.691749095916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103949, + "balance_loss_mlp": 1.06613564, + "epoch": 0.5003847633705272, + "flos": 533534750208.0, + "grad_norm": 0.05701005713359991, + "language_loss": 0.89229304, + "learning_rate": 0.0005236683674177264, + "loss": 0.90333253, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.37744141, + "step": 2601, + "time_per_iteration": 2.6216700077056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118846, + "balance_loss_mlp": 1.08053231, + "epoch": 0.5005771450557907, + "flos": 737473876992.0, + "grad_norm": 0.059257141019647214, + "language_loss": 0.82444715, + "learning_rate": 0.0005233571697760021, + "loss": 0.83563566, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.3828125, + "step": 2602, + "time_per_iteration": 2.856107473373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127913, + "balance_loss_mlp": 1.08902669, + "epoch": 0.5007695267410542, + "flos": 778646974464.0, + "grad_norm": 0.08832305121279985, + "language_loss": 0.83020616, + "learning_rate": 0.0005230459630662203, + "loss": 0.84148532, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.38842773, + "step": 2603, + "time_per_iteration": 2.954914093017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133998, + "balance_loss_mlp": 1.09563613, + "epoch": 0.5009619084263178, + "flos": 623192251392.0, + "grad_norm": 0.09845505678723535, + "language_loss": 0.81501806, + "learning_rate": 0.0005227347474092022, + "loss": 0.82635808, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.38354492, + "step": 2604, + "time_per_iteration": 2.7330713272094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132886, + "balance_loss_mlp": 1.09223533, + "epoch": 0.5011542901115814, + "flos": 530812459008.0, + "grad_norm": 0.044602380755084235, + "language_loss": 0.83597159, + "learning_rate": 0.0005224235229257724, + "loss": 0.84730041, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.40649414, + "step": 2605, + "time_per_iteration": 2.682590961456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134983, + "balance_loss_mlp": 1.09485674, + "epoch": 0.5013466717968449, + "flos": 527262303744.0, + "grad_norm": 0.06172408458695075, + "language_loss": 0.86453664, + "learning_rate": 0.0005221122897367589, + "loss": 0.87588644, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.40136719, + "step": 2606, + "time_per_iteration": 2.7657558917999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130017, + "balance_loss_mlp": 1.08970046, + "epoch": 0.5015390534821085, + "flos": 565750987776.0, + "grad_norm": 0.060573415362282904, + "language_loss": 0.80914944, + "learning_rate": 0.0005218010479629932, + "loss": 0.82044959, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.40332031, + "step": 2607, + "time_per_iteration": 2.650521755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137201, + "balance_loss_mlp": 1.09564483, + "epoch": 0.5017314351673721, + "flos": 566430465024.0, + "grad_norm": 0.062462394429491495, + "language_loss": 0.82171839, + "learning_rate": 0.0005214897977253102, + "loss": 0.83309042, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.41552734, + "step": 2608, + "time_per_iteration": 2.679605484008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135192, + "balance_loss_mlp": 1.09222913, + "epoch": 0.5019238168526357, + "flos": 522018542592.0, + "grad_norm": 0.04524020883908707, + "language_loss": 0.84520149, + "learning_rate": 0.0005211785391445473, + "loss": 0.85655344, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.4296875, + "step": 2609, + "time_per_iteration": 2.727029323577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133676, + "balance_loss_mlp": 1.09128523, + "epoch": 0.5021161985378992, + "flos": 641135084544.0, + "grad_norm": 0.0754859849582408, + "language_loss": 0.79190326, + "learning_rate": 0.0005208672723415467, + "loss": 0.80324006, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.42358398, + "step": 2610, + "time_per_iteration": 2.7925145626068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132046, + "balance_loss_mlp": 1.09058475, + "epoch": 0.5023085802231627, + "flos": 591000744960.0, + "grad_norm": 0.05557553185326306, + "language_loss": 0.78870118, + "learning_rate": 0.0005205559974371525, + "loss": 0.80002165, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.41455078, + "step": 2611, + "time_per_iteration": 2.7993710041046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129577, + "balance_loss_mlp": 1.08747184, + "epoch": 0.5025009619084263, + "flos": 472134486528.0, + "grad_norm": 0.05627981978612443, + "language_loss": 0.81993866, + "learning_rate": 0.0005202447145522123, + "loss": 0.83123446, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.42089844, + "step": 2612, + "time_per_iteration": 2.6950342655181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120122, + "balance_loss_mlp": 1.0788281, + "epoch": 0.5026933435936899, + "flos": 454906036224.0, + "grad_norm": 0.05146182880646494, + "language_loss": 0.79119051, + "learning_rate": 0.0005199334238075769, + "loss": 0.80239171, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.4128418, + "step": 2613, + "time_per_iteration": 2.533280372619629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121533, + "balance_loss_mlp": 1.08064461, + "epoch": 0.5028857252789535, + "flos": 491504675328.0, + "grad_norm": 0.049706042989329166, + "language_loss": 0.91481262, + "learning_rate": 0.0005196221253241, + "loss": 0.92602801, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.40869141, + "step": 2614, + "time_per_iteration": 2.562459707260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125484, + "balance_loss_mlp": 1.08271146, + "epoch": 0.503078106964217, + "flos": 625280145408.0, + "grad_norm": 0.05688830610190983, + "language_loss": 0.82597703, + "learning_rate": 0.0005193108192226383, + "loss": 0.83723187, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.42797852, + "step": 2615, + "time_per_iteration": 2.7700836658477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124223, + "balance_loss_mlp": 1.08054483, + "epoch": 0.5032704886494805, + "flos": 578774536704.0, + "grad_norm": 0.07123141067873749, + "language_loss": 0.87046134, + "learning_rate": 0.000518999505624052, + "loss": 0.88170362, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.43701172, + "step": 2616, + "time_per_iteration": 2.6920361518859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110793, + "balance_loss_mlp": 1.06897473, + "epoch": 0.5034628703347441, + "flos": 471481150464.0, + "grad_norm": 0.07512500822512953, + "language_loss": 0.83250809, + "learning_rate": 0.000518688184649203, + "loss": 0.84361595, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.41845703, + "step": 2617, + "time_per_iteration": 2.8107755184173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109828, + "balance_loss_mlp": 1.06786621, + "epoch": 0.5036552520200077, + "flos": 489594281472.0, + "grad_norm": 0.05241889370213675, + "language_loss": 0.83636624, + "learning_rate": 0.0005183768564189577, + "loss": 0.84746444, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.41967773, + "step": 2618, + "time_per_iteration": 2.5401604175567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117501, + "balance_loss_mlp": 1.07649279, + "epoch": 0.5038476337052713, + "flos": 493991239680.0, + "grad_norm": 0.05660213632560354, + "language_loss": 0.8184489, + "learning_rate": 0.0005180655210541838, + "loss": 0.82962382, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.40991211, + "step": 2619, + "time_per_iteration": 2.603214979171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111785, + "balance_loss_mlp": 1.06829762, + "epoch": 0.5040400153905348, + "flos": 600321369600.0, + "grad_norm": 0.06441755274122189, + "language_loss": 0.83548617, + "learning_rate": 0.0005177541786757527, + "loss": 0.84660405, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.43481445, + "step": 2620, + "time_per_iteration": 2.760035276412964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122898, + "balance_loss_mlp": 1.07759881, + "epoch": 0.5042323970757984, + "flos": 811178924544.0, + "grad_norm": 0.05307882661131351, + "language_loss": 0.82779682, + "learning_rate": 0.000517442829404538, + "loss": 0.8390258, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.453125, + "step": 2621, + "time_per_iteration": 2.9839560985565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110947, + "balance_loss_mlp": 1.06581521, + "epoch": 0.504424778761062, + "flos": 626985335808.0, + "grad_norm": 0.08823829105457728, + "language_loss": 0.87315869, + "learning_rate": 0.0005171314733614166, + "loss": 0.88425338, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.43676758, + "step": 2622, + "time_per_iteration": 2.901881456375122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_mlp": 1.05961967, + "epoch": 0.5046171604463255, + "flos": 515651553792.0, + "grad_norm": 0.052612789537889, + "language_loss": 0.78039354, + "learning_rate": 0.0005168201106672671, + "loss": 0.79141223, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.42236328, + "step": 2623, + "time_per_iteration": 2.7674055099487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111898, + "balance_loss_mlp": 1.07046056, + "epoch": 0.504809542131589, + "flos": 527576606208.0, + "grad_norm": 0.08464756430959838, + "language_loss": 0.8495788, + "learning_rate": 0.0005165087414429717, + "loss": 0.86069775, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.41430664, + "step": 2624, + "time_per_iteration": 2.602158546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117075, + "balance_loss_mlp": 1.07261038, + "epoch": 0.5050019238168526, + "flos": 553855048704.0, + "grad_norm": 0.23140620797494316, + "language_loss": 0.83667731, + "learning_rate": 0.0005161973658094144, + "loss": 0.84784812, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.44458008, + "step": 2625, + "time_per_iteration": 2.6992454528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108998, + "balance_loss_mlp": 1.06834817, + "epoch": 0.5051943055021162, + "flos": 574486677504.0, + "grad_norm": 0.05317382862924398, + "language_loss": 0.82239455, + "learning_rate": 0.000515885983887482, + "loss": 0.83348453, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.40649414, + "step": 2626, + "time_per_iteration": 2.7204251289367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110206, + "balance_loss_mlp": 1.06781507, + "epoch": 0.5053866871873798, + "flos": 496438516224.0, + "grad_norm": 0.08071327634258786, + "language_loss": 0.84119672, + "learning_rate": 0.0005155745957980636, + "loss": 0.85229874, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.42382812, + "step": 2627, + "time_per_iteration": 2.5813376903533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118048, + "balance_loss_mlp": 1.0760628, + "epoch": 0.5055790688726434, + "flos": 501963084288.0, + "grad_norm": 0.04526623404133713, + "language_loss": 0.88577604, + "learning_rate": 0.000515263201662051, + "loss": 0.89695656, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.41992188, + "step": 2628, + "time_per_iteration": 2.6876380443573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111719, + "balance_loss_mlp": 1.07625389, + "epoch": 0.5057714505579068, + "flos": 844844276736.0, + "grad_norm": 0.05588400488715087, + "language_loss": 0.82233381, + "learning_rate": 0.0005149518016003378, + "loss": 0.83350569, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.40942383, + "step": 2629, + "time_per_iteration": 3.1858632564544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124651, + "balance_loss_mlp": 1.0810678, + "epoch": 0.5059638322431704, + "flos": 497580682752.0, + "grad_norm": 0.0555737706891176, + "language_loss": 0.82261145, + "learning_rate": 0.0005146403957338206, + "loss": 0.83385789, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.43603516, + "step": 2630, + "time_per_iteration": 2.548497438430786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118356, + "balance_loss_mlp": 1.07703853, + "epoch": 0.506156213928434, + "flos": 617526498816.0, + "grad_norm": 0.05055767229530262, + "language_loss": 0.82073247, + "learning_rate": 0.0005143289841833975, + "loss": 0.83191609, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.41308594, + "step": 2631, + "time_per_iteration": 2.847142457962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116463, + "balance_loss_mlp": 1.07500172, + "epoch": 0.5063485956136976, + "flos": 424624923648.0, + "grad_norm": 0.06986911289391046, + "language_loss": 0.81789684, + "learning_rate": 0.0005140175670699696, + "loss": 0.82906151, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.41455078, + "step": 2632, + "time_per_iteration": 2.6268298625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_mlp": 1.0729686, + "epoch": 0.5065409772989612, + "flos": 569641586688.0, + "grad_norm": 0.04802770333155415, + "language_loss": 0.8255887, + "learning_rate": 0.0005137061445144395, + "loss": 0.8367523, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.43383789, + "step": 2633, + "time_per_iteration": 2.93361759185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106091, + "balance_loss_mlp": 1.06458259, + "epoch": 0.5067333589842247, + "flos": 628510205952.0, + "grad_norm": 0.0826873370301202, + "language_loss": 0.86646289, + "learning_rate": 0.000513394716637712, + "loss": 0.87752378, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.4152832, + "step": 2634, + "time_per_iteration": 2.8372714519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083943, + "balance_loss_mlp": 1.06868434, + "epoch": 0.5069257406694883, + "flos": 1447062741504.0, + "grad_norm": 0.03147096823206272, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80275649, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.15234375, + "step": 2635, + "time_per_iteration": 4.893187046051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109252, + "balance_loss_mlp": 1.06812489, + "epoch": 0.5071181223547518, + "flos": 638530656768.0, + "grad_norm": 0.046825638192595165, + "language_loss": 0.80415404, + "learning_rate": 0.0005127718454042958, + "loss": 0.81524646, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.41113281, + "step": 2636, + "time_per_iteration": 2.8583669662475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104227, + "balance_loss_mlp": 1.06250417, + "epoch": 0.5073105040400154, + "flos": 713239658496.0, + "grad_norm": 0.061804914120772665, + "language_loss": 0.84210312, + "learning_rate": 0.0005124604022894269, + "loss": 0.85314542, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.41723633, + "step": 2637, + "time_per_iteration": 2.924973726272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047259, + "balance_loss_mlp": 1.03228605, + "epoch": 0.5075028857252789, + "flos": 1435658605056.0, + "grad_norm": 0.01918715016894911, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78235483, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.14941406, + "step": 2638, + "time_per_iteration": 4.856257915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103908, + "balance_loss_mlp": 1.06115913, + "epoch": 0.5076952674105425, + "flos": 570857946624.0, + "grad_norm": 0.0603044028086303, + "language_loss": 0.83185166, + "learning_rate": 0.0005118375016679325, + "loss": 0.84289074, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.42749023, + "step": 2639, + "time_per_iteration": 2.788266897201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108523, + "balance_loss_mlp": 1.06651402, + "epoch": 0.5078876490958061, + "flos": 516463451136.0, + "grad_norm": 0.06423032366665075, + "language_loss": 0.8059274, + "learning_rate": 0.0005115260444031382, + "loss": 0.81701261, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.42016602, + "step": 2640, + "time_per_iteration": 2.5973188877105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036794, + "balance_loss_mlp": 1.02191687, + "epoch": 0.5080800307810697, + "flos": 1583378620416.0, + "grad_norm": 0.017407415587129545, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.7976861, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.1484375, + "step": 2641, + "time_per_iteration": 4.9824395179748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107231, + "balance_loss_mlp": 1.06340933, + "epoch": 0.5082724124663333, + "flos": 484965978624.0, + "grad_norm": 0.05963770496992207, + "language_loss": 0.8711704, + "learning_rate": 0.0005109031165700483, + "loss": 0.88224268, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.43823242, + "step": 2642, + "time_per_iteration": 2.5530447959899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103344, + "balance_loss_mlp": 1.05997539, + "epoch": 0.5084647941515967, + "flos": 681928450560.0, + "grad_norm": 0.05207490997788611, + "language_loss": 0.8334229, + "learning_rate": 0.0005105916462435945, + "loss": 0.84445643, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.43359375, + "step": 2643, + "time_per_iteration": 2.8092200756073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101863, + "balance_loss_mlp": 1.05863762, + "epoch": 0.5086571758368603, + "flos": 548468692992.0, + "grad_norm": 0.0494294374601552, + "language_loss": 0.85464209, + "learning_rate": 0.0005102801718050989, + "loss": 0.86566073, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.43261719, + "step": 2644, + "time_per_iteration": 2.6660444736480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111917, + "balance_loss_mlp": 1.06735659, + "epoch": 0.5088495575221239, + "flos": 563751843840.0, + "grad_norm": 0.0695979688507087, + "language_loss": 0.88942361, + "learning_rate": 0.0005099686933754867, + "loss": 0.9005428, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.44580078, + "step": 2645, + "time_per_iteration": 2.673337697982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108704, + "balance_loss_mlp": 1.06283236, + "epoch": 0.5090419392073875, + "flos": 551132757504.0, + "grad_norm": 0.05355859457172443, + "language_loss": 0.84209561, + "learning_rate": 0.0005096572110756845, + "loss": 0.85318267, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.45874023, + "step": 2646, + "time_per_iteration": 2.6638782024383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112209, + "balance_loss_mlp": 1.06686139, + "epoch": 0.509234320892651, + "flos": 567504230400.0, + "grad_norm": 0.04874041351849401, + "language_loss": 0.85460532, + "learning_rate": 0.0005093457250266205, + "loss": 0.86572737, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.45361328, + "step": 2647, + "time_per_iteration": 2.6637892723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107252, + "balance_loss_mlp": 1.0633595, + "epoch": 0.5094267025779146, + "flos": 582339248640.0, + "grad_norm": 0.05998717956466229, + "language_loss": 0.8317883, + "learning_rate": 0.000509034235349224, + "loss": 0.84286082, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.43920898, + "step": 2648, + "time_per_iteration": 2.6878888607025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100829, + "balance_loss_mlp": 1.05846214, + "epoch": 0.5096190842631781, + "flos": 591704953344.0, + "grad_norm": 0.05244355272630434, + "language_loss": 0.812711, + "learning_rate": 0.0005087227421644266, + "loss": 0.82371926, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.42407227, + "step": 2649, + "time_per_iteration": 2.7117576599121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106927, + "balance_loss_mlp": 1.06346333, + "epoch": 0.5098114659484417, + "flos": 513307584000.0, + "grad_norm": 0.052249476616985355, + "language_loss": 0.8603372, + "learning_rate": 0.0005084112455931602, + "loss": 0.87140644, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.43457031, + "step": 2650, + "time_per_iteration": 2.6070332527160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106986, + "balance_loss_mlp": 1.06578696, + "epoch": 0.5100038476337053, + "flos": 484389808128.0, + "grad_norm": 0.053750245063259934, + "language_loss": 0.85138631, + "learning_rate": 0.0005080997457563586, + "loss": 0.8624562, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.41210938, + "step": 2651, + "time_per_iteration": 2.53045654296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105107, + "balance_loss_mlp": 1.06374109, + "epoch": 0.5101962293189688, + "flos": 461366157312.0, + "grad_norm": 0.06332454651149101, + "language_loss": 0.79166603, + "learning_rate": 0.0005077882427749569, + "loss": 0.80271709, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.41381836, + "step": 2652, + "time_per_iteration": 2.4946300983428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113144, + "balance_loss_mlp": 1.07084906, + "epoch": 0.5103886110042324, + "flos": 586760937984.0, + "grad_norm": 0.06191877346451425, + "language_loss": 0.8487432, + "learning_rate": 0.0005074767367698913, + "loss": 0.85987473, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.42285156, + "step": 2653, + "time_per_iteration": 2.6763722896575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105422, + "balance_loss_mlp": 1.06455684, + "epoch": 0.510580992689496, + "flos": 844906885632.0, + "grad_norm": 0.056937070163659766, + "language_loss": 0.83570945, + "learning_rate": 0.0005071652278620988, + "loss": 0.84676373, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.40869141, + "step": 2654, + "time_per_iteration": 3.0378835201263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109149, + "balance_loss_mlp": 1.06706858, + "epoch": 0.5107733743747596, + "flos": 658328629248.0, + "grad_norm": 0.057649397656864075, + "language_loss": 0.83013982, + "learning_rate": 0.0005068537161725186, + "loss": 0.84123135, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.42041016, + "step": 2655, + "time_per_iteration": 2.7623610496520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105478, + "balance_loss_mlp": 1.06385016, + "epoch": 0.510965756060023, + "flos": 701426677248.0, + "grad_norm": 0.05708536741035134, + "language_loss": 0.8435111, + "learning_rate": 0.0005065422018220893, + "loss": 0.85456586, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.41601562, + "step": 2656, + "time_per_iteration": 2.823542833328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102091, + "balance_loss_mlp": 1.06096351, + "epoch": 0.5111581377452866, + "flos": 559430489088.0, + "grad_norm": 0.05217113074905386, + "language_loss": 0.80225503, + "learning_rate": 0.0005062306849317521, + "loss": 0.81327593, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.41113281, + "step": 2657, + "time_per_iteration": 2.8275818824768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084461, + "balance_loss_mlp": 1.04314327, + "epoch": 0.5113505194305502, + "flos": 608745729024.0, + "grad_norm": 0.05701327198704139, + "language_loss": 0.83469534, + "learning_rate": 0.0005059191656224487, + "loss": 0.84553993, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.41308594, + "step": 2658, + "time_per_iteration": 2.7243552207946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094832, + "balance_loss_mlp": 1.05158317, + "epoch": 0.5115429011158138, + "flos": 534214227456.0, + "grad_norm": 0.0458707137929394, + "language_loss": 0.89186656, + "learning_rate": 0.0005056076440151212, + "loss": 0.90281487, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.43237305, + "step": 2659, + "time_per_iteration": 2.663668632507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047323, + "balance_loss_mlp": 1.0349257, + "epoch": 0.5117352828010774, + "flos": 1361451580416.0, + "grad_norm": 0.020991592608455897, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77335441, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.12402344, + "step": 2660, + "time_per_iteration": 4.851064205169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095712, + "balance_loss_mlp": 1.05420339, + "epoch": 0.5119276644863409, + "flos": 633444046848.0, + "grad_norm": 0.05508509945890564, + "language_loss": 0.87153888, + "learning_rate": 0.0005049845943901691, + "loss": 0.882496, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.41479492, + "step": 2661, + "time_per_iteration": 2.827824831008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085148, + "balance_loss_mlp": 1.04459286, + "epoch": 0.5121200461716044, + "flos": 585304468992.0, + "grad_norm": 0.05132624096148621, + "language_loss": 0.86219436, + "learning_rate": 0.0005046730666144338, + "loss": 0.8730458, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.40527344, + "step": 2662, + "time_per_iteration": 2.75281023979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096524, + "balance_loss_mlp": 1.05542088, + "epoch": 0.512312427856868, + "flos": 1032081661440.0, + "grad_norm": 0.048177160037868025, + "language_loss": 0.87700105, + "learning_rate": 0.0005043615370244532, + "loss": 0.88796628, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.41113281, + "step": 2663, + "time_per_iteration": 3.3618671894073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028213, + "balance_loss_mlp": 1.01524341, + "epoch": 0.5125048095421316, + "flos": 1537257277440.0, + "grad_norm": 0.012858425268609664, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79272604, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.12988281, + "step": 2664, + "time_per_iteration": 4.658047914505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093765, + "balance_loss_mlp": 1.05292368, + "epoch": 0.5126971912273951, + "flos": 590814480384.0, + "grad_norm": 0.04944817886166227, + "language_loss": 0.85279715, + "learning_rate": 0.0005037384728855425, + "loss": 0.86373478, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.40820312, + "step": 2665, + "time_per_iteration": 2.8461544513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098547, + "balance_loss_mlp": 1.05620384, + "epoch": 0.5128895729126587, + "flos": 551393215488.0, + "grad_norm": 0.158979293172939, + "language_loss": 0.84343994, + "learning_rate": 0.0005034269385785075, + "loss": 0.85442543, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.42333984, + "step": 2666, + "time_per_iteration": 2.651714563369751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092703, + "balance_loss_mlp": 1.05222011, + "epoch": 0.5130819545979223, + "flos": 481031709696.0, + "grad_norm": 0.06506731950678159, + "language_loss": 0.84809029, + "learning_rate": 0.0005031154029410168, + "loss": 0.85901731, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.40478516, + "step": 2667, + "time_per_iteration": 2.5316364765167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095809, + "balance_loss_mlp": 1.05368042, + "epoch": 0.5132743362831859, + "flos": 475556603904.0, + "grad_norm": 0.06903413954772, + "language_loss": 0.86695576, + "learning_rate": 0.0005028038660940197, + "loss": 0.87791383, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.42138672, + "step": 2668, + "time_per_iteration": 2.521328926086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090159, + "balance_loss_mlp": 1.04962766, + "epoch": 0.5134667179684494, + "flos": 503559175680.0, + "grad_norm": 0.047102953885103854, + "language_loss": 0.84545898, + "learning_rate": 0.0005024923281584648, + "loss": 0.85636055, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.4050293, + "step": 2669, + "time_per_iteration": 2.6462371349334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092047, + "balance_loss_mlp": 1.05330372, + "epoch": 0.5136590996537129, + "flos": 503647925760.0, + "grad_norm": 0.04719667862832961, + "language_loss": 0.82488692, + "learning_rate": 0.0005021807892553026, + "loss": 0.83580744, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.38696289, + "step": 2670, + "time_per_iteration": 2.732416868209839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094857, + "balance_loss_mlp": 1.05370605, + "epoch": 0.5138514813389765, + "flos": 624330035712.0, + "grad_norm": 0.05149766622145395, + "language_loss": 0.84497285, + "learning_rate": 0.0005018692495054828, + "loss": 0.85592139, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.41137695, + "step": 2671, + "time_per_iteration": 2.760014533996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092038, + "balance_loss_mlp": 1.05174494, + "epoch": 0.5140438630242401, + "flos": 583274801664.0, + "grad_norm": 0.05511271146100304, + "language_loss": 0.80692601, + "learning_rate": 0.0005015577090299561, + "loss": 0.81784636, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.40283203, + "step": 2672, + "time_per_iteration": 2.6871819496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102168, + "balance_loss_mlp": 1.06046844, + "epoch": 0.5142362447095037, + "flos": 487683887616.0, + "grad_norm": 0.05906966789334332, + "language_loss": 0.86718851, + "learning_rate": 0.0005012461679496729, + "loss": 0.87821019, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.41674805, + "step": 2673, + "time_per_iteration": 2.573075771331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111889, + "balance_loss_mlp": 1.06968939, + "epoch": 0.5144286263947672, + "flos": 526601765376.0, + "grad_norm": 0.050226260736663565, + "language_loss": 0.87357539, + "learning_rate": 0.0005009346263855848, + "loss": 0.88469428, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.42211914, + "step": 2674, + "time_per_iteration": 2.6014504432678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100642, + "balance_loss_mlp": 1.06106424, + "epoch": 0.5146210080800308, + "flos": 486252149760.0, + "grad_norm": 0.047502810841318265, + "language_loss": 0.8393209, + "learning_rate": 0.0005006230844586422, + "loss": 0.85032737, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.39599609, + "step": 2675, + "time_per_iteration": 2.7817234992980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102843, + "balance_loss_mlp": 1.06152487, + "epoch": 0.5148133897652943, + "flos": 515622440448.0, + "grad_norm": 0.04472754928085029, + "language_loss": 0.79101396, + "learning_rate": 0.0005003115422897968, + "loss": 0.80204242, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.4128418, + "step": 2676, + "time_per_iteration": 2.72664213180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102609, + "balance_loss_mlp": 1.06243563, + "epoch": 0.5150057714505579, + "flos": 510963614208.0, + "grad_norm": 0.061230997357755966, + "language_loss": 0.86760038, + "learning_rate": 0.0005, + "loss": 0.87862647, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.40161133, + "step": 2677, + "time_per_iteration": 2.6518850326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095319, + "balance_loss_mlp": 1.05648041, + "epoch": 0.5151981531358215, + "flos": 910541164032.0, + "grad_norm": 0.056847893934042666, + "language_loss": 0.79409456, + "learning_rate": 0.0004996884577102033, + "loss": 0.80504775, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.38818359, + "step": 2678, + "time_per_iteration": 3.0679850578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096526, + "balance_loss_mlp": 1.05623293, + "epoch": 0.515390534821085, + "flos": 471599013888.0, + "grad_norm": 0.047432465044858714, + "language_loss": 0.8447082, + "learning_rate": 0.000499376915541358, + "loss": 0.85567349, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.40283203, + "step": 2679, + "time_per_iteration": 2.6785037517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099943, + "balance_loss_mlp": 1.06086659, + "epoch": 0.5155829165063486, + "flos": 649811137536.0, + "grad_norm": 0.04795230358992159, + "language_loss": 0.81296241, + "learning_rate": 0.0004990653736144155, + "loss": 0.82396191, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.390625, + "step": 2680, + "time_per_iteration": 2.840188980102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100494, + "balance_loss_mlp": 1.06072533, + "epoch": 0.5157752981916122, + "flos": 414038476800.0, + "grad_norm": 0.062126395708719404, + "language_loss": 0.86077356, + "learning_rate": 0.0004987538320503271, + "loss": 0.87177849, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.3972168, + "step": 2681, + "time_per_iteration": 2.4594664573669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100598, + "balance_loss_mlp": 1.06054354, + "epoch": 0.5159676798768758, + "flos": 553569859584.0, + "grad_norm": 0.05537703124714055, + "language_loss": 0.82735646, + "learning_rate": 0.0004984422909700442, + "loss": 0.83836246, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.39990234, + "step": 2682, + "time_per_iteration": 2.66052508354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091816, + "balance_loss_mlp": 1.05292952, + "epoch": 0.5161600615621393, + "flos": 586234229760.0, + "grad_norm": 0.051780542585777085, + "language_loss": 0.83951235, + "learning_rate": 0.0004981307504945173, + "loss": 0.85043043, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.38867188, + "step": 2683, + "time_per_iteration": 2.6698381900787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109715, + "balance_loss_mlp": 1.05766809, + "epoch": 0.5163524432474028, + "flos": 588568025088.0, + "grad_norm": 0.05164690349476628, + "language_loss": 0.8939817, + "learning_rate": 0.0004978192107446976, + "loss": 0.90495312, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.39428711, + "step": 2684, + "time_per_iteration": 2.7249348163604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095053, + "balance_loss_mlp": 1.05325842, + "epoch": 0.5165448249326664, + "flos": 503642133504.0, + "grad_norm": 0.05677264338484585, + "language_loss": 0.87172639, + "learning_rate": 0.0004975076718415353, + "loss": 0.8826769, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.41796875, + "step": 2685, + "time_per_iteration": 2.599235773086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086892, + "balance_loss_mlp": 1.04676652, + "epoch": 0.51673720661793, + "flos": 416539597824.0, + "grad_norm": 0.05087662124677675, + "language_loss": 0.90954995, + "learning_rate": 0.0004971961339059806, + "loss": 0.92041892, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.40112305, + "step": 2686, + "time_per_iteration": 2.4647631645202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091735, + "balance_loss_mlp": 1.04986906, + "epoch": 0.5169295883031936, + "flos": 598696164864.0, + "grad_norm": 0.1190187036629449, + "language_loss": 0.83923638, + "learning_rate": 0.0004968845970589832, + "loss": 0.85015374, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.41870117, + "step": 2687, + "time_per_iteration": 2.6631908416748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087993, + "balance_loss_mlp": 1.04793859, + "epoch": 0.517121969988457, + "flos": 556543844352.0, + "grad_norm": 0.06869038553700607, + "language_loss": 0.8455354, + "learning_rate": 0.0004965730614214926, + "loss": 0.85641533, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.40039062, + "step": 2688, + "time_per_iteration": 2.628286361694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098374, + "balance_loss_mlp": 1.05576849, + "epoch": 0.5173143516737206, + "flos": 469214346240.0, + "grad_norm": 0.05001993876024353, + "language_loss": 0.85256827, + "learning_rate": 0.0004962615271144576, + "loss": 0.86355197, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.42602539, + "step": 2689, + "time_per_iteration": 2.5224428176879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091114, + "balance_loss_mlp": 1.05017805, + "epoch": 0.5175067333589842, + "flos": 719739067392.0, + "grad_norm": 0.0600896413832987, + "language_loss": 0.82435369, + "learning_rate": 0.0004959499942588264, + "loss": 0.8352648, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.40917969, + "step": 2690, + "time_per_iteration": 2.923792600631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043722, + "balance_loss_mlp": 1.02932107, + "epoch": 0.5176991150442478, + "flos": 1465402834944.0, + "grad_norm": 0.02659438930583784, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79243743, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.14355469, + "step": 2691, + "time_per_iteration": 4.779648542404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089552, + "balance_loss_mlp": 1.04863954, + "epoch": 0.5178914967295114, + "flos": 612345346560.0, + "grad_norm": 0.05374555179207371, + "language_loss": 0.85215712, + "learning_rate": 0.0004953269333855661, + "loss": 0.86305267, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.40917969, + "step": 2692, + "time_per_iteration": 2.7646090984344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086121, + "balance_loss_mlp": 1.04604328, + "epoch": 0.5180838784147749, + "flos": 500663766528.0, + "grad_norm": 0.05670677168127033, + "language_loss": 0.84148359, + "learning_rate": 0.0004950154056098309, + "loss": 0.85234475, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.40039062, + "step": 2693, + "time_per_iteration": 2.7038145065307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088689, + "balance_loss_mlp": 1.0469892, + "epoch": 0.5182762601000385, + "flos": 688531166208.0, + "grad_norm": 0.05599909013755839, + "language_loss": 0.84343493, + "learning_rate": 0.0004947038797692867, + "loss": 0.85432184, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.41699219, + "step": 2694, + "time_per_iteration": 2.8155903816223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092198, + "balance_loss_mlp": 1.05147612, + "epoch": 0.518468641785302, + "flos": 665315458560.0, + "grad_norm": 0.046372715162849826, + "language_loss": 0.77593923, + "learning_rate": 0.0004943923559848789, + "loss": 0.7868613, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.40698242, + "step": 2695, + "time_per_iteration": 2.787229061126709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086627, + "balance_loss_mlp": 1.04714453, + "epoch": 0.5186610234705656, + "flos": 566440639488.0, + "grad_norm": 0.05332286724917534, + "language_loss": 0.89972508, + "learning_rate": 0.0004940808343775515, + "loss": 0.9105913, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.39453125, + "step": 2696, + "time_per_iteration": 2.6648201942443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083742, + "balance_loss_mlp": 1.04292464, + "epoch": 0.5188534051558291, + "flos": 428652324864.0, + "grad_norm": 0.055572994373314345, + "language_loss": 0.82251114, + "learning_rate": 0.0004937693150682479, + "loss": 0.83334857, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.40820312, + "step": 2697, + "time_per_iteration": 2.5013554096221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089603, + "balance_loss_mlp": 1.04804635, + "epoch": 0.5190457868410927, + "flos": 546085435392.0, + "grad_norm": 0.05634548635888483, + "language_loss": 0.7652837, + "learning_rate": 0.0004934577981779107, + "loss": 0.77617967, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.41552734, + "step": 2698, + "time_per_iteration": 2.7512943744659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092838, + "balance_loss_mlp": 1.04958856, + "epoch": 0.5192381685263563, + "flos": 548321716224.0, + "grad_norm": 0.04670174030259061, + "language_loss": 0.81419832, + "learning_rate": 0.0004931462838274817, + "loss": 0.82512677, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.43237305, + "step": 2699, + "time_per_iteration": 2.8294084072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082299, + "balance_loss_mlp": 1.04296041, + "epoch": 0.5194305502116199, + "flos": 574993036800.0, + "grad_norm": 0.05440575131052059, + "language_loss": 0.83835357, + "learning_rate": 0.0004928347721379011, + "loss": 0.84917653, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.39331055, + "step": 2700, + "time_per_iteration": 2.643941879272461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084407, + "balance_loss_mlp": 1.04485357, + "epoch": 0.5196229318968835, + "flos": 434019741696.0, + "grad_norm": 0.054958496552239416, + "language_loss": 0.81611145, + "learning_rate": 0.0004925232632301089, + "loss": 0.8269555, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.39526367, + "step": 2701, + "time_per_iteration": 2.5408122539520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084434, + "balance_loss_mlp": 1.04638255, + "epoch": 0.5198153135821469, + "flos": 558607007232.0, + "grad_norm": 0.05193596738822722, + "language_loss": 0.79534626, + "learning_rate": 0.0004922117572250431, + "loss": 0.80619061, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.38037109, + "step": 2702, + "time_per_iteration": 2.6687467098236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078961, + "balance_loss_mlp": 1.04152906, + "epoch": 0.5200076952674105, + "flos": 565397397504.0, + "grad_norm": 0.04814908286006495, + "language_loss": 0.80652344, + "learning_rate": 0.0004919002542436414, + "loss": 0.81731308, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.37451172, + "step": 2703, + "time_per_iteration": 2.811460256576538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085168, + "balance_loss_mlp": 1.04644859, + "epoch": 0.5202000769526741, + "flos": 570916173312.0, + "grad_norm": 0.05555982935463854, + "language_loss": 0.81149572, + "learning_rate": 0.0004915887544068399, + "loss": 0.8223474, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.38720703, + "step": 2704, + "time_per_iteration": 2.6499714851379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093505, + "balance_loss_mlp": 1.05199671, + "epoch": 0.5203924586379377, + "flos": 693898583040.0, + "grad_norm": 0.050837486186397586, + "language_loss": 0.77994883, + "learning_rate": 0.0004912772578355736, + "loss": 0.7908839, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.41503906, + "step": 2705, + "time_per_iteration": 2.8637514114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094973, + "balance_loss_mlp": 1.0555619, + "epoch": 0.5205848403232012, + "flos": 566215087104.0, + "grad_norm": 0.054100857686445215, + "language_loss": 0.8301729, + "learning_rate": 0.000490965764650776, + "loss": 0.84112263, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.39404297, + "step": 2706, + "time_per_iteration": 2.8644323348999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085669, + "balance_loss_mlp": 1.04661632, + "epoch": 0.5207772220084648, + "flos": 1213775539200.0, + "grad_norm": 0.05228956126941533, + "language_loss": 0.82813179, + "learning_rate": 0.0004906542749733798, + "loss": 0.83898848, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.39013672, + "step": 2707, + "time_per_iteration": 3.6128242015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084528, + "balance_loss_mlp": 1.04635715, + "epoch": 0.5209696036937284, + "flos": 592547374080.0, + "grad_norm": 0.12708447176708407, + "language_loss": 0.84871459, + "learning_rate": 0.0004903427889243156, + "loss": 0.85955989, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.38134766, + "step": 2708, + "time_per_iteration": 2.86226487159729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109093, + "balance_loss_mlp": 1.05211544, + "epoch": 0.5211619853789919, + "flos": 522623826432.0, + "grad_norm": 0.05348625186790992, + "language_loss": 0.85548282, + "learning_rate": 0.0004900313066245134, + "loss": 0.86639208, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.38818359, + "step": 2709, + "time_per_iteration": 2.662485122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081893, + "balance_loss_mlp": 1.0432452, + "epoch": 0.5213543670642555, + "flos": 502534872576.0, + "grad_norm": 0.050688452880556414, + "language_loss": 0.80490649, + "learning_rate": 0.0004897198281949012, + "loss": 0.81572545, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.38647461, + "step": 2710, + "time_per_iteration": 2.6449263095855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085994, + "balance_loss_mlp": 1.04636908, + "epoch": 0.521546748749519, + "flos": 585682790400.0, + "grad_norm": 0.05860885905894002, + "language_loss": 0.77534401, + "learning_rate": 0.0004894083537564057, + "loss": 0.78620392, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.39599609, + "step": 2711, + "time_per_iteration": 2.7473373413085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083493, + "balance_loss_mlp": 1.04458284, + "epoch": 0.5217391304347826, + "flos": 569833643520.0, + "grad_norm": 0.04954385524753536, + "language_loss": 0.80801934, + "learning_rate": 0.0004890968834299519, + "loss": 0.81885427, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.38867188, + "step": 2712, + "time_per_iteration": 2.7709779739379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084865, + "balance_loss_mlp": 1.04621696, + "epoch": 0.5219315121200462, + "flos": 542501784576.0, + "grad_norm": 0.06807472429400872, + "language_loss": 0.78801876, + "learning_rate": 0.0004887854173364633, + "loss": 0.7988674, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.38623047, + "step": 2713, + "time_per_iteration": 2.710489273071289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084971, + "balance_loss_mlp": 1.04713416, + "epoch": 0.5221238938053098, + "flos": 550006557696.0, + "grad_norm": 0.048000843690728094, + "language_loss": 0.81816071, + "learning_rate": 0.0004884739555968617, + "loss": 0.82901043, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.37866211, + "step": 2714, + "time_per_iteration": 2.8097493648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_mlp": 1.01785719, + "epoch": 0.5223162754905732, + "flos": 1354373028864.0, + "grad_norm": 0.016208306264550634, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80007499, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.12597656, + "step": 2715, + "time_per_iteration": 4.9789557456970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083164, + "balance_loss_mlp": 1.04444456, + "epoch": 0.5225086571758368, + "flos": 567441621504.0, + "grad_norm": 0.04806245104826077, + "language_loss": 0.86670554, + "learning_rate": 0.0004878510456629992, + "loss": 0.87753725, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.38696289, + "step": 2716, + "time_per_iteration": 3.015443801879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084237, + "balance_loss_mlp": 1.0459466, + "epoch": 0.5227010388611004, + "flos": 499914478080.0, + "grad_norm": 0.051081355886524536, + "language_loss": 0.85046101, + "learning_rate": 0.00048753959771057314, + "loss": 0.86130333, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.3828125, + "step": 2717, + "time_per_iteration": 2.623352289199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084285, + "balance_loss_mlp": 1.04539871, + "epoch": 0.522893420546364, + "flos": 597372115968.0, + "grad_norm": 0.0531417340924391, + "language_loss": 0.82181746, + "learning_rate": 0.0004872281545957044, + "loss": 0.83266038, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.38842773, + "step": 2718, + "time_per_iteration": 2.7300612926483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080864, + "balance_loss_mlp": 1.04154897, + "epoch": 0.5230858022316276, + "flos": 664278008832.0, + "grad_norm": 0.05093940259468129, + "language_loss": 0.85964847, + "learning_rate": 0.0004869167164393055, + "loss": 0.87045711, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.39306641, + "step": 2719, + "time_per_iteration": 2.9219412803649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108043, + "balance_loss_mlp": 1.04206884, + "epoch": 0.5232781839168911, + "flos": 603547047936.0, + "grad_norm": 0.04294663688852852, + "language_loss": 0.89195794, + "learning_rate": 0.00048660528336228793, + "loss": 0.90276217, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.38330078, + "step": 2720, + "time_per_iteration": 2.7792000770568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076128, + "balance_loss_mlp": 1.03781438, + "epoch": 0.5234705656021547, + "flos": 550438723584.0, + "grad_norm": 0.04780199229625597, + "language_loss": 0.90052795, + "learning_rate": 0.0004862938554855606, + "loss": 0.91128922, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.3828125, + "step": 2721, + "time_per_iteration": 2.781075954437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083526, + "balance_loss_mlp": 1.04509294, + "epoch": 0.5236629472874182, + "flos": 504026247168.0, + "grad_norm": 0.06026541291367098, + "language_loss": 0.85920995, + "learning_rate": 0.0004859824329300304, + "loss": 0.87004519, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.3840332, + "step": 2722, + "time_per_iteration": 2.5523464679718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078682, + "balance_loss_mlp": 1.04043949, + "epoch": 0.5238553289726818, + "flos": 547394927616.0, + "grad_norm": 0.04759572809953804, + "language_loss": 0.83678633, + "learning_rate": 0.00048567101581660244, + "loss": 0.84757316, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.38208008, + "step": 2723, + "time_per_iteration": 2.62168288230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081139, + "balance_loss_mlp": 1.04208636, + "epoch": 0.5240477106579453, + "flos": 531702931968.0, + "grad_norm": 0.060086559712579084, + "language_loss": 0.87061596, + "learning_rate": 0.00048535960426617956, + "loss": 0.88142729, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.39038086, + "step": 2724, + "time_per_iteration": 2.5913078784942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081015, + "balance_loss_mlp": 1.04208124, + "epoch": 0.5242400923432089, + "flos": 617653126656.0, + "grad_norm": 0.05554996608046291, + "language_loss": 0.81582165, + "learning_rate": 0.0004850481983996621, + "loss": 0.82663178, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.3894043, + "step": 2725, + "time_per_iteration": 2.744001865386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083025, + "balance_loss_mlp": 1.04366207, + "epoch": 0.5244324740284725, + "flos": 416461022208.0, + "grad_norm": 0.051041166575027594, + "language_loss": 0.87690443, + "learning_rate": 0.0004847367983379492, + "loss": 0.88773465, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.39331055, + "step": 2726, + "time_per_iteration": 2.452622652053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081299, + "balance_loss_mlp": 1.04327154, + "epoch": 0.5246248557137361, + "flos": 626113801728.0, + "grad_norm": 0.0465947896589182, + "language_loss": 0.7866348, + "learning_rate": 0.00048442540420193643, + "loss": 0.7974478, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.38012695, + "step": 2727, + "time_per_iteration": 2.8958897590637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085515, + "balance_loss_mlp": 1.04524565, + "epoch": 0.5248172373989997, + "flos": 1247980746240.0, + "grad_norm": 0.0639927904505779, + "language_loss": 0.79006433, + "learning_rate": 0.0004841140161125182, + "loss": 0.80091947, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.40234375, + "step": 2728, + "time_per_iteration": 3.5769736766815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109207, + "balance_loss_mlp": 1.05370796, + "epoch": 0.5250096190842631, + "flos": 506616118272.0, + "grad_norm": 0.05909227072060698, + "language_loss": 0.84801137, + "learning_rate": 0.0004838026341905857, + "loss": 0.85893214, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.38354492, + "step": 2729, + "time_per_iteration": 2.6979076862335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082745, + "balance_loss_mlp": 1.04476523, + "epoch": 0.5252020007695267, + "flos": 611021297664.0, + "grad_norm": 0.0531469423300266, + "language_loss": 0.85391581, + "learning_rate": 0.00048349125855702844, + "loss": 0.86474323, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.37915039, + "step": 2730, + "time_per_iteration": 2.7757534980773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084625, + "balance_loss_mlp": 1.04669309, + "epoch": 0.5253943824547903, + "flos": 538970568192.0, + "grad_norm": 0.04649712268604906, + "language_loss": 0.81255782, + "learning_rate": 0.00048317988933273287, + "loss": 0.82340407, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.37939453, + "step": 2731, + "time_per_iteration": 2.7401769161224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094943, + "balance_loss_mlp": 1.05476904, + "epoch": 0.5255867641400539, + "flos": 697714988544.0, + "grad_norm": 0.05136039584795155, + "language_loss": 0.82178587, + "learning_rate": 0.00048286852663858367, + "loss": 0.8327353, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.40161133, + "step": 2732, + "time_per_iteration": 2.9572720527648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088204, + "balance_loss_mlp": 1.05084419, + "epoch": 0.5257791458253175, + "flos": 666975568896.0, + "grad_norm": 0.08443038207797475, + "language_loss": 0.83823925, + "learning_rate": 0.000482557170595462, + "loss": 0.84912133, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.37304688, + "step": 2733, + "time_per_iteration": 2.881659746170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092705, + "balance_loss_mlp": 1.05443931, + "epoch": 0.525971527510581, + "flos": 483375679488.0, + "grad_norm": 0.04826672636793544, + "language_loss": 0.87744856, + "learning_rate": 0.0004822458213242475, + "loss": 0.88837564, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.38232422, + "step": 2734, + "time_per_iteration": 2.5599043369293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090689, + "balance_loss_mlp": 1.05270863, + "epoch": 0.5261639091958445, + "flos": 829559715840.0, + "grad_norm": 0.055467035242162094, + "language_loss": 0.85945731, + "learning_rate": 0.00048193447894581627, + "loss": 0.87036419, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.37988281, + "step": 2735, + "time_per_iteration": 3.1253552436828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102448, + "balance_loss_mlp": 1.06258464, + "epoch": 0.5263562908811081, + "flos": 520461739008.0, + "grad_norm": 0.05936611315903256, + "language_loss": 0.87591684, + "learning_rate": 0.00048162314358104243, + "loss": 0.88694137, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.39868164, + "step": 2736, + "time_per_iteration": 2.5996334552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094957, + "balance_loss_mlp": 1.05704832, + "epoch": 0.5265486725663717, + "flos": 574722404352.0, + "grad_norm": 0.047689297469847035, + "language_loss": 0.82871807, + "learning_rate": 0.0004813118153507969, + "loss": 0.83966762, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.37890625, + "step": 2737, + "time_per_iteration": 2.7455976009368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058466, + "balance_loss_mlp": 1.04540098, + "epoch": 0.5267410542516352, + "flos": 1546439537664.0, + "grad_norm": 0.021507379855054985, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83505595, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.13085938, + "step": 2738, + "time_per_iteration": 4.774937629699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110016, + "balance_loss_mlp": 1.06184578, + "epoch": 0.5269334359368988, + "flos": 929576701440.0, + "grad_norm": 0.045277698895202834, + "language_loss": 0.83199632, + "learning_rate": 0.00048068918077736163, + "loss": 0.84299791, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.38305664, + "step": 2739, + "time_per_iteration": 3.253458261489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102121, + "balance_loss_mlp": 1.06256771, + "epoch": 0.5271258176221624, + "flos": 655079629824.0, + "grad_norm": 0.05720476143842487, + "language_loss": 0.81167477, + "learning_rate": 0.0004803778746759001, + "loss": 0.82269597, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.39526367, + "step": 2740, + "time_per_iteration": 2.890253782272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095422, + "balance_loss_mlp": 1.05777621, + "epoch": 0.527318199307426, + "flos": 542781181440.0, + "grad_norm": 0.064499445698322, + "language_loss": 0.81573081, + "learning_rate": 0.00048006657619242317, + "loss": 0.82668501, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.37646484, + "step": 2741, + "time_per_iteration": 2.696274518966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104347, + "balance_loss_mlp": 1.06419694, + "epoch": 0.5275105809926895, + "flos": 447629635584.0, + "grad_norm": 0.05845576302131632, + "language_loss": 0.78272831, + "learning_rate": 0.00047975528544778775, + "loss": 0.79377174, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.40112305, + "step": 2742, + "time_per_iteration": 2.6140294075012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094189, + "balance_loss_mlp": 1.05508804, + "epoch": 0.527702962677953, + "flos": 578656673280.0, + "grad_norm": 0.058395918180573554, + "language_loss": 0.88265073, + "learning_rate": 0.00047944400256284754, + "loss": 0.89359266, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.39086914, + "step": 2743, + "time_per_iteration": 2.7063393592834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097827, + "balance_loss_mlp": 1.0614922, + "epoch": 0.5278953443632166, + "flos": 652465027584.0, + "grad_norm": 0.07282412653967131, + "language_loss": 0.79796684, + "learning_rate": 0.0004791327276584532, + "loss": 0.80894512, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.36352539, + "step": 2744, + "time_per_iteration": 2.8260412216186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109844, + "balance_loss_mlp": 1.06031692, + "epoch": 0.5280877260484802, + "flos": 513741159936.0, + "grad_norm": 0.04991281876590649, + "language_loss": 0.80703586, + "learning_rate": 0.00047882146085545264, + "loss": 0.81802028, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.38061523, + "step": 2745, + "time_per_iteration": 2.6051464080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018989, + "balance_loss_mlp": 1.00611436, + "epoch": 0.5282801077337438, + "flos": 1444650370560.0, + "grad_norm": 0.010819489631099216, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76421368, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.12890625, + "step": 2746, + "time_per_iteration": 4.9944517612457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084488, + "balance_loss_mlp": 1.0470562, + "epoch": 0.5284724894190073, + "flos": 604580115456.0, + "grad_norm": 0.058273426421755106, + "language_loss": 0.79290295, + "learning_rate": 0.00047819895203700684, + "loss": 0.80374789, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.37451172, + "step": 2747, + "time_per_iteration": 2.728018045425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016603, + "balance_loss_mlp": 1.00410998, + "epoch": 0.5286648711042709, + "flos": 1494172224000.0, + "grad_norm": 0.012264329558562137, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76529038, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.125, + "step": 2748, + "time_per_iteration": 4.659038782119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077352, + "balance_loss_mlp": 1.03860867, + "epoch": 0.5288572527895344, + "flos": 597313889280.0, + "grad_norm": 0.056212558578819974, + "language_loss": 0.88259304, + "learning_rate": 0.0004775764770742277, + "loss": 0.89336658, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.38720703, + "step": 2749, + "time_per_iteration": 2.845102548599243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086383, + "balance_loss_mlp": 1.04699659, + "epoch": 0.529049634474798, + "flos": 557041439232.0, + "grad_norm": 0.05924821658857843, + "language_loss": 0.86565638, + "learning_rate": 0.00047726525259079777, + "loss": 0.87652022, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.39404297, + "step": 2750, + "time_per_iteration": 2.773296356201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085746, + "balance_loss_mlp": 1.04793251, + "epoch": 0.5292420161600616, + "flos": 580986086400.0, + "grad_norm": 0.05670035904014211, + "language_loss": 0.885436, + "learning_rate": 0.0004769540369337798, + "loss": 0.89629346, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.37792969, + "step": 2751, + "time_per_iteration": 2.715921401977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084379, + "balance_loss_mlp": 1.04563594, + "epoch": 0.5294343978453251, + "flos": 607989086208.0, + "grad_norm": 0.05448198338431079, + "language_loss": 0.86051679, + "learning_rate": 0.00047664283022399794, + "loss": 0.87136054, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.38720703, + "step": 2752, + "time_per_iteration": 2.8683502674102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078933, + "balance_loss_mlp": 1.04078627, + "epoch": 0.5296267795305887, + "flos": 646226076672.0, + "grad_norm": 0.05827570747642561, + "language_loss": 0.81129229, + "learning_rate": 0.00047633163258227376, + "loss": 0.82208163, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.38110352, + "step": 2753, + "time_per_iteration": 2.8427987098693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084727, + "balance_loss_mlp": 1.04595971, + "epoch": 0.5298191612158523, + "flos": 559482923520.0, + "grad_norm": 0.14342502720880523, + "language_loss": 0.85232151, + "learning_rate": 0.0004760204441294247, + "loss": 0.86316884, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.38745117, + "step": 2754, + "time_per_iteration": 2.644049882888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088138, + "balance_loss_mlp": 1.05096865, + "epoch": 0.5300115429011159, + "flos": 513776065536.0, + "grad_norm": 0.052931776937271004, + "language_loss": 0.86139393, + "learning_rate": 0.00047570926498626486, + "loss": 0.87227535, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.37133789, + "step": 2755, + "time_per_iteration": 2.6872901916503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092723, + "balance_loss_mlp": 1.05402756, + "epoch": 0.5302039245863793, + "flos": 672475405824.0, + "grad_norm": 0.0470441247054563, + "language_loss": 0.81654894, + "learning_rate": 0.00047539809527360474, + "loss": 0.82747614, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.38696289, + "step": 2756, + "time_per_iteration": 2.865553379058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093267, + "balance_loss_mlp": 1.05488133, + "epoch": 0.5303963062716429, + "flos": 730507396608.0, + "grad_norm": 0.04188022637432273, + "language_loss": 0.82037127, + "learning_rate": 0.0004750869351122511, + "loss": 0.83130395, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.38330078, + "step": 2757, + "time_per_iteration": 2.9792187213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093906, + "balance_loss_mlp": 1.0563792, + "epoch": 0.5305886879569065, + "flos": 573156836352.0, + "grad_norm": 0.0631181134246054, + "language_loss": 0.81604397, + "learning_rate": 0.00047477578462300685, + "loss": 0.82698298, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.37524414, + "step": 2758, + "time_per_iteration": 2.6986684799194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093632, + "balance_loss_mlp": 1.05553293, + "epoch": 0.5307810696421701, + "flos": 694988315136.0, + "grad_norm": 0.050985358767642326, + "language_loss": 0.79166949, + "learning_rate": 0.0004744646439266718, + "loss": 0.80260581, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.38085938, + "step": 2759, + "time_per_iteration": 2.978621006011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091884, + "balance_loss_mlp": 1.05342746, + "epoch": 0.5309734513274337, + "flos": 648629683200.0, + "grad_norm": 0.042424952199748935, + "language_loss": 0.92400765, + "learning_rate": 0.000474153513144041, + "loss": 0.93492657, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.38427734, + "step": 2760, + "time_per_iteration": 2.8996803760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109538, + "balance_loss_mlp": 1.05534935, + "epoch": 0.5311658330126972, + "flos": 604517506560.0, + "grad_norm": 0.048779343359875056, + "language_loss": 0.86932075, + "learning_rate": 0.00047384239239590633, + "loss": 0.88027459, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.39990234, + "step": 2761, + "time_per_iteration": 2.8649730682373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090925, + "balance_loss_mlp": 1.05342138, + "epoch": 0.5313582146979607, + "flos": 557995931136.0, + "grad_norm": 0.062125162710189655, + "language_loss": 0.88300002, + "learning_rate": 0.0004735312818030556, + "loss": 0.89390922, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.37475586, + "step": 2762, + "time_per_iteration": 2.664534091949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108596, + "balance_loss_mlp": 1.04776537, + "epoch": 0.5315505963832243, + "flos": 508152572928.0, + "grad_norm": 0.04725442501000759, + "language_loss": 0.82514352, + "learning_rate": 0.0004732201814862727, + "loss": 0.83600307, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.38183594, + "step": 2763, + "time_per_iteration": 2.7150583267211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100901, + "balance_loss_mlp": 1.06113279, + "epoch": 0.5317429780684879, + "flos": 626132740608.0, + "grad_norm": 0.050347986684343975, + "language_loss": 0.81810606, + "learning_rate": 0.0004729090915663373, + "loss": 0.82911509, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.39746094, + "step": 2764, + "time_per_iteration": 2.837186336517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093533, + "balance_loss_mlp": 1.05509973, + "epoch": 0.5319353597537514, + "flos": 476506713600.0, + "grad_norm": 0.06358705333883939, + "language_loss": 0.85396516, + "learning_rate": 0.00047259801216402534, + "loss": 0.86490047, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.38427734, + "step": 2765, + "time_per_iteration": 2.5005743503570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094119, + "balance_loss_mlp": 1.05592442, + "epoch": 0.532127741439015, + "flos": 501386913792.0, + "grad_norm": 0.06543180937467778, + "language_loss": 0.8612839, + "learning_rate": 0.00047228694340010845, + "loss": 0.87222505, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.38183594, + "step": 2766, + "time_per_iteration": 2.549018144607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096891, + "balance_loss_mlp": 1.0578146, + "epoch": 0.5323201231242786, + "flos": 1164114063360.0, + "grad_norm": 0.04837235133211893, + "language_loss": 0.85614288, + "learning_rate": 0.0004719758853953544, + "loss": 0.8671118, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.390625, + "step": 2767, + "time_per_iteration": 3.568779468536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095476, + "balance_loss_mlp": 1.05709052, + "epoch": 0.5325125048095422, + "flos": 378493254144.0, + "grad_norm": 0.06740098585195309, + "language_loss": 0.84098738, + "learning_rate": 0.00047166483827052645, + "loss": 0.85194218, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.38354492, + "step": 2768, + "time_per_iteration": 2.4389522075653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_mlp": 1.01784337, + "epoch": 0.5327048864948057, + "flos": 1540507534848.0, + "grad_norm": 0.01937833439113787, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78109497, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.13183594, + "step": 2769, + "time_per_iteration": 4.967049837112427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093166, + "balance_loss_mlp": 1.05320704, + "epoch": 0.5328972681800692, + "flos": 910877225472.0, + "grad_norm": 0.052506511923680964, + "language_loss": 0.83564013, + "learning_rate": 0.000471042777143682, + "loss": 0.8465718, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.3996582, + "step": 2770, + "time_per_iteration": 3.2065277099609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083074, + "balance_loss_mlp": 1.04530883, + "epoch": 0.5330896498653328, + "flos": 473660766720.0, + "grad_norm": 0.0519747156636442, + "language_loss": 0.79680347, + "learning_rate": 0.0004707317633831707, + "loss": 0.80763417, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.37744141, + "step": 2771, + "time_per_iteration": 2.5498273372650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091325, + "balance_loss_mlp": 1.05408382, + "epoch": 0.5332820315505964, + "flos": 501386913792.0, + "grad_norm": 0.05598064533442757, + "language_loss": 0.77608013, + "learning_rate": 0.00047042076098559673, + "loss": 0.78699338, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.37231445, + "step": 2772, + "time_per_iteration": 2.5759775638580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091384, + "balance_loss_mlp": 1.05323732, + "epoch": 0.53347441323586, + "flos": 924043368960.0, + "grad_norm": 0.060675625301583505, + "language_loss": 0.73884845, + "learning_rate": 0.00047010977007170174, + "loss": 0.7497623, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.38110352, + "step": 2773, + "time_per_iteration": 3.257833957672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089952, + "balance_loss_mlp": 1.05099463, + "epoch": 0.5336667949211235, + "flos": 574185521664.0, + "grad_norm": 0.06246333407972351, + "language_loss": 0.82451814, + "learning_rate": 0.00046979879076222334, + "loss": 0.83541769, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.38916016, + "step": 2774, + "time_per_iteration": 2.6394476890563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091431, + "balance_loss_mlp": 1.05306923, + "epoch": 0.533859176606387, + "flos": 1064233880064.0, + "grad_norm": 0.044878758318980805, + "language_loss": 0.85063684, + "learning_rate": 0.0004694878231778939, + "loss": 0.86155117, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.38330078, + "step": 2775, + "time_per_iteration": 3.3668456077575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085244, + "balance_loss_mlp": 1.04695392, + "epoch": 0.5340515582916506, + "flos": 746277967872.0, + "grad_norm": 0.04760082973405309, + "language_loss": 0.84270054, + "learning_rate": 0.0004691768674394423, + "loss": 0.85355294, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.38305664, + "step": 2776, + "time_per_iteration": 2.9580860137939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_mlp": 1.02644587, + "epoch": 0.5342439399769142, + "flos": 1444905036288.0, + "grad_norm": 0.01780260433895519, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85522568, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.12109375, + "step": 2777, + "time_per_iteration": 4.798782825469971 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036301, + "balance_loss_mlp": 1.02423704, + "epoch": 0.5344363216621778, + "flos": 1426790495232.0, + "grad_norm": 0.016806659478265918, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77689832, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.12060547, + "step": 2778, + "time_per_iteration": 4.971946477890015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083696, + "balance_loss_mlp": 1.04650259, + "epoch": 0.5346287033474413, + "flos": 527355436032.0, + "grad_norm": 0.27028176168378437, + "language_loss": 0.79060376, + "learning_rate": 0.00046824407250656676, + "loss": 0.80144072, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.37158203, + "step": 2779, + "time_per_iteration": 2.639554738998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083286, + "balance_loss_mlp": 1.04528189, + "epoch": 0.5348210850327049, + "flos": 510515481600.0, + "grad_norm": 0.04912000707376091, + "language_loss": 0.83288354, + "learning_rate": 0.0004679331653588161, + "loss": 0.84371638, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.37988281, + "step": 2780, + "time_per_iteration": 2.590897560119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082747, + "balance_loss_mlp": 1.04388487, + "epoch": 0.5350134667179685, + "flos": 462429748224.0, + "grad_norm": 0.07636572739089499, + "language_loss": 0.8547262, + "learning_rate": 0.0004676222706605147, + "loss": 0.86555368, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.38867188, + "step": 2781, + "time_per_iteration": 2.606795310974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088005, + "balance_loss_mlp": 1.04647303, + "epoch": 0.535205848403232, + "flos": 708566275584.0, + "grad_norm": 0.05667741573580048, + "language_loss": 0.84751678, + "learning_rate": 0.0004673113885323626, + "loss": 0.85839683, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.4152832, + "step": 2782, + "time_per_iteration": 2.813957691192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084509, + "balance_loss_mlp": 1.04507411, + "epoch": 0.5353982300884956, + "flos": 893855388672.0, + "grad_norm": 0.04933634097838137, + "language_loss": 0.78395712, + "learning_rate": 0.00046700051909505494, + "loss": 0.79480219, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.39404297, + "step": 2783, + "time_per_iteration": 3.151244878768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089347, + "balance_loss_mlp": 1.0476948, + "epoch": 0.5355906117737591, + "flos": 535701219840.0, + "grad_norm": 0.06378381527079717, + "language_loss": 0.83984947, + "learning_rate": 0.000466689662469282, + "loss": 0.85074294, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.41650391, + "step": 2784, + "time_per_iteration": 2.6275248527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081581, + "balance_loss_mlp": 1.04159856, + "epoch": 0.5357829934590227, + "flos": 868477593600.0, + "grad_norm": 0.05202541270375375, + "language_loss": 0.83895493, + "learning_rate": 0.00046637881877572917, + "loss": 0.84977078, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.3996582, + "step": 2785, + "time_per_iteration": 3.069645404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085059, + "balance_loss_mlp": 1.04481411, + "epoch": 0.5359753751442863, + "flos": 552999481344.0, + "grad_norm": 0.08844651025983005, + "language_loss": 0.8452431, + "learning_rate": 0.0004660679881350764, + "loss": 0.85609365, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.40234375, + "step": 2786, + "time_per_iteration": 2.7307839393615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059616, + "balance_loss_mlp": 1.04531133, + "epoch": 0.5361677568295499, + "flos": 1479687823872.0, + "grad_norm": 0.02226240505672553, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76667762, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.14257812, + "step": 2787, + "time_per_iteration": 5.010236740112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083366, + "balance_loss_mlp": 1.04352605, + "epoch": 0.5363601385148133, + "flos": 805910432256.0, + "grad_norm": 0.0562451411020875, + "language_loss": 0.78052628, + "learning_rate": 0.0004654463664951667, + "loss": 0.79135996, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.3984375, + "step": 2788, + "time_per_iteration": 2.9822394847869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090776, + "balance_loss_mlp": 1.05076993, + "epoch": 0.5365525202000769, + "flos": 507630246912.0, + "grad_norm": 0.05204597911301594, + "language_loss": 0.82849109, + "learning_rate": 0.0004651355757372447, + "loss": 0.83939886, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.39990234, + "step": 2789, + "time_per_iteration": 2.615691900253296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089332, + "balance_loss_mlp": 1.04937315, + "epoch": 0.5367449018853405, + "flos": 528660546048.0, + "grad_norm": 0.0871364316310779, + "language_loss": 0.854258, + "learning_rate": 0.00046482479851489274, + "loss": 0.86515129, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.39941406, + "step": 2790, + "time_per_iteration": 2.7088706493377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089329, + "balance_loss_mlp": 1.04853582, + "epoch": 0.5369372835706041, + "flos": 649614698496.0, + "grad_norm": 0.059769288934836705, + "language_loss": 0.78002077, + "learning_rate": 0.00046451403494876525, + "loss": 0.79091412, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.40795898, + "step": 2791, + "time_per_iteration": 2.8624680042266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082264, + "balance_loss_mlp": 1.04254341, + "epoch": 0.5371296652558677, + "flos": 584205972480.0, + "grad_norm": 0.05423678017273499, + "language_loss": 0.84187895, + "learning_rate": 0.0004642032851595111, + "loss": 0.8527016, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.3972168, + "step": 2792, + "time_per_iteration": 2.7222046852111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090422, + "balance_loss_mlp": 1.04877055, + "epoch": 0.5373220469411312, + "flos": 595570821120.0, + "grad_norm": 0.05596231110481221, + "language_loss": 0.84764576, + "learning_rate": 0.00046389254926777404, + "loss": 0.85855001, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.41674805, + "step": 2793, + "time_per_iteration": 2.8049495220184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083638, + "balance_loss_mlp": 1.04286838, + "epoch": 0.5375144286263948, + "flos": 1113965167104.0, + "grad_norm": 0.05603938595076487, + "language_loss": 0.78227508, + "learning_rate": 0.0004635818273941926, + "loss": 0.79311144, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.4074707, + "step": 2794, + "time_per_iteration": 3.506617307662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085514, + "balance_loss_mlp": 1.04495919, + "epoch": 0.5377068103116583, + "flos": 595319127552.0, + "grad_norm": 0.07610950885477011, + "language_loss": 0.81443048, + "learning_rate": 0.0004632711196593997, + "loss": 0.82528561, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.40527344, + "step": 2795, + "time_per_iteration": 2.7142324447631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083447, + "balance_loss_mlp": 1.04377437, + "epoch": 0.5378991919969219, + "flos": 883839320064.0, + "grad_norm": 0.061986224183990205, + "language_loss": 0.85229117, + "learning_rate": 0.00046296042618402297, + "loss": 0.86312562, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.39697266, + "step": 2796, + "time_per_iteration": 3.0699656009674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077763, + "balance_loss_mlp": 1.03801823, + "epoch": 0.5380915736821854, + "flos": 710344249344.0, + "grad_norm": 0.04828732184108336, + "language_loss": 0.792054, + "learning_rate": 0.0004626497470886839, + "loss": 0.80283165, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.39746094, + "step": 2797, + "time_per_iteration": 2.9337801933288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085503, + "balance_loss_mlp": 1.04444742, + "epoch": 0.538283955367449, + "flos": 556721344512.0, + "grad_norm": 0.04667541599746409, + "language_loss": 0.8208226, + "learning_rate": 0.00046233908249399897, + "loss": 0.83167768, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.41040039, + "step": 2798, + "time_per_iteration": 2.736253023147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086245, + "balance_loss_mlp": 1.04585731, + "epoch": 0.5384763370527126, + "flos": 513218833920.0, + "grad_norm": 0.05904964511977083, + "language_loss": 0.78162259, + "learning_rate": 0.00046202843252057905, + "loss": 0.79248506, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.40380859, + "step": 2799, + "time_per_iteration": 2.5839316844940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085466, + "balance_loss_mlp": 1.04503012, + "epoch": 0.5386687187379762, + "flos": 489490974720.0, + "grad_norm": 0.06428119470797507, + "language_loss": 0.83220208, + "learning_rate": 0.00046171779728902896, + "loss": 0.8430568, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.40405273, + "step": 2800, + "time_per_iteration": 2.6141908168792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087168, + "balance_loss_mlp": 1.04801977, + "epoch": 0.5388611004232398, + "flos": 482415395328.0, + "grad_norm": 0.12344174959648258, + "language_loss": 0.86207569, + "learning_rate": 0.000461407176919948, + "loss": 0.87294734, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.39111328, + "step": 2801, + "time_per_iteration": 2.503673791885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_mlp": 1.04158366, + "epoch": 0.5390534821085032, + "flos": 560709457920.0, + "grad_norm": 0.05013064620145656, + "language_loss": 0.85174656, + "learning_rate": 0.00046109657153392997, + "loss": 0.86255008, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.38720703, + "step": 2802, + "time_per_iteration": 2.6549510955810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086897, + "balance_loss_mlp": 1.04624677, + "epoch": 0.5392458637937668, + "flos": 488132020224.0, + "grad_norm": 0.05351248634305854, + "language_loss": 0.82771289, + "learning_rate": 0.0004607859812515622, + "loss": 0.8385818, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.40649414, + "step": 2803, + "time_per_iteration": 2.592742681503296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085335, + "balance_loss_mlp": 1.0456624, + "epoch": 0.5394382454790304, + "flos": 511810417152.0, + "grad_norm": 0.06156300752407298, + "language_loss": 0.87926197, + "learning_rate": 0.00046047540619342667, + "loss": 0.89011538, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.39648438, + "step": 2804, + "time_per_iteration": 2.566542863845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108144, + "balance_loss_mlp": 1.04343605, + "epoch": 0.539630627164294, + "flos": 567312173568.0, + "grad_norm": 0.04852529488921132, + "language_loss": 0.7995888, + "learning_rate": 0.00046016484648009933, + "loss": 0.81040317, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.38012695, + "step": 2805, + "time_per_iteration": 2.693988561630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108233, + "balance_loss_mlp": 1.04415882, + "epoch": 0.5398230088495575, + "flos": 526203095040.0, + "grad_norm": 0.058780411040176145, + "language_loss": 0.8077246, + "learning_rate": 0.0004598543022321501, + "loss": 0.81854796, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.38134766, + "step": 2806, + "time_per_iteration": 2.635873317718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079563, + "balance_loss_mlp": 1.0410111, + "epoch": 0.5400153905348211, + "flos": 538493322240.0, + "grad_norm": 0.05389643439716648, + "language_loss": 0.7979452, + "learning_rate": 0.0004595437735701433, + "loss": 0.80874085, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.38500977, + "step": 2807, + "time_per_iteration": 2.671004056930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082107, + "balance_loss_mlp": 1.04252934, + "epoch": 0.5402077722200846, + "flos": 513259531776.0, + "grad_norm": 0.056977099557855106, + "language_loss": 0.83333278, + "learning_rate": 0.00045923326061463623, + "loss": 0.84415388, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.39575195, + "step": 2808, + "time_per_iteration": 2.748844861984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108444, + "balance_loss_mlp": 1.04519629, + "epoch": 0.5404001539053482, + "flos": 675932428800.0, + "grad_norm": 0.053531678156081904, + "language_loss": 0.81448805, + "learning_rate": 0.00045892276348618113, + "loss": 0.82533252, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.39208984, + "step": 2809, + "time_per_iteration": 2.9712717533111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034069, + "balance_loss_mlp": 1.02195704, + "epoch": 0.5405925355906118, + "flos": 1553998155264.0, + "grad_norm": 0.02221665300745606, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79294896, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.12109375, + "step": 2810, + "time_per_iteration": 4.987140893936157 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085482, + "balance_loss_mlp": 1.04697728, + "epoch": 0.5407849172758753, + "flos": 647004478464.0, + "grad_norm": 0.050822756134718025, + "language_loss": 0.80942833, + "learning_rate": 0.000458301817192603, + "loss": 0.82028317, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.38500977, + "step": 2811, + "time_per_iteration": 2.826511859893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028161, + "balance_loss_mlp": 1.01576281, + "epoch": 0.5409772989611389, + "flos": 1406641904640.0, + "grad_norm": 0.017319914930323605, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81869948, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.12353516, + "step": 2812, + "time_per_iteration": 4.797938346862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083094, + "balance_loss_mlp": 1.04525733, + "epoch": 0.5411696806464025, + "flos": 554102360064.0, + "grad_norm": 0.08517188397837483, + "language_loss": 0.87214613, + "learning_rate": 0.00045768093565369983, + "loss": 0.88297707, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.37817383, + "step": 2813, + "time_per_iteration": 2.716890811920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082803, + "balance_loss_mlp": 1.04441762, + "epoch": 0.5413620623316661, + "flos": 527853030912.0, + "grad_norm": 0.05234072905155942, + "language_loss": 0.81825578, + "learning_rate": 0.0004573705194685646, + "loss": 0.8290838, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.38330078, + "step": 2814, + "time_per_iteration": 2.6517584323883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082477, + "balance_loss_mlp": 1.04380536, + "epoch": 0.5415544440169295, + "flos": 598464820224.0, + "grad_norm": 0.054888895455983605, + "language_loss": 0.84797984, + "learning_rate": 0.00045706011983366157, + "loss": 0.85880458, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.38623047, + "step": 2815, + "time_per_iteration": 2.670135974884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088358, + "balance_loss_mlp": 1.050807, + "epoch": 0.5417468257021931, + "flos": 470519456256.0, + "grad_norm": 0.06349065912195655, + "language_loss": 0.82603323, + "learning_rate": 0.00045674973686949847, + "loss": 0.8369168, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.37524414, + "step": 2816, + "time_per_iteration": 2.51487398147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085537, + "balance_loss_mlp": 1.04710388, + "epoch": 0.5419392073874567, + "flos": 680477773824.0, + "grad_norm": 0.04802331030108417, + "language_loss": 0.85519576, + "learning_rate": 0.0004564393706965766, + "loss": 0.86605108, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.3840332, + "step": 2817, + "time_per_iteration": 2.9650819301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088505, + "balance_loss_mlp": 1.05031061, + "epoch": 0.5421315890727203, + "flos": 462134384640.0, + "grad_norm": 0.11431790588446349, + "language_loss": 0.81361973, + "learning_rate": 0.00045612902143539116, + "loss": 0.82450485, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.3815918, + "step": 2818, + "time_per_iteration": 2.5874366760253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083753, + "balance_loss_mlp": 1.04620242, + "epoch": 0.5423239707579839, + "flos": 436727476224.0, + "grad_norm": 0.06287409893753121, + "language_loss": 0.81734043, + "learning_rate": 0.00045581868920642986, + "loss": 0.82817793, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.375, + "step": 2819, + "time_per_iteration": 2.4778597354888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.04818964, + "epoch": 0.5425163524432474, + "flos": 458067695616.0, + "grad_norm": 0.0556653381868651, + "language_loss": 0.79541689, + "learning_rate": 0.00045550837413017457, + "loss": 0.8062731, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.37402344, + "step": 2820, + "time_per_iteration": 2.653878688812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087265, + "balance_loss_mlp": 1.04873669, + "epoch": 0.542708734128511, + "flos": 419267681280.0, + "grad_norm": 0.047652791336190936, + "language_loss": 0.85203838, + "learning_rate": 0.0004551980763271005, + "loss": 0.86291105, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.38500977, + "step": 2821, + "time_per_iteration": 2.6410272121429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088978, + "balance_loss_mlp": 1.04942417, + "epoch": 0.5429011158137745, + "flos": 678142568448.0, + "grad_norm": 0.047512644994480734, + "language_loss": 0.83545935, + "learning_rate": 0.0004548877959176756, + "loss": 0.84634912, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.39550781, + "step": 2822, + "time_per_iteration": 2.8824410438537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083878, + "balance_loss_mlp": 1.04542077, + "epoch": 0.5430934974990381, + "flos": 540664174080.0, + "grad_norm": 0.05440283794038225, + "language_loss": 0.8588357, + "learning_rate": 0.00045457753302236166, + "loss": 0.86967444, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.3840332, + "step": 2823, + "time_per_iteration": 2.665828227996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078196, + "balance_loss_mlp": 1.04069233, + "epoch": 0.5432858791843016, + "flos": 658175860224.0, + "grad_norm": 0.053164692369765, + "language_loss": 0.86939847, + "learning_rate": 0.00045426728776161353, + "loss": 0.88018048, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.37475586, + "step": 2824, + "time_per_iteration": 2.79662823677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082032, + "balance_loss_mlp": 1.04367089, + "epoch": 0.5434782608695652, + "flos": 531678200832.0, + "grad_norm": 0.051257131946256196, + "language_loss": 0.81339788, + "learning_rate": 0.00045395706025587863, + "loss": 0.82421821, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.38330078, + "step": 2825, + "time_per_iteration": 2.612839698791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083107, + "balance_loss_mlp": 1.04298067, + "epoch": 0.5436706425548288, + "flos": 608219020800.0, + "grad_norm": 0.0654215378261843, + "language_loss": 0.8246271, + "learning_rate": 0.00045364685062559843, + "loss": 0.83545816, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.40112305, + "step": 2826, + "time_per_iteration": 2.8304717540740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077545, + "balance_loss_mlp": 1.03863502, + "epoch": 0.5438630242400924, + "flos": 705081549312.0, + "grad_norm": 0.05153461088450525, + "language_loss": 0.91323566, + "learning_rate": 0.0004533366589912067, + "loss": 0.92401117, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.38891602, + "step": 2827, + "time_per_iteration": 2.9909794330596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083973, + "balance_loss_mlp": 1.04399014, + "epoch": 0.544055405925356, + "flos": 856073885184.0, + "grad_norm": 0.06162926864421369, + "language_loss": 0.77631354, + "learning_rate": 0.0004530264854731306, + "loss": 0.78715324, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.3996582, + "step": 2828, + "time_per_iteration": 3.0477852821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079886, + "balance_loss_mlp": 1.0402137, + "epoch": 0.5442477876106194, + "flos": 571483579392.0, + "grad_norm": 0.04880017685382554, + "language_loss": 0.83835936, + "learning_rate": 0.00045271633019179034, + "loss": 0.84915829, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.39648438, + "step": 2829, + "time_per_iteration": 2.8048830032348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108649, + "balance_loss_mlp": 1.04684114, + "epoch": 0.544440169295883, + "flos": 625246649856.0, + "grad_norm": 0.05731672371216008, + "language_loss": 0.87693858, + "learning_rate": 0.0004524061932675986, + "loss": 0.88780355, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.39624023, + "step": 2830, + "time_per_iteration": 2.880328893661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081175, + "balance_loss_mlp": 1.0420748, + "epoch": 0.5446325509811466, + "flos": 835896181248.0, + "grad_norm": 0.061736377466748704, + "language_loss": 0.8659271, + "learning_rate": 0.00045209607482096125, + "loss": 0.87673885, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.390625, + "step": 2831, + "time_per_iteration": 2.9996933937072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080099, + "balance_loss_mlp": 1.04016387, + "epoch": 0.5448249326664102, + "flos": 483129778176.0, + "grad_norm": 0.057163759026562816, + "language_loss": 0.8399148, + "learning_rate": 0.0004517859749722772, + "loss": 0.85071582, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.39892578, + "step": 2832, + "time_per_iteration": 2.6431195735931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085662, + "balance_loss_mlp": 1.04606068, + "epoch": 0.5450173143516738, + "flos": 560799618048.0, + "grad_norm": 0.061436781325619555, + "language_loss": 0.78688192, + "learning_rate": 0.0004514758938419376, + "loss": 0.79773855, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.39575195, + "step": 2833, + "time_per_iteration": 2.811894655227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058831, + "balance_loss_mlp": 1.04280972, + "epoch": 0.5452096960369373, + "flos": 1469632467456.0, + "grad_norm": 0.020133642361800857, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77979416, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.16015625, + "step": 2834, + "time_per_iteration": 4.920469760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077415, + "balance_loss_mlp": 1.03798103, + "epoch": 0.5454020777222008, + "flos": 464827562496.0, + "grad_norm": 0.051503170745990534, + "language_loss": 0.83848447, + "learning_rate": 0.00045085578821782175, + "loss": 0.84925866, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.39404297, + "step": 2835, + "time_per_iteration": 2.523089647293091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048396, + "balance_loss_mlp": 1.03246999, + "epoch": 0.5455944594074644, + "flos": 1468921056768.0, + "grad_norm": 0.01613355837810212, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77183139, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.15917969, + "step": 2836, + "time_per_iteration": 4.865030288696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082059, + "balance_loss_mlp": 1.0422194, + "epoch": 0.545786841092728, + "flos": 532900353024.0, + "grad_norm": 0.04532447535161293, + "language_loss": 0.81224561, + "learning_rate": 0.00045023575891159866, + "loss": 0.82306617, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.3984375, + "step": 2837, + "time_per_iteration": 2.7024872303009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038306, + "balance_loss_mlp": 1.02285683, + "epoch": 0.5459792227779915, + "flos": 1351633360896.0, + "grad_norm": 0.01633471064412587, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75802112, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.15429688, + "step": 2838, + "time_per_iteration": 4.88713812828064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072439, + "balance_loss_mlp": 1.03436387, + "epoch": 0.5461716044632551, + "flos": 637584929280.0, + "grad_norm": 0.044187924464620755, + "language_loss": 0.77777064, + "learning_rate": 0.0004496158068861354, + "loss": 0.788495, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.38037109, + "step": 2839, + "time_per_iteration": 2.7734854221343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083666, + "balance_loss_mlp": 1.04451799, + "epoch": 0.5463639861485187, + "flos": 602458725888.0, + "grad_norm": 0.04916115853202861, + "language_loss": 0.80780178, + "learning_rate": 0.00044930586015455207, + "loss": 0.81863844, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.39111328, + "step": 2840, + "time_per_iteration": 2.776756525039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079059, + "balance_loss_mlp": 1.04105484, + "epoch": 0.5465563678337823, + "flos": 642208849920.0, + "grad_norm": 0.047638532734035705, + "language_loss": 0.89027333, + "learning_rate": 0.000448995933104179, + "loss": 0.90106392, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.37939453, + "step": 2841, + "time_per_iteration": 2.835770606994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084722, + "balance_loss_mlp": 1.04526389, + "epoch": 0.5467487495190458, + "flos": 613852687872.0, + "grad_norm": 0.05241434980763647, + "language_loss": 0.79585081, + "learning_rate": 0.00044868602585534077, + "loss": 0.80669802, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.39428711, + "step": 2842, + "time_per_iteration": 2.8165202140808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081214, + "balance_loss_mlp": 1.04297209, + "epoch": 0.5469411312043093, + "flos": 460957312512.0, + "grad_norm": 0.05377375824052972, + "language_loss": 0.88703167, + "learning_rate": 0.0004483761385283541, + "loss": 0.89784384, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.38183594, + "step": 2843, + "time_per_iteration": 2.5191187858581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085655, + "balance_loss_mlp": 1.04705536, + "epoch": 0.5471335128895729, + "flos": 560930628096.0, + "grad_norm": 0.05339183941738246, + "language_loss": 0.82029176, + "learning_rate": 0.0004480662712435281, + "loss": 0.83114827, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.38549805, + "step": 2844, + "time_per_iteration": 2.7347452640533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084571, + "balance_loss_mlp": 1.046996, + "epoch": 0.5473258945748365, + "flos": 518437863936.0, + "grad_norm": 0.05481278216627967, + "language_loss": 0.88263971, + "learning_rate": 0.0004477564241211635, + "loss": 0.89348543, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.37548828, + "step": 2845, + "time_per_iteration": 2.566675901412964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085991, + "balance_loss_mlp": 1.0476774, + "epoch": 0.5475182762601001, + "flos": 433600722432.0, + "grad_norm": 0.05360762168993706, + "language_loss": 0.87165999, + "learning_rate": 0.0004474465972815541, + "loss": 0.88251984, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.38256836, + "step": 2846, + "time_per_iteration": 2.458261489868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085193, + "balance_loss_mlp": 1.04754686, + "epoch": 0.5477106579453636, + "flos": 511308440064.0, + "grad_norm": 0.04786363547278841, + "language_loss": 0.87439841, + "learning_rate": 0.000447136790844985, + "loss": 0.88525033, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.37646484, + "step": 2847, + "time_per_iteration": 2.667609214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108262, + "balance_loss_mlp": 1.04547465, + "epoch": 0.5479030396306271, + "flos": 675606541824.0, + "grad_norm": 0.050829406458998395, + "language_loss": 0.80589354, + "learning_rate": 0.00044682700493173385, + "loss": 0.81671977, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.37133789, + "step": 2848, + "time_per_iteration": 2.83048677444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088336, + "balance_loss_mlp": 1.04978406, + "epoch": 0.5480954213158907, + "flos": 875720498688.0, + "grad_norm": 0.057674115143319986, + "language_loss": 0.80473161, + "learning_rate": 0.00044651723966207004, + "loss": 0.81561506, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.38500977, + "step": 2849, + "time_per_iteration": 3.1320085525512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084454, + "balance_loss_mlp": 1.04780865, + "epoch": 0.5482878030011543, + "flos": 621715433472.0, + "grad_norm": 0.04900831188074684, + "language_loss": 0.78059959, + "learning_rate": 0.00044620749515625536, + "loss": 0.79144412, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.36669922, + "step": 2850, + "time_per_iteration": 2.784318447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091667, + "balance_loss_mlp": 1.05404472, + "epoch": 0.5484801846864179, + "flos": 496946285568.0, + "grad_norm": 0.05697086220906577, + "language_loss": 0.84891641, + "learning_rate": 0.00044589777153454334, + "loss": 0.85983306, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.37597656, + "step": 2851, + "time_per_iteration": 2.7432825565338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087741, + "balance_loss_mlp": 1.04973722, + "epoch": 0.5486725663716814, + "flos": 442202582016.0, + "grad_norm": 0.05425914558119235, + "language_loss": 0.83565009, + "learning_rate": 0.00044558806891717895, + "loss": 0.84652746, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.37963867, + "step": 2852, + "time_per_iteration": 2.486581563949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093078, + "balance_loss_mlp": 1.05528831, + "epoch": 0.548864948056945, + "flos": 654867224064.0, + "grad_norm": 0.04695408394518552, + "language_loss": 0.79779923, + "learning_rate": 0.0004452783874243998, + "loss": 0.80873001, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.37817383, + "step": 2853, + "time_per_iteration": 2.823004722595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088751, + "balance_loss_mlp": 1.05246305, + "epoch": 0.5490573297422086, + "flos": 545760958464.0, + "grad_norm": 0.06406980317061135, + "language_loss": 0.84579176, + "learning_rate": 0.00044496872717643475, + "loss": 0.85667926, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.36279297, + "step": 2854, + "time_per_iteration": 2.6582207679748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104165, + "balance_loss_mlp": 1.02906144, + "epoch": 0.5492497114274721, + "flos": 1589450245632.0, + "grad_norm": 0.019738925867794382, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78130943, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.12597656, + "step": 2855, + "time_per_iteration": 4.917479991912842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086161, + "balance_loss_mlp": 1.0507319, + "epoch": 0.5494420931127356, + "flos": 750567237120.0, + "grad_norm": 0.05097157568088764, + "language_loss": 0.82032043, + "learning_rate": 0.0004443494708958217, + "loss": 0.83118206, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.35473633, + "step": 2856, + "time_per_iteration": 2.944794178009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087153, + "balance_loss_mlp": 1.04860103, + "epoch": 0.5496344747979992, + "flos": 625704956928.0, + "grad_norm": 0.05077616299787212, + "language_loss": 0.80950212, + "learning_rate": 0.0004440398751035906, + "loss": 0.82037365, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.38549805, + "step": 2857, + "time_per_iteration": 2.8557775020599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083742, + "balance_loss_mlp": 1.04707289, + "epoch": 0.5498268564832628, + "flos": 522859553280.0, + "grad_norm": 0.07234504005195413, + "language_loss": 0.83526963, + "learning_rate": 0.00044373030103700645, + "loss": 0.84610707, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.3671875, + "step": 2858, + "time_per_iteration": 2.5718507766723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079618, + "balance_loss_mlp": 1.04337823, + "epoch": 0.5500192381685264, + "flos": 604290544128.0, + "grad_norm": 0.05047837894946753, + "language_loss": 0.79457223, + "learning_rate": 0.000443420748816257, + "loss": 0.80536836, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.36279297, + "step": 2859, + "time_per_iteration": 2.791083335876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_mlp": 1.0475843, + "epoch": 0.55021161985379, + "flos": 520246361088.0, + "grad_norm": 0.05245161408681963, + "language_loss": 0.78267741, + "learning_rate": 0.0004431112185615208, + "loss": 0.79352212, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.36914062, + "step": 2860, + "time_per_iteration": 2.755300760269165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084785, + "balance_loss_mlp": 1.04873633, + "epoch": 0.5504040015390534, + "flos": 489426955776.0, + "grad_norm": 0.05433061967205067, + "language_loss": 0.79769695, + "learning_rate": 0.00044280171039296845, + "loss": 0.80854475, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.3605957, + "step": 2861, + "time_per_iteration": 2.611142873764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086738, + "balance_loss_mlp": 1.04925907, + "epoch": 0.550596383224317, + "flos": 575519745024.0, + "grad_norm": 0.06168485457456991, + "language_loss": 0.88482428, + "learning_rate": 0.0004424922244307616, + "loss": 0.89569169, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.375, + "step": 2862, + "time_per_iteration": 2.673872470855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085076, + "balance_loss_mlp": 1.04750168, + "epoch": 0.5507887649095806, + "flos": 642149213184.0, + "grad_norm": 0.06448144785997337, + "language_loss": 0.82166171, + "learning_rate": 0.00044218276079505315, + "loss": 0.83251244, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.37524414, + "step": 2863, + "time_per_iteration": 2.8468000888824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088216, + "balance_loss_mlp": 1.05126143, + "epoch": 0.5509811465948442, + "flos": 531589450752.0, + "grad_norm": 0.050966073807123834, + "language_loss": 0.7469635, + "learning_rate": 0.0004418733196059876, + "loss": 0.7578457, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.36938477, + "step": 2864, + "time_per_iteration": 2.662949323654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088219, + "balance_loss_mlp": 1.05174112, + "epoch": 0.5511735282801077, + "flos": 654439440384.0, + "grad_norm": 0.054186590964919915, + "language_loss": 0.79709429, + "learning_rate": 0.0004415639009837008, + "loss": 0.80797648, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.36474609, + "step": 2865, + "time_per_iteration": 2.8164796829223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080612, + "balance_loss_mlp": 1.04503989, + "epoch": 0.5513659099653713, + "flos": 529222159872.0, + "grad_norm": 0.05095499883513892, + "language_loss": 0.81590974, + "learning_rate": 0.00044125450504831955, + "loss": 0.82671583, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.35620117, + "step": 2866, + "time_per_iteration": 2.7417778968811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088604, + "balance_loss_mlp": 1.05162513, + "epoch": 0.5515582916506349, + "flos": 554594162688.0, + "grad_norm": 0.05682958193324047, + "language_loss": 0.82243145, + "learning_rate": 0.0004409451319199622, + "loss": 0.83331752, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.36987305, + "step": 2867, + "time_per_iteration": 2.6530325412750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082608, + "balance_loss_mlp": 1.04608202, + "epoch": 0.5517506733358984, + "flos": 735067298304.0, + "grad_norm": 0.04759427919913488, + "language_loss": 0.84027618, + "learning_rate": 0.0004406357817187381, + "loss": 0.85110223, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.36572266, + "step": 2868, + "time_per_iteration": 2.9475574493408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083501, + "balance_loss_mlp": 1.04590225, + "epoch": 0.551943055021162, + "flos": 1114861432320.0, + "grad_norm": 0.043872910920917114, + "language_loss": 0.80878294, + "learning_rate": 0.0004403264545647474, + "loss": 0.81961799, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.37597656, + "step": 2869, + "time_per_iteration": 3.5124435424804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080703, + "balance_loss_mlp": 1.04422534, + "epoch": 0.5521354367064255, + "flos": 544092083712.0, + "grad_norm": 0.0550168733336382, + "language_loss": 0.84926724, + "learning_rate": 0.00044001715057808154, + "loss": 0.86007428, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.36499023, + "step": 2870, + "time_per_iteration": 2.7501060962677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085896, + "balance_loss_mlp": 1.04855943, + "epoch": 0.5523278183916891, + "flos": 935889845760.0, + "grad_norm": 0.05461062340152541, + "language_loss": 0.81539249, + "learning_rate": 0.0004397078698788232, + "loss": 0.82625151, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.3737793, + "step": 2871, + "time_per_iteration": 3.2084577083587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026675, + "balance_loss_mlp": 1.01427722, + "epoch": 0.5525202000769527, + "flos": 1465117645824.0, + "grad_norm": 0.012296141252344654, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81469035, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.12353516, + "step": 2872, + "time_per_iteration": 4.909080266952515 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087659, + "balance_loss_mlp": 1.05115747, + "epoch": 0.5527125817622163, + "flos": 489554993664.0, + "grad_norm": 0.06201182150044637, + "language_loss": 0.78260124, + "learning_rate": 0.00043908937882281343, + "loss": 0.79347777, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.36523438, + "step": 2873, + "time_per_iteration": 2.5999958515167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078657, + "balance_loss_mlp": 1.0410111, + "epoch": 0.5529049634474797, + "flos": 634606562304.0, + "grad_norm": 0.05626101072807578, + "language_loss": 0.82624078, + "learning_rate": 0.0004387801687061814, + "loss": 0.83702731, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.37573242, + "step": 2874, + "time_per_iteration": 2.816607713699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082085, + "balance_loss_mlp": 1.04310322, + "epoch": 0.5530973451327433, + "flos": 580986086400.0, + "grad_norm": 0.04886656520386433, + "language_loss": 0.80143493, + "learning_rate": 0.0004384709823571958, + "loss": 0.8122558, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.38964844, + "step": 2875, + "time_per_iteration": 2.7270736694335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079953, + "balance_loss_mlp": 1.04113841, + "epoch": 0.5532897268180069, + "flos": 1122030144000.0, + "grad_norm": 0.06103557908182598, + "language_loss": 0.83129716, + "learning_rate": 0.0004381618198958932, + "loss": 0.84209669, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.38793945, + "step": 2876, + "time_per_iteration": 3.4826347827911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085381, + "balance_loss_mlp": 1.04721045, + "epoch": 0.5534821085032705, + "flos": 636965088768.0, + "grad_norm": 0.05070554688334561, + "language_loss": 0.83524168, + "learning_rate": 0.00043785268144230137, + "loss": 0.84609544, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.38183594, + "step": 2877, + "time_per_iteration": 2.8850836753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080685, + "balance_loss_mlp": 1.04332519, + "epoch": 0.5536744901885341, + "flos": 570837597696.0, + "grad_norm": 0.056027333180870484, + "language_loss": 0.82300985, + "learning_rate": 0.00043754356711643837, + "loss": 0.83381677, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.37353516, + "step": 2878, + "time_per_iteration": 2.6629955768585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079329, + "balance_loss_mlp": 1.04180145, + "epoch": 0.5538668718737976, + "flos": 595418052096.0, + "grad_norm": 0.051053801448504514, + "language_loss": 0.84143484, + "learning_rate": 0.0004372344770383132, + "loss": 0.85222816, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.37475586, + "step": 2879, + "time_per_iteration": 2.809924364089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080427, + "balance_loss_mlp": 1.04220867, + "epoch": 0.5540592535590612, + "flos": 532324182528.0, + "grad_norm": 0.054354704442993965, + "language_loss": 0.83048761, + "learning_rate": 0.00043692541132792507, + "loss": 0.8412919, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.38183594, + "step": 2880, + "time_per_iteration": 2.6826112270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076517, + "balance_loss_mlp": 1.03915703, + "epoch": 0.5542516352443247, + "flos": 412398715392.0, + "grad_norm": 0.060842521075957015, + "language_loss": 0.83359361, + "learning_rate": 0.00043661637010526384, + "loss": 0.84435874, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.37329102, + "step": 2881, + "time_per_iteration": 2.5412843227386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077156, + "balance_loss_mlp": 1.03946209, + "epoch": 0.5544440169295883, + "flos": 547341083136.0, + "grad_norm": 0.06506612292228302, + "language_loss": 0.82828653, + "learning_rate": 0.00043630735349031025, + "loss": 0.83905804, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.37646484, + "step": 2882, + "time_per_iteration": 2.6428792476654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079595, + "balance_loss_mlp": 1.04132843, + "epoch": 0.5546363986148518, + "flos": 621518994432.0, + "grad_norm": 0.04746548389090053, + "language_loss": 0.8146224, + "learning_rate": 0.00043599836160303495, + "loss": 0.82541835, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.38232422, + "step": 2883, + "time_per_iteration": 2.836928367614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076033, + "balance_loss_mlp": 1.03833902, + "epoch": 0.5548287803001154, + "flos": 704972450304.0, + "grad_norm": 0.05191443424956408, + "language_loss": 0.77216405, + "learning_rate": 0.0004356893945633995, + "loss": 0.78292441, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.37719727, + "step": 2884, + "time_per_iteration": 2.959998846054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077354, + "balance_loss_mlp": 1.03877735, + "epoch": 0.555021161985379, + "flos": 503952053760.0, + "grad_norm": 0.04795057861891694, + "language_loss": 0.8143183, + "learning_rate": 0.0004353804524913551, + "loss": 0.82509184, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.38549805, + "step": 2885, + "time_per_iteration": 2.587458848953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076676, + "balance_loss_mlp": 1.03960204, + "epoch": 0.5552135436706426, + "flos": 615782020608.0, + "grad_norm": 0.060100634137020215, + "language_loss": 0.81801999, + "learning_rate": 0.0004350715355068441, + "loss": 0.82878673, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.37109375, + "step": 2886, + "time_per_iteration": 2.739311933517456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080638, + "balance_loss_mlp": 1.04227662, + "epoch": 0.5554059253559062, + "flos": 463635933696.0, + "grad_norm": 0.06732751663430354, + "language_loss": 0.79759407, + "learning_rate": 0.00043476264372979847, + "loss": 0.80840045, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.38305664, + "step": 2887, + "time_per_iteration": 2.5322625637054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081996, + "balance_loss_mlp": 1.04425478, + "epoch": 0.5555983070411696, + "flos": 1561923813888.0, + "grad_norm": 0.05205208802168105, + "language_loss": 0.78767329, + "learning_rate": 0.0004344537772801408, + "loss": 0.79849327, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.37744141, + "step": 2888, + "time_per_iteration": 3.8099794387817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022363, + "balance_loss_mlp": 1.00986981, + "epoch": 0.5557906887264332, + "flos": 1467093468672.0, + "grad_norm": 0.012872465654446894, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74444818, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.12451172, + "step": 2889, + "time_per_iteration": 4.8980872631073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080645, + "balance_loss_mlp": 1.04373789, + "epoch": 0.5559830704116968, + "flos": 529575750144.0, + "grad_norm": 0.056518477254008576, + "language_loss": 0.83232135, + "learning_rate": 0.0004338361208426298, + "loss": 0.84312785, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.36889648, + "step": 2890, + "time_per_iteration": 2.596644163131714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108101, + "balance_loss_mlp": 1.04312527, + "epoch": 0.5561754520969604, + "flos": 650895077376.0, + "grad_norm": 0.04719414959796351, + "language_loss": 0.81189138, + "learning_rate": 0.00043352733109457164, + "loss": 0.82270145, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.37841797, + "step": 2891, + "time_per_iteration": 2.8776957988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079084, + "balance_loss_mlp": 1.04158103, + "epoch": 0.556367833782224, + "flos": 733968801792.0, + "grad_norm": 0.04510399892940866, + "language_loss": 0.84577823, + "learning_rate": 0.00043321856715349244, + "loss": 0.85656911, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.37451172, + "step": 2892, + "time_per_iteration": 2.9247210025787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107932, + "balance_loss_mlp": 1.04243708, + "epoch": 0.5565602154674875, + "flos": 672120405504.0, + "grad_norm": 0.04457708587394983, + "language_loss": 0.80344868, + "learning_rate": 0.00043290982913926466, + "loss": 0.81424183, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.36889648, + "step": 2893, + "time_per_iteration": 2.791151285171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087078, + "balance_loss_mlp": 1.04919362, + "epoch": 0.556752597152751, + "flos": 585911162880.0, + "grad_norm": 0.05091942660655845, + "language_loss": 0.84425044, + "learning_rate": 0.0004326011171717514, + "loss": 0.8551212, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.37866211, + "step": 2894, + "time_per_iteration": 2.8832085132598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085909, + "balance_loss_mlp": 1.04788101, + "epoch": 0.5569449788380146, + "flos": 437549548032.0, + "grad_norm": 0.04808991967010034, + "language_loss": 0.81074953, + "learning_rate": 0.0004322924313708051, + "loss": 0.82160866, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.38012695, + "step": 2895, + "time_per_iteration": 2.5033986568450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079423, + "balance_loss_mlp": 1.04315972, + "epoch": 0.5571373605232782, + "flos": 502002372096.0, + "grad_norm": 0.057289668121921454, + "language_loss": 0.84257507, + "learning_rate": 0.0004319837718562681, + "loss": 0.85336924, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.36254883, + "step": 2896, + "time_per_iteration": 2.55461049079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086447, + "balance_loss_mlp": 1.04856229, + "epoch": 0.5573297422085417, + "flos": 577126010880.0, + "grad_norm": 0.05427319641394577, + "language_loss": 0.83001935, + "learning_rate": 0.0004316751387479726, + "loss": 0.84088391, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.37841797, + "step": 2897, + "time_per_iteration": 2.726621150970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010828, + "balance_loss_mlp": 1.04622626, + "epoch": 0.5575221238938053, + "flos": 1343536754688.0, + "grad_norm": 0.07147882998338702, + "language_loss": 0.82389295, + "learning_rate": 0.0004313665321657409, + "loss": 0.83472097, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.36572266, + "step": 2898, + "time_per_iteration": 3.705557107925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084994, + "balance_loss_mlp": 1.04756212, + "epoch": 0.5577145055790689, + "flos": 601680324096.0, + "grad_norm": 0.06263472170874507, + "language_loss": 0.80018216, + "learning_rate": 0.00043105795222938436, + "loss": 0.81103212, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.37451172, + "step": 2899, + "time_per_iteration": 2.7069091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082839, + "balance_loss_mlp": 1.04500163, + "epoch": 0.5579068872643325, + "flos": 562353601536.0, + "grad_norm": 0.0921941925102754, + "language_loss": 0.78331131, + "learning_rate": 0.00043074939905870467, + "loss": 0.79413968, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.37817383, + "step": 2900, + "time_per_iteration": 2.6597537994384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108264, + "balance_loss_mlp": 1.04468393, + "epoch": 0.558099268949596, + "flos": 544292904960.0, + "grad_norm": 0.05487003421557055, + "language_loss": 0.80032802, + "learning_rate": 0.0004304408727734927, + "loss": 0.81115448, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.37939453, + "step": 2901, + "time_per_iteration": 2.61590838432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077829, + "balance_loss_mlp": 1.04120803, + "epoch": 0.5582916506348595, + "flos": 552520825344.0, + "grad_norm": 0.05406538300276566, + "language_loss": 0.88821226, + "learning_rate": 0.0004301323734935288, + "loss": 0.89899063, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.36645508, + "step": 2902, + "time_per_iteration": 2.6357102394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082783, + "balance_loss_mlp": 1.04573286, + "epoch": 0.5584840323201231, + "flos": 543126007296.0, + "grad_norm": 0.054631389421551546, + "language_loss": 0.87217975, + "learning_rate": 0.000429823901338583, + "loss": 0.88300759, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.37011719, + "step": 2903, + "time_per_iteration": 2.6050922870635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073691, + "balance_loss_mlp": 1.03678417, + "epoch": 0.5586764140053867, + "flos": 815212118016.0, + "grad_norm": 0.05529085617610277, + "language_loss": 0.86446041, + "learning_rate": 0.00042951545642841513, + "loss": 0.87519729, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.36914062, + "step": 2904, + "time_per_iteration": 3.0609569549560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076695, + "balance_loss_mlp": 1.03981209, + "epoch": 0.5588687956906503, + "flos": 486196895232.0, + "grad_norm": 0.04557850009306157, + "language_loss": 0.86361349, + "learning_rate": 0.0004292070388827737, + "loss": 0.87438047, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.3684082, + "step": 2905, + "time_per_iteration": 2.5549428462982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107784, + "balance_loss_mlp": 1.04017019, + "epoch": 0.5590611773759138, + "flos": 451809805824.0, + "grad_norm": 0.04842795237529701, + "language_loss": 0.8078168, + "learning_rate": 0.00042889864882139753, + "loss": 0.81859523, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.37646484, + "step": 2906, + "time_per_iteration": 2.6019363403320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072955, + "balance_loss_mlp": 1.03662026, + "epoch": 0.5592535590611774, + "flos": 520693083648.0, + "grad_norm": 0.04884179046821603, + "language_loss": 0.81762469, + "learning_rate": 0.0004285902863640139, + "loss": 0.8283543, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.36352539, + "step": 2907, + "time_per_iteration": 2.5899524688720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072848, + "balance_loss_mlp": 1.03622651, + "epoch": 0.5594459407464409, + "flos": 552250192896.0, + "grad_norm": 0.048074009249812255, + "language_loss": 0.8615104, + "learning_rate": 0.00042828195163033966, + "loss": 0.87223887, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.36645508, + "step": 2908, + "time_per_iteration": 2.676518440246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072859, + "balance_loss_mlp": 1.03585625, + "epoch": 0.5596383224317045, + "flos": 484596421632.0, + "grad_norm": 0.0512741694464887, + "language_loss": 0.79307508, + "learning_rate": 0.0004279736447400812, + "loss": 0.80380368, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.36987305, + "step": 2909, + "time_per_iteration": 2.590859889984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074509, + "balance_loss_mlp": 1.03676748, + "epoch": 0.5598307041169681, + "flos": 610976217600.0, + "grad_norm": 0.05469922136848912, + "language_loss": 0.78325337, + "learning_rate": 0.00042766536581293385, + "loss": 0.79399848, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.37695312, + "step": 2910, + "time_per_iteration": 2.7034008502960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074194, + "balance_loss_mlp": 1.03654802, + "epoch": 0.5600230858022316, + "flos": 488585945088.0, + "grad_norm": 0.05207227245540468, + "language_loss": 0.79564762, + "learning_rate": 0.0004273571149685819, + "loss": 0.80638957, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.37597656, + "step": 2911, + "time_per_iteration": 2.7075796127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074918, + "balance_loss_mlp": 1.03650868, + "epoch": 0.5602154674874952, + "flos": 598592858112.0, + "grad_norm": 0.04994756976596268, + "language_loss": 0.84006047, + "learning_rate": 0.00042704889232669937, + "loss": 0.85080969, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.38354492, + "step": 2912, + "time_per_iteration": 2.6922175884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071101, + "balance_loss_mlp": 1.03431344, + "epoch": 0.5604078491727588, + "flos": 585697347072.0, + "grad_norm": 0.05437848146357707, + "language_loss": 0.85302234, + "learning_rate": 0.0004267406980069484, + "loss": 0.86373341, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.36791992, + "step": 2913, + "time_per_iteration": 2.70796537399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067512, + "balance_loss_mlp": 1.03077149, + "epoch": 0.5606002308580224, + "flos": 540926042112.0, + "grad_norm": 0.045341959008097614, + "language_loss": 0.79753983, + "learning_rate": 0.0004264325321289808, + "loss": 0.80821496, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.3671875, + "step": 2914, + "time_per_iteration": 2.761362314224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107061, + "balance_loss_mlp": 1.03241491, + "epoch": 0.5607926125432858, + "flos": 583654533120.0, + "grad_norm": 0.0532534560102953, + "language_loss": 0.85864502, + "learning_rate": 0.00042612439481243736, + "loss": 0.86935115, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.38183594, + "step": 2915, + "time_per_iteration": 2.745008945465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073585, + "balance_loss_mlp": 1.03655863, + "epoch": 0.5609849942285494, + "flos": 627205095936.0, + "grad_norm": 0.06454697115510677, + "language_loss": 0.90024638, + "learning_rate": 0.00042581628617694735, + "loss": 0.91098225, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.37036133, + "step": 2916, + "time_per_iteration": 2.7654495239257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072441, + "balance_loss_mlp": 1.0346992, + "epoch": 0.561177375913813, + "flos": 588095161344.0, + "grad_norm": 0.05235254168005436, + "language_loss": 0.81651318, + "learning_rate": 0.0004255082063421296, + "loss": 0.82723755, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.37719727, + "step": 2917, + "time_per_iteration": 2.674204111099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107424, + "balance_loss_mlp": 1.03726149, + "epoch": 0.5613697575990766, + "flos": 526774883328.0, + "grad_norm": 0.05687183599046208, + "language_loss": 0.8481921, + "learning_rate": 0.00042520015542759065, + "loss": 0.85893452, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.36987305, + "step": 2918, + "time_per_iteration": 2.8309459686279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079966, + "balance_loss_mlp": 1.04134226, + "epoch": 0.5615621392843402, + "flos": 642351444480.0, + "grad_norm": 0.05024796403090353, + "language_loss": 0.88020825, + "learning_rate": 0.00042489213355292687, + "loss": 0.89100802, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.38598633, + "step": 2919, + "time_per_iteration": 2.8605942726135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083327, + "balance_loss_mlp": 1.04444087, + "epoch": 0.5617545209696037, + "flos": 427524715008.0, + "grad_norm": 0.05130722807003229, + "language_loss": 0.8097831, + "learning_rate": 0.00042458414083772276, + "loss": 0.82061636, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.38842773, + "step": 2920, + "time_per_iteration": 2.5186893939971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076823, + "balance_loss_mlp": 1.03920078, + "epoch": 0.5619469026548672, + "flos": 568140037632.0, + "grad_norm": 0.04280127072200588, + "language_loss": 0.84787017, + "learning_rate": 0.000424276177401552, + "loss": 0.85863835, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.37597656, + "step": 2921, + "time_per_iteration": 2.773881435394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079464, + "balance_loss_mlp": 1.04203272, + "epoch": 0.5621392843401308, + "flos": 504947243520.0, + "grad_norm": 0.056711430924252765, + "language_loss": 0.85714108, + "learning_rate": 0.0004239682433639763, + "loss": 0.86793578, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.37426758, + "step": 2922, + "time_per_iteration": 2.714646816253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081153, + "balance_loss_mlp": 1.04477036, + "epoch": 0.5623316660253944, + "flos": 516744258048.0, + "grad_norm": 0.060505090734525195, + "language_loss": 0.85348099, + "learning_rate": 0.0004236603388445467, + "loss": 0.8642925, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.36425781, + "step": 2923, + "time_per_iteration": 2.6141107082366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075818, + "balance_loss_mlp": 1.03905368, + "epoch": 0.5625240477106579, + "flos": 605732456448.0, + "grad_norm": 0.05369747698254185, + "language_loss": 0.81871819, + "learning_rate": 0.00042335246396280166, + "loss": 0.82947636, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.3671875, + "step": 2924, + "time_per_iteration": 2.7129671573638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081443, + "balance_loss_mlp": 1.0438447, + "epoch": 0.5627164293959215, + "flos": 450203539968.0, + "grad_norm": 0.06323509209264203, + "language_loss": 0.89955974, + "learning_rate": 0.0004230446188382693, + "loss": 0.9103741, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.3762207, + "step": 2925, + "time_per_iteration": 2.5567660331726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077212, + "balance_loss_mlp": 1.04101968, + "epoch": 0.5629088110811851, + "flos": 741734032896.0, + "grad_norm": 0.055420573846539395, + "language_loss": 0.80082184, + "learning_rate": 0.0004227368035904654, + "loss": 0.81159395, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.36181641, + "step": 2926, + "time_per_iteration": 2.947251319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108299, + "balance_loss_mlp": 1.04610705, + "epoch": 0.5631011927664487, + "flos": 496719323136.0, + "grad_norm": 0.04719463019166682, + "language_loss": 0.82913107, + "learning_rate": 0.00042242901833889474, + "loss": 0.83996093, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.36889648, + "step": 2927, + "time_per_iteration": 2.6429412364959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085596, + "balance_loss_mlp": 1.0498333, + "epoch": 0.5632935744517122, + "flos": 885774445056.0, + "grad_norm": 0.055780235249339845, + "language_loss": 0.85862845, + "learning_rate": 0.0004221212632030501, + "loss": 0.86948442, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.35791016, + "step": 2928, + "time_per_iteration": 3.0935142040252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085595, + "balance_loss_mlp": 1.04897451, + "epoch": 0.5634859561369757, + "flos": 604516096512.0, + "grad_norm": 0.08179321361553939, + "language_loss": 0.80431306, + "learning_rate": 0.0004218135383024124, + "loss": 0.81516898, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.3659668, + "step": 2929, + "time_per_iteration": 2.688404083251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079905, + "balance_loss_mlp": 1.04359436, + "epoch": 0.5636783378222393, + "flos": 453670737408.0, + "grad_norm": 0.05341288147748167, + "language_loss": 0.85107243, + "learning_rate": 0.0004215058437564511, + "loss": 0.86187148, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.36352539, + "step": 2930, + "time_per_iteration": 2.5591979026794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083948, + "balance_loss_mlp": 1.04725528, + "epoch": 0.5638707195075029, + "flos": 518206519296.0, + "grad_norm": 0.06241038231461263, + "language_loss": 0.82415265, + "learning_rate": 0.00042119817968462397, + "loss": 0.83499211, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.36694336, + "step": 2931, + "time_per_iteration": 2.5755324363708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075664, + "balance_loss_mlp": 1.03916192, + "epoch": 0.5640631011927665, + "flos": 564632142336.0, + "grad_norm": 0.06755883510394861, + "language_loss": 0.87004125, + "learning_rate": 0.0004208905462063766, + "loss": 0.88079786, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.36499023, + "step": 2932, + "time_per_iteration": 2.6330130100250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077211, + "balance_loss_mlp": 1.04097116, + "epoch": 0.56425548287803, + "flos": 516783545856.0, + "grad_norm": 0.04875434703648171, + "language_loss": 0.84473455, + "learning_rate": 0.00042058294344114315, + "loss": 0.85550666, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.36254883, + "step": 2933, + "time_per_iteration": 2.60188627243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081111, + "balance_loss_mlp": 1.04477572, + "epoch": 0.5644478645632935, + "flos": 853907415552.0, + "grad_norm": 0.05278955631679875, + "language_loss": 0.77495515, + "learning_rate": 0.0004202753715083456, + "loss": 0.78576624, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.36352539, + "step": 2934, + "time_per_iteration": 3.0625100135803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084889, + "balance_loss_mlp": 1.04860175, + "epoch": 0.5646402462485571, + "flos": 553175571456.0, + "grad_norm": 0.05717629686508025, + "language_loss": 0.81433523, + "learning_rate": 0.0004199678305273936, + "loss": 0.82518411, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.36279297, + "step": 2935, + "time_per_iteration": 2.6390254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082496, + "balance_loss_mlp": 1.04587531, + "epoch": 0.5648326279338207, + "flos": 685661898240.0, + "grad_norm": 0.05411523361189988, + "language_loss": 0.81180829, + "learning_rate": 0.0004196603206176854, + "loss": 0.82263327, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.36669922, + "step": 2936, + "time_per_iteration": 2.9184954166412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079161, + "balance_loss_mlp": 1.04354107, + "epoch": 0.5650250096190843, + "flos": 802990291968.0, + "grad_norm": 0.04902014595353554, + "language_loss": 0.83833814, + "learning_rate": 0.000419352841898607, + "loss": 0.84912974, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.35644531, + "step": 2937, + "time_per_iteration": 2.963693618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078443, + "balance_loss_mlp": 1.04248953, + "epoch": 0.5652173913043478, + "flos": 581787809280.0, + "grad_norm": 0.05926519799053672, + "language_loss": 0.77107543, + "learning_rate": 0.000419045394489532, + "loss": 0.78185987, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.359375, + "step": 2938, + "time_per_iteration": 2.727398633956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076353, + "balance_loss_mlp": 1.03975606, + "epoch": 0.5654097729896114, + "flos": 820269614592.0, + "grad_norm": 0.053889258634032246, + "language_loss": 0.76768535, + "learning_rate": 0.0004187379785098224, + "loss": 0.77844894, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.3659668, + "step": 2939, + "time_per_iteration": 3.1188313961029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079563, + "balance_loss_mlp": 1.04339492, + "epoch": 0.565602154674875, + "flos": 783826716672.0, + "grad_norm": 0.05512056097545077, + "language_loss": 0.83633238, + "learning_rate": 0.00041843059407882744, + "loss": 0.84712803, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.36206055, + "step": 2940, + "time_per_iteration": 2.983302116394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076269, + "balance_loss_mlp": 1.04072082, + "epoch": 0.5657945363601385, + "flos": 549418802688.0, + "grad_norm": 0.05159052201649483, + "language_loss": 0.82491434, + "learning_rate": 0.0004181232413158842, + "loss": 0.83567703, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.35571289, + "step": 2941, + "time_per_iteration": 2.6737120151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_mlp": 1.04028893, + "epoch": 0.5659869180454021, + "flos": 667826754048.0, + "grad_norm": 0.06466569325042074, + "language_loss": 0.82093412, + "learning_rate": 0.0004178159203403179, + "loss": 0.83170253, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.36547852, + "step": 2942, + "time_per_iteration": 2.8263752460479736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077242, + "balance_loss_mlp": 1.0423857, + "epoch": 0.5661792997306656, + "flos": 499707864576.0, + "grad_norm": 0.05486974364690197, + "language_loss": 0.81532693, + "learning_rate": 0.0004175086312714409, + "loss": 0.82609934, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.34912109, + "step": 2943, + "time_per_iteration": 2.5581164360046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084461, + "balance_loss_mlp": 1.04848337, + "epoch": 0.5663716814159292, + "flos": 600922271232.0, + "grad_norm": 0.04881995286740945, + "language_loss": 0.83686805, + "learning_rate": 0.00041720137422855366, + "loss": 0.84771264, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.35961914, + "step": 2944, + "time_per_iteration": 2.7574734687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080125, + "balance_loss_mlp": 1.04390931, + "epoch": 0.5665640631011928, + "flos": 540728193024.0, + "grad_norm": 0.05214507443979086, + "language_loss": 0.79004753, + "learning_rate": 0.00041689414933094383, + "loss": 0.80084872, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.36230469, + "step": 2945, + "time_per_iteration": 2.6470541954040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080942, + "balance_loss_mlp": 1.0463953, + "epoch": 0.5667564447864564, + "flos": 601655592960.0, + "grad_norm": 0.06146311821637782, + "language_loss": 0.80673099, + "learning_rate": 0.00041658695669788653, + "loss": 0.81754035, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.34594727, + "step": 2946, + "time_per_iteration": 2.721078872680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083586, + "balance_loss_mlp": 1.04791868, + "epoch": 0.5669488264717198, + "flos": 659224894464.0, + "grad_norm": 0.05891401598443517, + "language_loss": 0.80939281, + "learning_rate": 0.00041627979644864453, + "loss": 0.82022864, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.35717773, + "step": 2947, + "time_per_iteration": 2.877037286758423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085181, + "balance_loss_mlp": 1.04941845, + "epoch": 0.5671412081569834, + "flos": 485158035456.0, + "grad_norm": 0.042998309327625356, + "language_loss": 0.809735, + "learning_rate": 0.0004159726687024683, + "loss": 0.8205868, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.35791016, + "step": 2948, + "time_per_iteration": 2.617147207260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_mlp": 1.04832673, + "epoch": 0.567333589842247, + "flos": 729487475712.0, + "grad_norm": 0.049875608566737006, + "language_loss": 0.79203111, + "learning_rate": 0.00041566557357859506, + "loss": 0.80287302, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.35888672, + "step": 2949, + "time_per_iteration": 2.859217882156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080493, + "balance_loss_mlp": 1.04494464, + "epoch": 0.5675259715275106, + "flos": 968471258112.0, + "grad_norm": 0.06410563873068757, + "language_loss": 0.79063594, + "learning_rate": 0.0004153585111962502, + "loss": 0.80144083, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.35571289, + "step": 2950, + "time_per_iteration": 3.3080387115478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084603, + "balance_loss_mlp": 1.04767203, + "epoch": 0.5677183532127742, + "flos": 564879453696.0, + "grad_norm": 0.058242755990822084, + "language_loss": 0.84030402, + "learning_rate": 0.0004150514816746453, + "loss": 0.85115004, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.36938477, + "step": 2951, + "time_per_iteration": 2.66630220413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080246, + "balance_loss_mlp": 1.04517412, + "epoch": 0.5679107348980377, + "flos": 551432503296.0, + "grad_norm": 0.05117838990465897, + "language_loss": 0.85669959, + "learning_rate": 0.0004147444851329802, + "loss": 0.86750209, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.35107422, + "step": 2952, + "time_per_iteration": 2.645735502243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108119, + "balance_loss_mlp": 1.04585648, + "epoch": 0.5681031165833013, + "flos": 819115863552.0, + "grad_norm": 0.04931619960622222, + "language_loss": 0.85395974, + "learning_rate": 0.00041443752169044126, + "loss": 0.8647716, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.35351562, + "step": 2953, + "time_per_iteration": 3.025468349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087019, + "balance_loss_mlp": 1.05116129, + "epoch": 0.5682954982685648, + "flos": 617731702272.0, + "grad_norm": 0.05138113495872943, + "language_loss": 0.84811544, + "learning_rate": 0.0004141305914662025, + "loss": 0.85898566, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.35888672, + "step": 2954, + "time_per_iteration": 2.7767860889434814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108515, + "balance_loss_mlp": 1.04848099, + "epoch": 0.5684878799538284, + "flos": 647625729024.0, + "grad_norm": 0.04880277930525614, + "language_loss": 0.80257368, + "learning_rate": 0.0004138236945794246, + "loss": 0.81342518, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.36645508, + "step": 2955, + "time_per_iteration": 2.9492557048797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079722, + "balance_loss_mlp": 1.04434061, + "epoch": 0.5686802616390919, + "flos": 805615068672.0, + "grad_norm": 0.060523381383535066, + "language_loss": 0.83239132, + "learning_rate": 0.00041351683114925576, + "loss": 0.84318852, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.35424805, + "step": 2956, + "time_per_iteration": 3.0558693408966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080356, + "balance_loss_mlp": 1.0441637, + "epoch": 0.5688726433243555, + "flos": 546882776064.0, + "grad_norm": 0.06102379875806974, + "language_loss": 0.86688364, + "learning_rate": 0.0004132100012948308, + "loss": 0.87768722, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.36230469, + "step": 2957, + "time_per_iteration": 2.6131510734558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084018, + "balance_loss_mlp": 1.04689598, + "epoch": 0.5690650250096191, + "flos": 486324933120.0, + "grad_norm": 0.05856765821562534, + "language_loss": 0.84111595, + "learning_rate": 0.00041290320513527145, + "loss": 0.85195613, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.37133789, + "step": 2958, + "time_per_iteration": 2.584434986114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077095, + "balance_loss_mlp": 1.04154706, + "epoch": 0.5692574066948827, + "flos": 577184237568.0, + "grad_norm": 0.04674501738886335, + "language_loss": 0.85154927, + "learning_rate": 0.0004125964427896867, + "loss": 0.86232018, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.35571289, + "step": 2959, + "time_per_iteration": 2.6582295894622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071399, + "balance_loss_mlp": 1.03551733, + "epoch": 0.5694497883801463, + "flos": 454005388800.0, + "grad_norm": 0.055082869163009494, + "language_loss": 0.79042369, + "learning_rate": 0.0004122897143771723, + "loss": 0.80113769, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.35888672, + "step": 2960, + "time_per_iteration": 2.555941104888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075778, + "balance_loss_mlp": 1.0394429, + "epoch": 0.5696421700654097, + "flos": 559251578880.0, + "grad_norm": 0.0498118595632428, + "language_loss": 0.81253064, + "learning_rate": 0.0004119830200168109, + "loss": 0.82328844, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.36376953, + "step": 2961, + "time_per_iteration": 2.6521012783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073119, + "balance_loss_mlp": 1.03780937, + "epoch": 0.5698345517506733, + "flos": 465314982912.0, + "grad_norm": 0.05616905034177488, + "language_loss": 0.8830415, + "learning_rate": 0.0004116763598276714, + "loss": 0.89377272, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.35327148, + "step": 2962, + "time_per_iteration": 2.5006790161132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073408, + "balance_loss_mlp": 1.03702545, + "epoch": 0.5700269334359369, + "flos": 605645116416.0, + "grad_norm": 0.05368070912324084, + "language_loss": 0.8055867, + "learning_rate": 0.00041136973392881017, + "loss": 0.81632078, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.36376953, + "step": 2963, + "time_per_iteration": 2.8011715412139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073783, + "balance_loss_mlp": 1.03852105, + "epoch": 0.5702193151212005, + "flos": 562423412736.0, + "grad_norm": 0.05977105557008513, + "language_loss": 0.81818962, + "learning_rate": 0.00041106314243926983, + "loss": 0.82892752, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.35302734, + "step": 2964, + "time_per_iteration": 2.7296242713928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070445, + "balance_loss_mlp": 1.03558779, + "epoch": 0.570411696806464, + "flos": 522983208960.0, + "grad_norm": 0.05693204807949615, + "language_loss": 0.87045705, + "learning_rate": 0.0004107565854780798, + "loss": 0.88116145, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.34887695, + "step": 2965, + "time_per_iteration": 2.5964605808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075214, + "balance_loss_mlp": 1.04002357, + "epoch": 0.5706040784917276, + "flos": 717911631360.0, + "grad_norm": 0.05031367362382368, + "language_loss": 0.80980343, + "learning_rate": 0.000410450063164256, + "loss": 0.82055557, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.35229492, + "step": 2966, + "time_per_iteration": 2.8248300552368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076636, + "balance_loss_mlp": 1.04127812, + "epoch": 0.5707964601769911, + "flos": 476467425792.0, + "grad_norm": 0.059966750204006415, + "language_loss": 0.8167066, + "learning_rate": 0.00041014357561680115, + "loss": 0.82747293, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.35351562, + "step": 2967, + "time_per_iteration": 2.4996910095214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077241, + "balance_loss_mlp": 1.04278946, + "epoch": 0.5709888418622547, + "flos": 579823570944.0, + "grad_norm": 0.05891056148222195, + "language_loss": 0.85875672, + "learning_rate": 0.0004098371229547039, + "loss": 0.86952913, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.3449707, + "step": 2968, + "time_per_iteration": 2.6908459663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131838, + "balance_loss_mlp": 1.11677039, + "epoch": 0.5711812235475183, + "flos": 1579108290048.0, + "grad_norm": 0.050443633584492734, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81142646, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.15039062, + "step": 2969, + "time_per_iteration": 4.709675550460815 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107846, + "balance_loss_mlp": 1.04233932, + "epoch": 0.5713736052327818, + "flos": 468259854336.0, + "grad_norm": 0.04864564090032181, + "language_loss": 0.80513656, + "learning_rate": 0.00040922432276247107, + "loss": 0.81592119, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.36132812, + "step": 2970, + "time_per_iteration": 2.554276466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078647, + "balance_loss_mlp": 1.04412448, + "epoch": 0.5715659869180454, + "flos": 537390443520.0, + "grad_norm": 0.06858717783230618, + "language_loss": 0.84265316, + "learning_rate": 0.0004089179754702457, + "loss": 0.85343957, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.34570312, + "step": 2971, + "time_per_iteration": 2.7972512245178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072054, + "balance_loss_mlp": 1.0365299, + "epoch": 0.571758368603309, + "flos": 655778045952.0, + "grad_norm": 0.0710461233457747, + "language_loss": 0.79649973, + "learning_rate": 0.00040861166353919843, + "loss": 0.80722028, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.35546875, + "step": 2972, + "time_per_iteration": 2.7805516719818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076554, + "balance_loss_mlp": 1.04076695, + "epoch": 0.5719507502885726, + "flos": 667609966080.0, + "grad_norm": 0.05192257726698222, + "language_loss": 0.81693333, + "learning_rate": 0.00040830538708824983, + "loss": 0.82769883, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.35839844, + "step": 2973, + "time_per_iteration": 2.8635294437408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070547, + "balance_loss_mlp": 1.03507066, + "epoch": 0.572143131973836, + "flos": 476083312128.0, + "grad_norm": 0.060626408017241236, + "language_loss": 0.81790257, + "learning_rate": 0.000407999146236307, + "loss": 0.82860804, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.35498047, + "step": 2974, + "time_per_iteration": 2.5645899772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074889, + "balance_loss_mlp": 1.03943634, + "epoch": 0.5723355136590996, + "flos": 539255757312.0, + "grad_norm": 0.06009071322865027, + "language_loss": 0.83246768, + "learning_rate": 0.0004076929411022634, + "loss": 0.84321654, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.35449219, + "step": 2975, + "time_per_iteration": 2.655545234680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075121, + "balance_loss_mlp": 1.0383811, + "epoch": 0.5725278953443632, + "flos": 823784864256.0, + "grad_norm": 0.053970809123607175, + "language_loss": 0.79314309, + "learning_rate": 0.0004073867718049982, + "loss": 0.80389434, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.36743164, + "step": 2976, + "time_per_iteration": 3.0664896965026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078369, + "balance_loss_mlp": 1.0429157, + "epoch": 0.5727202770296268, + "flos": 587155226112.0, + "grad_norm": 0.05912475797179562, + "language_loss": 0.82244706, + "learning_rate": 0.00040708063846337704, + "loss": 0.83323073, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.35522461, + "step": 2977, + "time_per_iteration": 2.7131478786468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083864, + "balance_loss_mlp": 1.04800642, + "epoch": 0.5729126587148904, + "flos": 446723195904.0, + "grad_norm": 0.048537452765021645, + "language_loss": 0.80846637, + "learning_rate": 0.00040677454119625143, + "loss": 0.81930506, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.35864258, + "step": 2978, + "time_per_iteration": 2.6209888458251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078154, + "balance_loss_mlp": 1.0418427, + "epoch": 0.5731050404001539, + "flos": 519206091264.0, + "grad_norm": 0.05702144714813726, + "language_loss": 0.82471335, + "learning_rate": 0.0004064684801224587, + "loss": 0.83549494, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.36328125, + "step": 2979, + "time_per_iteration": 2.5915722846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077909, + "balance_loss_mlp": 1.04197955, + "epoch": 0.5732974220854175, + "flos": 504528224256.0, + "grad_norm": 0.05171310351774622, + "language_loss": 0.80115962, + "learning_rate": 0.00040616245536082224, + "loss": 0.8119387, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.35961914, + "step": 2980, + "time_per_iteration": 2.6032769680023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076097, + "balance_loss_mlp": 1.04057276, + "epoch": 0.573489803770681, + "flos": 592187991552.0, + "grad_norm": 0.049753074122949235, + "language_loss": 0.80894011, + "learning_rate": 0.00040585646703015165, + "loss": 0.81970108, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.35522461, + "step": 2981, + "time_per_iteration": 2.79546856880188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074296, + "balance_loss_mlp": 1.03891444, + "epoch": 0.5736821854559446, + "flos": 489672857088.0, + "grad_norm": 0.06088968225358262, + "language_loss": 0.78612393, + "learning_rate": 0.0004055505152492419, + "loss": 0.79686689, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.35449219, + "step": 2982, + "time_per_iteration": 2.6494040489196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078742, + "balance_loss_mlp": 1.04283655, + "epoch": 0.5738745671412081, + "flos": 457895987712.0, + "grad_norm": 0.05054468303814383, + "language_loss": 0.74372864, + "learning_rate": 0.00040524460013687425, + "loss": 0.75451601, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.359375, + "step": 2983, + "time_per_iteration": 2.7171366214752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078136, + "balance_loss_mlp": 1.04294515, + "epoch": 0.5740669488264717, + "flos": 580012655616.0, + "grad_norm": 0.044553783792680594, + "language_loss": 0.80828458, + "learning_rate": 0.0004049387218118155, + "loss": 0.81906593, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.35229492, + "step": 2984, + "time_per_iteration": 2.995347738265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073612, + "balance_loss_mlp": 1.03725314, + "epoch": 0.5742593305117353, + "flos": 524155898880.0, + "grad_norm": 0.05730874981758524, + "language_loss": 0.8475495, + "learning_rate": 0.00040463288039281777, + "loss": 0.85828567, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.36328125, + "step": 2985, + "time_per_iteration": 2.715092182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102117, + "balance_loss_mlp": 1.0106324, + "epoch": 0.5744517121969989, + "flos": 1553033488896.0, + "grad_norm": 0.021440825644231668, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78897589, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.10546875, + "step": 2986, + "time_per_iteration": 4.936111211776733 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071312, + "balance_loss_mlp": 1.03588247, + "epoch": 0.5746440938822625, + "flos": 751600304640.0, + "grad_norm": 0.05668637583843988, + "language_loss": 0.81840217, + "learning_rate": 0.0004040213087479444, + "loss": 0.82911527, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.35449219, + "step": 2987, + "time_per_iteration": 2.949164628982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074412, + "balance_loss_mlp": 1.03955531, + "epoch": 0.5748364755675259, + "flos": 501618258432.0, + "grad_norm": 0.05762088821448085, + "language_loss": 0.84999508, + "learning_rate": 0.0004037155787595018, + "loss": 0.86073923, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.34887695, + "step": 2988, + "time_per_iteration": 2.6570816040039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010738, + "balance_loss_mlp": 1.03863311, + "epoch": 0.5750288572527895, + "flos": 503757024768.0, + "grad_norm": 0.17757642281187902, + "language_loss": 0.80609345, + "learning_rate": 0.000403409886151987, + "loss": 0.81683147, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.35205078, + "step": 2989, + "time_per_iteration": 2.913994073867798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014651, + "balance_loss_mlp": 1.00430369, + "epoch": 0.5752212389380531, + "flos": 1540541030400.0, + "grad_norm": 0.007550989320398048, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83013755, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.10351562, + "step": 2990, + "time_per_iteration": 4.7991979122161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020765, + "balance_loss_mlp": 1.01027453, + "epoch": 0.5754136206233167, + "flos": 1566499378176.0, + "grad_norm": 0.009415259483784648, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79219365, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.10498047, + "step": 2991, + "time_per_iteration": 4.760354280471802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076847, + "balance_loss_mlp": 1.04282451, + "epoch": 0.5756060023085803, + "flos": 797806167552.0, + "grad_norm": 0.05030181344669937, + "language_loss": 0.76800382, + "learning_rate": 0.00040249303380173807, + "loss": 0.77877235, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.34057617, + "step": 2992, + "time_per_iteration": 3.083129644393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080903, + "balance_loss_mlp": 1.04573631, + "epoch": 0.5757983839938438, + "flos": 587588802048.0, + "grad_norm": 0.05896593059815975, + "language_loss": 0.78794599, + "learning_rate": 0.00040218749190459126, + "loss": 0.79875505, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.35229492, + "step": 2993, + "time_per_iteration": 2.763256788253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083869, + "balance_loss_mlp": 1.04884517, + "epoch": 0.5759907656791073, + "flos": 516576932352.0, + "grad_norm": 0.05409710441005256, + "language_loss": 0.82655573, + "learning_rate": 0.00040188198798162775, + "loss": 0.83739436, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.35058594, + "step": 2994, + "time_per_iteration": 2.6000871658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078452, + "balance_loss_mlp": 1.04333293, + "epoch": 0.5761831473643709, + "flos": 586845305856.0, + "grad_norm": 0.05831918093224265, + "language_loss": 0.85334295, + "learning_rate": 0.000401576522151455, + "loss": 0.8641274, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.3515625, + "step": 2995, + "time_per_iteration": 2.808647871017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081176, + "balance_loss_mlp": 1.04672456, + "epoch": 0.5763755290496345, + "flos": 543619219968.0, + "grad_norm": 0.04257335582462403, + "language_loss": 0.82291412, + "learning_rate": 0.0004012710945326651, + "loss": 0.83372593, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.34472656, + "step": 2996, + "time_per_iteration": 2.7611968517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082283, + "balance_loss_mlp": 1.04749799, + "epoch": 0.576567910734898, + "flos": 625930509312.0, + "grad_norm": 0.050767561493079726, + "language_loss": 0.80952752, + "learning_rate": 0.0004009657052438355, + "loss": 0.82035035, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.34814453, + "step": 2997, + "time_per_iteration": 2.788496971130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107931, + "balance_loss_mlp": 1.04392815, + "epoch": 0.5767602924201616, + "flos": 537985552896.0, + "grad_norm": 0.053276481047857226, + "language_loss": 0.85359365, + "learning_rate": 0.00040066035440352904, + "loss": 0.86438668, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.35400391, + "step": 2998, + "time_per_iteration": 2.6187028884887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010656, + "balance_loss_mlp": 1.05358338, + "epoch": 0.5769526741054252, + "flos": 1558969873920.0, + "grad_norm": 0.027624435835290975, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80358732, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.12011719, + "step": 2999, + "time_per_iteration": 4.880754470825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085333, + "balance_loss_mlp": 1.05071473, + "epoch": 0.5771450557906888, + "flos": 467939759616.0, + "grad_norm": 0.056203987299685475, + "language_loss": 0.7605744, + "learning_rate": 0.00040004976854266145, + "loss": 0.77142775, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.34667969, + "step": 3000, + "time_per_iteration": 2.537555694580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079139, + "balance_loss_mlp": 1.043329, + "epoch": 0.5773374374759523, + "flos": 574288828416.0, + "grad_norm": 0.059637526980377456, + "language_loss": 0.81006908, + "learning_rate": 0.0003997445337591505, + "loss": 0.82086051, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.35839844, + "step": 3001, + "time_per_iteration": 2.637199878692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072855, + "balance_loss_mlp": 1.03756905, + "epoch": 0.5775298191612158, + "flos": 528216795648.0, + "grad_norm": 0.054057225734739034, + "language_loss": 0.73747128, + "learning_rate": 0.0003994393378982635, + "loss": 0.74819982, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.35327148, + "step": 3002, + "time_per_iteration": 2.605628490447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042054, + "balance_loss_mlp": 1.03013277, + "epoch": 0.5777222008464794, + "flos": 1303178070528.0, + "grad_norm": 0.01828159888171313, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80580056, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.11914062, + "step": 3003, + "time_per_iteration": 4.791952848434448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072518, + "balance_loss_mlp": 1.03708899, + "epoch": 0.577914582531743, + "flos": 603344816640.0, + "grad_norm": 0.05129820562397971, + "language_loss": 0.88025165, + "learning_rate": 0.0003988290634182961, + "loss": 0.89097679, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.35449219, + "step": 3004, + "time_per_iteration": 2.7482082843780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107598, + "balance_loss_mlp": 1.04162431, + "epoch": 0.5781069642170066, + "flos": 486537338880.0, + "grad_norm": 0.060845290060135546, + "language_loss": 0.80967325, + "learning_rate": 0.0003985239850361453, + "loss": 0.82043308, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.34399414, + "step": 3005, + "time_per_iteration": 2.577929735183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074673, + "balance_loss_mlp": 1.03933978, + "epoch": 0.5782993459022701, + "flos": 506016626688.0, + "grad_norm": 0.06787324566679709, + "language_loss": 0.84799004, + "learning_rate": 0.0003982189460504777, + "loss": 0.85873681, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.35375977, + "step": 3006, + "time_per_iteration": 2.6993815898895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077227, + "balance_loss_mlp": 1.04179859, + "epoch": 0.5784917275875336, + "flos": 601872380928.0, + "grad_norm": 0.06968716045875477, + "language_loss": 0.79860866, + "learning_rate": 0.00039791394657971935, + "loss": 0.80938095, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.35449219, + "step": 3007, + "time_per_iteration": 2.6929664611816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070454, + "balance_loss_mlp": 1.03616893, + "epoch": 0.5786841092727972, + "flos": 521279428608.0, + "grad_norm": 0.07090711844515878, + "language_loss": 0.84396511, + "learning_rate": 0.00039760898674228205, + "loss": 0.85466969, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.34301758, + "step": 3008, + "time_per_iteration": 2.674983501434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074007, + "balance_loss_mlp": 1.03941262, + "epoch": 0.5788764909580608, + "flos": 767047809024.0, + "grad_norm": 0.04405411396785794, + "language_loss": 0.80589879, + "learning_rate": 0.0003973040666565613, + "loss": 0.81663889, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.34619141, + "step": 3009, + "time_per_iteration": 3.0445330142974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068256, + "balance_loss_mlp": 1.03347063, + "epoch": 0.5790688726433244, + "flos": 598786324992.0, + "grad_norm": 0.0464228238066257, + "language_loss": 0.81778955, + "learning_rate": 0.000396999186440938, + "loss": 0.82847214, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.34814453, + "step": 3010, + "time_per_iteration": 2.837510585784912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_mlp": 1.03594089, + "epoch": 0.5792612543285879, + "flos": 522805708800.0, + "grad_norm": 0.06076952990047212, + "language_loss": 0.8482464, + "learning_rate": 0.000396694346213777, + "loss": 0.85896629, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.36083984, + "step": 3011, + "time_per_iteration": 2.630096197128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071847, + "balance_loss_mlp": 1.03498721, + "epoch": 0.5794536360138515, + "flos": 876178805760.0, + "grad_norm": 0.045866643068031475, + "language_loss": 0.83350897, + "learning_rate": 0.0003963895460934276, + "loss": 0.84422737, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.3684082, + "step": 3012, + "time_per_iteration": 3.144862174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107149, + "balance_loss_mlp": 1.03555989, + "epoch": 0.5796460176991151, + "flos": 401221541376.0, + "grad_norm": 0.0681769397078292, + "language_loss": 0.84421676, + "learning_rate": 0.00039608478619822376, + "loss": 0.85493165, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.35961914, + "step": 3013, + "time_per_iteration": 2.459653854370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071668, + "balance_loss_mlp": 1.03545213, + "epoch": 0.5798383993843786, + "flos": 618229297152.0, + "grad_norm": 0.04312849012034037, + "language_loss": 0.82395273, + "learning_rate": 0.00039578006664648394, + "loss": 0.83466941, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.36206055, + "step": 3014, + "time_per_iteration": 2.759540557861328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068998, + "balance_loss_mlp": 1.0336163, + "epoch": 0.5800307810696421, + "flos": 843966950400.0, + "grad_norm": 0.05059644865737796, + "language_loss": 0.80954117, + "learning_rate": 0.0003954753875565105, + "loss": 0.82023108, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.35424805, + "step": 3015, + "time_per_iteration": 3.102818727493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065155, + "balance_loss_mlp": 1.02970195, + "epoch": 0.5802231627549057, + "flos": 569005779456.0, + "grad_norm": 0.049284538826036076, + "language_loss": 0.82072717, + "learning_rate": 0.00039517074904659057, + "loss": 0.83137876, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.35498047, + "step": 3016, + "time_per_iteration": 2.6733109951019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074461, + "balance_loss_mlp": 1.03884125, + "epoch": 0.5804155444401693, + "flos": 660160447488.0, + "grad_norm": 0.0506827974734746, + "language_loss": 0.84573597, + "learning_rate": 0.00039486615123499535, + "loss": 0.8564806, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.35668945, + "step": 3017, + "time_per_iteration": 2.8088088035583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071818, + "balance_loss_mlp": 1.0354352, + "epoch": 0.5806079261254329, + "flos": 513726603264.0, + "grad_norm": 0.053399367847764105, + "language_loss": 0.84808505, + "learning_rate": 0.00039456159423997996, + "loss": 0.85880327, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.36401367, + "step": 3018, + "time_per_iteration": 2.6254379749298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074265, + "balance_loss_mlp": 1.03747678, + "epoch": 0.5808003078106965, + "flos": 528379739136.0, + "grad_norm": 0.059071353461068586, + "language_loss": 0.89337808, + "learning_rate": 0.00039425707817978406, + "loss": 0.90412068, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.36767578, + "step": 3019, + "time_per_iteration": 2.65867280960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071634, + "balance_loss_mlp": 1.0357995, + "epoch": 0.58099268949596, + "flos": 476787520512.0, + "grad_norm": 0.06353889490099716, + "language_loss": 0.83356857, + "learning_rate": 0.00039395260317263124, + "loss": 0.84428501, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.35839844, + "step": 3020, + "time_per_iteration": 2.554124116897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074167, + "balance_loss_mlp": 1.03666329, + "epoch": 0.5811850711812235, + "flos": 517340777472.0, + "grad_norm": 0.05166922362438639, + "language_loss": 0.84975517, + "learning_rate": 0.0003936481693367291, + "loss": 0.86049688, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.37475586, + "step": 3021, + "time_per_iteration": 2.6460227966308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075928, + "balance_loss_mlp": 1.03976023, + "epoch": 0.5813774528664871, + "flos": 616122464256.0, + "grad_norm": 0.06649500378390247, + "language_loss": 0.876212, + "learning_rate": 0.0003933437767902697, + "loss": 0.88697129, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.36206055, + "step": 3022, + "time_per_iteration": 2.8114941120147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071818, + "balance_loss_mlp": 1.03588879, + "epoch": 0.5815698345517507, + "flos": 567194310144.0, + "grad_norm": 0.06503214921944889, + "language_loss": 0.78287327, + "learning_rate": 0.00039303942565142825, + "loss": 0.7935915, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.35961914, + "step": 3023, + "time_per_iteration": 2.7259762287139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071639, + "balance_loss_mlp": 1.03563786, + "epoch": 0.5817622162370142, + "flos": 562886102016.0, + "grad_norm": 0.05350887168996553, + "language_loss": 0.76429439, + "learning_rate": 0.0003927351160383644, + "loss": 0.77501082, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.36035156, + "step": 3024, + "time_per_iteration": 2.8155934810638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071996, + "balance_loss_mlp": 1.03730595, + "epoch": 0.5819545979222778, + "flos": 458982899712.0, + "grad_norm": 0.05396860990467202, + "language_loss": 0.77624023, + "learning_rate": 0.000392430848069222, + "loss": 0.78696012, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.34741211, + "step": 3025, + "time_per_iteration": 2.5123956203460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069758, + "balance_loss_mlp": 1.03387606, + "epoch": 0.5821469796075414, + "flos": 541215613440.0, + "grad_norm": 0.05894861582094883, + "language_loss": 0.82395303, + "learning_rate": 0.00039212662186212795, + "loss": 0.83465064, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.35913086, + "step": 3026, + "time_per_iteration": 2.6423861980438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075924, + "balance_loss_mlp": 1.03930306, + "epoch": 0.582339361292805, + "flos": 551994117120.0, + "grad_norm": 0.060293393109458415, + "language_loss": 0.77264106, + "learning_rate": 0.0003918224375351934, + "loss": 0.7834003, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.36621094, + "step": 3027, + "time_per_iteration": 2.691378593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075199, + "balance_loss_mlp": 1.04029393, + "epoch": 0.5825317429780685, + "flos": 496138770432.0, + "grad_norm": 0.05191318265313257, + "language_loss": 0.78248543, + "learning_rate": 0.0003915182952065135, + "loss": 0.79323745, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.34936523, + "step": 3028, + "time_per_iteration": 2.718275308609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073019, + "balance_loss_mlp": 1.03732777, + "epoch": 0.582724124663332, + "flos": 563890056192.0, + "grad_norm": 0.0482119369127772, + "language_loss": 0.87499475, + "learning_rate": 0.0003912141949941664, + "loss": 0.8857249, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.35766602, + "step": 3029, + "time_per_iteration": 2.6762070655822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075023, + "balance_loss_mlp": 1.03852117, + "epoch": 0.5829165063485956, + "flos": 491888788992.0, + "grad_norm": 0.06336756881053687, + "language_loss": 0.82355005, + "learning_rate": 0.0003909101370162143, + "loss": 0.83430028, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.36499023, + "step": 3030, + "time_per_iteration": 2.6055908203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035193, + "balance_loss_mlp": 1.02432156, + "epoch": 0.5831088880338592, + "flos": 1528134501888.0, + "grad_norm": 0.025423566517204055, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.7346909, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.10888672, + "step": 3031, + "time_per_iteration": 4.88014817237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071232, + "balance_loss_mlp": 1.03558815, + "epoch": 0.5833012697191228, + "flos": 617712763392.0, + "grad_norm": 0.04799878735573131, + "language_loss": 0.82774729, + "learning_rate": 0.0003903021482356622, + "loss": 0.83845961, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.35693359, + "step": 3032, + "time_per_iteration": 2.7778074741363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071187, + "balance_loss_mlp": 1.03542447, + "epoch": 0.5834936514043862, + "flos": 767578899456.0, + "grad_norm": 0.04830091888101656, + "language_loss": 0.82788891, + "learning_rate": 0.00038999821766910465, + "loss": 0.83860075, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.35791016, + "step": 3033, + "time_per_iteration": 2.9640953540802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070566, + "balance_loss_mlp": 1.03496981, + "epoch": 0.5836860330896498, + "flos": 458136096768.0, + "grad_norm": 0.045708981442043065, + "language_loss": 0.85570675, + "learning_rate": 0.00038969432980902606, + "loss": 0.8664124, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.35620117, + "step": 3034, + "time_per_iteration": 2.520869255065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029447, + "balance_loss_mlp": 1.01819336, + "epoch": 0.5838784147749134, + "flos": 1360485504000.0, + "grad_norm": 0.023110513117977256, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80813944, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.11230469, + "step": 3035, + "time_per_iteration": 4.791047811508179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076539, + "balance_loss_mlp": 1.04125297, + "epoch": 0.584070796460177, + "flos": 566942616576.0, + "grad_norm": 0.048603623386797364, + "language_loss": 0.82340151, + "learning_rate": 0.00038908668268020953, + "loss": 0.83416688, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.35302734, + "step": 3036, + "time_per_iteration": 2.6480767726898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073606, + "balance_loss_mlp": 1.03781927, + "epoch": 0.5842631781454406, + "flos": 611188623360.0, + "grad_norm": 0.04937423588772942, + "language_loss": 0.84850454, + "learning_rate": 0.00038878292364738097, + "loss": 0.85924065, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.3581543, + "step": 3037, + "time_per_iteration": 2.7739527225494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070708, + "balance_loss_mlp": 1.03418183, + "epoch": 0.5844555598307041, + "flos": 463148513280.0, + "grad_norm": 0.05602443207387838, + "language_loss": 0.86980963, + "learning_rate": 0.0003884792077928508, + "loss": 0.88051671, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.36523438, + "step": 3038, + "time_per_iteration": 2.488044500350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076481, + "balance_loss_mlp": 1.04083705, + "epoch": 0.5846479415159677, + "flos": 410005283328.0, + "grad_norm": 0.06107663121836191, + "language_loss": 0.76691568, + "learning_rate": 0.0003881755352345322, + "loss": 0.77768052, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.35644531, + "step": 3039, + "time_per_iteration": 2.4996848106384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076016, + "balance_loss_mlp": 1.03944278, + "epoch": 0.5848403232012312, + "flos": 491056542720.0, + "grad_norm": 0.04475599589029588, + "language_loss": 0.86940634, + "learning_rate": 0.0003878719060903207, + "loss": 0.88016653, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.36572266, + "step": 3040, + "time_per_iteration": 2.5631661415100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107371, + "balance_loss_mlp": 1.03823376, + "epoch": 0.5850327048864948, + "flos": 584146335744.0, + "grad_norm": 0.06623374989281658, + "language_loss": 0.82883763, + "learning_rate": 0.0003875683204780961, + "loss": 0.83957475, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.35522461, + "step": 3041, + "time_per_iteration": 2.7194101810455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074225, + "balance_loss_mlp": 1.03765166, + "epoch": 0.5852250865717584, + "flos": 651253049856.0, + "grad_norm": 0.05546398592496706, + "language_loss": 0.84983653, + "learning_rate": 0.00038726477851572043, + "loss": 0.86057878, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.36572266, + "step": 3042, + "time_per_iteration": 2.809687376022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072999, + "balance_loss_mlp": 1.03659296, + "epoch": 0.5854174682570219, + "flos": 534332090880.0, + "grad_norm": 0.07237686853447298, + "language_loss": 0.80418718, + "learning_rate": 0.0003869612803210395, + "loss": 0.81491715, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.36401367, + "step": 3043, + "time_per_iteration": 2.6141133308410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074965, + "balance_loss_mlp": 1.03872585, + "epoch": 0.5856098499422855, + "flos": 509501352960.0, + "grad_norm": 0.08321780378599658, + "language_loss": 0.83029413, + "learning_rate": 0.0003866578260118817, + "loss": 0.84104383, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.36254883, + "step": 3044, + "time_per_iteration": 2.5739400386810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070627, + "balance_loss_mlp": 1.03438699, + "epoch": 0.5858022316275491, + "flos": 593619729408.0, + "grad_norm": 0.061750802810204855, + "language_loss": 0.83199847, + "learning_rate": 0.0003863544157060581, + "loss": 0.84270471, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.36254883, + "step": 3045, + "time_per_iteration": 2.662442207336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077309, + "balance_loss_mlp": 1.04083109, + "epoch": 0.5859946133128127, + "flos": 558829587456.0, + "grad_norm": 0.0566139046566934, + "language_loss": 0.82210046, + "learning_rate": 0.0003860510495213634, + "loss": 0.83287358, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.36499023, + "step": 3046, + "time_per_iteration": 2.817676305770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086311, + "balance_loss_mlp": 1.04885542, + "epoch": 0.5861869949980761, + "flos": 553431647232.0, + "grad_norm": 0.06969052760403557, + "language_loss": 0.77781415, + "learning_rate": 0.0003857477275755746, + "loss": 0.78867728, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.37451172, + "step": 3047, + "time_per_iteration": 2.645547389984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076852, + "balance_loss_mlp": 1.03994477, + "epoch": 0.5863793766833397, + "flos": 718321886208.0, + "grad_norm": 0.060152245737565335, + "language_loss": 0.83672923, + "learning_rate": 0.00038544444998645167, + "loss": 0.84749776, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.36914062, + "step": 3048, + "time_per_iteration": 2.995572090148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080654, + "balance_loss_mlp": 1.04410434, + "epoch": 0.5865717583686033, + "flos": 472041354240.0, + "grad_norm": 0.05877541838315078, + "language_loss": 0.81869525, + "learning_rate": 0.00038514121687173767, + "loss": 0.82950181, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.36572266, + "step": 3049, + "time_per_iteration": 2.5653092861175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085484, + "balance_loss_mlp": 1.04819572, + "epoch": 0.5867641400538669, + "flos": 813143162880.0, + "grad_norm": 0.060327128014073625, + "language_loss": 0.82117838, + "learning_rate": 0.00038483802834915807, + "loss": 0.83203322, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.37280273, + "step": 3050, + "time_per_iteration": 2.9661922454833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074132, + "balance_loss_mlp": 1.03755879, + "epoch": 0.5869565217391305, + "flos": 486285645312.0, + "grad_norm": 0.05442603126978945, + "language_loss": 0.78767669, + "learning_rate": 0.00038453488453642074, + "loss": 0.79841799, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.36547852, + "step": 3051, + "time_per_iteration": 2.6421701908111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076937, + "balance_loss_mlp": 1.0401963, + "epoch": 0.587148903424394, + "flos": 569104704000.0, + "grad_norm": 0.050403805084847125, + "language_loss": 0.86714828, + "learning_rate": 0.00038423178555121697, + "loss": 0.87791765, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.36743164, + "step": 3052, + "time_per_iteration": 2.689039945602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079498, + "balance_loss_mlp": 1.04239988, + "epoch": 0.5873412851096576, + "flos": 746948680704.0, + "grad_norm": 0.04537735372020953, + "language_loss": 0.85335124, + "learning_rate": 0.00038392873151121994, + "loss": 0.86414617, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.37084961, + "step": 3053, + "time_per_iteration": 3.0252749919891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071586, + "balance_loss_mlp": 1.03510821, + "epoch": 0.5875336667949211, + "flos": 527882144256.0, + "grad_norm": 0.0531573443466337, + "language_loss": 0.82837141, + "learning_rate": 0.0003836257225340859, + "loss": 0.83908725, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.36474609, + "step": 3054, + "time_per_iteration": 2.6028475761413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074191, + "balance_loss_mlp": 1.03728426, + "epoch": 0.5877260484801847, + "flos": 823799420928.0, + "grad_norm": 0.057535155706969474, + "language_loss": 0.81870168, + "learning_rate": 0.00038332275873745336, + "loss": 0.82944363, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.36889648, + "step": 3055, + "time_per_iteration": 3.1007511615753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074496, + "balance_loss_mlp": 1.03682637, + "epoch": 0.5879184301654482, + "flos": 591325221888.0, + "grad_norm": 0.0460079349498171, + "language_loss": 0.82943761, + "learning_rate": 0.0003830198402389431, + "loss": 0.84018254, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.37646484, + "step": 3056, + "time_per_iteration": 2.6919126510620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010412, + "balance_loss_mlp": 1.02975643, + "epoch": 0.5881108118507118, + "flos": 1544953955328.0, + "grad_norm": 0.021887470100806234, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78390133, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.11425781, + "step": 3057, + "time_per_iteration": 4.971444368362427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072291, + "balance_loss_mlp": 1.03576517, + "epoch": 0.5883031935359754, + "flos": 489348380160.0, + "grad_norm": 0.055950804718103285, + "language_loss": 0.82692897, + "learning_rate": 0.0003824141396066855, + "loss": 0.83765185, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.36572266, + "step": 3058, + "time_per_iteration": 2.5848963260650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074876, + "balance_loss_mlp": 1.03842139, + "epoch": 0.588495575221239, + "flos": 582551654400.0, + "grad_norm": 0.05305150563857962, + "language_loss": 0.82647693, + "learning_rate": 0.000382111357708092, + "loss": 0.83722568, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.36499023, + "step": 3059, + "time_per_iteration": 2.750030279159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071916, + "balance_loss_mlp": 1.03558111, + "epoch": 0.5886879569065026, + "flos": 660751174656.0, + "grad_norm": 0.05165433097502605, + "language_loss": 0.83451211, + "learning_rate": 0.00038180862157792864, + "loss": 0.84523129, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.36303711, + "step": 3060, + "time_per_iteration": 2.7654812335968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070431, + "balance_loss_mlp": 1.03414369, + "epoch": 0.588880338591766, + "flos": 562392889344.0, + "grad_norm": 0.05703427459216956, + "language_loss": 0.82004499, + "learning_rate": 0.0003815059313337279, + "loss": 0.83074933, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.36279297, + "step": 3061, + "time_per_iteration": 2.659722089767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072147, + "balance_loss_mlp": 1.03585935, + "epoch": 0.5890727202770296, + "flos": 554451568128.0, + "grad_norm": 0.04901881896382658, + "language_loss": 0.77886307, + "learning_rate": 0.00038120328709300436, + "loss": 0.78958452, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.36279297, + "step": 3062, + "time_per_iteration": 2.8264663219451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076904, + "balance_loss_mlp": 1.04114151, + "epoch": 0.5892651019622932, + "flos": 655226606592.0, + "grad_norm": 0.057794453116502664, + "language_loss": 0.83449113, + "learning_rate": 0.0003809006889732549, + "loss": 0.84526014, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.35766602, + "step": 3063, + "time_per_iteration": 2.780714511871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073572, + "balance_loss_mlp": 1.03680801, + "epoch": 0.5894574836475568, + "flos": 452970911232.0, + "grad_norm": 0.048397381644471126, + "language_loss": 0.87604314, + "learning_rate": 0.0003805981370919589, + "loss": 0.88677883, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.36743164, + "step": 3064, + "time_per_iteration": 2.497511386871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077136, + "balance_loss_mlp": 1.03965652, + "epoch": 0.5896498653328203, + "flos": 518763750912.0, + "grad_norm": 0.05535483461806511, + "language_loss": 0.83910584, + "learning_rate": 0.0003802956315665771, + "loss": 0.84987724, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.37475586, + "step": 3065, + "time_per_iteration": 2.6540539264678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075706, + "balance_loss_mlp": 1.03965688, + "epoch": 0.5898422470180839, + "flos": 548793169920.0, + "grad_norm": 0.06978967624296899, + "language_loss": 0.81621277, + "learning_rate": 0.0003799931725145529, + "loss": 0.82696986, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.3605957, + "step": 3066, + "time_per_iteration": 2.5999929904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075756, + "balance_loss_mlp": 1.04015982, + "epoch": 0.5900346287033474, + "flos": 524046799872.0, + "grad_norm": 0.06178961053063138, + "language_loss": 0.85556895, + "learning_rate": 0.00037969076005331083, + "loss": 0.86632651, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.35571289, + "step": 3067, + "time_per_iteration": 2.7505955696105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080728, + "balance_loss_mlp": 1.04372525, + "epoch": 0.590227010388611, + "flos": 566893154304.0, + "grad_norm": 0.059517883137225745, + "language_loss": 0.88041914, + "learning_rate": 0.00037938839430025817, + "loss": 0.89122641, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.36962891, + "step": 3068, + "time_per_iteration": 2.6254634857177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072667, + "balance_loss_mlp": 1.03714228, + "epoch": 0.5904193920738746, + "flos": 583053631488.0, + "grad_norm": 0.05094647187222568, + "language_loss": 0.85285151, + "learning_rate": 0.0003790860753727835, + "loss": 0.8635782, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.35546875, + "step": 3069, + "time_per_iteration": 2.790996551513672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076132, + "balance_loss_mlp": 1.04056025, + "epoch": 0.5906117737591381, + "flos": 529428773376.0, + "grad_norm": 0.06487433034023032, + "language_loss": 0.82915914, + "learning_rate": 0.00037878380338825766, + "loss": 0.83992046, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.35644531, + "step": 3070, + "time_per_iteration": 2.6697611808776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078223, + "balance_loss_mlp": 1.04276967, + "epoch": 0.5908041554444017, + "flos": 683908655616.0, + "grad_norm": 0.053205750192721994, + "language_loss": 0.81560326, + "learning_rate": 0.00037848157846403287, + "loss": 0.8263855, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.35473633, + "step": 3071, + "time_per_iteration": 2.92523193359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077534, + "balance_loss_mlp": 1.04246306, + "epoch": 0.5909965371296653, + "flos": 549719958528.0, + "grad_norm": 0.04683417834560967, + "language_loss": 0.83405554, + "learning_rate": 0.0003781794007174435, + "loss": 0.84483093, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.35107422, + "step": 3072, + "time_per_iteration": 2.7881455421447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022638, + "balance_loss_mlp": 1.01200461, + "epoch": 0.5911889188149289, + "flos": 1491544475136.0, + "grad_norm": 0.008695883247199268, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75097167, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.10644531, + "step": 3073, + "time_per_iteration": 4.864701509475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078671, + "balance_loss_mlp": 1.04293227, + "epoch": 0.5913813005001923, + "flos": 487630043136.0, + "grad_norm": 0.053099165858615995, + "language_loss": 0.80592149, + "learning_rate": 0.0003775751872264152, + "loss": 0.81670815, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.35766602, + "step": 3074, + "time_per_iteration": 2.7932956218719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079498, + "balance_loss_mlp": 1.04409289, + "epoch": 0.5915736821854559, + "flos": 573034590720.0, + "grad_norm": 0.04575078918426429, + "language_loss": 0.86981148, + "learning_rate": 0.0003772731517165527, + "loss": 0.88060653, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.35449219, + "step": 3075, + "time_per_iteration": 2.7613656520843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075792, + "balance_loss_mlp": 1.04060149, + "epoch": 0.5917660638707195, + "flos": 789183959040.0, + "grad_norm": 0.06797753963070947, + "language_loss": 0.84194851, + "learning_rate": 0.0003769711638534784, + "loss": 0.85270643, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.35205078, + "step": 3076, + "time_per_iteration": 2.991854190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076527, + "balance_loss_mlp": 1.04181361, + "epoch": 0.5919584455559831, + "flos": 528487428096.0, + "grad_norm": 0.06227325112589354, + "language_loss": 0.78677326, + "learning_rate": 0.00037666922375443446, + "loss": 0.79753852, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.34765625, + "step": 3077, + "time_per_iteration": 2.591597557067871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072268, + "balance_loss_mlp": 1.03757811, + "epoch": 0.5921508272412467, + "flos": 560320962048.0, + "grad_norm": 0.056716138151229355, + "language_loss": 0.81505013, + "learning_rate": 0.00037636733153664396, + "loss": 0.82577276, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.34716797, + "step": 3078, + "time_per_iteration": 2.854278802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075513, + "balance_loss_mlp": 1.04144311, + "epoch": 0.5923432089265102, + "flos": 563008347648.0, + "grad_norm": 0.061835614307010005, + "language_loss": 0.79824865, + "learning_rate": 0.0003760654873173124, + "loss": 0.80900383, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.34082031, + "step": 3079, + "time_per_iteration": 2.66091251373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_mlp": 1.04387426, + "epoch": 0.5925355906117737, + "flos": 495488406528.0, + "grad_norm": 0.052514491856325576, + "language_loss": 0.81763887, + "learning_rate": 0.00037576369121362566, + "loss": 0.8284322, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.35498047, + "step": 3080, + "time_per_iteration": 2.5847787857055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079309, + "balance_loss_mlp": 1.04473865, + "epoch": 0.5927279722970373, + "flos": 565940072448.0, + "grad_norm": 0.05276703199883553, + "language_loss": 0.81885982, + "learning_rate": 0.0003754619433427516, + "loss": 0.82965291, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.34570312, + "step": 3081, + "time_per_iteration": 2.898594856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108187, + "balance_loss_mlp": 1.04682267, + "epoch": 0.5929203539823009, + "flos": 666674413056.0, + "grad_norm": 0.06717854488830324, + "language_loss": 0.77682364, + "learning_rate": 0.0003751602438218392, + "loss": 0.78764236, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.35083008, + "step": 3082, + "time_per_iteration": 2.7553367614746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083555, + "balance_loss_mlp": 1.0486505, + "epoch": 0.5931127356675644, + "flos": 555484635648.0, + "grad_norm": 0.05625551140275949, + "language_loss": 0.83254004, + "learning_rate": 0.0003748585927680186, + "loss": 0.84337556, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.34912109, + "step": 3083, + "time_per_iteration": 2.6493966579437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089532, + "balance_loss_mlp": 1.0530777, + "epoch": 0.593305117352828, + "flos": 534932992512.0, + "grad_norm": 0.07512877248395429, + "language_loss": 0.82828176, + "learning_rate": 0.00037455699029840086, + "loss": 0.83917707, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.36450195, + "step": 3084, + "time_per_iteration": 2.674532890319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079674, + "balance_loss_mlp": 1.04488921, + "epoch": 0.5934974990380916, + "flos": 593683748352.0, + "grad_norm": 0.05984158390569505, + "language_loss": 0.84177965, + "learning_rate": 0.0003742554365300787, + "loss": 0.85257638, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.34838867, + "step": 3085, + "time_per_iteration": 2.712371587753296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085917, + "balance_loss_mlp": 1.05044067, + "epoch": 0.5936898807233552, + "flos": 712339011072.0, + "grad_norm": 0.05068184961629974, + "language_loss": 0.78978491, + "learning_rate": 0.0003739539315801255, + "loss": 0.80064404, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.35473633, + "step": 3086, + "time_per_iteration": 2.916006565093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088952, + "balance_loss_mlp": 1.05345142, + "epoch": 0.5938822624086187, + "flos": 391684128768.0, + "grad_norm": 0.05263578767135529, + "language_loss": 0.9165324, + "learning_rate": 0.000373652475565596, + "loss": 0.92742193, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.35522461, + "step": 3087, + "time_per_iteration": 2.470960855484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094024, + "balance_loss_mlp": 1.05900025, + "epoch": 0.5940746440938822, + "flos": 480023373312.0, + "grad_norm": 0.060850763929597464, + "language_loss": 0.81550741, + "learning_rate": 0.00037335106860352587, + "loss": 0.82644761, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.35083008, + "step": 3088, + "time_per_iteration": 2.666472911834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097243, + "balance_loss_mlp": 1.06100357, + "epoch": 0.5942670257791458, + "flos": 483094872576.0, + "grad_norm": 0.049324641114684424, + "language_loss": 0.83196813, + "learning_rate": 0.00037304971081093146, + "loss": 0.84294057, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.36230469, + "step": 3089, + "time_per_iteration": 2.521000862121582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093812, + "balance_loss_mlp": 1.05967069, + "epoch": 0.5944594074644094, + "flos": 547656795648.0, + "grad_norm": 0.0533670066305608, + "language_loss": 0.81061506, + "learning_rate": 0.00037274840230481024, + "loss": 0.82155317, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.34179688, + "step": 3090, + "time_per_iteration": 2.7134556770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092625, + "balance_loss_mlp": 1.05700517, + "epoch": 0.594651789149673, + "flos": 448943510016.0, + "grad_norm": 0.055393993008082114, + "language_loss": 0.78753984, + "learning_rate": 0.00037244714320214077, + "loss": 0.79846609, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.35620117, + "step": 3091, + "time_per_iteration": 2.5576789379119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092048, + "balance_loss_mlp": 1.05640459, + "epoch": 0.5948441708349365, + "flos": 595969491456.0, + "grad_norm": 0.050698130573270175, + "language_loss": 0.83444929, + "learning_rate": 0.000372145933619882, + "loss": 0.84536982, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.35668945, + "step": 3092, + "time_per_iteration": 2.8742141723632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091606, + "balance_loss_mlp": 1.05636811, + "epoch": 0.5950365525202, + "flos": 548251905024.0, + "grad_norm": 0.05419961551348069, + "language_loss": 0.82168603, + "learning_rate": 0.000371844773674974, + "loss": 0.83260214, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.3527832, + "step": 3093, + "time_per_iteration": 2.6228530406951904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094358, + "balance_loss_mlp": 1.05890489, + "epoch": 0.5952289342054636, + "flos": 654385595904.0, + "grad_norm": 0.05844341434318606, + "language_loss": 0.81673229, + "learning_rate": 0.0003715436634843375, + "loss": 0.82767594, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.35498047, + "step": 3094, + "time_per_iteration": 2.8496577739715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084873, + "balance_loss_mlp": 1.04951525, + "epoch": 0.5954213158907272, + "flos": 603055245312.0, + "grad_norm": 0.0455107572696148, + "language_loss": 0.80728281, + "learning_rate": 0.00037124260316487355, + "loss": 0.81813157, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.35375977, + "step": 3095, + "time_per_iteration": 2.83181095123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084995, + "balance_loss_mlp": 1.05044806, + "epoch": 0.5956136975759908, + "flos": 486097970688.0, + "grad_norm": 0.0493360128544523, + "language_loss": 0.89028478, + "learning_rate": 0.0003709415928334643, + "loss": 0.90113473, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.34570312, + "step": 3096, + "time_per_iteration": 2.5334527492523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081969, + "balance_loss_mlp": 1.0465641, + "epoch": 0.5958060792612543, + "flos": 658462459392.0, + "grad_norm": 0.05334894182240255, + "language_loss": 0.80644953, + "learning_rate": 0.00037064063260697233, + "loss": 0.81726921, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.35424805, + "step": 3097, + "time_per_iteration": 2.868948221206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085643, + "balance_loss_mlp": 1.05004668, + "epoch": 0.5959984609465179, + "flos": 723201882624.0, + "grad_norm": 0.05441892470065276, + "language_loss": 0.78413296, + "learning_rate": 0.0003703397226022407, + "loss": 0.79498935, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.35595703, + "step": 3098, + "time_per_iteration": 3.0486435890197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054277, + "balance_loss_mlp": 1.04254675, + "epoch": 0.5961908426317815, + "flos": 1519010164224.0, + "grad_norm": 0.031936086773479797, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76554149, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.1171875, + "step": 3099, + "time_per_iteration": 4.9141762256622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082532, + "balance_loss_mlp": 1.04822397, + "epoch": 0.596383224317045, + "flos": 532357678080.0, + "grad_norm": 0.04537931846822051, + "language_loss": 0.83096731, + "learning_rate": 0.0003697380537253339, + "loss": 0.84179258, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.34350586, + "step": 3100, + "time_per_iteration": 2.6156232357025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082884, + "balance_loss_mlp": 1.04766929, + "epoch": 0.5965756060023086, + "flos": 590922169344.0, + "grad_norm": 0.060003355935897486, + "language_loss": 0.81679451, + "learning_rate": 0.0003694372950867471, + "loss": 0.82762337, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.3527832, + "step": 3101, + "time_per_iteration": 2.746100902557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084672, + "balance_loss_mlp": 1.04967189, + "epoch": 0.5967679876875721, + "flos": 861701760000.0, + "grad_norm": 0.05796500812003716, + "language_loss": 0.77373374, + "learning_rate": 0.0003691365871370976, + "loss": 0.78458047, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.3503418, + "step": 3102, + "time_per_iteration": 3.0448250770568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082291, + "balance_loss_mlp": 1.04710054, + "epoch": 0.5969603693728357, + "flos": 553574241792.0, + "grad_norm": 0.05791620467430745, + "language_loss": 0.854276, + "learning_rate": 0.00036883592999313093, + "loss": 0.86509889, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.35229492, + "step": 3103, + "time_per_iteration": 2.650810718536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082187, + "balance_loss_mlp": 1.04666269, + "epoch": 0.5971527510580993, + "flos": 718345207296.0, + "grad_norm": 0.05277795957282848, + "language_loss": 0.79037023, + "learning_rate": 0.0003685353237715722, + "loss": 0.80119205, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.35546875, + "step": 3104, + "time_per_iteration": 2.87162184715271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082645, + "balance_loss_mlp": 1.04812241, + "epoch": 0.5973451327433629, + "flos": 647324573184.0, + "grad_norm": 0.05039525348103138, + "language_loss": 0.81437027, + "learning_rate": 0.0003682347685891274, + "loss": 0.82519674, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.34570312, + "step": 3105, + "time_per_iteration": 2.844632863998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078322, + "balance_loss_mlp": 1.04284513, + "epoch": 0.5975375144286263, + "flos": 721374446592.0, + "grad_norm": 0.053848168408106474, + "language_loss": 0.80436707, + "learning_rate": 0.0003679342645624822, + "loss": 0.81515038, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.35498047, + "step": 3106, + "time_per_iteration": 2.961121082305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079962, + "balance_loss_mlp": 1.04374671, + "epoch": 0.5977298961138899, + "flos": 750616699392.0, + "grad_norm": 0.04889819009677852, + "language_loss": 0.8164891, + "learning_rate": 0.0003676338118083025, + "loss": 0.82728875, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.36230469, + "step": 3107, + "time_per_iteration": 2.997671127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076541, + "balance_loss_mlp": 1.04161251, + "epoch": 0.5979222777991535, + "flos": 530703360000.0, + "grad_norm": 0.05034919609110883, + "language_loss": 0.79592144, + "learning_rate": 0.0003673334104432347, + "loss": 0.80668688, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.34960938, + "step": 3108, + "time_per_iteration": 2.5946898460388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079709, + "balance_loss_mlp": 1.04461432, + "epoch": 0.5981146594844171, + "flos": 621459357696.0, + "grad_norm": 0.04952863942356172, + "language_loss": 0.83337331, + "learning_rate": 0.0003670330605839048, + "loss": 0.84417045, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.35131836, + "step": 3109, + "time_per_iteration": 2.7955031394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080275, + "balance_loss_mlp": 1.04470301, + "epoch": 0.5983070411696807, + "flos": 603309911040.0, + "grad_norm": 0.05233505638894281, + "language_loss": 0.76384044, + "learning_rate": 0.0003667327623469191, + "loss": 0.77464318, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.35571289, + "step": 3110, + "time_per_iteration": 2.7939095497131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080046, + "balance_loss_mlp": 1.04516506, + "epoch": 0.5984994228549442, + "flos": 633187971072.0, + "grad_norm": 0.05191698416970628, + "language_loss": 0.7765972, + "learning_rate": 0.00036643251584886333, + "loss": 0.78739762, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.34912109, + "step": 3111, + "time_per_iteration": 2.821956157684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076682, + "balance_loss_mlp": 1.0426122, + "epoch": 0.5986918045402078, + "flos": 525026022912.0, + "grad_norm": 0.05255438672232182, + "language_loss": 0.81679058, + "learning_rate": 0.00036613232120630393, + "loss": 0.82755744, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.34106445, + "step": 3112, + "time_per_iteration": 2.61639142036438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072562, + "balance_loss_mlp": 1.03751469, + "epoch": 0.5988841862254713, + "flos": 482942103552.0, + "grad_norm": 0.06309856820969045, + "language_loss": 0.8010537, + "learning_rate": 0.00036583217853578643, + "loss": 0.81177926, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.35083008, + "step": 3113, + "time_per_iteration": 2.544152021408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076935, + "balance_loss_mlp": 1.04241252, + "epoch": 0.5990765679107349, + "flos": 1139658674688.0, + "grad_norm": 0.05746596179478014, + "language_loss": 0.7739538, + "learning_rate": 0.000365532087953837, + "loss": 0.78472316, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.34545898, + "step": 3114, + "time_per_iteration": 3.6210074424743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074738, + "balance_loss_mlp": 1.04104948, + "epoch": 0.5992689495959984, + "flos": 516729701376.0, + "grad_norm": 0.0590793434639382, + "language_loss": 0.89283043, + "learning_rate": 0.00036523204957696065, + "loss": 0.9035778, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.3371582, + "step": 3115, + "time_per_iteration": 2.5835559368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079472, + "balance_loss_mlp": 1.0447346, + "epoch": 0.599461331281262, + "flos": 744288998400.0, + "grad_norm": 0.05148674480480004, + "language_loss": 0.80590332, + "learning_rate": 0.00036493206352164324, + "loss": 0.81669807, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.34790039, + "step": 3116, + "time_per_iteration": 2.9135849475860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073646, + "balance_loss_mlp": 1.03960013, + "epoch": 0.5996537129665256, + "flos": 592078892544.0, + "grad_norm": 0.05828379622393402, + "language_loss": 0.85252976, + "learning_rate": 0.000364632129904349, + "loss": 0.86326623, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.34082031, + "step": 3117, + "time_per_iteration": 2.7019104957580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107531, + "balance_loss_mlp": 1.03997648, + "epoch": 0.5998460946517892, + "flos": 558735045120.0, + "grad_norm": 0.05080253376139345, + "language_loss": 0.77507442, + "learning_rate": 0.00036433224884152283, + "loss": 0.78582752, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.35375977, + "step": 3118, + "time_per_iteration": 2.698032855987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082073, + "balance_loss_mlp": 1.04814649, + "epoch": 0.6000384763370528, + "flos": 484325789184.0, + "grad_norm": 0.058104830427354655, + "language_loss": 0.77595496, + "learning_rate": 0.00036403242044958875, + "loss": 0.78677565, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.33959961, + "step": 3119, + "time_per_iteration": 2.5694661140441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082708, + "balance_loss_mlp": 1.04763699, + "epoch": 0.6002308580223162, + "flos": 596490407424.0, + "grad_norm": 0.05350136271967441, + "language_loss": 0.91317761, + "learning_rate": 0.0003637326448449507, + "loss": 0.92400473, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.35083008, + "step": 3120, + "time_per_iteration": 2.7095799446105957 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 260120304, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7074473936158720.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/training_args.bin b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..459663e238ea62a90da439e633388cc1e16cedb6 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63f07a99639c8908760dc7ac65f4d34d749c1861fc4b5a1f91cbdcc73581ce9e +size 7992 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/zero_to_fp32.py b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-3120/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/added_tokens.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7929d4cdbe9bb7ee3537b93d161990a8caa422ec --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sharev3", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/generation_config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..893a43f2e008fa1dc8fd56c14b177d9ebaef635c --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84a282275215c825c6202e07f99e6d19d2f498947356db7480b3ea755c63572b +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5285d4bee1f783c3d0d52d2bcc443f9badd595a9 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5505d2f93aff6047fd70d6e17d75802c84f32196fc5e3047421591eccbdca95 +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7579435d26d861f299f9ad59e038b1e6d1d3df99 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e43edf63e5f6597b36f4f99150907da2e51886e107e23701ec152b3c1681d5b9 +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f38c27d7c173d29898228d23d04bcc068713703 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23a67f30688f18cdeba3f7d5262e554d44ba8b33294cc55d90a42f733d16f79e +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f18deca0ae15f926a825788a8fa12ee2be79950b --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e0c2df3ac7371f435e427c6ed80a2ae0c55734dd6d8d014d9d22483fc8244e3 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35940b65314dcae551499ddff687e5696d273ff9 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbb1082d3338cc2b175f41ad34239c90eadfff79f907e577051db34f2ed181ef +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0aabcc96e1f029272e2ec90c0340161f8093a5a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0dcb7591135986ed041e623cb589796cf7719903e4aa28ea37c7196bb48f0a +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25dddf57348426295054920f28fad4a812f5b734 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15b3ad0bf6ee2e7792b833dbd0fce4cacc6dd490883f522b9342b5cec512369d +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/latest b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/latest new file mode 100644 index 0000000000000000000000000000000000000000..ae01dfd535e9ee314b565695c1d61230ecf4c494 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/latest @@ -0,0 +1 @@ +global_step4160 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..734e2cf3cd89ed57e2a45383f220b027b6e85f6d --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:614e3b0837ae65223c8729c87137355c83c78ab9c1f942e76e8cf2e431b590ff +size 3759020544 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/model.safetensors.index.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..90a56cf0153ed9062ff35ee2b372f7ab9e796c6a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731420128 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/rng_state_0.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/rng_state_1.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/rng_state_2.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/rng_state_3.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/special_tokens_map.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/tokenizer.model b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/tokenizer_config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/trainer_state.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..44e28bf00bff43282ddf839e7e01803daedc5257 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/trainer_state.json @@ -0,0 +1,62433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8003078106964217, + "eval_steps": 500, + "global_step": 4160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03964023, + "balance_loss_mlp": 3.01339984, + "epoch": 0.00019238168526356292, + "flos": 470464353792.0, + "grad_norm": 27.10233905437441, + "language_loss": 3.72295761, + "learning_rate": 0.0, + "loss": 2.48840261, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 9.5, + "step": 1, + "time_per_iteration": 29.606513023376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01906453, + "balance_loss_mlp": 1.25642872, + "epoch": 0.00038476337052712584, + "flos": 504311436288.0, + "grad_norm": 2.874173750989579, + "language_loss": 1.79264998, + "learning_rate": 0.00013726078121135892, + "loss": 1.81171465, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.5, + "step": 2, + "time_per_iteration": 2.7078208923339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01915611, + "balance_loss_mlp": 1.2667315, + "epoch": 0.0005771450557906887, + "flos": 598869282816.0, + "grad_norm": 2.1141296462778643, + "language_loss": 1.61429811, + "learning_rate": 0.00021755319103969496, + "loss": 1.63345432, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.48828125, + "step": 3, + "time_per_iteration": 3.010409116744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01917319, + "balance_loss_mlp": 1.26309848, + "epoch": 0.0007695267410542517, + "flos": 580133491200.0, + "grad_norm": 1.255159247360545, + "language_loss": 1.49202251, + "learning_rate": 0.00027452156242271784, + "loss": 1.51119578, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.54296875, + "step": 4, + "time_per_iteration": 2.7161622047424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0185163, + "balance_loss_mlp": 1.22144234, + "epoch": 0.0009619084263178145, + "flos": 485857861632.0, + "grad_norm": 4.267520959606063, + "language_loss": 1.57359505, + "learning_rate": 0.0003187096642208417, + "loss": 1.59211147, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 6.296875, + "step": 5, + "time_per_iteration": 2.718417167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01828185, + "balance_loss_mlp": 1.21211123, + "epoch": 0.0011542901115813775, + "flos": 559744791552.0, + "grad_norm": 1.225349312557607, + "language_loss": 1.4752574, + "learning_rate": 0.0003548139722510539, + "loss": 1.49353933, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 6.15234375, + "step": 6, + "time_per_iteration": 2.6827874183654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01821666, + "balance_loss_mlp": 1.22428453, + "epoch": 0.0013466717968449403, + "flos": 533721014784.0, + "grad_norm": 0.5025899606895544, + "language_loss": 1.33846116, + "learning_rate": 0.00038533972973918044, + "loss": 1.35667801, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.96875, + "step": 7, + "time_per_iteration": 2.6889517307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01776667, + "balance_loss_mlp": 1.2090404, + "epoch": 0.0015390534821085034, + "flos": 492037175808.0, + "grad_norm": 0.1719820928967348, + "language_loss": 1.2814672, + "learning_rate": 0.0004117823436340768, + "loss": 1.29923391, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.6875, + "step": 8, + "time_per_iteration": 2.7207248210906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0177577, + "balance_loss_mlp": 1.23217535, + "epoch": 0.0017314351673720662, + "flos": 564402207744.0, + "grad_norm": 0.6128716609675008, + "language_loss": 1.39861906, + "learning_rate": 0.00043510638207938993, + "loss": 1.41637683, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.44140625, + "step": 9, + "time_per_iteration": 2.887538194656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01823371, + "balance_loss_mlp": 1.31334615, + "epoch": 0.001923816852635629, + "flos": 593132308992.0, + "grad_norm": 0.480897383035181, + "language_loss": 1.25963569, + "learning_rate": 0.00045597044543220066, + "loss": 1.27786922, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.09765625, + "step": 10, + "time_per_iteration": 2.7672832012176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01930298, + "balance_loss_mlp": 1.44621277, + "epoch": 0.002116198537899192, + "flos": 609308752896.0, + "grad_norm": 0.21803247425844502, + "language_loss": 1.22959518, + "learning_rate": 0.00047484428652143135, + "loss": 1.24889803, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 4.83203125, + "step": 11, + "time_per_iteration": 2.9771082401275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130152, + "balance_loss_mlp": 1.67772901, + "epoch": 0.002308580223162755, + "flos": 544869075456.0, + "grad_norm": 0.19847359144835577, + "language_loss": 1.28057694, + "learning_rate": 0.0004920747534624128, + "loss": 1.30187845, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 4.52734375, + "step": 12, + "time_per_iteration": 2.6094090938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02177014, + "balance_loss_mlp": 1.7512939, + "epoch": 0.002500961908426318, + "flos": 644458277376.0, + "grad_norm": 0.3126355826019607, + "language_loss": 1.29235363, + "learning_rate": 0.0005079252465375872, + "loss": 1.31412375, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 4.265625, + "step": 13, + "time_per_iteration": 2.841792345046997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02141221, + "balance_loss_mlp": 1.74411082, + "epoch": 0.0026933435936898806, + "flos": 487605312000.0, + "grad_norm": 0.282411779716686, + "language_loss": 1.17459798, + "learning_rate": 0.0005226005109505393, + "loss": 1.19601011, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 3.96875, + "step": 14, + "time_per_iteration": 2.597313165664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02024541, + "balance_loss_mlp": 1.65890288, + "epoch": 0.0028857252789534437, + "flos": 434368949760.0, + "grad_norm": 0.2583476739022616, + "language_loss": 1.22957516, + "learning_rate": 0.0005362628552605367, + "loss": 1.24982059, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 3.65234375, + "step": 15, + "time_per_iteration": 2.6388704776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01790575, + "balance_loss_mlp": 1.44687057, + "epoch": 0.0030781069642170067, + "flos": 596465676288.0, + "grad_norm": 0.18613747071639053, + "language_loss": 1.27631426, + "learning_rate": 0.0005490431248454357, + "loss": 1.29421997, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 3.44140625, + "step": 16, + "time_per_iteration": 2.708346128463745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01779165, + "balance_loss_mlp": 1.46941185, + "epoch": 0.0032704886494805694, + "flos": 1537360432128.0, + "grad_norm": 0.2733785965407311, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.77484274, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 3.09375, + "step": 17, + "time_per_iteration": 6.916250705718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01553778, + "balance_loss_mlp": 1.24955583, + "epoch": 0.0034628703347441324, + "flos": 473720403456.0, + "grad_norm": 0.11658431553946913, + "language_loss": 1.14468098, + "learning_rate": 0.0005723671632907488, + "loss": 1.16021872, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 3.03710938, + "step": 18, + "time_per_iteration": 2.7716212272644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01490625, + "balance_loss_mlp": 1.21005416, + "epoch": 0.0036552520200076955, + "flos": 448303320576.0, + "grad_norm": 0.11552730485963776, + "language_loss": 1.19723654, + "learning_rate": 0.0005830738490244919, + "loss": 1.21214283, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 2.80859375, + "step": 19, + "time_per_iteration": 2.6067557334899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0141948, + "balance_loss_mlp": 1.16103387, + "epoch": 0.003847633705271258, + "flos": 635881148928.0, + "grad_norm": 0.11977740619668105, + "language_loss": 1.21676993, + "learning_rate": 0.0005932312266435596, + "loss": 1.23096466, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 2.58398438, + "step": 20, + "time_per_iteration": 2.8545703887939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01364308, + "balance_loss_mlp": 1.13084817, + "epoch": 0.004040015390534821, + "flos": 589222771200.0, + "grad_norm": 0.09935322828728523, + "language_loss": 1.16681409, + "learning_rate": 0.0006028929207788754, + "loss": 1.18045723, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 2.33203125, + "step": 21, + "time_per_iteration": 2.7119524478912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319718, + "balance_loss_mlp": 1.11038613, + "epoch": 0.004232397075798384, + "flos": 756253338624.0, + "grad_norm": 0.09023283304690737, + "language_loss": 1.20250762, + "learning_rate": 0.0006121050677327902, + "loss": 1.21570492, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 2.09667969, + "step": 22, + "time_per_iteration": 2.884739398956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304467, + "balance_loss_mlp": 1.1184051, + "epoch": 0.004424778761061947, + "flos": 526434439680.0, + "grad_norm": 0.08559602389751407, + "language_loss": 1.10067201, + "learning_rate": 0.0006209076479463684, + "loss": 1.1137166, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 1.85839844, + "step": 23, + "time_per_iteration": 2.6616718769073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275434, + "balance_loss_mlp": 1.10787356, + "epoch": 0.00461716044632551, + "flos": 547907079168.0, + "grad_norm": 0.07141137445072718, + "language_loss": 1.2012924, + "learning_rate": 0.0006293355346737718, + "loss": 1.21404672, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 1.67675781, + "step": 24, + "time_per_iteration": 2.7025952339172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252583, + "balance_loss_mlp": 1.10476315, + "epoch": 0.004809542131589073, + "flos": 567293234688.0, + "grad_norm": 0.08524381015789384, + "language_loss": 1.16738653, + "learning_rate": 0.0006374193284416834, + "loss": 1.17991233, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 1.47753906, + "step": 25, + "time_per_iteration": 2.827439069747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223638, + "balance_loss_mlp": 1.0984205, + "epoch": 0.005001923816852636, + "flos": 470391418368.0, + "grad_norm": 0.08512374611478205, + "language_loss": 1.15399337, + "learning_rate": 0.0006451860277489461, + "loss": 1.16622972, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 1.25097656, + "step": 26, + "time_per_iteration": 2.6214864253997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206141, + "balance_loss_mlp": 1.10009253, + "epoch": 0.005194305502116198, + "flos": 415283950080.0, + "grad_norm": 0.07774032731783902, + "language_loss": 1.23061514, + "learning_rate": 0.0006526595731190848, + "loss": 1.2426765, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 1.0625, + "step": 27, + "time_per_iteration": 2.5637125968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117904, + "balance_loss_mlp": 1.09192181, + "epoch": 0.005386687187379761, + "flos": 628466535936.0, + "grad_norm": 0.05524077436438855, + "language_loss": 1.1626848, + "learning_rate": 0.0006598612921618983, + "loss": 1.17447519, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 0.87158203, + "step": 28, + "time_per_iteration": 2.8202784061431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159441, + "balance_loss_mlp": 1.08772469, + "epoch": 0.005579068872643324, + "flos": 886100332032.0, + "grad_norm": 0.07386109802626846, + "language_loss": 1.08505416, + "learning_rate": 0.0006668102665011454, + "loss": 1.09664845, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 0.71728516, + "step": 29, + "time_per_iteration": 3.2254040241241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154142, + "balance_loss_mlp": 1.09520459, + "epoch": 0.005771450557906887, + "flos": 547287238656.0, + "grad_norm": 0.0797557646441396, + "language_loss": 1.18077409, + "learning_rate": 0.0006735236364718957, + "loss": 1.19231534, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 0.58886719, + "step": 30, + "time_per_iteration": 2.6730945110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140737, + "balance_loss_mlp": 1.09384, + "epoch": 0.00596383224317045, + "flos": 531766950912.0, + "grad_norm": 0.060827451674393726, + "language_loss": 1.1687839, + "learning_rate": 0.0006800168558381346, + "loss": 1.18019128, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 0.46875, + "step": 31, + "time_per_iteration": 2.649216651916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148736, + "balance_loss_mlp": 1.11166239, + "epoch": 0.0061562139284340135, + "flos": 588813926400.0, + "grad_norm": 0.10592463777190406, + "language_loss": 1.19211543, + "learning_rate": 0.0006863039060567947, + "loss": 1.20360279, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 0.37084961, + "step": 32, + "time_per_iteration": 2.6697018146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132499, + "balance_loss_mlp": 1.10136151, + "epoch": 0.006348595613697576, + "flos": 617929551360.0, + "grad_norm": 0.09812744917576391, + "language_loss": 1.1217525, + "learning_rate": 0.0006923974775611263, + "loss": 1.13307738, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 0.3112793, + "step": 33, + "time_per_iteration": 2.770225763320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137532, + "balance_loss_mlp": 1.11146092, + "epoch": 0.006540977298961139, + "flos": 777564444672.0, + "grad_norm": 0.06513543096417564, + "language_loss": 1.08375585, + "learning_rate": 0.0006983091239737814, + "loss": 1.09513116, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 0.26086426, + "step": 34, + "time_per_iteration": 2.99418306350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128276, + "balance_loss_mlp": 1.10578084, + "epoch": 0.006733358984224702, + "flos": 666837356544.0, + "grad_norm": 0.06344935516817307, + "language_loss": 1.07062221, + "learning_rate": 0.0007040493939600222, + "loss": 1.08190489, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 0.22497559, + "step": 35, + "time_per_iteration": 2.9126057624816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119708, + "balance_loss_mlp": 1.09892988, + "epoch": 0.006925740669488265, + "flos": 564092287488.0, + "grad_norm": 0.06579143759664555, + "language_loss": 1.07960629, + "learning_rate": 0.0007096279445021078, + "loss": 1.09080338, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 0.20788574, + "step": 36, + "time_per_iteration": 2.7079102993011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114855, + "balance_loss_mlp": 1.09574544, + "epoch": 0.007118122354751828, + "flos": 549583156224.0, + "grad_norm": 0.14799474820221378, + "language_loss": 1.14634764, + "learning_rate": 0.0007150536386503726, + "loss": 1.15749621, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 0.19104004, + "step": 37, + "time_per_iteration": 2.8290467262268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104011, + "balance_loss_mlp": 1.08533084, + "epoch": 0.007310504040015391, + "flos": 702161409024.0, + "grad_norm": 0.2513092385422617, + "language_loss": 1.08396375, + "learning_rate": 0.0007203346302358509, + "loss": 1.09500384, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 0.18688965, + "step": 38, + "time_per_iteration": 2.961430311203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121274, + "balance_loss_mlp": 1.10231924, + "epoch": 0.007502885725278953, + "flos": 599022051840.0, + "grad_norm": 0.0999674626629785, + "language_loss": 1.11391926, + "learning_rate": 0.000725478437577282, + "loss": 1.12513208, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 0.18945312, + "step": 39, + "time_per_iteration": 2.742088556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146989, + "balance_loss_mlp": 1.12810588, + "epoch": 0.007695267410542516, + "flos": 560000867328.0, + "grad_norm": 0.3323772184023467, + "language_loss": 1.08355689, + "learning_rate": 0.0007304920078549186, + "loss": 1.09502685, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 0.18884277, + "step": 40, + "time_per_iteration": 2.66943621635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116486, + "balance_loss_mlp": 1.1452378, + "epoch": 0.007887649095806078, + "flos": 507906671616.0, + "grad_norm": 0.11539272036457353, + "language_loss": 1.09356606, + "learning_rate": 0.0007353817735343603, + "loss": 1.1052146, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 0.19604492, + "step": 41, + "time_per_iteration": 2.7052595615386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132998, + "balance_loss_mlp": 1.11293542, + "epoch": 0.008080030781069641, + "flos": 503642133504.0, + "grad_norm": 0.12251683576194117, + "language_loss": 1.04851842, + "learning_rate": 0.0007401537019902344, + "loss": 1.05984843, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 0.20056152, + "step": 42, + "time_per_iteration": 2.590432643890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124507, + "balance_loss_mlp": 1.10198867, + "epoch": 0.008272412466333205, + "flos": 517764178944.0, + "grad_norm": 0.09393858903586973, + "language_loss": 1.08539796, + "learning_rate": 0.0007448133392900729, + "loss": 1.09664297, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 0.22521973, + "step": 43, + "time_per_iteration": 2.6619081497192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112544, + "balance_loss_mlp": 1.10156202, + "epoch": 0.008464794151596768, + "flos": 607673373696.0, + "grad_norm": 0.06822323064374927, + "language_loss": 1.03845203, + "learning_rate": 0.0007493658489441491, + "loss": 1.04970646, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 0.23864746, + "step": 44, + "time_per_iteration": 2.861008644104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128905, + "balance_loss_mlp": 1.10477662, + "epoch": 0.00865717583686033, + "flos": 537661075968.0, + "grad_norm": 0.1413166066405165, + "language_loss": 1.08820629, + "learning_rate": 0.0007538160463002316, + "loss": 1.09949529, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 0.2409668, + "step": 45, + "time_per_iteration": 2.643458604812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115676, + "balance_loss_mlp": 1.13258433, + "epoch": 0.008849557522123894, + "flos": 507758284800.0, + "grad_norm": 0.08570115972640321, + "language_loss": 1.10720444, + "learning_rate": 0.0007581684291577274, + "loss": 1.11877203, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.24157715, + "step": 46, + "time_per_iteration": 2.5904788970947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145761, + "balance_loss_mlp": 1.12085772, + "epoch": 0.009041939207387457, + "flos": 625048800768.0, + "grad_norm": 0.06636849455276843, + "language_loss": 1.14156199, + "learning_rate": 0.0007624272050891776, + "loss": 1.15301955, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.24902344, + "step": 47, + "time_per_iteration": 2.782179594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154374, + "balance_loss_mlp": 1.12759995, + "epoch": 0.00923432089265102, + "flos": 549124849152.0, + "grad_norm": 0.09356522507451794, + "language_loss": 1.04615343, + "learning_rate": 0.0007665963158851307, + "loss": 1.05769718, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.26806641, + "step": 48, + "time_per_iteration": 2.824540138244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174661, + "balance_loss_mlp": 1.14738548, + "epoch": 0.009426702577914583, + "flos": 562202242560.0, + "grad_norm": 0.059100241584136314, + "language_loss": 1.12381458, + "learning_rate": 0.0007706794594783609, + "loss": 1.13556111, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.27270508, + "step": 49, + "time_per_iteration": 2.790757894515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192673, + "balance_loss_mlp": 1.16604137, + "epoch": 0.009619084263178146, + "flos": 616486228992.0, + "grad_norm": 0.08074806779925832, + "language_loss": 1.11280799, + "learning_rate": 0.0007746801096530423, + "loss": 1.12473488, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.2668457, + "step": 50, + "time_per_iteration": 2.7235305309295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116178, + "balance_loss_mlp": 1.135149, + "epoch": 0.009811465948441709, + "flos": 541176325632.0, + "grad_norm": 0.06558886342971224, + "language_loss": 1.16576111, + "learning_rate": 0.0007786015338021173, + "loss": 1.17737889, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.26672363, + "step": 51, + "time_per_iteration": 2.6817519664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134628, + "balance_loss_mlp": 1.1085453, + "epoch": 0.010003847633705272, + "flos": 535608087552.0, + "grad_norm": 0.06210449580458492, + "language_loss": 1.08870959, + "learning_rate": 0.0007824468089603051, + "loss": 1.10005593, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.26098633, + "step": 52, + "time_per_iteration": 2.644577980041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125522, + "balance_loss_mlp": 1.09910512, + "epoch": 0.010196229318968833, + "flos": 908867907072.0, + "grad_norm": 0.05864822926220488, + "language_loss": 1.07807887, + "learning_rate": 0.0007862188363098669, + "loss": 1.08933413, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.26428223, + "step": 53, + "time_per_iteration": 3.1450047492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126237, + "balance_loss_mlp": 1.10084558, + "epoch": 0.010388611004232396, + "flos": 585594040320.0, + "grad_norm": 0.07974065634267835, + "language_loss": 1.08295977, + "learning_rate": 0.0007899203543304438, + "loss": 1.09422219, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.25390625, + "step": 54, + "time_per_iteration": 2.6822280883789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155972, + "balance_loss_mlp": 1.13315582, + "epoch": 0.01058099268949596, + "flos": 502233716736.0, + "grad_norm": 0.07014139109577967, + "language_loss": 1.22212756, + "learning_rate": 0.0007935539507422731, + "loss": 1.23368728, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.22814941, + "step": 55, + "time_per_iteration": 2.5841405391693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117516, + "balance_loss_mlp": 1.153512, + "epoch": 0.010773374374759523, + "flos": 544170659328.0, + "grad_norm": 0.07006342440897594, + "language_loss": 1.13914931, + "learning_rate": 0.0007971220733732573, + "loss": 1.15090084, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.21643066, + "step": 56, + "time_per_iteration": 2.697427988052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193099, + "balance_loss_mlp": 1.17267895, + "epoch": 0.010965756060023086, + "flos": 525874235904.0, + "grad_norm": 0.08125896119424647, + "language_loss": 1.0764755, + "learning_rate": 0.0008006270400641869, + "loss": 1.08840656, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.2043457, + "step": 57, + "time_per_iteration": 2.723154306411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174019, + "balance_loss_mlp": 1.15412247, + "epoch": 0.011158137745286649, + "flos": 576653147136.0, + "grad_norm": 0.07485866075688756, + "language_loss": 1.09104013, + "learning_rate": 0.0008040710477125043, + "loss": 1.10278034, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.19897461, + "step": 58, + "time_per_iteration": 2.703186273574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153983, + "balance_loss_mlp": 1.13440859, + "epoch": 0.011350519430550212, + "flos": 529024310784.0, + "grad_norm": 0.06764829366941465, + "language_loss": 1.09780312, + "learning_rate": 0.0008074561805429771, + "loss": 1.10934305, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.19567871, + "step": 59, + "time_per_iteration": 2.6111674308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136624, + "balance_loss_mlp": 1.11676335, + "epoch": 0.011542901115813775, + "flos": 555608291328.0, + "grad_norm": 0.06986870516034673, + "language_loss": 1.08079648, + "learning_rate": 0.0008107844176832545, + "loss": 1.09216261, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.19848633, + "step": 60, + "time_per_iteration": 2.682687997817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125651, + "balance_loss_mlp": 1.1056236, + "epoch": 0.011735282801077338, + "flos": 571826995200.0, + "grad_norm": 0.061548073586970495, + "language_loss": 1.09071934, + "learning_rate": 0.0008140576401132568, + "loss": 1.10197592, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.20019531, + "step": 61, + "time_per_iteration": 2.639394760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111743, + "balance_loss_mlp": 1.09838021, + "epoch": 0.0119276644863409, + "flos": 615309156864.0, + "grad_norm": 0.06273761556608791, + "language_loss": 1.10558033, + "learning_rate": 0.0008172776370494935, + "loss": 1.11675453, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.19030762, + "step": 62, + "time_per_iteration": 2.7110230922698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134294, + "balance_loss_mlp": 1.11483955, + "epoch": 0.012120046171604464, + "flos": 500835474432.0, + "grad_norm": 0.07391589684249159, + "language_loss": 1.17346644, + "learning_rate": 0.0008204461118185703, + "loss": 1.18480933, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.19445801, + "step": 63, + "time_per_iteration": 2.5490689277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142708, + "balance_loss_mlp": 1.12420678, + "epoch": 0.012312427856868027, + "flos": 473109327360.0, + "grad_norm": 0.05825974543220343, + "language_loss": 1.06081367, + "learning_rate": 0.0008235646872681536, + "loss": 1.07224083, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.18505859, + "step": 64, + "time_per_iteration": 2.5874247550964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139504, + "balance_loss_mlp": 1.12069249, + "epoch": 0.012504809542131588, + "flos": 538094651904.0, + "grad_norm": 0.1040066778051144, + "language_loss": 1.06503749, + "learning_rate": 0.0008266349107584288, + "loss": 1.07643247, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.18823242, + "step": 65, + "time_per_iteration": 2.678736925125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123492, + "balance_loss_mlp": 1.10500288, + "epoch": 0.012697191227395151, + "flos": 608450365440.0, + "grad_norm": 0.09066354406474254, + "language_loss": 1.09410381, + "learning_rate": 0.0008296582587724851, + "loss": 1.10533869, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.18481445, + "step": 66, + "time_per_iteration": 2.6937255859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121105, + "balance_loss_mlp": 1.10255599, + "epoch": 0.012889572912658714, + "flos": 767750607360.0, + "grad_norm": 0.11790618145169461, + "language_loss": 1.07982886, + "learning_rate": 0.0008326361411800136, + "loss": 1.0910399, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.1854248, + "step": 67, + "time_per_iteration": 2.9377663135528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096346, + "balance_loss_mlp": 1.07871521, + "epoch": 0.013081954597922277, + "flos": 533604561408.0, + "grad_norm": 0.09153807632987658, + "language_loss": 1.08335972, + "learning_rate": 0.0008355699051851403, + "loss": 1.09432316, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.17651367, + "step": 68, + "time_per_iteration": 2.7278473377227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_mlp": 1.0865227, + "epoch": 0.01327433628318584, + "flos": 572826567168.0, + "grad_norm": 0.08317322449907456, + "language_loss": 1.14837921, + "learning_rate": 0.0008384608389860635, + "loss": 1.15941942, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.1751709, + "step": 69, + "time_per_iteration": 2.7238211631774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111418, + "balance_loss_mlp": 1.09424019, + "epoch": 0.013466717968449404, + "flos": 497029243392.0, + "grad_norm": 0.08213812906773327, + "language_loss": 1.04970825, + "learning_rate": 0.000841310175171381, + "loss": 1.06082237, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.17199707, + "step": 70, + "time_per_iteration": 2.578726291656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101987, + "balance_loss_mlp": 1.08526158, + "epoch": 0.013659099653712967, + "flos": 565234454016.0, + "grad_norm": 0.06358988870017376, + "language_loss": 1.03380442, + "learning_rate": 0.000844119093875517, + "loss": 1.04482436, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.16723633, + "step": 71, + "time_per_iteration": 2.692791223526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103533, + "balance_loss_mlp": 1.08689094, + "epoch": 0.01385148133897653, + "flos": 573540950016.0, + "grad_norm": 0.07461407963015444, + "language_loss": 1.08098376, + "learning_rate": 0.0008468887257134666, + "loss": 1.09201908, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.16650391, + "step": 72, + "time_per_iteration": 2.6599459648132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122889, + "balance_loss_mlp": 1.10587776, + "epoch": 0.014043863024240093, + "flos": 576539665920.0, + "grad_norm": 0.05931650266846123, + "language_loss": 1.10316896, + "learning_rate": 0.0008496201545131264, + "loss": 1.11439776, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.17028809, + "step": 73, + "time_per_iteration": 2.7093684673309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126213, + "balance_loss_mlp": 1.10950017, + "epoch": 0.014236244709503656, + "flos": 938287660032.0, + "grad_norm": 0.060718352480344094, + "language_loss": 1.08902812, + "learning_rate": 0.0008523144198617317, + "loss": 1.1002903, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.16711426, + "step": 74, + "time_per_iteration": 3.1743276119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125614, + "balance_loss_mlp": 1.10876918, + "epoch": 0.014428626394767219, + "flos": 528231352320.0, + "grad_norm": 0.07198154728214846, + "language_loss": 1.08249164, + "learning_rate": 0.0008549725194813783, + "loss": 1.09374774, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.1685791, + "step": 75, + "time_per_iteration": 2.630387783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106727, + "balance_loss_mlp": 1.09047866, + "epoch": 0.014621008080030782, + "flos": 803371433472.0, + "grad_norm": 0.07553700512989577, + "language_loss": 1.06998253, + "learning_rate": 0.0008575954114472099, + "loss": 1.0810498, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.16247559, + "step": 76, + "time_per_iteration": 3.134385347366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109532, + "balance_loss_mlp": 1.0933075, + "epoch": 0.014813389765294343, + "flos": 696588788736.0, + "grad_norm": 0.053440596513601155, + "language_loss": 1.05069363, + "learning_rate": 0.0008601840162606118, + "loss": 1.06178904, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.16223145, + "step": 77, + "time_per_iteration": 3.039991855621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123171, + "balance_loss_mlp": 1.10660076, + "epoch": 0.015005771450557906, + "flos": 596702813184.0, + "grad_norm": 0.07894951514499118, + "language_loss": 1.1143651, + "learning_rate": 0.000862739218788641, + "loss": 1.12559676, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.16577148, + "step": 78, + "time_per_iteration": 2.867741346359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113228, + "balance_loss_mlp": 1.11553121, + "epoch": 0.01519815313582147, + "flos": 549148170240.0, + "grad_norm": 0.0893413961860561, + "language_loss": 1.07743871, + "learning_rate": 0.0008652618700799138, + "loss": 1.08876157, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.16760254, + "step": 79, + "time_per_iteration": 2.675795555114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160152, + "balance_loss_mlp": 1.14348662, + "epoch": 0.015390534821085032, + "flos": 430306642944.0, + "grad_norm": 0.06679936706529424, + "language_loss": 1.07125092, + "learning_rate": 0.0008677527890662774, + "loss": 1.08285248, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.16662598, + "step": 80, + "time_per_iteration": 2.4765963554382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196819, + "balance_loss_mlp": 1.17889023, + "epoch": 0.015582916506348595, + "flos": 523854743040.0, + "grad_norm": 0.12362960542988827, + "language_loss": 1.09903598, + "learning_rate": 0.0008702127641587799, + "loss": 1.11100423, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.17932129, + "step": 81, + "time_per_iteration": 2.636688470840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180455, + "balance_loss_mlp": 1.16288388, + "epoch": 0.015775298191612157, + "flos": 575151598080.0, + "grad_norm": 0.08274533442322421, + "language_loss": 1.04032063, + "learning_rate": 0.0008726425547457192, + "loss": 1.05212522, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.17565918, + "step": 82, + "time_per_iteration": 2.765179395675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157804, + "balance_loss_mlp": 1.14051914, + "epoch": 0.01596767987687572, + "flos": 610040664576.0, + "grad_norm": 0.07618339381967684, + "language_loss": 1.03921247, + "learning_rate": 0.0008750428925998964, + "loss": 1.05079055, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.1730957, + "step": 83, + "time_per_iteration": 2.7615418434143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159673, + "balance_loss_mlp": 1.14280462, + "epoch": 0.016160061562139283, + "flos": 566864040960.0, + "grad_norm": 0.0706757922791228, + "language_loss": 1.09743476, + "learning_rate": 0.0008774144832015932, + "loss": 1.10903156, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.16882324, + "step": 84, + "time_per_iteration": 2.694364070892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01699218, + "balance_loss_mlp": 1.68252861, + "epoch": 0.016352443247402846, + "flos": 1410557234688.0, + "grad_norm": 0.23342967148410274, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76473522, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.16699219, + "step": 85, + "time_per_iteration": 4.599137306213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212855, + "balance_loss_mlp": 1.19580793, + "epoch": 0.01654482493266641, + "flos": 730177127424.0, + "grad_norm": 0.09253845479208671, + "language_loss": 1.04518116, + "learning_rate": 0.0008820741205014318, + "loss": 1.05730963, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.1706543, + "step": 86, + "time_per_iteration": 2.8595266342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246652, + "balance_loss_mlp": 1.22939014, + "epoch": 0.016737206617929972, + "flos": 536016932352.0, + "grad_norm": 0.10044068584300966, + "language_loss": 1.06437612, + "learning_rate": 0.0008843634575408404, + "loss": 1.07684278, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.17248535, + "step": 87, + "time_per_iteration": 2.690492630004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215448, + "balance_loss_mlp": 1.19887805, + "epoch": 0.016929588303193535, + "flos": 536706584064.0, + "grad_norm": 0.0661610487366718, + "language_loss": 1.07674646, + "learning_rate": 0.0008866266301555082, + "loss": 1.08890104, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.16577148, + "step": 88, + "time_per_iteration": 2.737339496612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203027, + "balance_loss_mlp": 1.18706512, + "epoch": 0.017121969988457098, + "flos": 526498458624.0, + "grad_norm": 0.07897226836222233, + "language_loss": 1.08543992, + "learning_rate": 0.0008888642296509615, + "loss": 1.09747016, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.1595459, + "step": 89, + "time_per_iteration": 2.576819658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187655, + "balance_loss_mlp": 1.17131162, + "epoch": 0.01731435167372066, + "flos": 625304876544.0, + "grad_norm": 0.0740353605135553, + "language_loss": 1.13367987, + "learning_rate": 0.0008910768275115906, + "loss": 1.14555645, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.16345215, + "step": 90, + "time_per_iteration": 2.778571128845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173459, + "balance_loss_mlp": 1.15692425, + "epoch": 0.017506733358984224, + "flos": 496157709312.0, + "grad_norm": 0.07518713147028631, + "language_loss": 1.08794332, + "learning_rate": 0.0008932649762767675, + "loss": 1.0996778, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.16540527, + "step": 91, + "time_per_iteration": 2.5931665897369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185544, + "balance_loss_mlp": 1.16881919, + "epoch": 0.017699115044247787, + "flos": 745613047296.0, + "grad_norm": 0.07711429280558382, + "language_loss": 1.11576343, + "learning_rate": 0.0008954292103690864, + "loss": 1.12761879, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.1673584, + "step": 92, + "time_per_iteration": 2.9129488468170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194769, + "balance_loss_mlp": 1.17854476, + "epoch": 0.01789149672951135, + "flos": 515257265664.0, + "grad_norm": 0.0669718610224715, + "language_loss": 1.1343056, + "learning_rate": 0.0008975700468778296, + "loss": 1.14625335, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.16223145, + "step": 93, + "time_per_iteration": 2.576620101928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216953, + "balance_loss_mlp": 1.20076382, + "epoch": 0.018083878414774913, + "flos": 585850116096.0, + "grad_norm": 0.11698648494194364, + "language_loss": 1.0652318, + "learning_rate": 0.0008996879863005366, + "loss": 1.07740128, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.16186523, + "step": 94, + "time_per_iteration": 2.6751108169555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217025, + "balance_loss_mlp": 1.2013253, + "epoch": 0.018276260100038477, + "flos": 497103436800.0, + "grad_norm": 0.08327491501556071, + "language_loss": 1.06208014, + "learning_rate": 0.0009017835132453337, + "loss": 1.07425046, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.15686035, + "step": 95, + "time_per_iteration": 2.5971803665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196463, + "balance_loss_mlp": 1.1804409, + "epoch": 0.01846864178530204, + "flos": 639765955584.0, + "grad_norm": 0.09756000368948786, + "language_loss": 1.06920743, + "learning_rate": 0.0009038570970964896, + "loss": 1.08117199, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.16027832, + "step": 96, + "time_per_iteration": 2.7428832054138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173361, + "balance_loss_mlp": 1.15723228, + "epoch": 0.018661023470565603, + "flos": 511411746816.0, + "grad_norm": 0.07053433913024812, + "language_loss": 1.04343212, + "learning_rate": 0.0009059091926454854, + "loss": 1.05516577, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.16125488, + "step": 97, + "time_per_iteration": 2.570509433746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.16246903, + "epoch": 0.018853405155829166, + "flos": 930710103552.0, + "grad_norm": 0.08767892767743933, + "language_loss": 1.03389072, + "learning_rate": 0.0009079402406897198, + "loss": 1.04567385, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.15844727, + "step": 98, + "time_per_iteration": 3.202298164367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179075, + "balance_loss_mlp": 1.16296983, + "epoch": 0.01904578684109273, + "flos": 576209396736.0, + "grad_norm": 0.2639136557883628, + "language_loss": 1.0596242, + "learning_rate": 0.0009099506686008212, + "loss": 1.07141495, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.16101074, + "step": 99, + "time_per_iteration": 2.812368869781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139923, + "balance_loss_mlp": 1.12423468, + "epoch": 0.019238168526356292, + "flos": 558173431296.0, + "grad_norm": 0.12311670746354397, + "language_loss": 1.08180976, + "learning_rate": 0.0009119408908644013, + "loss": 1.09320903, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.15673828, + "step": 100, + "time_per_iteration": 2.7063775062561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150815, + "balance_loss_mlp": 1.13574743, + "epoch": 0.019430550211619855, + "flos": 723539506176.0, + "grad_norm": 0.12127606313133317, + "language_loss": 1.14121008, + "learning_rate": 0.0009139113095929519, + "loss": 1.15271831, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.15039062, + "step": 101, + "time_per_iteration": 2.840913772583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218173, + "balance_loss_mlp": 1.20243776, + "epoch": 0.019622931896883418, + "flos": 499235000832.0, + "grad_norm": 0.1104247345061639, + "language_loss": 1.0836457, + "learning_rate": 0.0009158623150134762, + "loss": 1.09582746, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.15722656, + "step": 102, + "time_per_iteration": 2.560464859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01357908, + "balance_loss_mlp": 1.34204173, + "epoch": 0.01981531358214698, + "flos": 508916418048.0, + "grad_norm": 0.15164768975642337, + "language_loss": 1.07661259, + "learning_rate": 0.000917794285931332, + "loss": 1.0901916, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.15856934, + "step": 103, + "time_per_iteration": 2.6684353351593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381196, + "balance_loss_mlp": 1.36572242, + "epoch": 0.020007695267410544, + "flos": 521087371776.0, + "grad_norm": 0.10342928287682196, + "language_loss": 0.9971087, + "learning_rate": 0.0009197075901716639, + "loss": 1.01092052, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.15454102, + "step": 104, + "time_per_iteration": 2.7250871658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01356986, + "balance_loss_mlp": 1.34017777, + "epoch": 0.020200076952674107, + "flos": 533013834240.0, + "grad_norm": 0.1824265866479698, + "language_loss": 1.09647703, + "learning_rate": 0.0009216025849997171, + "loss": 1.11004686, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.16809082, + "step": 105, + "time_per_iteration": 2.776764154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261961, + "balance_loss_mlp": 1.24583197, + "epoch": 0.020392458637937667, + "flos": 684430981632.0, + "grad_norm": 0.06376163654280764, + "language_loss": 1.0425086, + "learning_rate": 0.0009234796175212258, + "loss": 1.05512834, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.16125488, + "step": 106, + "time_per_iteration": 2.9174978733062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269614, + "balance_loss_mlp": 1.25201869, + "epoch": 0.02058484032320123, + "flos": 701791852032.0, + "grad_norm": 0.060044663360548714, + "language_loss": 1.08808422, + "learning_rate": 0.000925339025064007, + "loss": 1.10078037, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.17590332, + "step": 107, + "time_per_iteration": 2.975735902786255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324579, + "balance_loss_mlp": 1.30547023, + "epoch": 0.020777222008464793, + "flos": 638772175872.0, + "grad_norm": 0.12680512225677842, + "language_loss": 1.01262307, + "learning_rate": 0.0009271811355418027, + "loss": 1.02586877, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.19128418, + "step": 108, + "time_per_iteration": 2.8408150672912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01306621, + "balance_loss_mlp": 1.28755951, + "epoch": 0.020969603693728356, + "flos": 681785856000.0, + "grad_norm": 0.06997483982989385, + "language_loss": 1.08551693, + "learning_rate": 0.0009290062678013548, + "loss": 1.09858322, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.19055176, + "step": 109, + "time_per_iteration": 2.869980812072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309468, + "balance_loss_mlp": 1.29159832, + "epoch": 0.02116198537899192, + "flos": 533140462080.0, + "grad_norm": 0.13190855435004306, + "language_loss": 1.06647623, + "learning_rate": 0.0009308147319536321, + "loss": 1.07957077, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.17895508, + "step": 110, + "time_per_iteration": 2.6270735263824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130688, + "balance_loss_mlp": 1.29067969, + "epoch": 0.021354367064255482, + "flos": 717168135168.0, + "grad_norm": 0.10963649287068344, + "language_loss": 1.1282903, + "learning_rate": 0.0009326068296900676, + "loss": 1.14135909, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.1619873, + "step": 111, + "time_per_iteration": 2.8845341205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388527, + "balance_loss_mlp": 1.37200487, + "epoch": 0.021546748749519045, + "flos": 519290459136.0, + "grad_norm": 0.12406482447985402, + "language_loss": 1.03902006, + "learning_rate": 0.0009343828545846161, + "loss": 1.05290532, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.16516113, + "step": 112, + "time_per_iteration": 2.8167102336883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01548404, + "balance_loss_mlp": 1.53109479, + "epoch": 0.021739130434782608, + "flos": 504912337920.0, + "grad_norm": 0.2528517188051562, + "language_loss": 1.0722419, + "learning_rate": 0.0009361430923823841, + "loss": 1.08772588, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.1730957, + "step": 113, + "time_per_iteration": 2.664581060409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441472, + "balance_loss_mlp": 1.42576015, + "epoch": 0.02193151212004617, + "flos": 463251820032.0, + "grad_norm": 0.1910881492312462, + "language_loss": 1.11420846, + "learning_rate": 0.0009378878212755459, + "loss": 1.12862325, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.15710449, + "step": 114, + "time_per_iteration": 2.4851133823394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262203, + "balance_loss_mlp": 1.24767148, + "epoch": 0.022123893805309734, + "flos": 552008673792.0, + "grad_norm": 0.09004287588953173, + "language_loss": 1.0099957, + "learning_rate": 0.0009396173121672103, + "loss": 1.0226177, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.14538574, + "step": 115, + "time_per_iteration": 2.6535162925720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215709, + "balance_loss_mlp": 1.20165396, + "epoch": 0.022316275490573297, + "flos": 635920436736.0, + "grad_norm": 0.07849561533847389, + "language_loss": 1.07122314, + "learning_rate": 0.0009413318289238633, + "loss": 1.08338022, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.14050293, + "step": 116, + "time_per_iteration": 2.7836899757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203544, + "balance_loss_mlp": 1.18965602, + "epoch": 0.02250865717583686, + "flos": 798535107072.0, + "grad_norm": 0.07099947506123377, + "language_loss": 0.98912275, + "learning_rate": 0.0009430316286169771, + "loss": 1.00115824, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.13891602, + "step": 117, + "time_per_iteration": 3.049468517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263206, + "balance_loss_mlp": 1.24786401, + "epoch": 0.022701038861100423, + "flos": 455851763712.0, + "grad_norm": 0.18808502465815918, + "language_loss": 1.04843259, + "learning_rate": 0.0009447169617543361, + "loss": 1.06106472, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.15319824, + "step": 118, + "time_per_iteration": 2.5886504650115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121023, + "balance_loss_mlp": 1.19557953, + "epoch": 0.022893420546363986, + "flos": 582812112384.0, + "grad_norm": 0.09179634719817005, + "language_loss": 1.11139297, + "learning_rate": 0.0009463880725016029, + "loss": 1.12349522, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.14648438, + "step": 119, + "time_per_iteration": 2.6861259937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169264, + "balance_loss_mlp": 1.15572226, + "epoch": 0.02308580223162755, + "flos": 561010613760.0, + "grad_norm": 0.09164108943144146, + "language_loss": 1.05675769, + "learning_rate": 0.0009480451988946134, + "loss": 1.06845045, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.13549805, + "step": 120, + "time_per_iteration": 2.8075129985809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217336, + "balance_loss_mlp": 1.2034359, + "epoch": 0.023278183916891113, + "flos": 770966111232.0, + "grad_norm": 0.1019945076921087, + "language_loss": 1.07486713, + "learning_rate": 0.0009496885730428627, + "loss": 1.08704054, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.13903809, + "step": 121, + "time_per_iteration": 3.0081264972686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291544, + "balance_loss_mlp": 1.27698815, + "epoch": 0.023470565602154676, + "flos": 553111552512.0, + "grad_norm": 0.08478902086488087, + "language_loss": 1.05369067, + "learning_rate": 0.0009513184213246156, + "loss": 1.06660616, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.14550781, + "step": 122, + "time_per_iteration": 2.654902696609497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406128, + "balance_loss_mlp": 1.39054775, + "epoch": 0.02366294728741824, + "flos": 559744791552.0, + "grad_norm": 0.09837859270685317, + "language_loss": 1.09463692, + "learning_rate": 0.0009529349645740552, + "loss": 1.10869825, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.15563965, + "step": 123, + "time_per_iteration": 2.6837081909179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01484693, + "balance_loss_mlp": 1.46961284, + "epoch": 0.0238553289726818, + "flos": 468313698816.0, + "grad_norm": 0.11388616458843728, + "language_loss": 1.07573724, + "learning_rate": 0.0009545384182608524, + "loss": 1.09058416, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.1505127, + "step": 124, + "time_per_iteration": 2.5069937705993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01411359, + "balance_loss_mlp": 1.39688659, + "epoch": 0.024047710657945365, + "flos": 559763730432.0, + "grad_norm": 0.3429043048666504, + "language_loss": 1.05057025, + "learning_rate": 0.0009561289926625252, + "loss": 1.06468379, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.14465332, + "step": 125, + "time_per_iteration": 2.6802117824554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011688, + "balance_loss_mlp": 1.15507352, + "epoch": 0.024240092343208928, + "flos": 504528224256.0, + "grad_norm": 0.18048320350440872, + "language_loss": 1.09737623, + "learning_rate": 0.0009577068930299292, + "loss": 1.10906434, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.13739014, + "step": 126, + "time_per_iteration": 2.6096670627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163735, + "balance_loss_mlp": 1.15040147, + "epoch": 0.02443247402847249, + "flos": 435516908544.0, + "grad_norm": 0.07278748671530755, + "language_loss": 1.05931616, + "learning_rate": 0.0009592723197462087, + "loss": 1.07095349, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.13360596, + "step": 127, + "time_per_iteration": 2.6409482955932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248107, + "balance_loss_mlp": 1.23239577, + "epoch": 0.024624855713736054, + "flos": 683445966336.0, + "grad_norm": 0.0813490266373729, + "language_loss": 1.02871299, + "learning_rate": 0.0009608254684795125, + "loss": 1.04119396, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.15710449, + "step": 128, + "time_per_iteration": 2.940600872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265693, + "balance_loss_mlp": 1.24772859, + "epoch": 0.024817237398999614, + "flos": 524721894912.0, + "grad_norm": 0.0804185451989367, + "language_loss": 1.06161952, + "learning_rate": 0.0009623665303297678, + "loss": 1.07427645, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.1796875, + "step": 129, + "time_per_iteration": 2.7088472843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256284, + "balance_loss_mlp": 1.23668599, + "epoch": 0.025009619084263177, + "flos": 655350262272.0, + "grad_norm": 0.12369480901617341, + "language_loss": 1.10218048, + "learning_rate": 0.0009638956919697878, + "loss": 1.11474347, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.19592285, + "step": 130, + "time_per_iteration": 2.857571840286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266475, + "balance_loss_mlp": 1.24420691, + "epoch": 0.02520200076952674, + "flos": 454187271168.0, + "grad_norm": 0.08293639348197612, + "language_loss": 1.02638018, + "learning_rate": 0.0009654131357809714, + "loss": 1.03904486, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.22253418, + "step": 131, + "time_per_iteration": 2.641470432281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128644, + "balance_loss_mlp": 1.26142943, + "epoch": 0.025394382454790303, + "flos": 839427397632.0, + "grad_norm": 0.05741461740254168, + "language_loss": 1.11002767, + "learning_rate": 0.0009669190399838441, + "loss": 1.12289214, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.25036621, + "step": 132, + "time_per_iteration": 3.133596420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302533, + "balance_loss_mlp": 1.27633083, + "epoch": 0.025586764140053866, + "flos": 580725628416.0, + "grad_norm": 0.06987664196058198, + "language_loss": 1.0413487, + "learning_rate": 0.0009684135787636724, + "loss": 1.05437398, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.26208496, + "step": 133, + "time_per_iteration": 2.7968075275421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01325396, + "balance_loss_mlp": 1.29710746, + "epoch": 0.02577914582531743, + "flos": 789893959680.0, + "grad_norm": 0.07551411578012862, + "language_loss": 1.07757604, + "learning_rate": 0.0009698969223913726, + "loss": 1.09083009, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.28283691, + "step": 134, + "time_per_iteration": 3.0058987140655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131126, + "balance_loss_mlp": 1.28212547, + "epoch": 0.025971527510580992, + "flos": 594683320320.0, + "grad_norm": 0.0731546450398535, + "language_loss": 1.10457921, + "learning_rate": 0.0009713692373399265, + "loss": 1.11769176, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.29125977, + "step": 135, + "time_per_iteration": 2.6654229164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02152319, + "balance_loss_mlp": 1.95700705, + "epoch": 0.026163909195844555, + "flos": 1576771522560.0, + "grad_norm": 0.26755932757436196, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81608546, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 1.953125, + "step": 136, + "time_per_iteration": 6.531313896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01724331, + "balance_loss_mlp": 1.55266988, + "epoch": 0.026356290881108118, + "flos": 1501306030080.0, + "grad_norm": 0.1444935793983717, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79535371, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 1.71875, + "step": 137, + "time_per_iteration": 4.966995716094971 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01371776, + "balance_loss_mlp": 1.34284425, + "epoch": 0.02654867256637168, + "flos": 596841025536.0, + "grad_norm": 0.06823267419395149, + "language_loss": 1.03539467, + "learning_rate": 0.0009757216201974225, + "loss": 1.04911256, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.28918457, + "step": 138, + "time_per_iteration": 2.7901663780212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396345, + "balance_loss_mlp": 1.36752045, + "epoch": 0.026741054251635244, + "flos": 544761386496.0, + "grad_norm": 0.08904352821745645, + "language_loss": 1.08793342, + "learning_rate": 0.0009771514130396581, + "loss": 1.10189688, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.28833008, + "step": 139, + "time_per_iteration": 2.664384603500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410566, + "balance_loss_mlp": 1.38171697, + "epoch": 0.026933435936898807, + "flos": 506591387136.0, + "grad_norm": 0.09467843708761726, + "language_loss": 1.08393478, + "learning_rate": 0.00097857095638274, + "loss": 1.09804034, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.28833008, + "step": 140, + "time_per_iteration": 2.5600626468658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399161, + "balance_loss_mlp": 1.37263703, + "epoch": 0.02712581762216237, + "flos": 740513290752.0, + "grad_norm": 0.06303030428856128, + "language_loss": 0.99670362, + "learning_rate": 0.0009799803961288726, + "loss": 1.01069522, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.26538086, + "step": 141, + "time_per_iteration": 2.984253168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01354082, + "balance_loss_mlp": 1.33143175, + "epoch": 0.027318199307425933, + "flos": 848023464960.0, + "grad_norm": 0.06264638149228761, + "language_loss": 1.05898559, + "learning_rate": 0.000981379875086876, + "loss": 1.07252645, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.22644043, + "step": 142, + "time_per_iteration": 3.032597064971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323808, + "balance_loss_mlp": 1.30553341, + "epoch": 0.027510580992689496, + "flos": 575288400384.0, + "grad_norm": 0.07028220907739285, + "language_loss": 1.01752293, + "learning_rate": 0.0009827695330590185, + "loss": 1.030761, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.18273926, + "step": 143, + "time_per_iteration": 2.626483678817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303402, + "balance_loss_mlp": 1.28557992, + "epoch": 0.02770296267795306, + "flos": 772079164416.0, + "grad_norm": 0.05744811954937285, + "language_loss": 1.00619161, + "learning_rate": 0.0009841495069248256, + "loss": 1.0192256, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.17822266, + "step": 144, + "time_per_iteration": 2.9495198726654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316023, + "balance_loss_mlp": 1.29916632, + "epoch": 0.027895344363216622, + "flos": 569123642880.0, + "grad_norm": 0.04968902291069247, + "language_loss": 0.9920603, + "learning_rate": 0.0009855199307219871, + "loss": 1.00522041, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.1685791, + "step": 145, + "time_per_iteration": 2.6407721042633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130391, + "balance_loss_mlp": 1.28731608, + "epoch": 0.028087726048480186, + "flos": 547099564032.0, + "grad_norm": 0.10723696528856613, + "language_loss": 1.01566505, + "learning_rate": 0.0009868809357244854, + "loss": 1.02870417, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.16589355, + "step": 146, + "time_per_iteration": 2.6262452602386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287507, + "balance_loss_mlp": 1.27153277, + "epoch": 0.02828010773374375, + "flos": 524519663616.0, + "grad_norm": 0.06991830692152445, + "language_loss": 1.05632663, + "learning_rate": 0.0009882326505180556, + "loss": 1.06920183, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.15966797, + "step": 147, + "time_per_iteration": 2.6469435691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270213, + "balance_loss_mlp": 1.2534517, + "epoch": 0.02847248941900731, + "flos": 772108277760.0, + "grad_norm": 0.07309095407736986, + "language_loss": 1.04486537, + "learning_rate": 0.0009895752010730906, + "loss": 1.0575676, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.16748047, + "step": 148, + "time_per_iteration": 2.9457786083221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012724, + "balance_loss_mlp": 1.25667655, + "epoch": 0.028664871104270875, + "flos": 534150208512.0, + "grad_norm": 0.048334696317449924, + "language_loss": 1.10088921, + "learning_rate": 0.0009909087108150867, + "loss": 1.11361325, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.15710449, + "step": 149, + "time_per_iteration": 2.712559700012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309133, + "balance_loss_mlp": 1.29286051, + "epoch": 0.028857252789534438, + "flos": 367557599232.0, + "grad_norm": 0.13115053493636905, + "language_loss": 1.11238122, + "learning_rate": 0.0009922333006927371, + "loss": 1.12547255, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.16247559, + "step": 150, + "time_per_iteration": 2.4607067108154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01329212, + "balance_loss_mlp": 1.31257081, + "epoch": 0.029049634474798, + "flos": 515232534528.0, + "grad_norm": 0.06948512606819708, + "language_loss": 1.04613614, + "learning_rate": 0.0009935490892437632, + "loss": 1.05942833, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.16650391, + "step": 151, + "time_per_iteration": 2.5460238456726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309109, + "balance_loss_mlp": 1.29317045, + "epoch": 0.029242016160061564, + "flos": 587840495616.0, + "grad_norm": 0.11257287432569656, + "language_loss": 1.03097093, + "learning_rate": 0.0009948561926585687, + "loss": 1.04406202, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.15930176, + "step": 152, + "time_per_iteration": 2.753009557723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300362, + "balance_loss_mlp": 1.28555596, + "epoch": 0.029434397845325123, + "flos": 551816616960.0, + "grad_norm": 0.062223246716750634, + "language_loss": 1.06524086, + "learning_rate": 0.0009961547248418122, + "loss": 1.07824445, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.14807129, + "step": 153, + "time_per_iteration": 2.630092144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308008, + "balance_loss_mlp": 1.29357219, + "epoch": 0.029626779530588686, + "flos": 603221160960.0, + "grad_norm": 0.09420536563091944, + "language_loss": 1.03062868, + "learning_rate": 0.0009974447974719707, + "loss": 1.04370856, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.14440918, + "step": 154, + "time_per_iteration": 2.6962759494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312448, + "balance_loss_mlp": 1.29745138, + "epoch": 0.02981916121585225, + "flos": 620808993792.0, + "grad_norm": 0.08558703297148447, + "language_loss": 1.04985213, + "learning_rate": 0.0009987265200589763, + "loss": 1.0629766, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.15002441, + "step": 155, + "time_per_iteration": 2.7059414386749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295882, + "balance_loss_mlp": 1.28057528, + "epoch": 0.030011542901115813, + "flos": 661322962944.0, + "grad_norm": 0.09731995783752632, + "language_loss": 1.04436159, + "learning_rate": 0.001, + "loss": 1.05732036, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.1529541, + "step": 156, + "time_per_iteration": 2.856968641281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262708, + "balance_loss_mlp": 1.24682927, + "epoch": 0.030203924586379376, + "flos": 651258842112.0, + "grad_norm": 0.05966927829613408, + "language_loss": 1.02520585, + "learning_rate": 0.0009999999029413921, + "loss": 1.03783274, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.15856934, + "step": 157, + "time_per_iteration": 2.851480722427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268181, + "balance_loss_mlp": 1.25150406, + "epoch": 0.03039630627164294, + "flos": 531083091456.0, + "grad_norm": 0.1034311415514979, + "language_loss": 1.04085183, + "learning_rate": 0.0009999996117656068, + "loss": 1.05353379, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.16674805, + "step": 158, + "time_per_iteration": 2.707646369934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262524, + "balance_loss_mlp": 1.24747968, + "epoch": 0.030588687956906502, + "flos": 585914135040.0, + "grad_norm": 0.12050944658187299, + "language_loss": 0.97824669, + "learning_rate": 0.0009999991264727564, + "loss": 0.99087203, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.15039062, + "step": 159, + "time_per_iteration": 2.7575390338897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272116, + "balance_loss_mlp": 1.25716722, + "epoch": 0.030781069642170065, + "flos": 513026777088.0, + "grad_norm": 0.07020206521781955, + "language_loss": 1.08316755, + "learning_rate": 0.0009999984470630296, + "loss": 1.09588861, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.14929199, + "step": 160, + "time_per_iteration": 2.62310528755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128559, + "balance_loss_mlp": 1.27058172, + "epoch": 0.030973451327433628, + "flos": 717766064640.0, + "grad_norm": 0.06068839125924313, + "language_loss": 0.96528012, + "learning_rate": 0.0009999975735366902, + "loss": 0.978136, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.15002441, + "step": 161, + "time_per_iteration": 3.0823376178741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305752, + "balance_loss_mlp": 1.29055238, + "epoch": 0.03116583301269719, + "flos": 1109312133120.0, + "grad_norm": 0.09428930343360856, + "language_loss": 0.98546314, + "learning_rate": 0.0009999965058940775, + "loss": 0.99852067, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.1517334, + "step": 162, + "time_per_iteration": 3.486618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315996, + "balance_loss_mlp": 1.3010118, + "epoch": 0.031358214697960754, + "flos": 450676403712.0, + "grad_norm": 0.09976775191278689, + "language_loss": 1.04580116, + "learning_rate": 0.0009999952441356057, + "loss": 1.05896115, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.1496582, + "step": 163, + "time_per_iteration": 2.537173271179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300744, + "balance_loss_mlp": 1.28654623, + "epoch": 0.031550596383224314, + "flos": 1254701325312.0, + "grad_norm": 0.0838197011845512, + "language_loss": 1.05903006, + "learning_rate": 0.000999993788261765, + "loss": 1.07203746, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.14196777, + "step": 164, + "time_per_iteration": 3.5638957023620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270584, + "balance_loss_mlp": 1.25625503, + "epoch": 0.03174297806848788, + "flos": 667841310720.0, + "grad_norm": 0.068717417443618, + "language_loss": 1.0642612, + "learning_rate": 0.00099999213827312, + "loss": 1.076967, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.14343262, + "step": 165, + "time_per_iteration": 2.8084213733673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255587, + "balance_loss_mlp": 1.24152076, + "epoch": 0.03193535975375144, + "flos": 551033832960.0, + "grad_norm": 0.06892139424853191, + "language_loss": 1.0208962, + "learning_rate": 0.000999990294170312, + "loss": 1.03345203, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.14074707, + "step": 166, + "time_per_iteration": 2.6247787475585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259954, + "balance_loss_mlp": 1.24549401, + "epoch": 0.032127741439015006, + "flos": 543377700864.0, + "grad_norm": 0.08292396830811857, + "language_loss": 1.05774951, + "learning_rate": 0.0009999882559540566, + "loss": 1.07034898, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.14465332, + "step": 167, + "time_per_iteration": 2.654036283493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291491, + "balance_loss_mlp": 1.27790117, + "epoch": 0.032320123124278566, + "flos": 548104928256.0, + "grad_norm": 0.07217909902530589, + "language_loss": 1.02104354, + "learning_rate": 0.000999986023625145, + "loss": 1.03395844, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.13598633, + "step": 168, + "time_per_iteration": 2.696866750717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03738194, + "balance_loss_mlp": 3.61993837, + "epoch": 0.03251250480954213, + "flos": 1305156865536.0, + "grad_norm": 0.563981464368737, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.82662606, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 1.1796875, + "step": 169, + "time_per_iteration": 4.971506834030151 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134723, + "balance_loss_mlp": 1.33386648, + "epoch": 0.03270488649480569, + "flos": 560866609152.0, + "grad_norm": 0.12141219581883538, + "language_loss": 1.02540469, + "learning_rate": 0.0009999809766328958, + "loss": 1.03887701, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.13391113, + "step": 170, + "time_per_iteration": 2.646425724029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01355192, + "balance_loss_mlp": 1.34039843, + "epoch": 0.03289726818006926, + "flos": 482120031744.0, + "grad_norm": 0.08046017426621577, + "language_loss": 1.05186188, + "learning_rate": 0.0009999781619715177, + "loss": 1.06541371, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.14770508, + "step": 171, + "time_per_iteration": 2.535360336303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381569, + "balance_loss_mlp": 1.36640596, + "epoch": 0.03308964986533282, + "flos": 674355276288.0, + "grad_norm": 0.08789680193074563, + "language_loss": 1.04250002, + "learning_rate": 0.000999975153201402, + "loss": 1.05631578, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.15161133, + "step": 172, + "time_per_iteration": 2.8205513954162598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433883, + "balance_loss_mlp": 1.41711044, + "epoch": 0.033282031550596385, + "flos": 608937785856.0, + "grad_norm": 0.07610360898370483, + "language_loss": 1.02505267, + "learning_rate": 0.0009999719503237174, + "loss": 1.03939152, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.16760254, + "step": 173, + "time_per_iteration": 2.738676071166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451195, + "balance_loss_mlp": 1.43315864, + "epoch": 0.033474413235859944, + "flos": 467801547264.0, + "grad_norm": 0.07270846083900323, + "language_loss": 1.111094, + "learning_rate": 0.0009999685533397073, + "loss": 1.12560594, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.18029785, + "step": 174, + "time_per_iteration": 2.5556905269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01429898, + "balance_loss_mlp": 1.41368508, + "epoch": 0.03366679492112351, + "flos": 579365263872.0, + "grad_norm": 0.09196642879711979, + "language_loss": 1.03199494, + "learning_rate": 0.00099996496225069, + "loss": 1.04629397, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.16210938, + "step": 175, + "time_per_iteration": 2.6806485652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432234, + "balance_loss_mlp": 1.41513896, + "epoch": 0.03385917660638707, + "flos": 637378315776.0, + "grad_norm": 0.08705990667808558, + "language_loss": 1.05897307, + "learning_rate": 0.0009999611770580604, + "loss": 1.07329535, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.17102051, + "step": 176, + "time_per_iteration": 2.830826759338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415158, + "balance_loss_mlp": 1.39910054, + "epoch": 0.03405155829165064, + "flos": 441587123712.0, + "grad_norm": 0.08054669051038237, + "language_loss": 1.03868258, + "learning_rate": 0.0009999571977632876, + "loss": 1.05283427, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.16052246, + "step": 177, + "time_per_iteration": 2.623309850692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463141, + "balance_loss_mlp": 1.44573641, + "epoch": 0.034243939976914196, + "flos": 466097766912.0, + "grad_norm": 0.08089290506220445, + "language_loss": 1.06928194, + "learning_rate": 0.0009999530243679166, + "loss": 1.08391333, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.17407227, + "step": 178, + "time_per_iteration": 2.545133113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451423, + "balance_loss_mlp": 1.43560433, + "epoch": 0.03443632166217776, + "flos": 778919016960.0, + "grad_norm": 0.08468734735068614, + "language_loss": 1.01505899, + "learning_rate": 0.0009999486568735675, + "loss": 1.0295732, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.15808105, + "step": 179, + "time_per_iteration": 3.0384457111358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433641, + "balance_loss_mlp": 1.41778612, + "epoch": 0.03462870334744132, + "flos": 1263284246016.0, + "grad_norm": 0.06997324880309466, + "language_loss": 1.01388979, + "learning_rate": 0.0009999440952819362, + "loss": 1.02822614, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.15856934, + "step": 180, + "time_per_iteration": 3.6892786026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401308, + "balance_loss_mlp": 1.38610911, + "epoch": 0.03482108503270489, + "flos": 606899354112.0, + "grad_norm": 0.057831512038439566, + "language_loss": 1.02027512, + "learning_rate": 0.0009999393395947935, + "loss": 1.03428817, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.15185547, + "step": 181, + "time_per_iteration": 2.826353073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381551, + "balance_loss_mlp": 1.36612535, + "epoch": 0.03501346671796845, + "flos": 538010284032.0, + "grad_norm": 0.05913415109875365, + "language_loss": 1.05361927, + "learning_rate": 0.0009999343898139858, + "loss": 1.06743479, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.1541748, + "step": 182, + "time_per_iteration": 2.593250036239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01359754, + "balance_loss_mlp": 1.33988214, + "epoch": 0.035205848403232015, + "flos": 518231250432.0, + "grad_norm": 0.05898920665253376, + "language_loss": 1.04308426, + "learning_rate": 0.0009999292459414348, + "loss": 1.05668187, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.19909668, + "step": 183, + "time_per_iteration": 2.565936326980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01311064, + "balance_loss_mlp": 1.296103, + "epoch": 0.035398230088495575, + "flos": 472134486528.0, + "grad_norm": 0.06373248491183749, + "language_loss": 1.08499169, + "learning_rate": 0.0009999239079791374, + "loss": 1.09810233, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.14953613, + "step": 184, + "time_per_iteration": 2.552949905395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130912, + "balance_loss_mlp": 1.29237127, + "epoch": 0.03559061177375914, + "flos": 511820591616.0, + "grad_norm": 0.056329932736213485, + "language_loss": 1.01337337, + "learning_rate": 0.0009999183759291659, + "loss": 1.02646446, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.16748047, + "step": 185, + "time_per_iteration": 2.741727113723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291511, + "balance_loss_mlp": 1.27575147, + "epoch": 0.0357829934590227, + "flos": 477146903040.0, + "grad_norm": 0.11224085577532149, + "language_loss": 1.03738213, + "learning_rate": 0.0009999126497936682, + "loss": 1.05029726, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.1574707, + "step": 186, + "time_per_iteration": 2.4901957511901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291515, + "balance_loss_mlp": 1.27446783, + "epoch": 0.03597537514428627, + "flos": 644350588416.0, + "grad_norm": 0.06537030709871235, + "language_loss": 1.06735992, + "learning_rate": 0.0009999067295748676, + "loss": 1.08027506, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.1706543, + "step": 187, + "time_per_iteration": 2.7923052310943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327575, + "balance_loss_mlp": 1.30966997, + "epoch": 0.03616775682954983, + "flos": 580916275200.0, + "grad_norm": 0.06523062893181024, + "language_loss": 1.04418302, + "learning_rate": 0.000999900615275062, + "loss": 1.05745876, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.17919922, + "step": 188, + "time_per_iteration": 2.7248637676239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295421, + "balance_loss_mlp": 1.27722955, + "epoch": 0.03636013851481339, + "flos": 382210735104.0, + "grad_norm": 0.08035209765807474, + "language_loss": 1.10347509, + "learning_rate": 0.0009998943068966256, + "loss": 1.11642933, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.18188477, + "step": 189, + "time_per_iteration": 2.429497480392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279097, + "balance_loss_mlp": 1.26120377, + "epoch": 0.03655252020007695, + "flos": 582954706944.0, + "grad_norm": 0.07380481555246936, + "language_loss": 1.0506779, + "learning_rate": 0.0009998878044420072, + "loss": 1.06346881, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.17907715, + "step": 190, + "time_per_iteration": 2.6878626346588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012863, + "balance_loss_mlp": 1.26773953, + "epoch": 0.03674490188534051, + "flos": 471376433664.0, + "grad_norm": 0.10484442400689244, + "language_loss": 1.01223493, + "learning_rate": 0.0009998811079137318, + "loss": 1.02509785, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.18566895, + "step": 191, + "time_per_iteration": 2.561494827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281775, + "balance_loss_mlp": 1.26645625, + "epoch": 0.03693728357060408, + "flos": 528113488896.0, + "grad_norm": 0.0609431296621874, + "language_loss": 1.01984763, + "learning_rate": 0.0009998742173143987, + "loss": 1.03266537, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.1529541, + "step": 192, + "time_per_iteration": 2.59798264503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336751, + "balance_loss_mlp": 1.32157528, + "epoch": 0.03712966525586764, + "flos": 798657352704.0, + "grad_norm": 0.10186248006293357, + "language_loss": 1.02005363, + "learning_rate": 0.0009998671326466833, + "loss": 1.03342128, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.15185547, + "step": 193, + "time_per_iteration": 2.9510865211486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331057, + "balance_loss_mlp": 1.3157624, + "epoch": 0.037322046941131205, + "flos": 829628116992.0, + "grad_norm": 0.06375125184008373, + "language_loss": 1.02914846, + "learning_rate": 0.0009998598539133362, + "loss": 1.04245901, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.1529541, + "step": 194, + "time_per_iteration": 2.9981300830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01337882, + "balance_loss_mlp": 1.3235296, + "epoch": 0.037514428626394765, + "flos": 437460797952.0, + "grad_norm": 0.10181133305516413, + "language_loss": 1.03936744, + "learning_rate": 0.0009998523811171828, + "loss": 1.0527463, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.14379883, + "step": 195, + "time_per_iteration": 2.501542568206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296215, + "balance_loss_mlp": 1.28125429, + "epoch": 0.03770681031165833, + "flos": 511372459008.0, + "grad_norm": 0.09414845611868274, + "language_loss": 1.04584992, + "learning_rate": 0.0009998447142611248, + "loss": 1.05881214, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.14941406, + "step": 196, + "time_per_iteration": 2.6247317790985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128144, + "balance_loss_mlp": 1.26702762, + "epoch": 0.03789919199692189, + "flos": 807102061056.0, + "grad_norm": 0.05831249070889761, + "language_loss": 0.97701526, + "learning_rate": 0.0009998368533481387, + "loss": 0.9898296, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.14422607, + "step": 197, + "time_per_iteration": 3.01912784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294999, + "balance_loss_mlp": 1.27945375, + "epoch": 0.03809157368218546, + "flos": 690274234368.0, + "grad_norm": 0.06656848410147823, + "language_loss": 1.00630498, + "learning_rate": 0.0009998287983812762, + "loss": 1.01925504, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.15551758, + "step": 198, + "time_per_iteration": 2.8252804279327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0135387, + "balance_loss_mlp": 1.33592904, + "epoch": 0.03828395536744902, + "flos": 517675428864.0, + "grad_norm": 0.06988401379713739, + "language_loss": 1.06386423, + "learning_rate": 0.0009998205493636646, + "loss": 1.07740283, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.17944336, + "step": 199, + "time_per_iteration": 2.649765729904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339461, + "balance_loss_mlp": 1.32242572, + "epoch": 0.038476337052712584, + "flos": 581389138944.0, + "grad_norm": 0.07184113921580974, + "language_loss": 0.9925406, + "learning_rate": 0.0009998121062985063, + "loss": 1.00593519, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.17053223, + "step": 200, + "time_per_iteration": 2.6788320541381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328142, + "balance_loss_mlp": 1.31167912, + "epoch": 0.03866871873797614, + "flos": 576791359488.0, + "grad_norm": 0.059667024197710104, + "language_loss": 1.01260698, + "learning_rate": 0.0009998034691890794, + "loss": 1.02588844, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.16455078, + "step": 201, + "time_per_iteration": 2.753265380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297644, + "balance_loss_mlp": 1.28249288, + "epoch": 0.03886110042323971, + "flos": 540472117248.0, + "grad_norm": 0.07302515973387386, + "language_loss": 1.05948424, + "learning_rate": 0.0009997946380387369, + "loss": 1.07246065, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.15136719, + "step": 202, + "time_per_iteration": 2.618546485900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262023, + "balance_loss_mlp": 1.24746776, + "epoch": 0.03905348210850327, + "flos": 717694843392.0, + "grad_norm": 0.0775452329378228, + "language_loss": 1.08266401, + "learning_rate": 0.0009997856128509076, + "loss": 1.09528422, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.14550781, + "step": 203, + "time_per_iteration": 2.8284859657287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267878, + "balance_loss_mlp": 1.25321579, + "epoch": 0.039245863793766836, + "flos": 427268639232.0, + "grad_norm": 0.06664318613050589, + "language_loss": 1.02886617, + "learning_rate": 0.0009997763936290952, + "loss": 1.04154491, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.14660645, + "step": 204, + "time_per_iteration": 2.516263246536255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129892, + "balance_loss_mlp": 1.28264785, + "epoch": 0.039438245479030395, + "flos": 662804163072.0, + "grad_norm": 0.07463685050771204, + "language_loss": 1.0815413, + "learning_rate": 0.0009997669803768789, + "loss": 1.09453046, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.16271973, + "step": 205, + "time_per_iteration": 2.7576606273651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291456, + "balance_loss_mlp": 1.27812803, + "epoch": 0.03963062716429396, + "flos": 635063459328.0, + "grad_norm": 0.055878982250893716, + "language_loss": 1.03253651, + "learning_rate": 0.0009997573730979134, + "loss": 1.04545116, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.13342285, + "step": 206, + "time_per_iteration": 2.716325521469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.07279634, + "balance_loss_mlp": 4.65512276, + "epoch": 0.03982300884955752, + "flos": 1417813286400.0, + "grad_norm": 0.533603848118922, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.86472833, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 26.25, + "step": 207, + "time_per_iteration": 4.635821342468262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0137482, + "balance_loss_mlp": 1.35964513, + "epoch": 0.04001539053482109, + "flos": 688769713152.0, + "grad_norm": 0.1040721574676452, + "language_loss": 1.02094078, + "learning_rate": 0.0009997375764747294, + "loss": 1.03468895, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.1517334, + "step": 208, + "time_per_iteration": 2.974442481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415285, + "balance_loss_mlp": 1.40052748, + "epoch": 0.04020777222008465, + "flos": 533363042304.0, + "grad_norm": 0.08111266742266361, + "language_loss": 0.99458027, + "learning_rate": 0.0009997273871381967, + "loss": 1.00873303, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.14758301, + "step": 209, + "time_per_iteration": 2.6802144050598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466201, + "balance_loss_mlp": 1.44989347, + "epoch": 0.040400153905348214, + "flos": 567661381632.0, + "grad_norm": 0.06875741115436663, + "language_loss": 1.05031717, + "learning_rate": 0.0009997170037902862, + "loss": 1.0649792, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.16308594, + "step": 210, + "time_per_iteration": 2.6975836753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531768, + "balance_loss_mlp": 1.51399446, + "epoch": 0.040592535590611774, + "flos": 713130559488.0, + "grad_norm": 0.07197690318934227, + "language_loss": 1.07202697, + "learning_rate": 0.0009997064264350292, + "loss": 1.08734465, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.17785645, + "step": 211, + "time_per_iteration": 2.836771011352539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531925, + "balance_loss_mlp": 1.5154984, + "epoch": 0.04078491727587533, + "flos": 577824427008.0, + "grad_norm": 0.09120436996840299, + "language_loss": 1.0146966, + "learning_rate": 0.0009996956550765317, + "loss": 1.03001595, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.16430664, + "step": 212, + "time_per_iteration": 2.671292781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01499293, + "balance_loss_mlp": 1.4817214, + "epoch": 0.0409772989611389, + "flos": 552033404928.0, + "grad_norm": 0.11449485477945152, + "language_loss": 0.96278083, + "learning_rate": 0.0009996846897189762, + "loss": 0.97777379, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.17565918, + "step": 213, + "time_per_iteration": 2.6231424808502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014653, + "balance_loss_mlp": 1.44753814, + "epoch": 0.04116968064640246, + "flos": 555347833344.0, + "grad_norm": 0.09512793115916172, + "language_loss": 1.02356708, + "learning_rate": 0.0009996735303666193, + "loss": 1.03822017, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.1776123, + "step": 214, + "time_per_iteration": 2.6930177211761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01477298, + "balance_loss_mlp": 1.46134758, + "epoch": 0.041362062331666026, + "flos": 578204158464.0, + "grad_norm": 0.09141123477091552, + "language_loss": 1.04750729, + "learning_rate": 0.0009996621770237937, + "loss": 1.06228042, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.15942383, + "step": 215, + "time_per_iteration": 2.7448923587799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01578462, + "balance_loss_mlp": 1.56013966, + "epoch": 0.041554444016929586, + "flos": 611130396672.0, + "grad_norm": 0.10233552268903827, + "language_loss": 0.99822551, + "learning_rate": 0.0009996506296949073, + "loss": 1.01401007, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.18334961, + "step": 216, + "time_per_iteration": 2.8548526763916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01609008, + "balance_loss_mlp": 1.59156775, + "epoch": 0.04174682570219315, + "flos": 527857413120.0, + "grad_norm": 0.10522858499680945, + "language_loss": 0.99888742, + "learning_rate": 0.0009996388883844428, + "loss": 1.01497757, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.17456055, + "step": 217, + "time_per_iteration": 2.618546724319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01557164, + "balance_loss_mlp": 1.54124999, + "epoch": 0.04193920738745671, + "flos": 511258977792.0, + "grad_norm": 0.09341741551851517, + "language_loss": 1.03841758, + "learning_rate": 0.0009996269530969588, + "loss": 1.05398929, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.15905762, + "step": 218, + "time_per_iteration": 2.6204636096954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01525903, + "balance_loss_mlp": 1.50927377, + "epoch": 0.04213158907272028, + "flos": 571226093568.0, + "grad_norm": 0.09609660813155754, + "language_loss": 1.02944803, + "learning_rate": 0.0009996148238370888, + "loss": 1.04470706, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.16625977, + "step": 219, + "time_per_iteration": 2.7071943283081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0150071, + "balance_loss_mlp": 1.48340106, + "epoch": 0.04232397075798384, + "flos": 963803667456.0, + "grad_norm": 0.05454565212769997, + "language_loss": 0.9941752, + "learning_rate": 0.0009996025006095421, + "loss": 1.00918233, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.1730957, + "step": 220, + "time_per_iteration": 3.3006374835968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.10944285, + "balance_loss_mlp": 6.84272289, + "epoch": 0.042516352443247404, + "flos": 1468814777856.0, + "grad_norm": 0.48497418398004566, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.88727427, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 41.0, + "step": 221, + "time_per_iteration": 5.7136383056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601015, + "balance_loss_mlp": 1.58291924, + "epoch": 0.042708734128510964, + "flos": 654419091456.0, + "grad_norm": 0.10763646442297387, + "language_loss": 0.99765503, + "learning_rate": 0.0009995772722706307, + "loss": 1.0136652, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.1809082, + "step": 222, + "time_per_iteration": 2.8322792053222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01658843, + "balance_loss_mlp": 1.63811278, + "epoch": 0.04290111581377453, + "flos": 431601578496.0, + "grad_norm": 0.16393394652444138, + "language_loss": 1.13557565, + "learning_rate": 0.0009995643671690604, + "loss": 1.1521641, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.20739746, + "step": 223, + "time_per_iteration": 2.470729351043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163871, + "balance_loss_mlp": 1.61504686, + "epoch": 0.04309349749903809, + "flos": 644379701760.0, + "grad_norm": 0.08733094203359489, + "language_loss": 1.00837708, + "learning_rate": 0.0009995512681194023, + "loss": 1.02476418, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.23632812, + "step": 224, + "time_per_iteration": 2.8274452686309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615568, + "balance_loss_mlp": 1.58755326, + "epoch": 0.04328587918430166, + "flos": 830861853696.0, + "grad_norm": 0.12001676841435771, + "language_loss": 0.98664522, + "learning_rate": 0.0009995379751267417, + "loss": 1.00280082, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.28027344, + "step": 225, + "time_per_iteration": 3.275660991668701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01617416, + "balance_loss_mlp": 1.58639741, + "epoch": 0.043478260869565216, + "flos": 524804852736.0, + "grad_norm": 0.1467276253632499, + "language_loss": 1.0007726, + "learning_rate": 0.0009995244881962398, + "loss": 1.01694679, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.30981445, + "step": 226, + "time_per_iteration": 2.6300203800201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601732, + "balance_loss_mlp": 1.56787658, + "epoch": 0.04367064255482878, + "flos": 439253328384.0, + "grad_norm": 0.095918638324787, + "language_loss": 1.01389623, + "learning_rate": 0.0009995108073331323, + "loss": 1.02991343, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.33862305, + "step": 227, + "time_per_iteration": 2.667628765106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0158134, + "balance_loss_mlp": 1.5462923, + "epoch": 0.04386302424009234, + "flos": 507109330944.0, + "grad_norm": 0.08564981186298011, + "language_loss": 1.04024279, + "learning_rate": 0.0009994969325427309, + "loss": 1.05605614, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.35058594, + "step": 228, + "time_per_iteration": 2.6454501152038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558795, + "balance_loss_mlp": 1.52224541, + "epoch": 0.04405540592535591, + "flos": 540432829440.0, + "grad_norm": 0.07744391701114339, + "language_loss": 1.00052619, + "learning_rate": 0.0009994828638304218, + "loss": 1.016114, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.36547852, + "step": 229, + "time_per_iteration": 2.6468071937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01516137, + "balance_loss_mlp": 1.47794271, + "epoch": 0.04424778761061947, + "flos": 446136850944.0, + "grad_norm": 0.08052263902742013, + "language_loss": 1.06763554, + "learning_rate": 0.0009994686012016675, + "loss": 1.08279693, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.3815918, + "step": 230, + "time_per_iteration": 2.5467634201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01483037, + "balance_loss_mlp": 1.44515228, + "epoch": 0.044440169295883035, + "flos": 700383435264.0, + "grad_norm": 0.05918307184238542, + "language_loss": 1.0518043, + "learning_rate": 0.000999454144662005, + "loss": 1.06663465, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.37866211, + "step": 231, + "time_per_iteration": 2.8704099655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473358, + "balance_loss_mlp": 1.43549716, + "epoch": 0.044632550981146595, + "flos": 588055873536.0, + "grad_norm": 0.08626514264815018, + "language_loss": 0.99676436, + "learning_rate": 0.0009994394942170468, + "loss": 1.01149797, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.37866211, + "step": 232, + "time_per_iteration": 2.6578898429870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461415, + "balance_loss_mlp": 1.4258194, + "epoch": 0.04482493266641016, + "flos": 554534525952.0, + "grad_norm": 0.07124765242066121, + "language_loss": 0.96965969, + "learning_rate": 0.0009994246498724808, + "loss": 0.98427379, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.35620117, + "step": 233, + "time_per_iteration": 2.7764015197753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463645, + "balance_loss_mlp": 1.42790616, + "epoch": 0.04501731435167372, + "flos": 722500646400.0, + "grad_norm": 0.07759597622956232, + "language_loss": 0.99069166, + "learning_rate": 0.00099940961163407, + "loss": 1.00532806, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.35766602, + "step": 234, + "time_per_iteration": 2.8431143760681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454599, + "balance_loss_mlp": 1.42098188, + "epoch": 0.04520969603693728, + "flos": 511539784704.0, + "grad_norm": 0.05931413709293958, + "language_loss": 1.02564597, + "learning_rate": 0.0009993943795076528, + "loss": 1.04019189, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.33642578, + "step": 235, + "time_per_iteration": 2.645988702774048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444244, + "balance_loss_mlp": 1.40936303, + "epoch": 0.04540207772220085, + "flos": 364854246912.0, + "grad_norm": 0.07280953320994132, + "language_loss": 1.04776168, + "learning_rate": 0.0009993789534991427, + "loss": 1.062204, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.34912109, + "step": 236, + "time_per_iteration": 2.4084837436676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01418385, + "balance_loss_mlp": 1.38390946, + "epoch": 0.045594459407464406, + "flos": 522407038464.0, + "grad_norm": 0.060943880380569936, + "language_loss": 0.99500269, + "learning_rate": 0.0009993633336145287, + "loss": 1.00918651, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.34472656, + "step": 237, + "time_per_iteration": 2.6044533252716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406135, + "balance_loss_mlp": 1.3730185, + "epoch": 0.04578684109272797, + "flos": 671442338304.0, + "grad_norm": 0.06747057459653658, + "language_loss": 1.03573179, + "learning_rate": 0.0009993475198598752, + "loss": 1.04979324, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.33129883, + "step": 238, + "time_per_iteration": 2.9948084354400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01383668, + "balance_loss_mlp": 1.35164809, + "epoch": 0.04597922277799153, + "flos": 541387321344.0, + "grad_norm": 0.07135856148897902, + "language_loss": 0.99909985, + "learning_rate": 0.0009993315122413212, + "loss": 1.01293659, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.32006836, + "step": 239, + "time_per_iteration": 2.5848827362060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369111, + "balance_loss_mlp": 1.33773541, + "epoch": 0.0461716044632551, + "flos": 458732616192.0, + "grad_norm": 0.056000088810755834, + "language_loss": 1.0008105, + "learning_rate": 0.0009993153107650818, + "loss": 1.01450157, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.31347656, + "step": 240, + "time_per_iteration": 2.5492687225341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338815, + "balance_loss_mlp": 1.31015706, + "epoch": 0.04636398614851866, + "flos": 455009342976.0, + "grad_norm": 0.06491754001609312, + "language_loss": 0.99534512, + "learning_rate": 0.0009992989154374468, + "loss": 1.00873327, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.28662109, + "step": 241, + "time_per_iteration": 2.511237621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294622, + "balance_loss_mlp": 1.26833653, + "epoch": 0.046556367833782225, + "flos": 556558401024.0, + "grad_norm": 0.07592069792168304, + "language_loss": 1.06626534, + "learning_rate": 0.0009992823262647817, + "loss": 1.07921147, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26293945, + "step": 242, + "time_per_iteration": 2.7618701457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249282, + "balance_loss_mlp": 1.22240043, + "epoch": 0.046748749519045785, + "flos": 592625949696.0, + "grad_norm": 0.0687662987323222, + "language_loss": 1.00893593, + "learning_rate": 0.0009992655432535264, + "loss": 1.0214287, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.26879883, + "step": 243, + "time_per_iteration": 2.7471935749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255015, + "balance_loss_mlp": 1.23083937, + "epoch": 0.04694113120430935, + "flos": 569596506624.0, + "grad_norm": 0.07373455055845594, + "language_loss": 1.0054853, + "learning_rate": 0.0009992485664101973, + "loss": 1.01803541, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.24169922, + "step": 244, + "time_per_iteration": 2.635344982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295554, + "balance_loss_mlp": 1.27291572, + "epoch": 0.04713351288957291, + "flos": 863401158144.0, + "grad_norm": 0.10584905626659928, + "language_loss": 1.03312445, + "learning_rate": 0.000999231395741385, + "loss": 1.04607987, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.22631836, + "step": 245, + "time_per_iteration": 3.093386173248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128706, + "balance_loss_mlp": 1.26464868, + "epoch": 0.04732589457483648, + "flos": 536961249792.0, + "grad_norm": 0.08844420521863233, + "language_loss": 1.01371169, + "learning_rate": 0.0009992140312537557, + "loss": 1.02658224, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.22412109, + "step": 246, + "time_per_iteration": 2.667579412460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256359, + "balance_loss_mlp": 1.23515141, + "epoch": 0.04751827626010004, + "flos": 761566910976.0, + "grad_norm": 0.052835972446563725, + "language_loss": 0.9609164, + "learning_rate": 0.000999196472954051, + "loss": 0.97347999, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.2121582, + "step": 247, + "time_per_iteration": 2.9537084102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02369813, + "balance_loss_mlp": 2.16687083, + "epoch": 0.0477106579453636, + "flos": 1578961313280.0, + "grad_norm": 0.2151482568863758, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.81794667, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 2.03125, + "step": 248, + "time_per_iteration": 5.758621454238892 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289622, + "balance_loss_mlp": 1.27137113, + "epoch": 0.04790303963062716, + "flos": 457535195136.0, + "grad_norm": 0.10849969336884063, + "language_loss": 1.03316629, + "learning_rate": 0.0009991607749457578, + "loss": 1.04606247, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.18261719, + "step": 249, + "time_per_iteration": 2.5432913303375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334566, + "balance_loss_mlp": 1.31724536, + "epoch": 0.04809542131589073, + "flos": 782079266304.0, + "grad_norm": 0.08264534697846654, + "language_loss": 1.01180637, + "learning_rate": 0.0009991426352510286, + "loss": 1.02515209, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.17321777, + "step": 250, + "time_per_iteration": 3.1542766094207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351096, + "balance_loss_mlp": 1.33215368, + "epoch": 0.04828780300115429, + "flos": 558995503104.0, + "grad_norm": 0.06435857362074206, + "language_loss": 1.03307557, + "learning_rate": 0.0009991243017719422, + "loss": 1.04658651, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.18933105, + "step": 251, + "time_per_iteration": 2.693882942199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333217, + "balance_loss_mlp": 1.31485844, + "epoch": 0.048480184686417856, + "flos": 501682277376.0, + "grad_norm": 0.09276508096019526, + "language_loss": 0.97794825, + "learning_rate": 0.0009991057745156165, + "loss": 0.99128038, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.18347168, + "step": 252, + "time_per_iteration": 2.628873109817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01867297, + "balance_loss_mlp": 1.75514495, + "epoch": 0.048672566371681415, + "flos": 1535585430528.0, + "grad_norm": 0.16359674361847032, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.8377828, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.125, + "step": 253, + "time_per_iteration": 5.060615062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285031, + "balance_loss_mlp": 1.26567185, + "epoch": 0.04886494805694498, + "flos": 537665458176.0, + "grad_norm": 0.07164286827098729, + "language_loss": 1.06546187, + "learning_rate": 0.0009990681387000943, + "loss": 1.07831216, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.19384766, + "step": 254, + "time_per_iteration": 2.783367395401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275754, + "balance_loss_mlp": 1.25606036, + "epoch": 0.04905732974220854, + "flos": 679841966592.0, + "grad_norm": 0.06618046133348403, + "language_loss": 1.01404011, + "learning_rate": 0.0009990490301555093, + "loss": 1.02679765, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.19689941, + "step": 255, + "time_per_iteration": 2.9520761966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01936632, + "balance_loss_mlp": 1.86796737, + "epoch": 0.04924971142747211, + "flos": 1420408949760.0, + "grad_norm": 0.31562964738653715, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.81151783, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.6875, + "step": 256, + "time_per_iteration": 4.825209856033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615246, + "balance_loss_mlp": 1.55344784, + "epoch": 0.04944209311273567, + "flos": 1557202074624.0, + "grad_norm": 0.16937574338078817, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80857986, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.6171875, + "step": 257, + "time_per_iteration": 4.995501518249512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163115, + "balance_loss_mlp": 1.58422887, + "epoch": 0.04963447479799923, + "flos": 1569985514496.0, + "grad_norm": 0.13524925240989144, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71607035, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.46875, + "step": 258, + "time_per_iteration": 4.841471910476685 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272668, + "balance_loss_mlp": 1.24463022, + "epoch": 0.049826856483262794, + "flos": 625063357440.0, + "grad_norm": 0.06365504505971183, + "language_loss": 0.95603192, + "learning_rate": 0.0009989706585723202, + "loss": 0.96875864, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.28076172, + "step": 259, + "time_per_iteration": 2.7635786533355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130022, + "balance_loss_mlp": 1.27020288, + "epoch": 0.05001923816852635, + "flos": 503912765952.0, + "grad_norm": 0.062257698278494894, + "language_loss": 1.01846027, + "learning_rate": 0.0009989505813633442, + "loss": 1.03146255, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.29980469, + "step": 260, + "time_per_iteration": 2.6451833248138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131611, + "balance_loss_mlp": 1.28101516, + "epoch": 0.05021161985378992, + "flos": 587066476032.0, + "grad_norm": 0.06290514068599455, + "language_loss": 1.01911807, + "learning_rate": 0.000998930310444573, + "loss": 1.03227913, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.35083008, + "step": 261, + "time_per_iteration": 2.6989662647247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324978, + "balance_loss_mlp": 1.2880708, + "epoch": 0.05040400153905348, + "flos": 633029409792.0, + "grad_norm": 0.0625839964239575, + "language_loss": 1.00387836, + "learning_rate": 0.0009989098458238765, + "loss": 1.01712811, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.36914062, + "step": 262, + "time_per_iteration": 2.7581043243408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319841, + "balance_loss_mlp": 1.28395867, + "epoch": 0.050596383224317046, + "flos": 553344307200.0, + "grad_norm": 0.06067150197267865, + "language_loss": 0.99905968, + "learning_rate": 0.0009988891875091998, + "loss": 1.01225805, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.35913086, + "step": 263, + "time_per_iteration": 2.7601842880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131413, + "balance_loss_mlp": 1.27793837, + "epoch": 0.050788764909580605, + "flos": 549389689344.0, + "grad_norm": 0.07440292928735547, + "language_loss": 0.94277728, + "learning_rate": 0.0009988683355085636, + "loss": 0.95591855, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.36206055, + "step": 264, + "time_per_iteration": 2.7262909412384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277315, + "balance_loss_mlp": 1.24248254, + "epoch": 0.05098114659484417, + "flos": 604812870144.0, + "grad_norm": 0.06984595792035174, + "language_loss": 1.02861905, + "learning_rate": 0.000998847289830063, + "loss": 1.04139221, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.34838867, + "step": 265, + "time_per_iteration": 2.8318397998809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256298, + "balance_loss_mlp": 1.22272849, + "epoch": 0.05117352828010773, + "flos": 438317775360.0, + "grad_norm": 0.08677906198544101, + "language_loss": 0.95779377, + "learning_rate": 0.0009988260504818682, + "loss": 0.9703567, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.3359375, + "step": 266, + "time_per_iteration": 2.5212388038635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220367, + "balance_loss_mlp": 1.19046903, + "epoch": 0.0513659099653713, + "flos": 504784300032.0, + "grad_norm": 0.09456939977029206, + "language_loss": 1.01958096, + "learning_rate": 0.000998804617472226, + "loss": 1.03178465, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.29858398, + "step": 267, + "time_per_iteration": 2.649739980697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169139, + "balance_loss_mlp": 1.14131606, + "epoch": 0.05155829165063486, + "flos": 695183344128.0, + "grad_norm": 0.07125411147685125, + "language_loss": 0.97574937, + "learning_rate": 0.0009987829908094568, + "loss": 0.98744082, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.27856445, + "step": 268, + "time_per_iteration": 2.816098690032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119703, + "balance_loss_mlp": 1.09379935, + "epoch": 0.051750673335898424, + "flos": 1347751830528.0, + "grad_norm": 0.06583247177587333, + "language_loss": 1.04151332, + "learning_rate": 0.0009987611705019569, + "loss": 1.05271029, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.25927734, + "step": 269, + "time_per_iteration": 4.478148460388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103433, + "balance_loss_mlp": 1.08008027, + "epoch": 0.051943055021161984, + "flos": 489362936832.0, + "grad_norm": 0.06787757239342199, + "language_loss": 1.02481639, + "learning_rate": 0.0009987391565581978, + "loss": 1.03585076, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.23364258, + "step": 270, + "time_per_iteration": 2.5662009716033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111004, + "balance_loss_mlp": 1.08859241, + "epoch": 0.05213543670642555, + "flos": 545504882688.0, + "grad_norm": 0.08198896814085149, + "language_loss": 0.9504528, + "learning_rate": 0.000998716948986726, + "loss": 0.96156287, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.22424316, + "step": 271, + "time_per_iteration": 2.7815349102020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158552, + "balance_loss_mlp": 1.13697529, + "epoch": 0.05232781839168911, + "flos": 603285179904.0, + "grad_norm": 0.07646156534985457, + "language_loss": 0.97641921, + "learning_rate": 0.0009986945477961633, + "loss": 0.9880048, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.21569824, + "step": 272, + "time_per_iteration": 2.694547414779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188724, + "balance_loss_mlp": 1.16735017, + "epoch": 0.052520200076952676, + "flos": 538218307584.0, + "grad_norm": 0.07381807258867126, + "language_loss": 1.02498066, + "learning_rate": 0.0009986719529952066, + "loss": 1.03686786, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.21386719, + "step": 273, + "time_per_iteration": 2.8339192867279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121752, + "balance_loss_mlp": 1.19785035, + "epoch": 0.052712581762216236, + "flos": 463148513280.0, + "grad_norm": 0.0738352941440963, + "language_loss": 1.01808548, + "learning_rate": 0.000998649164592628, + "loss": 1.03026068, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.19677734, + "step": 274, + "time_per_iteration": 2.60577130317688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236713, + "balance_loss_mlp": 1.21763909, + "epoch": 0.0529049634474798, + "flos": 547749927936.0, + "grad_norm": 0.08134169766286939, + "language_loss": 0.99272913, + "learning_rate": 0.0009986261825972748, + "loss": 1.00509632, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.19055176, + "step": 275, + "time_per_iteration": 2.652561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196689, + "balance_loss_mlp": 1.17834246, + "epoch": 0.05309734513274336, + "flos": 617727320064.0, + "grad_norm": 0.09111845604121613, + "language_loss": 1.01860571, + "learning_rate": 0.000998603007018069, + "loss": 1.03057253, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.18334961, + "step": 276, + "time_per_iteration": 2.8293774127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011443, + "balance_loss_mlp": 1.1273365, + "epoch": 0.05328972681800693, + "flos": 605220304896.0, + "grad_norm": 0.07377841396756965, + "language_loss": 0.99345076, + "learning_rate": 0.0009985796378640089, + "loss": 1.00489378, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.16955566, + "step": 277, + "time_per_iteration": 2.694716215133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098185, + "balance_loss_mlp": 1.08067346, + "epoch": 0.05348210850327049, + "flos": 604197411840.0, + "grad_norm": 0.07074934963985437, + "language_loss": 0.99532163, + "learning_rate": 0.0009985560751441665, + "loss": 1.00630355, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.1751709, + "step": 278, + "time_per_iteration": 2.798563241958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095446, + "balance_loss_mlp": 1.07736206, + "epoch": 0.053674490188534055, + "flos": 630480236544.0, + "grad_norm": 0.054749659326078955, + "language_loss": 1.01733184, + "learning_rate": 0.00099853231886769, + "loss": 1.02828622, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.1809082, + "step": 279, + "time_per_iteration": 2.780940532684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134885, + "balance_loss_mlp": 1.11744475, + "epoch": 0.053866871873797614, + "flos": 478939433472.0, + "grad_norm": 0.06375435082524677, + "language_loss": 1.01461124, + "learning_rate": 0.0009985083690438024, + "loss": 1.02595997, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.17443848, + "step": 280, + "time_per_iteration": 2.68762469291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145965, + "balance_loss_mlp": 1.12913251, + "epoch": 0.054059253559061174, + "flos": 787673645568.0, + "grad_norm": 0.07384801764192533, + "language_loss": 0.92380941, + "learning_rate": 0.0009984842256818016, + "loss": 0.93526906, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.16845703, + "step": 281, + "time_per_iteration": 3.054032325744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114791, + "balance_loss_mlp": 1.13080359, + "epoch": 0.05425163524432474, + "flos": 628076630016.0, + "grad_norm": 0.082175996598207, + "language_loss": 1.0314945, + "learning_rate": 0.0009984598887910613, + "loss": 1.04297376, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.17114258, + "step": 282, + "time_per_iteration": 2.7095611095428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144379, + "balance_loss_mlp": 1.12627149, + "epoch": 0.0544440169295883, + "flos": 615453161472.0, + "grad_norm": 0.06813866095032944, + "language_loss": 0.9902432, + "learning_rate": 0.0009984353583810297, + "loss": 1.00168693, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.18103027, + "step": 283, + "time_per_iteration": 2.804438829421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124262, + "balance_loss_mlp": 1.10624945, + "epoch": 0.05463639861485187, + "flos": 647471549952.0, + "grad_norm": 0.10003204141391345, + "language_loss": 1.01340103, + "learning_rate": 0.0009984106344612302, + "loss": 1.02464366, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.18017578, + "step": 284, + "time_per_iteration": 2.7521376609802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109552, + "balance_loss_mlp": 1.07819879, + "epoch": 0.054828780300115426, + "flos": 796845883392.0, + "grad_norm": 0.07143310654982075, + "language_loss": 0.96421391, + "learning_rate": 0.0009983857170412615, + "loss": 0.97516906, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.17321777, + "step": 285, + "time_per_iteration": 2.9796621799468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089942, + "balance_loss_mlp": 1.07363439, + "epoch": 0.05502116198537899, + "flos": 549414420480.0, + "grad_norm": 0.05224422052371224, + "language_loss": 0.95713383, + "learning_rate": 0.000998360606130798, + "loss": 0.96803325, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.16308594, + "step": 286, + "time_per_iteration": 2.7950801849365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02908189, + "balance_loss_mlp": 2.83799911, + "epoch": 0.05521354367064255, + "flos": 1406967791616.0, + "grad_norm": 0.233188183772104, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71981305, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.703125, + "step": 287, + "time_per_iteration": 4.876653432846069 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179574, + "balance_loss_mlp": 1.1627655, + "epoch": 0.05540592535590612, + "flos": 645123197952.0, + "grad_norm": 0.17417830683261867, + "language_loss": 1.0204829, + "learning_rate": 0.0009983098038774552, + "loss": 1.03227878, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.16821289, + "step": 288, + "time_per_iteration": 2.7781550884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02101154, + "balance_loss_mlp": 2.07540464, + "epoch": 0.05559830704116968, + "flos": 1510293413376.0, + "grad_norm": 0.1730100464590254, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.80271375, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.2578125, + "step": 289, + "time_per_iteration": 4.801970481872559 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338926, + "balance_loss_mlp": 1.32332134, + "epoch": 0.055790688726433245, + "flos": 508078379520.0, + "grad_norm": 0.11288123874753296, + "language_loss": 0.99586821, + "learning_rate": 0.0009982582277800948, + "loss": 1.00925756, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.15588379, + "step": 290, + "time_per_iteration": 2.6019012928009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376714, + "balance_loss_mlp": 1.36076403, + "epoch": 0.055983070411696804, + "flos": 657570576384.0, + "grad_norm": 0.11158393407579077, + "language_loss": 1.06464982, + "learning_rate": 0.0009982321495648908, + "loss": 1.07841706, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.15942383, + "step": 291, + "time_per_iteration": 2.7833075523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281101, + "balance_loss_mlp": 1.26441216, + "epoch": 0.05617545209696037, + "flos": 587051919360.0, + "grad_norm": 0.091490024999748, + "language_loss": 0.97375935, + "learning_rate": 0.0009982058779188115, + "loss": 0.98657036, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.16699219, + "step": 292, + "time_per_iteration": 2.700998067855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223751, + "balance_loss_mlp": 1.20634639, + "epoch": 0.05636783378222393, + "flos": 611331217920.0, + "grad_norm": 0.09093545163733599, + "language_loss": 1.05090272, + "learning_rate": 0.0009981794128520567, + "loss": 1.06314015, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.17431641, + "step": 293, + "time_per_iteration": 2.769562244415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172918, + "balance_loss_mlp": 1.15501237, + "epoch": 0.0565602154674875, + "flos": 667847102976.0, + "grad_norm": 0.08200667246549262, + "language_loss": 1.02219713, + "learning_rate": 0.000998152754374901, + "loss": 1.03392649, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.17919922, + "step": 294, + "time_per_iteration": 2.8421483039855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140496, + "balance_loss_mlp": 1.12121987, + "epoch": 0.05675259715275106, + "flos": 616963474944.0, + "grad_norm": 0.06298459153201627, + "language_loss": 0.97706711, + "learning_rate": 0.0009981259024976943, + "loss": 0.98847204, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.19250488, + "step": 295, + "time_per_iteration": 2.709536552429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131247, + "balance_loss_mlp": 1.11139894, + "epoch": 0.05694497883801462, + "flos": 751424214528.0, + "grad_norm": 0.13011693222478776, + "language_loss": 0.96307456, + "learning_rate": 0.0009980988572308612, + "loss": 0.97438705, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.19848633, + "step": 296, + "time_per_iteration": 2.9606993198394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125313, + "balance_loss_mlp": 1.10492802, + "epoch": 0.05713736052327818, + "flos": 711669708288.0, + "grad_norm": 0.06808560063607492, + "language_loss": 0.9959082, + "learning_rate": 0.0009980716185849015, + "loss": 1.00716126, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.20385742, + "step": 297, + "time_per_iteration": 2.952467203140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133548, + "balance_loss_mlp": 1.11424804, + "epoch": 0.05732974220854175, + "flos": 468737100288.0, + "grad_norm": 0.05570922928007862, + "language_loss": 0.95103967, + "learning_rate": 0.0009980441865703904, + "loss": 0.9623751, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.19299316, + "step": 298, + "time_per_iteration": 2.6629996299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125947, + "balance_loss_mlp": 1.10630131, + "epoch": 0.05752212389380531, + "flos": 601143441408.0, + "grad_norm": 0.06175770353433084, + "language_loss": 1.038656, + "learning_rate": 0.000998016561197978, + "loss": 1.04991555, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.19628906, + "step": 299, + "time_per_iteration": 2.7027034759521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122899, + "balance_loss_mlp": 1.10499382, + "epoch": 0.057714505579068875, + "flos": 678344799744.0, + "grad_norm": 0.07709513760197055, + "language_loss": 0.95715761, + "learning_rate": 0.0009979887424783895, + "loss": 0.96838653, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.17907715, + "step": 300, + "time_per_iteration": 2.8467562198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122592, + "balance_loss_mlp": 1.10369694, + "epoch": 0.057906887264332435, + "flos": 595604316672.0, + "grad_norm": 0.05754387138467597, + "language_loss": 0.94804943, + "learning_rate": 0.0009979607304224248, + "loss": 0.95927536, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.18908691, + "step": 301, + "time_per_iteration": 2.7457566261291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135958, + "balance_loss_mlp": 1.11577594, + "epoch": 0.058099268949596, + "flos": 551855904768.0, + "grad_norm": 0.06951393564289957, + "language_loss": 1.02452385, + "learning_rate": 0.000997932525040959, + "loss": 1.03588343, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.20166016, + "step": 302, + "time_per_iteration": 2.670464038848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123814, + "balance_loss_mlp": 1.10513425, + "epoch": 0.05829165063485956, + "flos": 507906671616.0, + "grad_norm": 0.06408930588753382, + "language_loss": 1.04041958, + "learning_rate": 0.000997904126344943, + "loss": 1.05165768, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.18676758, + "step": 303, + "time_per_iteration": 2.654275417327881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122557, + "balance_loss_mlp": 1.10432982, + "epoch": 0.05848403232012313, + "flos": 614949774336.0, + "grad_norm": 0.10902949066110783, + "language_loss": 1.00108004, + "learning_rate": 0.0009978755343454018, + "loss": 1.0123055, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.18212891, + "step": 304, + "time_per_iteration": 2.7061922550201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118016, + "balance_loss_mlp": 1.10034943, + "epoch": 0.05867641400538669, + "flos": 499835902464.0, + "grad_norm": 0.07196511907519268, + "language_loss": 1.01183403, + "learning_rate": 0.0009978467490534355, + "loss": 1.02301419, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.17663574, + "step": 305, + "time_per_iteration": 2.5658843517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118418, + "balance_loss_mlp": 1.09971452, + "epoch": 0.05886879569065025, + "flos": 531019072512.0, + "grad_norm": 0.05577021807863236, + "language_loss": 0.98775607, + "learning_rate": 0.00099781777048022, + "loss": 0.99894023, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.18713379, + "step": 306, + "time_per_iteration": 2.688661813735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112614, + "balance_loss_mlp": 1.10866416, + "epoch": 0.05906117737591381, + "flos": 488811497472.0, + "grad_norm": 0.06489613907432343, + "language_loss": 0.99682212, + "learning_rate": 0.0009977885986370057, + "loss": 1.00808358, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.17480469, + "step": 307, + "time_per_iteration": 2.527008056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129188, + "balance_loss_mlp": 1.11242771, + "epoch": 0.05925355906117737, + "flos": 591213150720.0, + "grad_norm": 0.060579194597163814, + "language_loss": 0.94911426, + "learning_rate": 0.000997759233535118, + "loss": 0.96040612, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.16772461, + "step": 308, + "time_per_iteration": 2.768683433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_mlp": 1.09052539, + "epoch": 0.05944594074644094, + "flos": 563373522432.0, + "grad_norm": 0.074144120767366, + "language_loss": 1.01706028, + "learning_rate": 0.0009977296751859576, + "loss": 1.02814317, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.17749023, + "step": 309, + "time_per_iteration": 2.710550308227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109964, + "balance_loss_mlp": 1.0817585, + "epoch": 0.0596383224317045, + "flos": 538483147776.0, + "grad_norm": 0.1012520362466171, + "language_loss": 1.03562367, + "learning_rate": 0.0009976999236009998, + "loss": 1.04662001, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.17895508, + "step": 310, + "time_per_iteration": 2.7346065044403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095396, + "balance_loss_mlp": 1.07697809, + "epoch": 0.059830704116968066, + "flos": 560684726784.0, + "grad_norm": 0.05903807060939984, + "language_loss": 1.05193245, + "learning_rate": 0.0009976699787917955, + "loss": 1.06288636, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.18408203, + "step": 311, + "time_per_iteration": 2.737165689468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04018029, + "balance_loss_mlp": 3.94440532, + "epoch": 0.060023085802231625, + "flos": 1569759962112.0, + "grad_norm": 0.34396821433057967, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.77461016, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.734375, + "step": 312, + "time_per_iteration": 4.990010976791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130575, + "balance_loss_mlp": 1.11010623, + "epoch": 0.06021546748749519, + "flos": 482415395328.0, + "grad_norm": 0.18656347991450223, + "language_loss": 0.97164261, + "learning_rate": 0.0009976095095472243, + "loss": 0.98294836, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.20458984, + "step": 313, + "time_per_iteration": 2.5596373081207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143867, + "balance_loss_mlp": 1.12198031, + "epoch": 0.06040784917275875, + "flos": 619889407488.0, + "grad_norm": 0.10017394493353984, + "language_loss": 0.98154747, + "learning_rate": 0.0009975789851353334, + "loss": 0.9929862, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.21911621, + "step": 314, + "time_per_iteration": 2.783092498779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113993, + "balance_loss_mlp": 1.11832976, + "epoch": 0.06060023085802232, + "flos": 483292721664.0, + "grad_norm": 0.12837029886330253, + "language_loss": 1.00706339, + "learning_rate": 0.0009975482675461487, + "loss": 1.01846266, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.21594238, + "step": 315, + "time_per_iteration": 2.6375765800476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128184, + "balance_loss_mlp": 1.10697675, + "epoch": 0.06079261254328588, + "flos": 581620483584.0, + "grad_norm": 0.07139597701291463, + "language_loss": 0.9800331, + "learning_rate": 0.0009975173567915952, + "loss": 0.99131489, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.21228027, + "step": 316, + "time_per_iteration": 2.680223226547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116284, + "balance_loss_mlp": 1.09438515, + "epoch": 0.060984994228549444, + "flos": 687492306432.0, + "grad_norm": 0.12898022133672052, + "language_loss": 0.92624593, + "learning_rate": 0.000997486252883674, + "loss": 0.93740869, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.21887207, + "step": 317, + "time_per_iteration": 2.835162878036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_mlp": 1.08325243, + "epoch": 0.061177375913813004, + "flos": 1314284327424.0, + "grad_norm": 0.06442728945451602, + "language_loss": 0.97186124, + "learning_rate": 0.0009974549558344602, + "loss": 0.98290741, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.21350098, + "step": 318, + "time_per_iteration": 3.6293551921844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105129, + "balance_loss_mlp": 1.08439815, + "epoch": 0.06136975759907657, + "flos": 574072040448.0, + "grad_norm": 0.08131052095693254, + "language_loss": 1.07145, + "learning_rate": 0.000997423465656105, + "loss": 1.08250129, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.20715332, + "step": 319, + "time_per_iteration": 2.7070071697235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_mlp": 1.08168781, + "epoch": 0.06156213928434013, + "flos": 527281242624.0, + "grad_norm": 0.059301156484267634, + "language_loss": 1.04424822, + "learning_rate": 0.0009973917823608335, + "loss": 1.0552659, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.20092773, + "step": 320, + "time_per_iteration": 2.6225128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110531, + "balance_loss_mlp": 1.0897882, + "epoch": 0.061754520969603696, + "flos": 495238123008.0, + "grad_norm": 0.05387649814829365, + "language_loss": 0.98383266, + "learning_rate": 0.0009973599059609462, + "loss": 0.9949379, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.20739746, + "step": 321, + "time_per_iteration": 2.692152261734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107693, + "balance_loss_mlp": 1.08798778, + "epoch": 0.061946902654867256, + "flos": 439839673344.0, + "grad_norm": 0.06112812680296507, + "language_loss": 0.9711749, + "learning_rate": 0.000997327836468819, + "loss": 0.98225188, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.19702148, + "step": 322, + "time_per_iteration": 2.5772383213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110285, + "balance_loss_mlp": 1.0900557, + "epoch": 0.06213928434013082, + "flos": 598490961408.0, + "grad_norm": 0.0645434874295678, + "language_loss": 0.9942351, + "learning_rate": 0.000997295573896902, + "loss": 1.00533807, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.20239258, + "step": 323, + "time_per_iteration": 2.839282274246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02259253, + "balance_loss_mlp": 2.20088792, + "epoch": 0.06233166602539438, + "flos": 1449393716736.0, + "grad_norm": 0.19547826226404627, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83455294, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.58203125, + "step": 324, + "time_per_iteration": 4.67440938949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01888161, + "balance_loss_mlp": 1.83246601, + "epoch": 0.06252404771065795, + "flos": 1462504453632.0, + "grad_norm": 0.11962022052509429, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80460101, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.55859375, + "step": 325, + "time_per_iteration": 4.860283136367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177486, + "balance_loss_mlp": 1.15595722, + "epoch": 0.06271642939592151, + "flos": 464059335168.0, + "grad_norm": 0.06272096910143152, + "language_loss": 0.93621421, + "learning_rate": 0.000997197627828043, + "loss": 0.94798911, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.2154541, + "step": 326, + "time_per_iteration": 2.5594961643218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205877, + "balance_loss_mlp": 1.18165386, + "epoch": 0.06290881108118507, + "flos": 532111776768.0, + "grad_norm": 0.08849931028565244, + "language_loss": 0.89414704, + "learning_rate": 0.0009971645930629716, + "loss": 0.90620589, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.2421875, + "step": 327, + "time_per_iteration": 2.7163310050964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223238, + "balance_loss_mlp": 1.19748878, + "epoch": 0.06310119276644863, + "flos": 673262572032.0, + "grad_norm": 0.09892100413683627, + "language_loss": 1.02883804, + "learning_rate": 0.0009971313652814872, + "loss": 1.04107046, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.25769043, + "step": 328, + "time_per_iteration": 2.7786266803741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228803, + "balance_loss_mlp": 1.20175433, + "epoch": 0.0632935744517122, + "flos": 770404497408.0, + "grad_norm": 0.06852265531332852, + "language_loss": 0.99799907, + "learning_rate": 0.0009970979444964903, + "loss": 1.01028717, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.27050781, + "step": 329, + "time_per_iteration": 2.952498197555542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235649, + "balance_loss_mlp": 1.2062993, + "epoch": 0.06348595613697576, + "flos": 561649393152.0, + "grad_norm": 0.09680127661829774, + "language_loss": 1.0121367, + "learning_rate": 0.0009970643307209556, + "loss": 1.02449322, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.29296875, + "step": 330, + "time_per_iteration": 2.78190541267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240935, + "balance_loss_mlp": 1.20970178, + "epoch": 0.06367833782223932, + "flos": 675891730944.0, + "grad_norm": 0.08786526055569537, + "language_loss": 0.9788332, + "learning_rate": 0.0009970305239679334, + "loss": 0.99124253, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.31201172, + "step": 331, + "time_per_iteration": 2.805845022201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228576, + "balance_loss_mlp": 1.19891691, + "epoch": 0.06387071950750288, + "flos": 495035891712.0, + "grad_norm": 0.10390832636325384, + "language_loss": 1.03124022, + "learning_rate": 0.0009969965242505483, + "loss": 1.04352593, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.29614258, + "step": 332, + "time_per_iteration": 2.676711082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207199, + "balance_loss_mlp": 1.1777302, + "epoch": 0.06406310119276645, + "flos": 533170985472.0, + "grad_norm": 0.07105898063788767, + "language_loss": 0.98331362, + "learning_rate": 0.0009969623315820007, + "loss": 0.99538565, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.29418945, + "step": 333, + "time_per_iteration": 2.6556739807128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118815, + "balance_loss_mlp": 1.16106582, + "epoch": 0.06425548287803001, + "flos": 455940513792.0, + "grad_norm": 0.08067516684621483, + "language_loss": 0.99160993, + "learning_rate": 0.000996927945975565, + "loss": 1.0034914, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.27124023, + "step": 334, + "time_per_iteration": 2.5398526191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147495, + "balance_loss_mlp": 1.1214596, + "epoch": 0.06444786456329357, + "flos": 559817574912.0, + "grad_norm": 0.08169715789363684, + "language_loss": 0.96174645, + "learning_rate": 0.0009968933674445906, + "loss": 0.97322142, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.26062012, + "step": 335, + "time_per_iteration": 2.6592860221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112932, + "balance_loss_mlp": 1.0879097, + "epoch": 0.06464024624855713, + "flos": 665769383424.0, + "grad_norm": 0.07104021966044574, + "language_loss": 0.97756392, + "learning_rate": 0.0009968585960025028, + "loss": 0.98869324, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.25036621, + "step": 336, + "time_per_iteration": 2.9279658794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01860024, + "balance_loss_mlp": 1.84323907, + "epoch": 0.0648326279338207, + "flos": 1520578704384.0, + "grad_norm": 0.14426901756633248, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.7951321, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.16796875, + "step": 337, + "time_per_iteration": 4.810914993286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101739, + "balance_loss_mlp": 1.07948256, + "epoch": 0.06502500961908426, + "flos": 1142872768512.0, + "grad_norm": 0.058812216055980165, + "language_loss": 0.95864177, + "learning_rate": 0.0009967884744390583, + "loss": 0.96965921, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.22265625, + "step": 338, + "time_per_iteration": 3.512282371520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146504, + "balance_loss_mlp": 1.12267399, + "epoch": 0.06521739130434782, + "flos": 582339248640.0, + "grad_norm": 0.10793578588091769, + "language_loss": 0.97449529, + "learning_rate": 0.0009967531243449256, + "loss": 0.98596036, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.23828125, + "step": 339, + "time_per_iteration": 2.712907075881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154087, + "balance_loss_mlp": 1.12950587, + "epoch": 0.06540977298961138, + "flos": 497398800384.0, + "grad_norm": 0.06396927661276222, + "language_loss": 1.04641414, + "learning_rate": 0.000996717581394126, + "loss": 1.05795503, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.24584961, + "step": 340, + "time_per_iteration": 2.5783133506774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168509, + "balance_loss_mlp": 1.14584756, + "epoch": 0.06560215467487496, + "flos": 542613855744.0, + "grad_norm": 0.07568553531769329, + "language_loss": 1.05092287, + "learning_rate": 0.000996681845600459, + "loss": 1.062608, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.2265625, + "step": 341, + "time_per_iteration": 2.6543757915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.13118291, + "epoch": 0.06579453636013852, + "flos": 413230961664.0, + "grad_norm": 0.06593832485574395, + "language_loss": 0.97027373, + "learning_rate": 0.0009966459169777982, + "loss": 0.9818095, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.22387695, + "step": 342, + "time_per_iteration": 2.5120761394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141132, + "balance_loss_mlp": 1.11848283, + "epoch": 0.06598691804540208, + "flos": 560354457600.0, + "grad_norm": 0.055115078659976495, + "language_loss": 1.05281377, + "learning_rate": 0.0009966097955400924, + "loss": 1.0642252, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.22644043, + "step": 343, + "time_per_iteration": 2.6954751014709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111133, + "balance_loss_mlp": 1.08904982, + "epoch": 0.06617929973066564, + "flos": 571789117440.0, + "grad_norm": 0.06176008438986438, + "language_loss": 0.99064481, + "learning_rate": 0.0009965734813013652, + "loss": 1.00175822, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.22277832, + "step": 344, + "time_per_iteration": 2.8235929012298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090293, + "balance_loss_mlp": 1.06726193, + "epoch": 0.06637168141592921, + "flos": 490234470912.0, + "grad_norm": 0.05365164831273283, + "language_loss": 1.01308548, + "learning_rate": 0.0009965369742757151, + "loss": 1.02398837, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.23022461, + "step": 345, + "time_per_iteration": 2.5708556175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078086, + "balance_loss_mlp": 1.05727243, + "epoch": 0.06656406310119277, + "flos": 1078735656960.0, + "grad_norm": 0.04968829319439664, + "language_loss": 0.97902787, + "learning_rate": 0.0009965002744773152, + "loss": 0.98980874, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.20812988, + "step": 346, + "time_per_iteration": 3.4984121322631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086945, + "balance_loss_mlp": 1.06450987, + "epoch": 0.06675644478645633, + "flos": 513421065216.0, + "grad_norm": 0.06258978415695335, + "language_loss": 0.95138037, + "learning_rate": 0.0009964633819204139, + "loss": 0.96224982, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.22436523, + "step": 347, + "time_per_iteration": 2.6866109371185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01752926, + "balance_loss_mlp": 1.73108697, + "epoch": 0.06694882647171989, + "flos": 1446359943168.0, + "grad_norm": 0.11694655230354783, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83554041, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.21875, + "step": 348, + "time_per_iteration": 4.935550928115845 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01583198, + "balance_loss_mlp": 1.56116796, + "epoch": 0.06714120815698346, + "flos": 1551230784000.0, + "grad_norm": 0.0989027294649474, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76737082, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.22070312, + "step": 349, + "time_per_iteration": 4.891008615493774 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149858, + "balance_loss_mlp": 1.12826955, + "epoch": 0.06733358984224702, + "flos": 879689673216.0, + "grad_norm": 0.07075764146586616, + "language_loss": 0.94920838, + "learning_rate": 0.000996351547842304, + "loss": 0.96070701, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.21582031, + "step": 350, + "time_per_iteration": 3.156322717666626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192552, + "balance_loss_mlp": 1.17055774, + "epoch": 0.06752597152751058, + "flos": 518654651904.0, + "grad_norm": 0.09040238598346795, + "language_loss": 0.93423587, + "learning_rate": 0.0009963138843953744, + "loss": 0.94616139, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.2199707, + "step": 351, + "time_per_iteration": 2.610987663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206077, + "balance_loss_mlp": 1.18405879, + "epoch": 0.06771835321277414, + "flos": 539366266368.0, + "grad_norm": 0.08658544591035036, + "language_loss": 0.97852194, + "learning_rate": 0.000996276028262306, + "loss": 0.9905827, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.22021484, + "step": 352, + "time_per_iteration": 2.8413686752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166048, + "balance_loss_mlp": 1.14382768, + "epoch": 0.0679107348980377, + "flos": 460430604288.0, + "grad_norm": 0.09117082479319542, + "language_loss": 1.04269946, + "learning_rate": 0.0009962379794577964, + "loss": 1.05435991, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.22216797, + "step": 353, + "time_per_iteration": 2.591372489929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114388, + "balance_loss_mlp": 1.12227976, + "epoch": 0.06810311658330127, + "flos": 635601752064.0, + "grad_norm": 0.05781909345233015, + "language_loss": 0.94169199, + "learning_rate": 0.000996199737996617, + "loss": 0.95313084, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.21630859, + "step": 354, + "time_per_iteration": 2.9088492393493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125411, + "balance_loss_mlp": 1.10420346, + "epoch": 0.06829549826856483, + "flos": 464443448832.0, + "grad_norm": 0.06770201052263504, + "language_loss": 1.03043509, + "learning_rate": 0.0009961613038936149, + "loss": 1.04168916, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.2121582, + "step": 355, + "time_per_iteration": 2.571904420852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110613, + "balance_loss_mlp": 1.08917904, + "epoch": 0.06848787995382839, + "flos": 634335929856.0, + "grad_norm": 0.06097004840688574, + "language_loss": 0.95565176, + "learning_rate": 0.000996122677163711, + "loss": 0.96675789, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.21435547, + "step": 356, + "time_per_iteration": 2.794982671737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107296, + "balance_loss_mlp": 1.08667266, + "epoch": 0.06868026163909195, + "flos": 806023913472.0, + "grad_norm": 0.08020973782133771, + "language_loss": 1.01095176, + "learning_rate": 0.000996083857821902, + "loss": 1.02202487, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.20629883, + "step": 357, + "time_per_iteration": 3.007086753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101637, + "balance_loss_mlp": 1.08076346, + "epoch": 0.06887264332435553, + "flos": 438997252608.0, + "grad_norm": 0.08125476198078858, + "language_loss": 0.99797714, + "learning_rate": 0.0009960448458832588, + "loss": 1.00899351, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.2088623, + "step": 358, + "time_per_iteration": 2.699530601501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098146, + "balance_loss_mlp": 1.07872701, + "epoch": 0.06906502500961909, + "flos": 484513463808.0, + "grad_norm": 0.06827746260367892, + "language_loss": 0.99188638, + "learning_rate": 0.000996005641362927, + "loss": 1.00286782, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.1940918, + "step": 359, + "time_per_iteration": 2.5541014671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103345, + "balance_loss_mlp": 1.0841639, + "epoch": 0.06925740669488265, + "flos": 733293706752.0, + "grad_norm": 0.08731085845928575, + "language_loss": 1.02303529, + "learning_rate": 0.0009959662442761274, + "loss": 1.0340687, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.19189453, + "step": 360, + "time_per_iteration": 2.906623363494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093844, + "balance_loss_mlp": 1.07268476, + "epoch": 0.0694497883801462, + "flos": 552127947264.0, + "grad_norm": 0.06697663210144707, + "language_loss": 0.9595629, + "learning_rate": 0.000995926654638155, + "loss": 0.97050136, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.21179199, + "step": 361, + "time_per_iteration": 2.793663501739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082773, + "balance_loss_mlp": 1.06236482, + "epoch": 0.06964217006540978, + "flos": 677708992512.0, + "grad_norm": 0.06860924301964295, + "language_loss": 0.98198265, + "learning_rate": 0.00099588687246438, + "loss": 0.99281037, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.20410156, + "step": 362, + "time_per_iteration": 2.828139305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085274, + "balance_loss_mlp": 1.06330371, + "epoch": 0.06983455175067334, + "flos": 523987163136.0, + "grad_norm": 0.08747541291209461, + "language_loss": 1.04803789, + "learning_rate": 0.0009958468977702471, + "loss": 1.0588907, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.21972656, + "step": 363, + "time_per_iteration": 2.5759966373443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02224374, + "balance_loss_mlp": 2.20682669, + "epoch": 0.0700269334359369, + "flos": 1575943658496.0, + "grad_norm": 0.2746548069890379, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81959081, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.17578125, + "step": 364, + "time_per_iteration": 4.782835245132446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134514, + "balance_loss_mlp": 1.11340213, + "epoch": 0.07021931512120046, + "flos": 1012848274944.0, + "grad_norm": 0.08586169827549085, + "language_loss": 0.93286598, + "learning_rate": 0.0009957663708830612, + "loss": 0.94421113, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.21105957, + "step": 365, + "time_per_iteration": 3.2484283447265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116189, + "balance_loss_mlp": 1.13884652, + "epoch": 0.07041169680646403, + "flos": 822622348800.0, + "grad_norm": 0.09941073368395695, + "language_loss": 0.97043455, + "learning_rate": 0.0009957258187212714, + "loss": 0.98205346, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.23034668, + "step": 366, + "time_per_iteration": 3.009479522705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01756688, + "balance_loss_mlp": 1.7255981, + "epoch": 0.07060407849172759, + "flos": 1413670993920.0, + "grad_norm": 0.12374795181042475, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80951542, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.31054688, + "step": 367, + "time_per_iteration": 4.82874608039856 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152073, + "balance_loss_mlp": 1.13087749, + "epoch": 0.07079646017699115, + "flos": 512652837888.0, + "grad_norm": 0.06786716904588838, + "language_loss": 0.93450886, + "learning_rate": 0.0009956441370400167, + "loss": 0.94602954, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.21191406, + "step": 368, + "time_per_iteration": 2.6226603984832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153965, + "balance_loss_mlp": 1.13158989, + "epoch": 0.07098884186225471, + "flos": 540240772608.0, + "grad_norm": 0.08343626294497461, + "language_loss": 0.99467343, + "learning_rate": 0.0009956030075522636, + "loss": 1.00621307, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.22375488, + "step": 369, + "time_per_iteration": 2.7128794193267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142856, + "balance_loss_mlp": 1.12137485, + "epoch": 0.07118122354751828, + "flos": 548419230720.0, + "grad_norm": 0.07464528715750075, + "language_loss": 0.98955953, + "learning_rate": 0.0009955616856543587, + "loss": 1.00098813, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.21472168, + "step": 370, + "time_per_iteration": 2.613138198852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118555, + "balance_loss_mlp": 1.0958215, + "epoch": 0.07137360523278184, + "flos": 620612554752.0, + "grad_norm": 0.056434914921328155, + "language_loss": 0.91880834, + "learning_rate": 0.0009955201713623448, + "loss": 0.92999387, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.22717285, + "step": 371, + "time_per_iteration": 2.747133255004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01746336, + "balance_loss_mlp": 1.72154021, + "epoch": 0.0715659869180454, + "flos": 1501850115072.0, + "grad_norm": 0.08669176596007007, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78419054, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.24707031, + "step": 372, + "time_per_iteration": 4.931428670883179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102855, + "balance_loss_mlp": 1.08040774, + "epoch": 0.07175836860330896, + "flos": 495246887424.0, + "grad_norm": 0.07044890130803105, + "language_loss": 1.05121827, + "learning_rate": 0.0009954365656605333, + "loss": 1.06224692, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.22436523, + "step": 373, + "time_per_iteration": 2.550243616104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118244, + "balance_loss_mlp": 1.09438992, + "epoch": 0.07195075028857253, + "flos": 785387902464.0, + "grad_norm": 0.05415547127036835, + "language_loss": 0.98150015, + "learning_rate": 0.0009953944742831947, + "loss": 0.99268264, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.23864746, + "step": 374, + "time_per_iteration": 2.9659459590911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125507, + "balance_loss_mlp": 1.10202336, + "epoch": 0.0721431319738361, + "flos": 592799067648.0, + "grad_norm": 0.07003669353380264, + "language_loss": 1.01441097, + "learning_rate": 0.0009953521905766642, + "loss": 1.02566612, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.23486328, + "step": 375, + "time_per_iteration": 2.942763566970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117119, + "balance_loss_mlp": 1.09393334, + "epoch": 0.07233551365909965, + "flos": 547981272576.0, + "grad_norm": 0.06343477824222313, + "language_loss": 0.99901861, + "learning_rate": 0.0009953097145573577, + "loss": 1.01018989, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.23193359, + "step": 376, + "time_per_iteration": 2.6275272369384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113711, + "balance_loss_mlp": 1.09023869, + "epoch": 0.07252789534436321, + "flos": 957170428416.0, + "grad_norm": 0.0678891965164594, + "language_loss": 0.97798675, + "learning_rate": 0.000995267046241766, + "loss": 0.98912394, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.23474121, + "step": 377, + "time_per_iteration": 3.2014975547790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096997, + "balance_loss_mlp": 1.07496762, + "epoch": 0.07272027702962677, + "flos": 507398902272.0, + "grad_norm": 0.0806519998399971, + "language_loss": 0.97275257, + "learning_rate": 0.0009952241856464547, + "loss": 0.98372257, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.22045898, + "step": 378, + "time_per_iteration": 2.6189732551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109641, + "balance_loss_mlp": 1.0746069, + "epoch": 0.07291265871489035, + "flos": 612128558592.0, + "grad_norm": 0.0691049335661606, + "language_loss": 1.04592681, + "learning_rate": 0.0009951811327880632, + "loss": 1.05689096, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.21826172, + "step": 379, + "time_per_iteration": 2.7411558628082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092071, + "balance_loss_mlp": 1.07025611, + "epoch": 0.0731050404001539, + "flos": 495502963200.0, + "grad_norm": 0.05765504670581196, + "language_loss": 0.97682816, + "learning_rate": 0.0009951378876833063, + "loss": 0.98774892, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.21813965, + "step": 380, + "time_per_iteration": 2.6211278438568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081575, + "balance_loss_mlp": 1.06068945, + "epoch": 0.07329742208541747, + "flos": 639677205504.0, + "grad_norm": 0.06809750593205881, + "language_loss": 1.04190159, + "learning_rate": 0.0009950944503489736, + "loss": 1.05271733, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.20898438, + "step": 381, + "time_per_iteration": 2.7533762454986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081401, + "balance_loss_mlp": 1.0607307, + "epoch": 0.07348980377068103, + "flos": 815999284224.0, + "grad_norm": 0.06607035824886899, + "language_loss": 0.98459697, + "learning_rate": 0.0009950508208019285, + "loss": 0.99541104, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.20678711, + "step": 382, + "time_per_iteration": 2.9885637760162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073667, + "balance_loss_mlp": 1.05369973, + "epoch": 0.0736821854559446, + "flos": 508383917568.0, + "grad_norm": 0.05970909775769663, + "language_loss": 1.02745128, + "learning_rate": 0.0009950069990591096, + "loss": 1.03818798, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.19958496, + "step": 383, + "time_per_iteration": 2.6111788749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01835936, + "balance_loss_mlp": 1.8101871, + "epoch": 0.07387456714120816, + "flos": 1553801716224.0, + "grad_norm": 0.167122487372618, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.78237301, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.2578125, + "step": 384, + "time_per_iteration": 4.859915494918823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116619, + "balance_loss_mlp": 1.09575748, + "epoch": 0.07406694882647172, + "flos": 525219489792.0, + "grad_norm": 0.0799084124695288, + "language_loss": 0.96017051, + "learning_rate": 0.0009949187790542777, + "loss": 0.97133672, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.20861816, + "step": 385, + "time_per_iteration": 2.6976191997528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124239, + "balance_loss_mlp": 1.10322285, + "epoch": 0.07425933051173528, + "flos": 497468611584.0, + "grad_norm": 0.08753491640442414, + "language_loss": 0.91745877, + "learning_rate": 0.0009948743808265148, + "loss": 0.92870116, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.21020508, + "step": 386, + "time_per_iteration": 2.6870572566986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113476, + "balance_loss_mlp": 1.09249496, + "epoch": 0.07445171219699885, + "flos": 504740630016.0, + "grad_norm": 0.05063210924529089, + "language_loss": 1.0156467, + "learning_rate": 0.0009948297904714782, + "loss": 1.02678132, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.20996094, + "step": 387, + "time_per_iteration": 2.668027639389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097529, + "balance_loss_mlp": 1.07642913, + "epoch": 0.07464409388226241, + "flos": 553693515264.0, + "grad_norm": 0.06830922509793466, + "language_loss": 0.93493366, + "learning_rate": 0.0009947850080064796, + "loss": 0.9459089, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.21105957, + "step": 388, + "time_per_iteration": 2.79836106300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098078, + "balance_loss_mlp": 1.07695365, + "epoch": 0.07483647556752597, + "flos": 776511028224.0, + "grad_norm": 0.06471398355705121, + "language_loss": 0.98276728, + "learning_rate": 0.0009947400334489047, + "loss": 0.99374807, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.21130371, + "step": 389, + "time_per_iteration": 3.0046355724334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095267, + "balance_loss_mlp": 1.07513261, + "epoch": 0.07502885725278953, + "flos": 612256596480.0, + "grad_norm": 0.0754939105077014, + "language_loss": 0.90272582, + "learning_rate": 0.0009946948668162145, + "loss": 0.91367853, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.20141602, + "step": 390, + "time_per_iteration": 2.724792003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091157, + "balance_loss_mlp": 1.06946135, + "epoch": 0.0752212389380531, + "flos": 688324552704.0, + "grad_norm": 0.05626120625508035, + "language_loss": 0.9463594, + "learning_rate": 0.0009946495081259441, + "loss": 0.95727098, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.21704102, + "step": 391, + "time_per_iteration": 2.8221397399902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101684, + "balance_loss_mlp": 1.08008361, + "epoch": 0.07541362062331666, + "flos": 765362967552.0, + "grad_norm": 0.09729902751759628, + "language_loss": 0.97468722, + "learning_rate": 0.0009946039573957035, + "loss": 0.98570406, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.21606445, + "step": 392, + "time_per_iteration": 2.958655595779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095785, + "balance_loss_mlp": 1.07572174, + "epoch": 0.07560600230858022, + "flos": 588460336128.0, + "grad_norm": 0.06468718689622391, + "language_loss": 0.94257009, + "learning_rate": 0.000994558214643177, + "loss": 0.95352793, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.20056152, + "step": 393, + "time_per_iteration": 2.752979040145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101382, + "balance_loss_mlp": 1.08086586, + "epoch": 0.07579838399384378, + "flos": 749508028416.0, + "grad_norm": 0.06635223139616171, + "language_loss": 0.961483, + "learning_rate": 0.000994512279886123, + "loss": 0.97249681, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.20532227, + "step": 394, + "time_per_iteration": 3.055225133895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104661, + "balance_loss_mlp": 1.08346581, + "epoch": 0.07599076567910736, + "flos": 523185440256.0, + "grad_norm": 0.06901630142642712, + "language_loss": 0.96749192, + "learning_rate": 0.0009944661531423758, + "loss": 0.97853857, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.2121582, + "step": 395, + "time_per_iteration": 2.6922085285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093271, + "balance_loss_mlp": 1.07248056, + "epoch": 0.07618314736437092, + "flos": 550812662784.0, + "grad_norm": 0.07064334209039194, + "language_loss": 0.95375401, + "learning_rate": 0.000994419834429843, + "loss": 0.96468663, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.20788574, + "step": 396, + "time_per_iteration": 2.6657333374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092352, + "balance_loss_mlp": 1.0716933, + "epoch": 0.07637552904963447, + "flos": 697901253120.0, + "grad_norm": 0.07324881108467876, + "language_loss": 0.99580455, + "learning_rate": 0.0009943733237665069, + "loss": 1.00672793, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.20654297, + "step": 397, + "time_per_iteration": 2.8662500381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085112, + "balance_loss_mlp": 1.06454849, + "epoch": 0.07656791073489803, + "flos": 579066928128.0, + "grad_norm": 0.04790317238997088, + "language_loss": 0.98118353, + "learning_rate": 0.0009943266211704248, + "loss": 0.99203461, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.20568848, + "step": 398, + "time_per_iteration": 2.930741786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094784, + "balance_loss_mlp": 1.07348132, + "epoch": 0.0767602924201616, + "flos": 416923711488.0, + "grad_norm": 0.09980331544781734, + "language_loss": 1.00422275, + "learning_rate": 0.000994279726659728, + "loss": 1.01517057, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.21325684, + "step": 399, + "time_per_iteration": 2.533738851547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109036, + "balance_loss_mlp": 1.06970143, + "epoch": 0.07695267410542517, + "flos": 482671471104.0, + "grad_norm": 0.06967700921129397, + "language_loss": 0.97985041, + "learning_rate": 0.0009942326402526231, + "loss": 0.99075395, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.20666504, + "step": 400, + "time_per_iteration": 2.51460337638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096542, + "balance_loss_mlp": 1.07526302, + "epoch": 0.07714505579068873, + "flos": 530742647808.0, + "grad_norm": 0.052652305799428985, + "language_loss": 0.96639109, + "learning_rate": 0.0009941853619673902, + "loss": 0.97735649, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.2130127, + "step": 401, + "time_per_iteration": 2.620939016342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101036, + "balance_loss_mlp": 1.08012676, + "epoch": 0.07733743747595229, + "flos": 804635845632.0, + "grad_norm": 0.07273299487754427, + "language_loss": 0.99959278, + "learning_rate": 0.0009941378918223844, + "loss": 1.01060319, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.20910645, + "step": 402, + "time_per_iteration": 3.036839008331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110477, + "balance_loss_mlp": 1.08423018, + "epoch": 0.07752981916121585, + "flos": 622192679424.0, + "grad_norm": 0.05767312217272775, + "language_loss": 0.93044209, + "learning_rate": 0.0009940902298360354, + "loss": 0.94148982, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.20544434, + "step": 403, + "time_per_iteration": 2.7703943252563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097477, + "balance_loss_mlp": 1.07694876, + "epoch": 0.07772220084647942, + "flos": 727961195520.0, + "grad_norm": 0.0686344305115436, + "language_loss": 1.02037048, + "learning_rate": 0.0009940423760268473, + "loss": 1.03134525, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.2052002, + "step": 404, + "time_per_iteration": 2.8823602199554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.06497431, + "epoch": 0.07791458253174298, + "flos": 555149984256.0, + "grad_norm": 0.10727031409308073, + "language_loss": 0.96142864, + "learning_rate": 0.0009939943304133982, + "loss": 0.97228479, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.20654297, + "step": 405, + "time_per_iteration": 2.63908314704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078944, + "balance_loss_mlp": 1.05944133, + "epoch": 0.07810696421700654, + "flos": 552919495680.0, + "grad_norm": 0.08981509362846728, + "language_loss": 1.0302707, + "learning_rate": 0.0009939460930143416, + "loss": 1.04106021, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.19482422, + "step": 406, + "time_per_iteration": 2.63259220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079269, + "balance_loss_mlp": 1.05927801, + "epoch": 0.0782993459022701, + "flos": 650323289088.0, + "grad_norm": 0.07212254231156982, + "language_loss": 0.96910775, + "learning_rate": 0.0009938976638484043, + "loss": 0.97990054, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.1998291, + "step": 407, + "time_per_iteration": 2.9489452838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_mlp": 1.05239439, + "epoch": 0.07849172758753367, + "flos": 495926364672.0, + "grad_norm": 0.07302041560946317, + "language_loss": 0.9619081, + "learning_rate": 0.0009938490429343887, + "loss": 0.97263873, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.20666504, + "step": 408, + "time_per_iteration": 2.541293144226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078297, + "balance_loss_mlp": 1.05823374, + "epoch": 0.07868410927279723, + "flos": 577696389120.0, + "grad_norm": 0.06961121210328268, + "language_loss": 0.96404505, + "learning_rate": 0.0009938002302911709, + "loss": 0.974828, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.20056152, + "step": 409, + "time_per_iteration": 2.7890634536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.05869615, + "epoch": 0.07887649095806079, + "flos": 522698019840.0, + "grad_norm": 0.10283598941623227, + "language_loss": 0.99080813, + "learning_rate": 0.0009937512259377015, + "loss": 1.00159442, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.19921875, + "step": 410, + "time_per_iteration": 2.6631360054016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076374, + "balance_loss_mlp": 1.05739617, + "epoch": 0.07906887264332435, + "flos": 556958481408.0, + "grad_norm": 0.07518465865945036, + "language_loss": 0.97744381, + "learning_rate": 0.000993702029893006, + "loss": 0.98820746, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.18981934, + "step": 411, + "time_per_iteration": 2.762937068939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070708, + "balance_loss_mlp": 1.0512886, + "epoch": 0.07926125432858792, + "flos": 821641715712.0, + "grad_norm": 0.06547583340109177, + "language_loss": 0.97466588, + "learning_rate": 0.0009936526421761838, + "loss": 0.98537302, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.1940918, + "step": 412, + "time_per_iteration": 3.019529342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070741, + "balance_loss_mlp": 1.05210841, + "epoch": 0.07945363601385148, + "flos": 562072794624.0, + "grad_norm": 0.06412617323579047, + "language_loss": 0.9993977, + "learning_rate": 0.000993603062806409, + "loss": 1.01010513, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.18615723, + "step": 413, + "time_per_iteration": 2.667893409729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078833, + "balance_loss_mlp": 1.05879402, + "epoch": 0.07964601769911504, + "flos": 517615792128.0, + "grad_norm": 0.0777298152120257, + "language_loss": 1.03187037, + "learning_rate": 0.0009935532918029298, + "loss": 1.04265857, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.20031738, + "step": 414, + "time_per_iteration": 2.628847122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079604, + "balance_loss_mlp": 1.06020916, + "epoch": 0.0798383993843786, + "flos": 538956011520.0, + "grad_norm": 0.0762846382616791, + "language_loss": 0.96381676, + "learning_rate": 0.0009935033291850694, + "loss": 0.97461283, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.19384766, + "step": 415, + "time_per_iteration": 2.6874804496765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078311, + "balance_loss_mlp": 1.05915451, + "epoch": 0.08003078106964218, + "flos": 484901959680.0, + "grad_norm": 0.07548152614126195, + "language_loss": 0.9874112, + "learning_rate": 0.0009934531749722247, + "loss": 0.9981944, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.19177246, + "step": 416, + "time_per_iteration": 2.5752930641174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077721, + "balance_loss_mlp": 1.0581702, + "epoch": 0.08022316275490574, + "flos": 517999905792.0, + "grad_norm": 0.07373378819853486, + "language_loss": 0.97326815, + "learning_rate": 0.0009934028291838672, + "loss": 0.98404539, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.1953125, + "step": 417, + "time_per_iteration": 2.715142011642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078885, + "balance_loss_mlp": 1.0593344, + "epoch": 0.0804155444401693, + "flos": 493755512832.0, + "grad_norm": 0.06878732968267398, + "language_loss": 0.9290086, + "learning_rate": 0.0009933522918395433, + "loss": 0.93979746, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.19555664, + "step": 418, + "time_per_iteration": 2.7008063793182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01673141, + "balance_loss_mlp": 1.6505394, + "epoch": 0.08060792612543285, + "flos": 1580567579136.0, + "grad_norm": 0.10865535097535944, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.7992425, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.22558594, + "step": 419, + "time_per_iteration": 4.854820728302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092516, + "balance_loss_mlp": 1.07238102, + "epoch": 0.08080030781069643, + "flos": 525090041856.0, + "grad_norm": 0.07888672823303539, + "language_loss": 1.11010027, + "learning_rate": 0.000993250642561551, + "loss": 1.12102532, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.20129395, + "step": 420, + "time_per_iteration": 2.6152822971343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102725, + "balance_loss_mlp": 1.08251905, + "epoch": 0.08099268949595999, + "flos": 546459374592.0, + "grad_norm": 0.06927423279576624, + "language_loss": 0.96781242, + "learning_rate": 0.0009931995306673466, + "loss": 0.97883964, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.20202637, + "step": 421, + "time_per_iteration": 2.8378820419311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107938, + "balance_loss_mlp": 1.08725524, + "epoch": 0.08118507118122355, + "flos": 510116811264.0, + "grad_norm": 0.07245841989657228, + "language_loss": 1.01691484, + "learning_rate": 0.000993148227296103, + "loss": 1.02799416, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.20678711, + "step": 422, + "time_per_iteration": 2.6234657764434814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109153, + "balance_loss_mlp": 1.08827925, + "epoch": 0.08137745286648711, + "flos": 720339969024.0, + "grad_norm": 0.06440268991377437, + "language_loss": 0.90059143, + "learning_rate": 0.000993096732467738, + "loss": 0.91168296, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.2088623, + "step": 423, + "time_per_iteration": 2.9789979457855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107042, + "balance_loss_mlp": 1.08620405, + "epoch": 0.08156983455175067, + "flos": 679313848320.0, + "grad_norm": 0.09430690436493987, + "language_loss": 0.97591221, + "learning_rate": 0.0009930450462022435, + "loss": 0.9869827, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.20837402, + "step": 424, + "time_per_iteration": 2.7870407104492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01731933, + "balance_loss_mlp": 1.70847309, + "epoch": 0.08176221623701424, + "flos": 1452577135104.0, + "grad_norm": 0.13164555017172178, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80921739, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.234375, + "step": 425, + "time_per_iteration": 4.870323181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095108, + "balance_loss_mlp": 1.07456827, + "epoch": 0.0819545979222778, + "flos": 1556034071040.0, + "grad_norm": 0.10298759083167684, + "language_loss": 0.95328236, + "learning_rate": 0.0009929410994402065, + "loss": 0.9642334, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.20544434, + "step": 426, + "time_per_iteration": 3.7942585945129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093366, + "balance_loss_mlp": 1.07214665, + "epoch": 0.08214697960754136, + "flos": 512456398848.0, + "grad_norm": 0.069672302328133, + "language_loss": 0.99507213, + "learning_rate": 0.0009928888389840196, + "loss": 1.00600576, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.21240234, + "step": 427, + "time_per_iteration": 2.684760093688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073876, + "balance_loss_mlp": 1.05376494, + "epoch": 0.08233936129280492, + "flos": 594850646016.0, + "grad_norm": 0.07796900075206671, + "language_loss": 1.01749206, + "learning_rate": 0.0009928363871714147, + "loss": 1.02823079, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.20092773, + "step": 428, + "time_per_iteration": 2.6608195304870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078126, + "balance_loss_mlp": 1.05796742, + "epoch": 0.08253174297806849, + "flos": 571758594048.0, + "grad_norm": 0.07341701057973313, + "language_loss": 0.95524251, + "learning_rate": 0.0009927837440227556, + "loss": 0.96602374, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.20153809, + "step": 429, + "time_per_iteration": 2.824958324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083273, + "balance_loss_mlp": 1.06413972, + "epoch": 0.08272412466333205, + "flos": 623065623552.0, + "grad_norm": 0.06194570532237157, + "language_loss": 0.90308964, + "learning_rate": 0.0009927309095584798, + "loss": 0.91392243, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.19128418, + "step": 430, + "time_per_iteration": 2.9565205574035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105878, + "balance_loss_mlp": 1.08643484, + "epoch": 0.08291650634859561, + "flos": 513745542144.0, + "grad_norm": 0.09375416706629437, + "language_loss": 1.0225904, + "learning_rate": 0.0009926778837991, + "loss": 1.03364921, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.19433594, + "step": 431, + "time_per_iteration": 2.5606777667999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104802, + "balance_loss_mlp": 1.08521628, + "epoch": 0.08310888803385917, + "flos": 667073083392.0, + "grad_norm": 0.09022222071598751, + "language_loss": 1.00445497, + "learning_rate": 0.000992624666765202, + "loss": 1.01550293, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.19580078, + "step": 432, + "time_per_iteration": 2.763514995574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112312, + "balance_loss_mlp": 1.09166527, + "epoch": 0.08330126971912274, + "flos": 582995404800.0, + "grad_norm": 0.07142121215748316, + "language_loss": 0.98131895, + "learning_rate": 0.000992571258477447, + "loss": 0.99244213, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.20654297, + "step": 433, + "time_per_iteration": 2.7823588848114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086622, + "balance_loss_mlp": 1.06731021, + "epoch": 0.0834936514043863, + "flos": 561064458240.0, + "grad_norm": 0.06618743000622296, + "language_loss": 0.92206728, + "learning_rate": 0.0009925176589565695, + "loss": 0.93293345, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.1932373, + "step": 434, + "time_per_iteration": 2.7774362564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109069, + "balance_loss_mlp": 1.07043648, + "epoch": 0.08368603308964986, + "flos": 494272046592.0, + "grad_norm": 0.07800081613857189, + "language_loss": 1.01949787, + "learning_rate": 0.0009924638682233791, + "loss": 1.03040481, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.20251465, + "step": 435, + "time_per_iteration": 2.574716091156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236801, + "balance_loss_mlp": 1.21505737, + "epoch": 0.08387841477491342, + "flos": 1388322312192.0, + "grad_norm": 0.08820287098199171, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80801398, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.21777344, + "step": 436, + "time_per_iteration": 4.521069049835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087939, + "balance_loss_mlp": 1.06750691, + "epoch": 0.084070796460177, + "flos": 798642796032.0, + "grad_norm": 0.09737991847895365, + "language_loss": 0.92070073, + "learning_rate": 0.0009923557132036668, + "loss": 0.93158013, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.2043457, + "step": 437, + "time_per_iteration": 3.0401971340179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106896, + "balance_loss_mlp": 1.08635592, + "epoch": 0.08426317814544056, + "flos": 558681200640.0, + "grad_norm": 0.07082709395687636, + "language_loss": 0.96077365, + "learning_rate": 0.0009923013489591345, + "loss": 0.97184265, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.20532227, + "step": 438, + "time_per_iteration": 2.7388038635253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138911, + "balance_loss_mlp": 1.11965871, + "epoch": 0.08445555983070412, + "flos": 810057106944.0, + "grad_norm": 0.09946092642967543, + "language_loss": 0.94659293, + "learning_rate": 0.0009922467935862681, + "loss": 0.95798206, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.19250488, + "step": 439, + "time_per_iteration": 3.0827929973602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153103, + "balance_loss_mlp": 1.13278937, + "epoch": 0.08464794151596768, + "flos": 509939311104.0, + "grad_norm": 0.08658230076015333, + "language_loss": 0.97196984, + "learning_rate": 0.0009921920471062478, + "loss": 0.9835009, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.203125, + "step": 440, + "time_per_iteration": 2.5667247772216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_mlp": 1.08952785, + "epoch": 0.08484032320123125, + "flos": 556149556224.0, + "grad_norm": 0.0779492699350581, + "language_loss": 0.95526892, + "learning_rate": 0.0009921371095403281, + "loss": 0.96636182, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.19763184, + "step": 441, + "time_per_iteration": 2.6504476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081558, + "balance_loss_mlp": 1.06137586, + "epoch": 0.08503270488649481, + "flos": 527103742464.0, + "grad_norm": 0.0823758421396894, + "language_loss": 0.98291612, + "learning_rate": 0.0009920819809098379, + "loss": 0.99373174, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.20166016, + "step": 442, + "time_per_iteration": 2.5884947776794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076633, + "balance_loss_mlp": 1.05612862, + "epoch": 0.08522508657175837, + "flos": 613989490176.0, + "grad_norm": 0.07828377396362728, + "language_loss": 0.94043314, + "learning_rate": 0.0009920266612361798, + "loss": 0.95119947, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.20507812, + "step": 443, + "time_per_iteration": 2.7464845180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077144, + "balance_loss_mlp": 1.05650926, + "epoch": 0.08541746825702193, + "flos": 619495119360.0, + "grad_norm": 0.07442656272719532, + "language_loss": 0.94335687, + "learning_rate": 0.0009919711505408308, + "loss": 0.95412827, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.2064209, + "step": 444, + "time_per_iteration": 2.7615623474121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092391, + "balance_loss_mlp": 1.07126665, + "epoch": 0.08560984994228549, + "flos": 482671471104.0, + "grad_norm": 0.08601843511227286, + "language_loss": 0.92049706, + "learning_rate": 0.000991915448845342, + "loss": 0.93142092, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.21130371, + "step": 445, + "time_per_iteration": 2.519644260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103035, + "balance_loss_mlp": 1.08145857, + "epoch": 0.08580223162754906, + "flos": 516897027072.0, + "grad_norm": 0.07781715705073443, + "language_loss": 1.01207459, + "learning_rate": 0.000991859556171339, + "loss": 1.02310491, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.21569824, + "step": 446, + "time_per_iteration": 2.5678694248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116922, + "balance_loss_mlp": 1.09462976, + "epoch": 0.08599461331281262, + "flos": 531215511552.0, + "grad_norm": 0.11213971543052093, + "language_loss": 1.02931881, + "learning_rate": 0.000991803472540521, + "loss": 1.040488, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.22302246, + "step": 447, + "time_per_iteration": 2.6309196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124555, + "balance_loss_mlp": 1.10302639, + "epoch": 0.08618699499807618, + "flos": 789966743040.0, + "grad_norm": 0.07287006723198586, + "language_loss": 0.97443926, + "learning_rate": 0.0009917471979746615, + "loss": 0.98568487, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.21533203, + "step": 448, + "time_per_iteration": 2.9742491245269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134564, + "balance_loss_mlp": 1.11266506, + "epoch": 0.08637937668333974, + "flos": 565707317760.0, + "grad_norm": 0.08202115093309782, + "language_loss": 0.97199845, + "learning_rate": 0.0009916907324956086, + "loss": 0.98334408, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.21923828, + "step": 449, + "time_per_iteration": 2.704089641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151497, + "balance_loss_mlp": 1.12693954, + "epoch": 0.08657175836860331, + "flos": 444930665472.0, + "grad_norm": 0.09325215593581063, + "language_loss": 0.93441564, + "learning_rate": 0.0009916340761252837, + "loss": 0.9459306, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.24536133, + "step": 450, + "time_per_iteration": 2.5866575241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158359, + "balance_loss_mlp": 1.13567328, + "epoch": 0.08676414005386687, + "flos": 843789450240.0, + "grad_norm": 0.23711660967347972, + "language_loss": 0.90976942, + "learning_rate": 0.0009915772288856832, + "loss": 0.92135304, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.22668457, + "step": 451, + "time_per_iteration": 3.109010696411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118071, + "balance_loss_mlp": 1.15827537, + "epoch": 0.08695652173913043, + "flos": 602995608576.0, + "grad_norm": 0.08699490701012727, + "language_loss": 0.92036849, + "learning_rate": 0.000991520190798877, + "loss": 0.93217564, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.22424316, + "step": 452, + "time_per_iteration": 2.8523812294006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181191, + "balance_loss_mlp": 1.15807629, + "epoch": 0.08714890342439399, + "flos": 730423028736.0, + "grad_norm": 0.09293440668835976, + "language_loss": 1.01637089, + "learning_rate": 0.0009914629618870089, + "loss": 1.02818286, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.23095703, + "step": 453, + "time_per_iteration": 2.882887125015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142362, + "balance_loss_mlp": 1.12891519, + "epoch": 0.08734128510965757, + "flos": 1481518232064.0, + "grad_norm": 0.0645312523276542, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79818237, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.13476562, + "step": 454, + "time_per_iteration": 4.717878103256226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103219, + "balance_loss_mlp": 1.09034455, + "epoch": 0.08753366679492113, + "flos": 1522214083584.0, + "grad_norm": 0.04274098512475534, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82531178, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.12890625, + "step": 455, + "time_per_iteration": 4.838243246078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171645, + "balance_loss_mlp": 1.14951944, + "epoch": 0.08772604848018468, + "flos": 720935078400.0, + "grad_norm": 0.10543082910841049, + "language_loss": 0.94423014, + "learning_rate": 0.0009912901304235883, + "loss": 0.95594656, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.22131348, + "step": 456, + "time_per_iteration": 2.9432015419006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150762, + "balance_loss_mlp": 1.12861252, + "epoch": 0.08791843016544824, + "flos": 707926086144.0, + "grad_norm": 0.10980567381029156, + "language_loss": 0.91300154, + "learning_rate": 0.000991232138434397, + "loss": 0.92450917, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.22143555, + "step": 457, + "time_per_iteration": 2.832761526107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113929, + "balance_loss_mlp": 1.09195828, + "epoch": 0.08811081185071182, + "flos": 472799407104.0, + "grad_norm": 0.1324680836731367, + "language_loss": 0.97845554, + "learning_rate": 0.000991173955731976, + "loss": 0.98959482, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.21960449, + "step": 458, + "time_per_iteration": 2.660696506500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100778, + "balance_loss_mlp": 1.07958269, + "epoch": 0.08830319353597538, + "flos": 684647769600.0, + "grad_norm": 0.07138233575581546, + "language_loss": 1.0178268, + "learning_rate": 0.0009911155823389137, + "loss": 1.02883458, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.21203613, + "step": 459, + "time_per_iteration": 2.9878122806549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105128, + "balance_loss_mlp": 1.08344412, + "epoch": 0.08849557522123894, + "flos": 573235411968.0, + "grad_norm": 0.0735053314112025, + "language_loss": 0.9764787, + "learning_rate": 0.000991057018277873, + "loss": 0.98752999, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.21679688, + "step": 460, + "time_per_iteration": 2.707247018814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116963, + "balance_loss_mlp": 1.09422946, + "epoch": 0.0886879569065025, + "flos": 564303283200.0, + "grad_norm": 0.10552034142073316, + "language_loss": 0.9759655, + "learning_rate": 0.0009909982635715898, + "loss": 0.98713505, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.22729492, + "step": 461, + "time_per_iteration": 2.609016180038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120097, + "balance_loss_mlp": 1.09760189, + "epoch": 0.08888033859176607, + "flos": 563609249280.0, + "grad_norm": 0.09185893532484944, + "language_loss": 0.96625364, + "learning_rate": 0.0009909393182428751, + "loss": 0.97745454, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.22497559, + "step": 462, + "time_per_iteration": 2.682616949081421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116703, + "balance_loss_mlp": 1.09437466, + "epoch": 0.08907272027702963, + "flos": 465517214208.0, + "grad_norm": 0.08888403374641002, + "language_loss": 0.91300213, + "learning_rate": 0.000990880182314614, + "loss": 0.92416912, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.22314453, + "step": 463, + "time_per_iteration": 2.732579469680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122985, + "balance_loss_mlp": 1.10014486, + "epoch": 0.08926510196229319, + "flos": 681200921088.0, + "grad_norm": 0.07408309604525525, + "language_loss": 0.92294347, + "learning_rate": 0.0009908208558097643, + "loss": 0.93417335, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.22839355, + "step": 464, + "time_per_iteration": 2.910313606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137827, + "balance_loss_mlp": 1.115273, + "epoch": 0.08945748364755675, + "flos": 596411831808.0, + "grad_norm": 0.08673846989427919, + "language_loss": 0.93827909, + "learning_rate": 0.000990761338751359, + "loss": 0.94965738, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.22546387, + "step": 465, + "time_per_iteration": 2.827570676803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133815, + "balance_loss_mlp": 1.12222791, + "epoch": 0.08964986533282032, + "flos": 1585082400768.0, + "grad_norm": 0.06082202694548154, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74793446, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.11572266, + "step": 466, + "time_per_iteration": 4.960917234420776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177765, + "balance_loss_mlp": 1.15419745, + "epoch": 0.08984224701808388, + "flos": 533268499968.0, + "grad_norm": 0.4900596090566038, + "language_loss": 0.96587038, + "learning_rate": 0.0009906417330663815, + "loss": 0.97764802, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.23571777, + "step": 467, + "time_per_iteration": 2.5937299728393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157084, + "balance_loss_mlp": 1.13383865, + "epoch": 0.09003462870334744, + "flos": 478702296576.0, + "grad_norm": 0.08613132202477504, + "language_loss": 0.92798859, + "learning_rate": 0.0009905816444862442, + "loss": 0.93955946, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.23217773, + "step": 468, + "time_per_iteration": 2.6012237071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164868, + "balance_loss_mlp": 1.14150274, + "epoch": 0.090227010388611, + "flos": 653307448320.0, + "grad_norm": 0.08218040805372613, + "language_loss": 0.90769458, + "learning_rate": 0.0009905213654454216, + "loss": 0.91934329, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.23364258, + "step": 469, + "time_per_iteration": 2.8727760314941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176439, + "balance_loss_mlp": 1.15152478, + "epoch": 0.09041939207387456, + "flos": 617894645760.0, + "grad_norm": 0.09256259391525869, + "language_loss": 0.97864139, + "learning_rate": 0.0009904608959673158, + "loss": 0.9904058, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.24938965, + "step": 470, + "time_per_iteration": 2.7991952896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151805, + "balance_loss_mlp": 1.12671185, + "epoch": 0.09061177375913813, + "flos": 454137808896.0, + "grad_norm": 0.09693984756275055, + "language_loss": 0.97988749, + "learning_rate": 0.000990400236075403, + "loss": 0.99140555, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.25109863, + "step": 471, + "time_per_iteration": 2.523508310317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125394, + "balance_loss_mlp": 1.10119498, + "epoch": 0.0908041554444017, + "flos": 543982984704.0, + "grad_norm": 0.09250187628709369, + "language_loss": 0.9490509, + "learning_rate": 0.0009903393857932338, + "loss": 0.96030486, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.24194336, + "step": 472, + "time_per_iteration": 2.7065584659576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124084, + "balance_loss_mlp": 1.09912193, + "epoch": 0.09099653712966525, + "flos": 564052999680.0, + "grad_norm": 0.10897832311722938, + "language_loss": 0.93660218, + "learning_rate": 0.0009902783451444317, + "loss": 0.94784307, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.24963379, + "step": 473, + "time_per_iteration": 2.7067277431488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108649, + "balance_loss_mlp": 1.08496177, + "epoch": 0.09118891881492881, + "flos": 474300956160.0, + "grad_norm": 0.09402902414949979, + "language_loss": 0.97273493, + "learning_rate": 0.0009902171141526956, + "loss": 0.98382139, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.23693848, + "step": 474, + "time_per_iteration": 2.5281569957733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087186, + "balance_loss_mlp": 1.06240201, + "epoch": 0.09138130050019239, + "flos": 545579076096.0, + "grad_norm": 0.06728788346792411, + "language_loss": 0.85273343, + "learning_rate": 0.000990155692841797, + "loss": 0.86360526, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.2479248, + "step": 475, + "time_per_iteration": 2.970107316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_mlp": 1.0587163, + "epoch": 0.09157368218545595, + "flos": 732397441536.0, + "grad_norm": 0.07226189405033341, + "language_loss": 0.97062063, + "learning_rate": 0.0009900940812355818, + "loss": 0.98145562, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.24768066, + "step": 476, + "time_per_iteration": 2.959184169769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096233, + "balance_loss_mlp": 1.07208097, + "epoch": 0.0917660638707195, + "flos": 610709967360.0, + "grad_norm": 0.09034653129128065, + "language_loss": 0.92824447, + "learning_rate": 0.00099003227935797, + "loss": 0.93920678, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.24157715, + "step": 477, + "time_per_iteration": 2.7553765773773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113263, + "balance_loss_mlp": 1.08839583, + "epoch": 0.09195844555598306, + "flos": 655561257984.0, + "grad_norm": 0.09830094540804109, + "language_loss": 0.95358098, + "learning_rate": 0.000989970287232955, + "loss": 0.96471357, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.2487793, + "step": 478, + "time_per_iteration": 2.7916457653045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112064, + "balance_loss_mlp": 1.09633327, + "epoch": 0.09215082724124664, + "flos": 476339387904.0, + "grad_norm": 0.08054303064285366, + "language_loss": 0.93560576, + "learning_rate": 0.0009899081048846043, + "loss": 0.94681215, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.24267578, + "step": 479, + "time_per_iteration": 2.554161787033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114732, + "balance_loss_mlp": 1.12177348, + "epoch": 0.0923432089265102, + "flos": 524051182080.0, + "grad_norm": 0.1186512856896222, + "language_loss": 0.97593725, + "learning_rate": 0.0009898457323370593, + "loss": 0.98741049, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.25549316, + "step": 480, + "time_per_iteration": 2.5794191360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131558, + "balance_loss_mlp": 1.10608315, + "epoch": 0.09253559061177376, + "flos": 545302651392.0, + "grad_norm": 0.10688941209840569, + "language_loss": 0.96892118, + "learning_rate": 0.000989783169614535, + "loss": 0.98023689, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.25512695, + "step": 481, + "time_per_iteration": 2.6676101684570312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336494, + "balance_loss_mlp": 1.32304764, + "epoch": 0.09272797229703732, + "flos": 1537222219776.0, + "grad_norm": 0.112558059824644, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80089253, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.13476562, + "step": 482, + "time_per_iteration": 4.910710096359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121205, + "balance_loss_mlp": 1.09537172, + "epoch": 0.09292035398230089, + "flos": 689501624832.0, + "grad_norm": 0.08905484371867754, + "language_loss": 0.93989253, + "learning_rate": 0.000989657473741779, + "loss": 0.95110452, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.25866699, + "step": 483, + "time_per_iteration": 2.8736467361450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120092, + "balance_loss_mlp": 1.09219658, + "epoch": 0.09311273566756445, + "flos": 509482414080.0, + "grad_norm": 0.10011855628381364, + "language_loss": 0.94861096, + "learning_rate": 0.0009895943406403465, + "loss": 0.95981193, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.27905273, + "step": 484, + "time_per_iteration": 2.7233312129974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114409, + "balance_loss_mlp": 1.08641887, + "epoch": 0.09330511735282801, + "flos": 659111413248.0, + "grad_norm": 0.10884122740481975, + "language_loss": 0.87602448, + "learning_rate": 0.0009895310174615338, + "loss": 0.88716859, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.2800293, + "step": 485, + "time_per_iteration": 2.7538061141967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098211, + "balance_loss_mlp": 1.08533621, + "epoch": 0.09349749903809157, + "flos": 1452054809088.0, + "grad_norm": 0.04867374252302138, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76816726, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.12890625, + "step": 486, + "time_per_iteration": 4.681119441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121456, + "balance_loss_mlp": 1.09291732, + "epoch": 0.09368988072335514, + "flos": 520614508032.0, + "grad_norm": 0.07858969791005947, + "language_loss": 0.92458618, + "learning_rate": 0.0009894038009701782, + "loss": 0.93580067, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.28515625, + "step": 487, + "time_per_iteration": 2.6114649772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153128, + "balance_loss_mlp": 1.12148952, + "epoch": 0.0938822624086187, + "flos": 497502107136.0, + "grad_norm": 0.11959755259003642, + "language_loss": 0.91595036, + "learning_rate": 0.0009893399077070253, + "loss": 0.92748165, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.31616211, + "step": 488, + "time_per_iteration": 2.5603692531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127952, + "balance_loss_mlp": 1.09845996, + "epoch": 0.09407464409388226, + "flos": 532948405248.0, + "grad_norm": 0.09098963794592498, + "language_loss": 0.89760649, + "learning_rate": 0.0009892758244652718, + "loss": 0.90888608, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.29516602, + "step": 489, + "time_per_iteration": 2.65938401222229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127724, + "balance_loss_mlp": 1.09568012, + "epoch": 0.09426702577914582, + "flos": 585736634880.0, + "grad_norm": 0.09102778373185845, + "language_loss": 0.94519842, + "learning_rate": 0.0009892115512697968, + "loss": 0.95647562, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.3203125, + "step": 490, + "time_per_iteration": 2.6538186073303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120065, + "balance_loss_mlp": 1.08926105, + "epoch": 0.0944594074644094, + "flos": 503081929728.0, + "grad_norm": 0.07724049493821064, + "language_loss": 0.96624851, + "learning_rate": 0.0009891470881455537, + "loss": 0.97744912, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.30810547, + "step": 491, + "time_per_iteration": 2.699535608291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122711, + "balance_loss_mlp": 1.09145451, + "epoch": 0.09465178914967295, + "flos": 570748847616.0, + "grad_norm": 0.0816499633869022, + "language_loss": 0.94510269, + "learning_rate": 0.0009890824351175692, + "loss": 0.95632982, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.31225586, + "step": 492, + "time_per_iteration": 2.678191661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125893, + "balance_loss_mlp": 1.09418344, + "epoch": 0.09484417083493651, + "flos": 549098707968.0, + "grad_norm": 0.07977284094064935, + "language_loss": 0.98609412, + "learning_rate": 0.0009890175922109435, + "loss": 0.99735302, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.31689453, + "step": 493, + "time_per_iteration": 2.6466987133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138627, + "balance_loss_mlp": 1.10534418, + "epoch": 0.09503655252020007, + "flos": 823552109568.0, + "grad_norm": 0.09331424233507904, + "language_loss": 0.96939492, + "learning_rate": 0.0009889525594508513, + "loss": 0.9807812, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.33300781, + "step": 494, + "time_per_iteration": 3.009894371032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153225, + "balance_loss_mlp": 1.12218332, + "epoch": 0.09522893420546363, + "flos": 404397757440.0, + "grad_norm": 0.08141129996203125, + "language_loss": 0.91043431, + "learning_rate": 0.0009888873368625404, + "loss": 0.92196655, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.31030273, + "step": 495, + "time_per_iteration": 2.4904890060424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171726, + "balance_loss_mlp": 1.14025438, + "epoch": 0.0954213158907272, + "flos": 690707810304.0, + "grad_norm": 0.08256479818708104, + "language_loss": 0.94339681, + "learning_rate": 0.0009888219244713326, + "loss": 0.95511413, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.31445312, + "step": 496, + "time_per_iteration": 2.8060483932495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181664, + "balance_loss_mlp": 1.15033531, + "epoch": 0.09561369757599077, + "flos": 518739019776.0, + "grad_norm": 0.10472312979641793, + "language_loss": 0.94370055, + "learning_rate": 0.0009887563223026229, + "loss": 0.95551717, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.31323242, + "step": 497, + "time_per_iteration": 2.6536803245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228939, + "balance_loss_mlp": 1.21549225, + "epoch": 0.09580607926125433, + "flos": 1384825849344.0, + "grad_norm": 0.04877985805939708, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80297101, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.13476562, + "step": 498, + "time_per_iteration": 4.874605178833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197245, + "balance_loss_mlp": 1.16455829, + "epoch": 0.09599846094651789, + "flos": 717090969600.0, + "grad_norm": 0.08863465655244346, + "language_loss": 0.93284124, + "learning_rate": 0.0009886245487346482, + "loss": 0.94481373, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.3269043, + "step": 499, + "time_per_iteration": 3.047938108444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011865, + "balance_loss_mlp": 1.15474319, + "epoch": 0.09619084263178146, + "flos": 385824909312.0, + "grad_norm": 0.09673466805801513, + "language_loss": 0.96238041, + "learning_rate": 0.0009885583773865422, + "loss": 0.97424543, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.31762695, + "step": 500, + "time_per_iteration": 2.402763843536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140705, + "balance_loss_mlp": 1.1099968, + "epoch": 0.09638322431704502, + "flos": 533869401600.0, + "grad_norm": 0.08556524095898377, + "language_loss": 0.93457472, + "learning_rate": 0.0009884920163632524, + "loss": 0.94598186, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.30688477, + "step": 501, + "time_per_iteration": 2.7420296669006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155853, + "balance_loss_mlp": 1.12373805, + "epoch": 0.09657560600230858, + "flos": 500426629632.0, + "grad_norm": 0.08462195742795481, + "language_loss": 0.95688182, + "learning_rate": 0.000988425465690543, + "loss": 0.96844035, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.32104492, + "step": 502, + "time_per_iteration": 2.5425736904144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163304, + "balance_loss_mlp": 1.13099861, + "epoch": 0.09676798768757214, + "flos": 528995197440.0, + "grad_norm": 0.07192036847451248, + "language_loss": 0.92721838, + "learning_rate": 0.0009883587253942505, + "loss": 0.93885148, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.32324219, + "step": 503, + "time_per_iteration": 2.8340742588043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188959, + "balance_loss_mlp": 1.15598607, + "epoch": 0.09696036937283571, + "flos": 463379857920.0, + "grad_norm": 0.0888689340699796, + "language_loss": 0.99166393, + "learning_rate": 0.0009882917955002862, + "loss": 1.00355351, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.32983398, + "step": 504, + "time_per_iteration": 2.560448169708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147535, + "balance_loss_mlp": 1.11606395, + "epoch": 0.09715275105809927, + "flos": 534716204544.0, + "grad_norm": 0.07251663236407552, + "language_loss": 0.9150176, + "learning_rate": 0.0009882246760346343, + "loss": 0.92649293, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.31420898, + "step": 505, + "time_per_iteration": 2.6460299491882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114081, + "balance_loss_mlp": 1.10714495, + "epoch": 0.09734513274336283, + "flos": 454713979392.0, + "grad_norm": 0.10061537251918176, + "language_loss": 0.96100289, + "learning_rate": 0.0009881573670233533, + "loss": 0.97241098, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.33666992, + "step": 506, + "time_per_iteration": 2.5137040615081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109977, + "balance_loss_mlp": 1.08029366, + "epoch": 0.09753751442862639, + "flos": 508551243264.0, + "grad_norm": 0.0762964042901656, + "language_loss": 0.91185808, + "learning_rate": 0.0009880898684925747, + "loss": 0.92295784, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.29663086, + "step": 507, + "time_per_iteration": 2.6571738719940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110119, + "balance_loss_mlp": 1.07133985, + "epoch": 0.09772989611388996, + "flos": 484030425600.0, + "grad_norm": 0.07531505250568626, + "language_loss": 0.89554358, + "learning_rate": 0.0009880221804685037, + "loss": 0.90655547, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.29882812, + "step": 508, + "time_per_iteration": 2.596289873123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01404721, + "balance_loss_mlp": 1.39136958, + "epoch": 0.09792227779915352, + "flos": 1565306339328.0, + "grad_norm": 0.10151454340945995, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80749142, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.13378906, + "step": 509, + "time_per_iteration": 4.724441051483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116621, + "balance_loss_mlp": 1.08655643, + "epoch": 0.09811465948441708, + "flos": 587529165312.0, + "grad_norm": 0.08257009801201759, + "language_loss": 0.94708043, + "learning_rate": 0.0009878862360456733, + "loss": 0.95824659, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.30029297, + "step": 510, + "time_per_iteration": 2.703011989593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122701, + "balance_loss_mlp": 1.09406662, + "epoch": 0.09830704116968064, + "flos": 612719285760.0, + "grad_norm": 0.06191460590209878, + "language_loss": 0.88457662, + "learning_rate": 0.0009878179796996922, + "loss": 0.89580369, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.28637695, + "step": 511, + "time_per_iteration": 2.7212226390838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128587, + "balance_loss_mlp": 1.09885597, + "epoch": 0.09849942285494422, + "flos": 538528227840.0, + "grad_norm": 0.06874751685339883, + "language_loss": 0.9199326, + "learning_rate": 0.0009877495339659754, + "loss": 0.9312185, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.29724121, + "step": 512, + "time_per_iteration": 2.7520575523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111609, + "balance_loss_mlp": 1.08826661, + "epoch": 0.09869180454020778, + "flos": 620193535488.0, + "grad_norm": 0.06953003964378547, + "language_loss": 0.87301105, + "learning_rate": 0.000987680898871096, + "loss": 0.88417196, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.27832031, + "step": 513, + "time_per_iteration": 2.7121992111206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134292, + "balance_loss_mlp": 1.10401261, + "epoch": 0.09888418622547133, + "flos": 811375363584.0, + "grad_norm": 0.1024184057853134, + "language_loss": 0.87763435, + "learning_rate": 0.0009876120744417, + "loss": 0.88897729, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.30273438, + "step": 514, + "time_per_iteration": 2.971573829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123143, + "balance_loss_mlp": 1.09267306, + "epoch": 0.0990765679107349, + "flos": 535548450816.0, + "grad_norm": 0.06764912074049458, + "language_loss": 0.95588082, + "learning_rate": 0.0009875430607045078, + "loss": 0.9671123, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.3046875, + "step": 515, + "time_per_iteration": 2.6630361080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108813, + "balance_loss_mlp": 1.08072746, + "epoch": 0.09926894959599845, + "flos": 587607740928.0, + "grad_norm": 0.06593749006245919, + "language_loss": 0.92788792, + "learning_rate": 0.000987473857686313, + "loss": 0.93897605, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.28076172, + "step": 516, + "time_per_iteration": 2.710068702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121556, + "balance_loss_mlp": 1.09039485, + "epoch": 0.09946133128126203, + "flos": 640947409920.0, + "grad_norm": 0.08862761474564218, + "language_loss": 0.9451825, + "learning_rate": 0.0009874044654139824, + "loss": 0.95639801, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.3112793, + "step": 517, + "time_per_iteration": 2.729975461959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117034, + "balance_loss_mlp": 1.08520555, + "epoch": 0.09965371296652559, + "flos": 465546327552.0, + "grad_norm": 0.09157938746936445, + "language_loss": 0.9250825, + "learning_rate": 0.0009873348839144563, + "loss": 0.93625283, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.31811523, + "step": 518, + "time_per_iteration": 2.5117127895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112516, + "balance_loss_mlp": 1.09540534, + "epoch": 0.09984609465178915, + "flos": 483365505024.0, + "grad_norm": 0.07736257304557469, + "language_loss": 0.9674046, + "learning_rate": 0.000987265113214749, + "loss": 0.97865617, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.29711914, + "step": 519, + "time_per_iteration": 2.5816774368286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147544, + "balance_loss_mlp": 1.11421299, + "epoch": 0.1000384763370527, + "flos": 568764260352.0, + "grad_norm": 0.08763817133734854, + "language_loss": 0.96583092, + "learning_rate": 0.0009871951533419476, + "loss": 0.97730637, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.33325195, + "step": 520, + "time_per_iteration": 2.638664484024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140597, + "balance_loss_mlp": 1.108482, + "epoch": 0.10023085802231628, + "flos": 545515057152.0, + "grad_norm": 0.10925869968591369, + "language_loss": 0.88377398, + "learning_rate": 0.0009871250043232132, + "loss": 0.89517999, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.32104492, + "step": 521, + "time_per_iteration": 2.70491886138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136934, + "balance_loss_mlp": 1.10555792, + "epoch": 0.10042323970757984, + "flos": 503208557568.0, + "grad_norm": 0.07694864026409119, + "language_loss": 0.87725985, + "learning_rate": 0.0009870546661857797, + "loss": 0.8886292, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.31347656, + "step": 522, + "time_per_iteration": 2.653456211090088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126678, + "balance_loss_mlp": 1.09380031, + "epoch": 0.1006156213928434, + "flos": 770084402688.0, + "grad_norm": 0.08414569380370593, + "language_loss": 0.95787346, + "learning_rate": 0.0009869841389569553, + "loss": 0.96914017, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.32885742, + "step": 523, + "time_per_iteration": 2.9442663192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116557, + "balance_loss_mlp": 1.08625388, + "epoch": 0.10080800307810696, + "flos": 489786338304.0, + "grad_norm": 0.06587351152736676, + "language_loss": 0.88897854, + "learning_rate": 0.0009869134226641206, + "loss": 0.90014416, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.30297852, + "step": 524, + "time_per_iteration": 2.5559868812561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110225, + "balance_loss_mlp": 1.07746601, + "epoch": 0.10100038476337053, + "flos": 454478252544.0, + "grad_norm": 0.09167866019985617, + "language_loss": 0.88383424, + "learning_rate": 0.0009868425173347303, + "loss": 0.89493656, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.32788086, + "step": 525, + "time_per_iteration": 2.645116090774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111349, + "balance_loss_mlp": 1.08216143, + "epoch": 0.10119276644863409, + "flos": 556155348480.0, + "grad_norm": 0.07288604326691553, + "language_loss": 0.96749896, + "learning_rate": 0.0009867714229963125, + "loss": 0.97863394, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.31323242, + "step": 526, + "time_per_iteration": 2.730703592300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106354, + "balance_loss_mlp": 1.07540703, + "epoch": 0.10138514813389765, + "flos": 515990587392.0, + "grad_norm": 0.07095113284061857, + "language_loss": 0.93916923, + "learning_rate": 0.000986700139676468, + "loss": 0.95023274, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.30932617, + "step": 527, + "time_per_iteration": 2.5836338996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110446, + "balance_loss_mlp": 1.07833052, + "epoch": 0.10157752981916121, + "flos": 500323322880.0, + "grad_norm": 0.06933811905919615, + "language_loss": 0.91673893, + "learning_rate": 0.0009866286674028717, + "loss": 0.92784333, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.32104492, + "step": 528, + "time_per_iteration": 2.7084739208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_mlp": 1.07100391, + "epoch": 0.10176991150442478, + "flos": 656444376576.0, + "grad_norm": 0.07189407365130172, + "language_loss": 0.88586026, + "learning_rate": 0.0009865570062032717, + "loss": 0.8968786, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.30810547, + "step": 529, + "time_per_iteration": 2.9141628742218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103952, + "balance_loss_mlp": 1.07443571, + "epoch": 0.10196229318968834, + "flos": 572974953984.0, + "grad_norm": 0.06841647032337263, + "language_loss": 0.93659967, + "learning_rate": 0.0009864851561054893, + "loss": 0.94763923, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.29516602, + "step": 530, + "time_per_iteration": 2.7539894580841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090977, + "balance_loss_mlp": 1.06110358, + "epoch": 0.1021546748749519, + "flos": 517946061312.0, + "grad_norm": 0.07340246055426732, + "language_loss": 0.91722125, + "learning_rate": 0.0009864131171374191, + "loss": 0.92813098, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.29882812, + "step": 531, + "time_per_iteration": 2.6921956539154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109944, + "balance_loss_mlp": 1.06749225, + "epoch": 0.10234705656021546, + "flos": 609470286336.0, + "grad_norm": 0.07867637119915549, + "language_loss": 0.91107762, + "learning_rate": 0.0009863408893270292, + "loss": 0.92207205, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.31933594, + "step": 532, + "time_per_iteration": 2.7911570072174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106913, + "balance_loss_mlp": 1.07396317, + "epoch": 0.10253943824547904, + "flos": 601473710592.0, + "grad_norm": 0.08191923529880715, + "language_loss": 0.86522454, + "learning_rate": 0.0009862684727023605, + "loss": 0.87629366, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.3293457, + "step": 533, + "time_per_iteration": 2.7452800273895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105875, + "balance_loss_mlp": 1.07466602, + "epoch": 0.1027318199307426, + "flos": 662647011840.0, + "grad_norm": 0.07282647554851075, + "language_loss": 0.90315968, + "learning_rate": 0.0009861958672915283, + "loss": 0.91421843, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.31201172, + "step": 534, + "time_per_iteration": 2.8041269779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096602, + "balance_loss_mlp": 1.0673244, + "epoch": 0.10292420161600616, + "flos": 682962928128.0, + "grad_norm": 0.058349855756870184, + "language_loss": 0.90126884, + "learning_rate": 0.0009861230731227201, + "loss": 0.9122349, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.29248047, + "step": 535, + "time_per_iteration": 2.8627805709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108033, + "balance_loss_mlp": 1.07615674, + "epoch": 0.10311658330126972, + "flos": 490042414080.0, + "grad_norm": 0.091555564896082, + "language_loss": 0.91954774, + "learning_rate": 0.0009860500902241973, + "loss": 0.93062806, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.31884766, + "step": 536, + "time_per_iteration": 2.6052157878875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120335, + "balance_loss_mlp": 1.08800602, + "epoch": 0.10330896498653329, + "flos": 431508446208.0, + "grad_norm": 0.0585767653270487, + "language_loss": 0.96574026, + "learning_rate": 0.0009859769186242942, + "loss": 0.97694361, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.32324219, + "step": 537, + "time_per_iteration": 2.51180362701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116517, + "balance_loss_mlp": 1.08571362, + "epoch": 0.10350134667179685, + "flos": 549330052608.0, + "grad_norm": 0.0744119924563098, + "language_loss": 0.8926785, + "learning_rate": 0.0009859035583514187, + "loss": 0.90384364, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.30834961, + "step": 538, + "time_per_iteration": 2.6369993686676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146613, + "balance_loss_mlp": 1.11380613, + "epoch": 0.10369372835706041, + "flos": 640327569408.0, + "grad_norm": 0.09976070350989504, + "language_loss": 0.90389431, + "learning_rate": 0.0009858300094340517, + "loss": 0.91536051, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.328125, + "step": 539, + "time_per_iteration": 2.7695086002349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150737, + "balance_loss_mlp": 1.11838388, + "epoch": 0.10388611004232397, + "flos": 521500598784.0, + "grad_norm": 0.08771902350159133, + "language_loss": 0.85304511, + "learning_rate": 0.0009857562719007473, + "loss": 0.8645525, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.32324219, + "step": 540, + "time_per_iteration": 2.59881329536438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144681, + "balance_loss_mlp": 1.11320961, + "epoch": 0.10407849172758753, + "flos": 702111946752.0, + "grad_norm": 0.07496368213999542, + "language_loss": 0.88249481, + "learning_rate": 0.0009856823457801331, + "loss": 0.89394164, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.31494141, + "step": 541, + "time_per_iteration": 2.873481035232544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119735, + "balance_loss_mlp": 1.08738184, + "epoch": 0.1042708734128511, + "flos": 502652736000.0, + "grad_norm": 0.06973546911765124, + "language_loss": 0.94998306, + "learning_rate": 0.00098560823110091, + "loss": 0.96118045, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.32373047, + "step": 542, + "time_per_iteration": 2.661374807357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_mlp": 1.08757377, + "epoch": 0.10446325509811466, + "flos": 485331153408.0, + "grad_norm": 0.0792045331206184, + "language_loss": 0.95517921, + "learning_rate": 0.000985533927891851, + "loss": 0.96635967, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.30419922, + "step": 543, + "time_per_iteration": 2.7264697551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096256, + "balance_loss_mlp": 1.06502366, + "epoch": 0.10465563678337822, + "flos": 568365590016.0, + "grad_norm": 0.0919664039836503, + "language_loss": 0.93718112, + "learning_rate": 0.0009854594361818044, + "loss": 0.94814372, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.31201172, + "step": 544, + "time_per_iteration": 2.6869821548461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099422, + "balance_loss_mlp": 1.0683322, + "epoch": 0.10484801846864178, + "flos": 625806853632.0, + "grad_norm": 0.1054615502202609, + "language_loss": 0.927598, + "learning_rate": 0.0009853847559996897, + "loss": 0.9385922, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.31103516, + "step": 545, + "time_per_iteration": 2.7953526973724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100313, + "balance_loss_mlp": 1.06772113, + "epoch": 0.10504040015390535, + "flos": 743063874048.0, + "grad_norm": 0.0768702593450629, + "language_loss": 0.92008656, + "learning_rate": 0.0009853098873745, + "loss": 0.93108964, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.32592773, + "step": 546, + "time_per_iteration": 3.0344293117523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106321, + "balance_loss_mlp": 1.07430172, + "epoch": 0.10523278183916891, + "flos": 586382616576.0, + "grad_norm": 0.072035501246702, + "language_loss": 0.90983582, + "learning_rate": 0.0009852348303353027, + "loss": 0.92089903, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.32006836, + "step": 547, + "time_per_iteration": 2.7647972106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110403, + "balance_loss_mlp": 1.07100892, + "epoch": 0.10542516352443247, + "flos": 869270552064.0, + "grad_norm": 0.07817580313906373, + "language_loss": 0.84611928, + "learning_rate": 0.000985159584911237, + "loss": 0.85715961, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.33007812, + "step": 548, + "time_per_iteration": 3.143122434616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104478, + "balance_loss_mlp": 1.07212472, + "epoch": 0.10561754520969603, + "flos": 505182970368.0, + "grad_norm": 0.08898596974063745, + "language_loss": 0.91126573, + "learning_rate": 0.0009850841511315162, + "loss": 0.92231047, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.32348633, + "step": 549, + "time_per_iteration": 2.6164846420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112982, + "balance_loss_mlp": 1.07946038, + "epoch": 0.1058099268949596, + "flos": 559690947072.0, + "grad_norm": 0.06224197989448247, + "language_loss": 0.92054999, + "learning_rate": 0.0009850085290254256, + "loss": 0.93167984, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.33520508, + "step": 550, + "time_per_iteration": 2.7473480701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110676, + "balance_loss_mlp": 1.07431078, + "epoch": 0.10600230858022316, + "flos": 561773048832.0, + "grad_norm": 0.05678957528127819, + "language_loss": 0.88957977, + "learning_rate": 0.0009849327186223246, + "loss": 0.90064728, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.32446289, + "step": 551, + "time_per_iteration": 2.805126905441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094878, + "balance_loss_mlp": 1.06297779, + "epoch": 0.10619469026548672, + "flos": 494079989760.0, + "grad_norm": 0.07906939671673464, + "language_loss": 0.95596325, + "learning_rate": 0.000984856719951646, + "loss": 0.96691203, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.31860352, + "step": 552, + "time_per_iteration": 2.5688273906707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105536, + "balance_loss_mlp": 1.07370734, + "epoch": 0.10638707195075028, + "flos": 675843678720.0, + "grad_norm": 0.06469368191660979, + "language_loss": 0.93170857, + "learning_rate": 0.0009847805330428943, + "loss": 0.94276392, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.31811523, + "step": 553, + "time_per_iteration": 2.8858227729797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105116, + "balance_loss_mlp": 1.07080746, + "epoch": 0.10657945363601386, + "flos": 487811925504.0, + "grad_norm": 0.07365688544553677, + "language_loss": 0.94454086, + "learning_rate": 0.0009847041579256481, + "loss": 0.95559192, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.34326172, + "step": 554, + "time_per_iteration": 2.5912039279937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114154, + "balance_loss_mlp": 1.08158636, + "epoch": 0.10677183532127742, + "flos": 482706376704.0, + "grad_norm": 0.06731486395760358, + "language_loss": 0.95310724, + "learning_rate": 0.0009846275946295592, + "loss": 0.96424878, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.32568359, + "step": 555, + "time_per_iteration": 2.6071619987487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120557, + "balance_loss_mlp": 1.08755958, + "epoch": 0.10696421700654098, + "flos": 655917668352.0, + "grad_norm": 0.06239681935918944, + "language_loss": 0.88169777, + "learning_rate": 0.0009845508431843518, + "loss": 0.89290333, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.32983398, + "step": 556, + "time_per_iteration": 2.9906973838806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_mlp": 1.08986306, + "epoch": 0.10715659869180454, + "flos": 567483881472.0, + "grad_norm": 0.06803394611182671, + "language_loss": 0.89010829, + "learning_rate": 0.0009844739036198233, + "loss": 0.90133309, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.32592773, + "step": 557, + "time_per_iteration": 2.6462793350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113246, + "balance_loss_mlp": 1.09927225, + "epoch": 0.10734898037706811, + "flos": 540432829440.0, + "grad_norm": 0.0683091886411484, + "language_loss": 0.96000761, + "learning_rate": 0.0009843967759658448, + "loss": 0.97133219, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.33203125, + "step": 558, + "time_per_iteration": 2.664320707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369087, + "balance_loss_mlp": 1.3546865, + "epoch": 0.10754136206233167, + "flos": 1475870008320.0, + "grad_norm": 0.12144998025248735, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74136841, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.14355469, + "step": 559, + "time_per_iteration": 4.836310148239136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124853, + "balance_loss_mlp": 1.0925231, + "epoch": 0.10773374374759523, + "flos": 512155243008.0, + "grad_norm": 0.06725764235558847, + "language_loss": 0.96045369, + "learning_rate": 0.000984241956509384, + "loss": 0.97170222, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.32324219, + "step": 560, + "time_per_iteration": 2.7409372329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134795, + "balance_loss_mlp": 1.10005689, + "epoch": 0.10792612543285879, + "flos": 496261016064.0, + "grad_norm": 0.08502468521942065, + "language_loss": 0.91520619, + "learning_rate": 0.0009841642647670078, + "loss": 0.92655414, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.34741211, + "step": 561, + "time_per_iteration": 2.5360167026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134435, + "balance_loss_mlp": 1.10050821, + "epoch": 0.10811850711812235, + "flos": 735131317248.0, + "grad_norm": 0.08550854990342285, + "language_loss": 0.86122006, + "learning_rate": 0.0009840863850553944, + "loss": 0.87256444, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.33911133, + "step": 562, + "time_per_iteration": 3.0013930797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118751, + "balance_loss_mlp": 1.08604038, + "epoch": 0.10831088880338592, + "flos": 611257024512.0, + "grad_norm": 0.07414056330929218, + "language_loss": 0.92513216, + "learning_rate": 0.0009840083174047782, + "loss": 0.93631971, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.3269043, + "step": 563, + "time_per_iteration": 2.761746883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125103, + "balance_loss_mlp": 1.09353685, + "epoch": 0.10850327048864948, + "flos": 556022928384.0, + "grad_norm": 0.06849160846851732, + "language_loss": 0.86520386, + "learning_rate": 0.0009839300618454685, + "loss": 0.87645483, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.31518555, + "step": 564, + "time_per_iteration": 2.833545684814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124691, + "balance_loss_mlp": 1.09291005, + "epoch": 0.10869565217391304, + "flos": 602902476288.0, + "grad_norm": 0.06688991061359367, + "language_loss": 0.92471159, + "learning_rate": 0.0009838516184078466, + "loss": 0.9359585, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.31762695, + "step": 565, + "time_per_iteration": 2.838482618331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112559, + "balance_loss_mlp": 1.09345102, + "epoch": 0.1088880338591766, + "flos": 525922288128.0, + "grad_norm": 0.08266802783800845, + "language_loss": 0.89073956, + "learning_rate": 0.0009837729871223669, + "loss": 0.90199542, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.3215332, + "step": 566, + "time_per_iteration": 2.6670589447021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134729, + "balance_loss_mlp": 1.10073042, + "epoch": 0.10908041554444017, + "flos": 619986921984.0, + "grad_norm": 0.06816497946354988, + "language_loss": 0.89503658, + "learning_rate": 0.0009836941680195568, + "loss": 0.90638387, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.34033203, + "step": 567, + "time_per_iteration": 2.7894582748413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131677, + "balance_loss_mlp": 1.09691525, + "epoch": 0.10927279722970373, + "flos": 897740195328.0, + "grad_norm": 0.07371226629870802, + "language_loss": 0.8534497, + "learning_rate": 0.0009836151611300166, + "loss": 0.86476642, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.34765625, + "step": 568, + "time_per_iteration": 3.204500913619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116051, + "balance_loss_mlp": 1.08467555, + "epoch": 0.10946517891496729, + "flos": 528408852480.0, + "grad_norm": 0.061952855977424344, + "language_loss": 0.96103537, + "learning_rate": 0.0009835359664844194, + "loss": 0.97219586, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.3137207, + "step": 569, + "time_per_iteration": 2.6154720783233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124163, + "balance_loss_mlp": 1.11014414, + "epoch": 0.10965756060023085, + "flos": 1559944714752.0, + "grad_norm": 0.03358522647050957, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82160974, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.140625, + "step": 570, + "time_per_iteration": 4.907090187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112487, + "balance_loss_mlp": 1.09406638, + "epoch": 0.10984994228549443, + "flos": 512820163584.0, + "grad_norm": 0.08674533322611513, + "language_loss": 0.9339065, + "learning_rate": 0.0009833770140481118, + "loss": 0.9451552, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.30786133, + "step": 571, + "time_per_iteration": 2.694821357727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121358, + "balance_loss_mlp": 1.09072113, + "epoch": 0.11004232397075799, + "flos": 954314307072.0, + "grad_norm": 0.07582699316256973, + "language_loss": 0.84126109, + "learning_rate": 0.000983297256319112, + "loss": 0.85247469, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.30664062, + "step": 572, + "time_per_iteration": 3.208728313446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144326, + "balance_loss_mlp": 1.11097169, + "epoch": 0.11023470565602154, + "flos": 487921024512.0, + "grad_norm": 0.07530566153242002, + "language_loss": 0.8789041, + "learning_rate": 0.000983217310957477, + "loss": 0.89034736, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.33349609, + "step": 573, + "time_per_iteration": 2.7521331310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113474, + "balance_loss_mlp": 1.1014812, + "epoch": 0.1104270873412851, + "flos": 655521970176.0, + "grad_norm": 0.08427122985019045, + "language_loss": 0.91161472, + "learning_rate": 0.000983137177994244, + "loss": 0.92296207, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.33300781, + "step": 574, + "time_per_iteration": 2.869795083999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105984, + "balance_loss_mlp": 1.0752039, + "epoch": 0.11061946902654868, + "flos": 723097165824.0, + "grad_norm": 0.0803000190442887, + "language_loss": 0.87202144, + "learning_rate": 0.0009830568574605235, + "loss": 0.88308132, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.30737305, + "step": 575, + "time_per_iteration": 2.952505111694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111674, + "balance_loss_mlp": 1.07963109, + "epoch": 0.11081185071181224, + "flos": 835113397248.0, + "grad_norm": 0.07764025760375837, + "language_loss": 0.89234924, + "learning_rate": 0.0009829763493874992, + "loss": 0.90346599, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.3203125, + "step": 576, + "time_per_iteration": 3.0367727279663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110641, + "balance_loss_mlp": 1.07508206, + "epoch": 0.1110042323970758, + "flos": 608776252416.0, + "grad_norm": 0.06795308301133055, + "language_loss": 0.94366598, + "learning_rate": 0.0009828956538064264, + "loss": 0.95473009, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.31347656, + "step": 577, + "time_per_iteration": 2.783268928527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091394, + "balance_loss_mlp": 1.0610671, + "epoch": 0.11119661408233936, + "flos": 595643604480.0, + "grad_norm": 0.0662915232098912, + "language_loss": 0.9183138, + "learning_rate": 0.0009828147707486344, + "loss": 0.92922771, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.30297852, + "step": 578, + "time_per_iteration": 2.6628670692443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092993, + "balance_loss_mlp": 1.06109214, + "epoch": 0.11138899576760293, + "flos": 555573385728.0, + "grad_norm": 0.07355059798421615, + "language_loss": 0.87444091, + "learning_rate": 0.0009827337002455245, + "loss": 0.88537085, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.31884766, + "step": 579, + "time_per_iteration": 2.616842031478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087398, + "balance_loss_mlp": 1.05857313, + "epoch": 0.11158137745286649, + "flos": 689418667008.0, + "grad_norm": 0.05531737995895799, + "language_loss": 0.89474124, + "learning_rate": 0.0009826524423285712, + "loss": 0.90561521, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.28808594, + "step": 580, + "time_per_iteration": 2.896409749984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093471, + "balance_loss_mlp": 1.06393051, + "epoch": 0.11177375913813005, + "flos": 762688728576.0, + "grad_norm": 0.06807232662928764, + "language_loss": 0.9046967, + "learning_rate": 0.0009825709970293218, + "loss": 0.91563141, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.2956543, + "step": 581, + "time_per_iteration": 2.8843319416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096208, + "balance_loss_mlp": 1.0669775, + "epoch": 0.11196614082339361, + "flos": 806211588096.0, + "grad_norm": 0.07053725402235117, + "language_loss": 0.96166003, + "learning_rate": 0.0009824893643793956, + "loss": 0.9726221, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.29248047, + "step": 582, + "time_per_iteration": 3.04577898979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104715, + "balance_loss_mlp": 1.07305288, + "epoch": 0.11215852250865718, + "flos": 558350931456.0, + "grad_norm": 0.10752491555358674, + "language_loss": 0.89033759, + "learning_rate": 0.0009824075444104857, + "loss": 0.90138471, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.31689453, + "step": 583, + "time_per_iteration": 2.682020902633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125233, + "balance_loss_mlp": 1.09497714, + "epoch": 0.11235090419392074, + "flos": 513322140672.0, + "grad_norm": 0.06606619546840543, + "language_loss": 0.94941097, + "learning_rate": 0.000982325537154357, + "loss": 0.9606632, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.30224609, + "step": 584, + "time_per_iteration": 2.577632427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122311, + "balance_loss_mlp": 1.09045827, + "epoch": 0.1125432858791843, + "flos": 491209311744.0, + "grad_norm": 0.07452844115700766, + "language_loss": 0.95190644, + "learning_rate": 0.0009822433426428484, + "loss": 0.96312958, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.31860352, + "step": 585, + "time_per_iteration": 2.560591220855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126565, + "balance_loss_mlp": 1.09280539, + "epoch": 0.11273566756444786, + "flos": 510476193792.0, + "grad_norm": 0.11434848401200806, + "language_loss": 0.87964213, + "learning_rate": 0.0009821609609078697, + "loss": 0.89090776, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.3371582, + "step": 586, + "time_per_iteration": 2.633925437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118464, + "balance_loss_mlp": 1.08785152, + "epoch": 0.11292804924971142, + "flos": 622149009408.0, + "grad_norm": 0.08000190427267627, + "language_loss": 0.905334, + "learning_rate": 0.0009820783919814045, + "loss": 0.91651857, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.3059082, + "step": 587, + "time_per_iteration": 2.806704044342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111289, + "balance_loss_mlp": 1.07857847, + "epoch": 0.113120430934975, + "flos": 477811823616.0, + "grad_norm": 0.09357252991594707, + "language_loss": 0.83955467, + "learning_rate": 0.0009819956358955095, + "loss": 0.8506676, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.32714844, + "step": 588, + "time_per_iteration": 2.5903711318969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109097, + "balance_loss_mlp": 1.07455039, + "epoch": 0.11331281262023855, + "flos": 466801975296.0, + "grad_norm": 0.06610764616840299, + "language_loss": 0.85348701, + "learning_rate": 0.0009819126926823127, + "loss": 0.86457801, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.34570312, + "step": 589, + "time_per_iteration": 2.5726494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108183, + "balance_loss_mlp": 1.07535291, + "epoch": 0.11350519430550211, + "flos": 650164727808.0, + "grad_norm": 0.06035980490561805, + "language_loss": 0.87806922, + "learning_rate": 0.000981829562374016, + "loss": 0.8891511, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.328125, + "step": 590, + "time_per_iteration": 2.7960643768310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112987, + "balance_loss_mlp": 1.08041859, + "epoch": 0.11369757599076567, + "flos": 557547798528.0, + "grad_norm": 0.08830164474684658, + "language_loss": 0.98550045, + "learning_rate": 0.0009817462450028933, + "loss": 0.99663031, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.32568359, + "step": 591, + "time_per_iteration": 2.654860734939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107596, + "balance_loss_mlp": 1.07526684, + "epoch": 0.11388995767602925, + "flos": 570774988800.0, + "grad_norm": 0.06245390963608315, + "language_loss": 0.86587834, + "learning_rate": 0.0009816627406012916, + "loss": 0.87695432, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.32348633, + "step": 592, + "time_per_iteration": 2.8017733097076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101808, + "balance_loss_mlp": 1.07074225, + "epoch": 0.1140823393612928, + "flos": 740069540352.0, + "grad_norm": 0.06581053360364857, + "language_loss": 0.8595314, + "learning_rate": 0.0009815790492016295, + "loss": 0.87054944, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.31030273, + "step": 593, + "time_per_iteration": 2.9602174758911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097875, + "balance_loss_mlp": 1.06666636, + "epoch": 0.11427472104655637, + "flos": 698694211584.0, + "grad_norm": 0.07124053574400792, + "language_loss": 0.87982339, + "learning_rate": 0.0009814951708363993, + "loss": 0.89080215, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.31201172, + "step": 594, + "time_per_iteration": 2.818460702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167391, + "balance_loss_mlp": 1.15413451, + "epoch": 0.11446710273181993, + "flos": 1476387952128.0, + "grad_norm": 0.04038129773095179, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79158378, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.1328125, + "step": 595, + "time_per_iteration": 4.776912450790405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110386, + "balance_loss_mlp": 1.07250798, + "epoch": 0.1146594844170835, + "flos": 494641603584.0, + "grad_norm": 0.1404346857169784, + "language_loss": 0.89489102, + "learning_rate": 0.0009813268533395648, + "loss": 0.90592968, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.3137207, + "step": 596, + "time_per_iteration": 2.562816858291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115324, + "balance_loss_mlp": 1.08344746, + "epoch": 0.11485186610234706, + "flos": 474596319744.0, + "grad_norm": 0.07456374098915484, + "language_loss": 0.89145029, + "learning_rate": 0.0009812424142733073, + "loss": 0.90260351, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.31884766, + "step": 597, + "time_per_iteration": 2.5198655128479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_mlp": 1.0946219, + "epoch": 0.11504424778761062, + "flos": 730858014720.0, + "grad_norm": 0.05033183127205697, + "language_loss": 0.86898923, + "learning_rate": 0.000981157788372175, + "loss": 0.88022888, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.29345703, + "step": 598, + "time_per_iteration": 3.004558563232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155063, + "balance_loss_mlp": 1.12290049, + "epoch": 0.11523662947287418, + "flos": 545539788288.0, + "grad_norm": 0.07554757352201513, + "language_loss": 0.90216064, + "learning_rate": 0.0009810729756690223, + "loss": 0.91371131, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.3215332, + "step": 599, + "time_per_iteration": 2.7165520191192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149643, + "balance_loss_mlp": 1.11790919, + "epoch": 0.11542901115813775, + "flos": 774737436672.0, + "grad_norm": 0.08801397326806587, + "language_loss": 0.92855275, + "learning_rate": 0.0009809879761967766, + "loss": 0.94004917, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.31738281, + "step": 600, + "time_per_iteration": 2.9548492431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115619, + "balance_loss_mlp": 1.12004542, + "epoch": 0.11562139284340131, + "flos": 730585972224.0, + "grad_norm": 0.08285308963026158, + "language_loss": 0.87716347, + "learning_rate": 0.0009809027899884378, + "loss": 0.8887254, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.36157227, + "step": 601, + "time_per_iteration": 2.9107346534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131924, + "balance_loss_mlp": 1.10085821, + "epoch": 0.11581377452866487, + "flos": 535589148672.0, + "grad_norm": 0.07059046613839054, + "language_loss": 0.89834028, + "learning_rate": 0.0009808174170770779, + "loss": 0.90965956, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.31079102, + "step": 602, + "time_per_iteration": 2.79127836227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217718, + "balance_loss_mlp": 1.20541608, + "epoch": 0.11600615621392843, + "flos": 1554968613888.0, + "grad_norm": 0.07528653751738872, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86115962, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.12304688, + "step": 603, + "time_per_iteration": 4.862261772155762 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103115, + "balance_loss_mlp": 1.07238269, + "epoch": 0.116198537899192, + "flos": 537178037760.0, + "grad_norm": 0.08106568577848162, + "language_loss": 0.94434869, + "learning_rate": 0.0009806461112779462, + "loss": 0.95537978, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.30737305, + "step": 604, + "time_per_iteration": 2.600008249282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097342, + "balance_loss_mlp": 1.06427336, + "epoch": 0.11639091958445556, + "flos": 453970483200.0, + "grad_norm": 0.09761910402267754, + "language_loss": 0.89590895, + "learning_rate": 0.0009805601784566814, + "loss": 0.90688241, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.33056641, + "step": 605, + "time_per_iteration": 2.4687013626098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097807, + "balance_loss_mlp": 1.06635928, + "epoch": 0.11658330126971912, + "flos": 554815332864.0, + "grad_norm": 0.0628453025897625, + "language_loss": 0.96235836, + "learning_rate": 0.0009804740590654089, + "loss": 0.97333646, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.31469727, + "step": 606, + "time_per_iteration": 2.654134750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109435, + "balance_loss_mlp": 1.0789417, + "epoch": 0.11677568295498268, + "flos": 716025968640.0, + "grad_norm": 0.07837472156111998, + "language_loss": 0.90884066, + "learning_rate": 0.0009803877531375635, + "loss": 0.91993499, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.30493164, + "step": 607, + "time_per_iteration": 2.825778007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_mlp": 1.08074808, + "epoch": 0.11696806464024626, + "flos": 609474668544.0, + "grad_norm": 0.07263848878870109, + "language_loss": 0.91923869, + "learning_rate": 0.0009803012607066523, + "loss": 0.93036401, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.31787109, + "step": 608, + "time_per_iteration": 2.721005916595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101062, + "balance_loss_mlp": 1.06980491, + "epoch": 0.11716044632550981, + "flos": 520127087616.0, + "grad_norm": 0.06980646294906427, + "language_loss": 0.9077962, + "learning_rate": 0.0009802145818062543, + "loss": 0.91880679, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.31225586, + "step": 609, + "time_per_iteration": 2.707643985748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102401, + "balance_loss_mlp": 1.07035792, + "epoch": 0.11735282801077337, + "flos": 507246133248.0, + "grad_norm": 0.07162886221417876, + "language_loss": 0.9293434, + "learning_rate": 0.0009801277164700212, + "loss": 0.9403674, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.3203125, + "step": 610, + "time_per_iteration": 2.6389639377593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094537, + "balance_loss_mlp": 1.06323278, + "epoch": 0.11754520969603693, + "flos": 686339965440.0, + "grad_norm": 0.07220465483683103, + "language_loss": 0.90727574, + "learning_rate": 0.0009800406647316776, + "loss": 0.91822106, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.31274414, + "step": 611, + "time_per_iteration": 2.8033382892608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066854, + "balance_loss_mlp": 1.05369329, + "epoch": 0.1177375913813005, + "flos": 1541673022464.0, + "grad_norm": 0.030783707978337852, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.77981311, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.13183594, + "step": 612, + "time_per_iteration": 4.777275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116404, + "balance_loss_mlp": 1.08307314, + "epoch": 0.11792997306656407, + "flos": 520269682176.0, + "grad_norm": 0.07589987368124408, + "language_loss": 0.8961159, + "learning_rate": 0.000979866002183916, + "loss": 0.90727997, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.33325195, + "step": 613, + "time_per_iteration": 2.6848883628845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109453, + "balance_loss_mlp": 1.07719529, + "epoch": 0.11812235475182763, + "flos": 665980379136.0, + "grad_norm": 0.08667718058784188, + "language_loss": 0.91197205, + "learning_rate": 0.0009797783914423082, + "loss": 0.92306662, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.32250977, + "step": 614, + "time_per_iteration": 2.832414388656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.07140493, + "epoch": 0.11831473643709119, + "flos": 621021399552.0, + "grad_norm": 0.06050640051516142, + "language_loss": 0.85425436, + "learning_rate": 0.0009796905944342094, + "loss": 0.86530626, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.33813477, + "step": 615, + "time_per_iteration": 2.8220455646514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112849, + "balance_loss_mlp": 1.07913685, + "epoch": 0.11850711812235475, + "flos": 456438108672.0, + "grad_norm": 0.0714748534502384, + "language_loss": 0.893188, + "learning_rate": 0.0009796026111937057, + "loss": 0.90431643, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.3371582, + "step": 616, + "time_per_iteration": 2.590566873550415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102524, + "balance_loss_mlp": 1.07005119, + "epoch": 0.11869949980761832, + "flos": 513598565376.0, + "grad_norm": 0.06492309219220607, + "language_loss": 0.89778733, + "learning_rate": 0.0009795144417549552, + "loss": 0.90881252, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.32470703, + "step": 617, + "time_per_iteration": 2.672914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109626, + "balance_loss_mlp": 1.0773685, + "epoch": 0.11889188149288188, + "flos": 534732171264.0, + "grad_norm": 0.057544425945024125, + "language_loss": 0.90660846, + "learning_rate": 0.0009794260861521883, + "loss": 0.9177047, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.32250977, + "step": 618, + "time_per_iteration": 2.817354202270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_mlp": 1.07009149, + "epoch": 0.11908426317814544, + "flos": 498344527872.0, + "grad_norm": 0.0773697745436404, + "language_loss": 0.87738883, + "learning_rate": 0.0009793375444197075, + "loss": 0.88841403, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.32397461, + "step": 619, + "time_per_iteration": 2.607475996017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011109, + "balance_loss_mlp": 1.07697332, + "epoch": 0.119276644863409, + "flos": 659598833664.0, + "grad_norm": 0.06767977381214116, + "language_loss": 0.86337721, + "learning_rate": 0.000979248816591888, + "loss": 0.87448615, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.33935547, + "step": 620, + "time_per_iteration": 2.758866548538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098794, + "balance_loss_mlp": 1.06667948, + "epoch": 0.11946902654867257, + "flos": 758396487168.0, + "grad_norm": 0.06819106164994826, + "language_loss": 0.87032986, + "learning_rate": 0.0009791599027031766, + "loss": 0.88131785, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.32128906, + "step": 621, + "time_per_iteration": 3.029431104660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088156, + "balance_loss_mlp": 1.05611241, + "epoch": 0.11966140823393613, + "flos": 680697533952.0, + "grad_norm": 0.0732554324646167, + "language_loss": 0.87112588, + "learning_rate": 0.0009790708027880932, + "loss": 0.88200748, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.32055664, + "step": 622, + "time_per_iteration": 2.855576992034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056461, + "balance_loss_mlp": 1.04444504, + "epoch": 0.11985378991919969, + "flos": 1450268070912.0, + "grad_norm": 0.03732324883573809, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78483754, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.12011719, + "step": 623, + "time_per_iteration": 4.840993165969849 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108671, + "balance_loss_mlp": 1.0551914, + "epoch": 0.12004617160446325, + "flos": 527586780672.0, + "grad_norm": 0.07309096746678648, + "language_loss": 0.94236648, + "learning_rate": 0.0009788920450172487, + "loss": 0.9532336, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.31518555, + "step": 624, + "time_per_iteration": 2.6301677227020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102663, + "balance_loss_mlp": 1.07023823, + "epoch": 0.12023855328972682, + "flos": 473980861440.0, + "grad_norm": 0.15739190650861204, + "language_loss": 0.91515559, + "learning_rate": 0.0009788023872308875, + "loss": 0.92618221, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.32421875, + "step": 625, + "time_per_iteration": 2.506446361541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014454, + "balance_loss_mlp": 1.0033915, + "epoch": 0.12043093497499038, + "flos": 1530954155520.0, + "grad_norm": 0.02216054665264375, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76443458, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.11083984, + "step": 626, + "time_per_iteration": 4.713289260864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114644, + "balance_loss_mlp": 1.08391225, + "epoch": 0.12062331666025394, + "flos": 539571469824.0, + "grad_norm": 0.0672242080300053, + "language_loss": 0.94766486, + "learning_rate": 0.0009786225140303285, + "loss": 0.95881128, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.30761719, + "step": 627, + "time_per_iteration": 2.61875057220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011267, + "balance_loss_mlp": 1.09503818, + "epoch": 0.1208156983455175, + "flos": 511634327040.0, + "grad_norm": 0.06510849521455, + "language_loss": 0.925771, + "learning_rate": 0.0009785322986859634, + "loss": 0.93703806, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.31640625, + "step": 628, + "time_per_iteration": 2.6567625999450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141777, + "balance_loss_mlp": 1.11059177, + "epoch": 0.12100808003078108, + "flos": 596195043840.0, + "grad_norm": 0.06735600063735754, + "language_loss": 0.93719506, + "learning_rate": 0.0009784418975588838, + "loss": 0.94861281, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.31152344, + "step": 629, + "time_per_iteration": 2.697376012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122983, + "balance_loss_mlp": 1.09222674, + "epoch": 0.12120046171604464, + "flos": 522698019840.0, + "grad_norm": 0.47103484407124013, + "language_loss": 0.93927598, + "learning_rate": 0.0009783513106841862, + "loss": 0.95050573, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.30761719, + "step": 630, + "time_per_iteration": 2.7226808071136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143332, + "balance_loss_mlp": 1.13179243, + "epoch": 0.1213928434013082, + "flos": 1553605277184.0, + "grad_norm": 0.056788624646596834, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77876031, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.11523438, + "step": 631, + "time_per_iteration": 4.948111295700073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228128, + "balance_loss_mlp": 1.19219875, + "epoch": 0.12158522508657175, + "flos": 495143580672.0, + "grad_norm": 0.06834333100250278, + "language_loss": 0.88515621, + "learning_rate": 0.0009781695798326854, + "loss": 0.89743745, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.35961914, + "step": 632, + "time_per_iteration": 2.5616555213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267845, + "balance_loss_mlp": 1.23050833, + "epoch": 0.12177760677183531, + "flos": 475335433728.0, + "grad_norm": 0.1009303482431908, + "language_loss": 0.88543177, + "learning_rate": 0.0009780784359264365, + "loss": 0.89811015, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.37329102, + "step": 633, + "time_per_iteration": 2.597935438156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265484, + "balance_loss_mlp": 1.25370574, + "epoch": 0.12196998845709889, + "flos": 1467630351360.0, + "grad_norm": 0.08843071113371018, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75454181, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.11767578, + "step": 634, + "time_per_iteration": 4.768415451049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235432, + "balance_loss_mlp": 1.19976473, + "epoch": 0.12216237014236245, + "flos": 586279309824.0, + "grad_norm": 0.0829698455775257, + "language_loss": 0.88074899, + "learning_rate": 0.000977895591329867, + "loss": 0.89310336, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.35668945, + "step": 635, + "time_per_iteration": 2.7918457984924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214994, + "balance_loss_mlp": 1.17720437, + "epoch": 0.12235475182762601, + "flos": 597721324032.0, + "grad_norm": 0.0916527997361875, + "language_loss": 0.87791145, + "learning_rate": 0.000977803890710533, + "loss": 0.89006138, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.37792969, + "step": 636, + "time_per_iteration": 2.7248313426971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186705, + "balance_loss_mlp": 1.1509428, + "epoch": 0.12254713351288957, + "flos": 497487550464.0, + "grad_norm": 0.0702522126388857, + "language_loss": 0.93856937, + "learning_rate": 0.0009777120045912774, + "loss": 0.95043641, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.35766602, + "step": 637, + "time_per_iteration": 2.6079726219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180704, + "balance_loss_mlp": 1.14236617, + "epoch": 0.12273951519815314, + "flos": 605565130752.0, + "grad_norm": 0.06645311005239844, + "language_loss": 0.90599251, + "learning_rate": 0.0009776199330077736, + "loss": 0.91779959, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.38330078, + "step": 638, + "time_per_iteration": 2.7671282291412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196025, + "balance_loss_mlp": 1.15940344, + "epoch": 0.1229318968834167, + "flos": 597578729472.0, + "grad_norm": 0.09015200479441979, + "language_loss": 0.93140519, + "learning_rate": 0.0009775276759957667, + "loss": 0.94336545, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.36621094, + "step": 639, + "time_per_iteration": 2.6990442276000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179898, + "balance_loss_mlp": 1.14265716, + "epoch": 0.12312427856868026, + "flos": 678082931712.0, + "grad_norm": 0.08188642922116089, + "language_loss": 0.90714514, + "learning_rate": 0.0009774352335910745, + "loss": 0.91894412, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.37280273, + "step": 640, + "time_per_iteration": 2.7950265407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115004, + "balance_loss_mlp": 1.11658967, + "epoch": 0.12331666025394382, + "flos": 608656978944.0, + "grad_norm": 0.07361380744806716, + "language_loss": 0.95549798, + "learning_rate": 0.000977342605829586, + "loss": 0.96699834, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.3347168, + "step": 641, + "time_per_iteration": 2.6966538429260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140018, + "balance_loss_mlp": 1.10497069, + "epoch": 0.12350904193920739, + "flos": 762172194816.0, + "grad_norm": 0.08211004604029591, + "language_loss": 0.86708105, + "learning_rate": 0.0009772497927472623, + "loss": 0.87848121, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.35083008, + "step": 642, + "time_per_iteration": 3.050595998764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121507, + "balance_loss_mlp": 1.0852437, + "epoch": 0.12370142362447095, + "flos": 540699079680.0, + "grad_norm": 0.0716743258864478, + "language_loss": 0.85363436, + "learning_rate": 0.0009771567943801368, + "loss": 0.86484945, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.36254883, + "step": 643, + "time_per_iteration": 2.627019166946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112067, + "balance_loss_mlp": 1.07744884, + "epoch": 0.12389380530973451, + "flos": 547848852480.0, + "grad_norm": 0.06992166814052157, + "language_loss": 0.89936745, + "learning_rate": 0.0009770636107643152, + "loss": 0.91048813, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.34643555, + "step": 644, + "time_per_iteration": 2.696233034133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102963, + "balance_loss_mlp": 1.06846356, + "epoch": 0.12408618699499807, + "flos": 540048715776.0, + "grad_norm": 0.06268128655507912, + "language_loss": 0.88181639, + "learning_rate": 0.0009769702419359738, + "loss": 0.89284605, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.3449707, + "step": 645, + "time_per_iteration": 2.61401104927063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116268, + "balance_loss_mlp": 1.0810535, + "epoch": 0.12427856868026164, + "flos": 745451513856.0, + "grad_norm": 0.07610574883038115, + "language_loss": 0.89730537, + "learning_rate": 0.000976876687931362, + "loss": 0.90846807, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.35229492, + "step": 646, + "time_per_iteration": 2.999408721923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131315, + "balance_loss_mlp": 1.09622002, + "epoch": 0.1244709503655252, + "flos": 533460556800.0, + "grad_norm": 0.19449531308307466, + "language_loss": 0.85410094, + "learning_rate": 0.0009767829487868005, + "loss": 0.86541414, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.35107422, + "step": 647, + "time_per_iteration": 2.617666721343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138117, + "balance_loss_mlp": 1.10159075, + "epoch": 0.12466333205078876, + "flos": 507847034880.0, + "grad_norm": 0.07509451505155453, + "language_loss": 0.89358151, + "learning_rate": 0.000976689024538682, + "loss": 0.90496266, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.36499023, + "step": 648, + "time_per_iteration": 2.5929009914398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138062, + "balance_loss_mlp": 1.10110736, + "epoch": 0.12485571373605232, + "flos": 681023420928.0, + "grad_norm": 0.07057439208121223, + "language_loss": 0.87662494, + "learning_rate": 0.0009765949152234716, + "loss": 0.8880055, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.36962891, + "step": 649, + "time_per_iteration": 2.874701976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147728, + "balance_loss_mlp": 1.13504386, + "epoch": 0.1250480954213159, + "flos": 1329402668544.0, + "grad_norm": 0.04527818124304351, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79833812, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.12695312, + "step": 650, + "time_per_iteration": 4.680933713912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138039, + "balance_loss_mlp": 1.10287213, + "epoch": 0.12524047710657946, + "flos": 938140683264.0, + "grad_norm": 0.08375968037938068, + "language_loss": 0.82443976, + "learning_rate": 0.0009764061415379919, + "loss": 0.83582014, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.35205078, + "step": 651, + "time_per_iteration": 3.2550604343414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135369, + "balance_loss_mlp": 1.09774697, + "epoch": 0.12543285879184302, + "flos": 513642235392.0, + "grad_norm": 0.07146085627000143, + "language_loss": 0.89363486, + "learning_rate": 0.0009763114772410109, + "loss": 0.90498853, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.3762207, + "step": 652, + "time_per_iteration": 2.5937142372131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139745, + "balance_loss_mlp": 1.10419679, + "epoch": 0.12562524047710658, + "flos": 717991617024.0, + "grad_norm": 0.07913079577836896, + "language_loss": 0.87230957, + "learning_rate": 0.0009762166280235146, + "loss": 0.88370705, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.35571289, + "step": 653, + "time_per_iteration": 2.96162748336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147653, + "balance_loss_mlp": 1.10974443, + "epoch": 0.12581762216237014, + "flos": 563441923584.0, + "grad_norm": 0.06492259826928527, + "language_loss": 0.87890899, + "learning_rate": 0.0009761215939223267, + "loss": 0.89038551, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.37890625, + "step": 654, + "time_per_iteration": 2.714641809463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145182, + "balance_loss_mlp": 1.1077261, + "epoch": 0.1260100038476337, + "flos": 481642785792.0, + "grad_norm": 0.07920721431290144, + "language_loss": 0.86875665, + "learning_rate": 0.0009760263749743428, + "loss": 0.88020849, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.37426758, + "step": 655, + "time_per_iteration": 2.547499179840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145343, + "balance_loss_mlp": 1.11074805, + "epoch": 0.12620238553289725, + "flos": 575269461504.0, + "grad_norm": 0.06357383816141966, + "language_loss": 0.90176344, + "learning_rate": 0.0009759309712165299, + "loss": 0.91321695, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.34570312, + "step": 656, + "time_per_iteration": 2.693922996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137509, + "balance_loss_mlp": 1.103248, + "epoch": 0.12639476721816084, + "flos": 530909973504.0, + "grad_norm": 0.07169490366111804, + "language_loss": 0.93258119, + "learning_rate": 0.0009758353826859272, + "loss": 0.94395626, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.34277344, + "step": 657, + "time_per_iteration": 2.5744612216949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139269, + "balance_loss_mlp": 1.10314822, + "epoch": 0.1265871489034244, + "flos": 689654393856.0, + "grad_norm": 0.06860158128637554, + "language_loss": 0.89679217, + "learning_rate": 0.0009757396094196456, + "loss": 0.90818477, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36132812, + "step": 658, + "time_per_iteration": 2.851700782775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143308, + "balance_loss_mlp": 1.10675859, + "epoch": 0.12677953058868796, + "flos": 536863735296.0, + "grad_norm": 0.0696485175834739, + "language_loss": 0.84555894, + "learning_rate": 0.0009756436514548673, + "loss": 0.85699201, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.36523438, + "step": 659, + "time_per_iteration": 2.7971351146698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122244, + "balance_loss_mlp": 1.08800757, + "epoch": 0.12697191227395152, + "flos": 518749194240.0, + "grad_norm": 0.05327633329409036, + "language_loss": 0.88343394, + "learning_rate": 0.0009755475088288466, + "loss": 0.89465636, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.34228516, + "step": 660, + "time_per_iteration": 2.670555353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103127, + "balance_loss_mlp": 1.06903291, + "epoch": 0.12716429395921508, + "flos": 566341714944.0, + "grad_norm": 0.06801254087798507, + "language_loss": 0.90210187, + "learning_rate": 0.0009754511815789095, + "loss": 0.91313314, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.34106445, + "step": 661, + "time_per_iteration": 2.748224973678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102552, + "balance_loss_mlp": 1.06798172, + "epoch": 0.12735667564447864, + "flos": 513844466688.0, + "grad_norm": 0.06975204014846512, + "language_loss": 0.86245489, + "learning_rate": 0.0009753546697424533, + "loss": 0.87348044, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.34594727, + "step": 662, + "time_per_iteration": 2.664799213409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092108, + "balance_loss_mlp": 1.05863369, + "epoch": 0.1275490573297422, + "flos": 541023556608.0, + "grad_norm": 0.05485824904298714, + "language_loss": 0.90572149, + "learning_rate": 0.0009752579733569475, + "loss": 0.91664255, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.3347168, + "step": 663, + "time_per_iteration": 2.679975748062134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267369, + "balance_loss_mlp": 1.2515384, + "epoch": 0.12774143901500576, + "flos": 1557872787456.0, + "grad_norm": 0.0685532780556388, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7614876, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.15820312, + "step": 664, + "time_per_iteration": 4.938101053237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096151, + "balance_loss_mlp": 1.06177139, + "epoch": 0.12793382070026935, + "flos": 613462781952.0, + "grad_norm": 0.06920677464457729, + "language_loss": 0.90523887, + "learning_rate": 0.0009750640270890217, + "loss": 0.9162004, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.34375, + "step": 665, + "time_per_iteration": 2.6939845085144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099083, + "balance_loss_mlp": 1.06563258, + "epoch": 0.1281262023855329, + "flos": 707386231296.0, + "grad_norm": 0.06773970450457005, + "language_loss": 0.96531481, + "learning_rate": 0.0009749667772818983, + "loss": 0.9763056, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.33447266, + "step": 666, + "time_per_iteration": 2.967853307723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164762, + "balance_loss_mlp": 1.15131497, + "epoch": 0.12831858407079647, + "flos": 1424250086400.0, + "grad_norm": 0.045177828452490555, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78100705, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.13476562, + "step": 667, + "time_per_iteration": 4.85069465637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093582, + "balance_loss_mlp": 1.05958366, + "epoch": 0.12851096575606002, + "flos": 448869316608.0, + "grad_norm": 0.07778909975942494, + "language_loss": 0.95426726, + "learning_rate": 0.0009747717245101093, + "loss": 0.96520311, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.34008789, + "step": 668, + "time_per_iteration": 2.5234692096710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098998, + "balance_loss_mlp": 1.06519032, + "epoch": 0.12870334744132358, + "flos": 479697486336.0, + "grad_norm": 0.05465485885236262, + "language_loss": 0.84969366, + "learning_rate": 0.00097467392162117, + "loss": 0.86068368, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.33789062, + "step": 669, + "time_per_iteration": 2.601684808731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096385, + "balance_loss_mlp": 1.06341171, + "epoch": 0.12889572912658714, + "flos": 638633963520.0, + "grad_norm": 0.05954757179165737, + "language_loss": 0.91292465, + "learning_rate": 0.0009745759344474708, + "loss": 0.92388856, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.32983398, + "step": 670, + "time_per_iteration": 2.8225347995758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098806, + "balance_loss_mlp": 1.06411648, + "epoch": 0.1290881108118507, + "flos": 509693409792.0, + "grad_norm": 0.06976130099981656, + "language_loss": 0.89229816, + "learning_rate": 0.0009744777630270536, + "loss": 0.90328622, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.34692383, + "step": 671, + "time_per_iteration": 2.633571147918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109875, + "balance_loss_mlp": 1.07435024, + "epoch": 0.12928049249711426, + "flos": 670746894336.0, + "grad_norm": 0.08011077975608555, + "language_loss": 0.93749923, + "learning_rate": 0.000974379407398032, + "loss": 0.94859791, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.35546875, + "step": 672, + "time_per_iteration": 2.875609874725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093721, + "balance_loss_mlp": 1.06065273, + "epoch": 0.12947287418237785, + "flos": 793158925824.0, + "grad_norm": 0.05850057523774312, + "language_loss": 0.82016242, + "learning_rate": 0.0009742808675985913, + "loss": 0.83109969, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.33056641, + "step": 673, + "time_per_iteration": 3.087738275527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101029, + "balance_loss_mlp": 1.0646224, + "epoch": 0.1296652558676414, + "flos": 485222054400.0, + "grad_norm": 0.08954381825883409, + "language_loss": 0.9153564, + "learning_rate": 0.0009741821436669876, + "loss": 0.92636657, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.36450195, + "step": 674, + "time_per_iteration": 2.539849281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101326, + "balance_loss_mlp": 1.06673169, + "epoch": 0.12985763755290497, + "flos": 453226987008.0, + "grad_norm": 0.0648114016490977, + "language_loss": 0.9288274, + "learning_rate": 0.0009740832356415492, + "loss": 0.93984067, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.34619141, + "step": 675, + "time_per_iteration": 2.467801094055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097673, + "balance_loss_mlp": 1.06315041, + "epoch": 0.13005001923816853, + "flos": 824719007232.0, + "grad_norm": 0.0735546441878898, + "language_loss": 0.8857609, + "learning_rate": 0.0009739841435606756, + "loss": 0.89673769, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.34545898, + "step": 676, + "time_per_iteration": 3.008781909942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109741, + "balance_loss_mlp": 1.06457949, + "epoch": 0.1302424009234321, + "flos": 531107822592.0, + "grad_norm": 0.07312926894822828, + "language_loss": 0.90675485, + "learning_rate": 0.0009738848674628377, + "loss": 0.9177289, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.328125, + "step": 677, + "time_per_iteration": 2.695338010787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104607, + "balance_loss_mlp": 1.06955981, + "epoch": 0.13043478260869565, + "flos": 525626924544.0, + "grad_norm": 0.06033597827839572, + "language_loss": 0.89643902, + "learning_rate": 0.000973785407386578, + "loss": 0.90748513, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.35058594, + "step": 678, + "time_per_iteration": 2.7727599143981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101976, + "balance_loss_mlp": 1.06714272, + "epoch": 0.1306271642939592, + "flos": 625862108160.0, + "grad_norm": 0.05570081952525763, + "language_loss": 0.87361526, + "learning_rate": 0.0009736857633705103, + "loss": 0.88463503, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.34814453, + "step": 679, + "time_per_iteration": 2.843129873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110176, + "balance_loss_mlp": 1.06630766, + "epoch": 0.13081954597922277, + "flos": 550438723584.0, + "grad_norm": 0.06405817655948583, + "language_loss": 0.93204647, + "learning_rate": 0.0009735859354533196, + "loss": 0.94306409, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.35473633, + "step": 680, + "time_per_iteration": 2.7122464179992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093118, + "balance_loss_mlp": 1.05914354, + "epoch": 0.13101192766448633, + "flos": 536651329536.0, + "grad_norm": 0.06779912020183775, + "language_loss": 0.91948998, + "learning_rate": 0.0009734859236737628, + "loss": 0.93042123, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.33984375, + "step": 681, + "time_per_iteration": 2.594881296157837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093326, + "balance_loss_mlp": 1.0593034, + "epoch": 0.13120430934974991, + "flos": 503258019840.0, + "grad_norm": 0.06413082246497326, + "language_loss": 0.93904501, + "learning_rate": 0.0009733857280706678, + "loss": 0.94997829, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.34033203, + "step": 682, + "time_per_iteration": 2.5831425189971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010929, + "balance_loss_mlp": 1.05992687, + "epoch": 0.13139669103501347, + "flos": 614014221312.0, + "grad_norm": 0.06246118190021366, + "language_loss": 0.85051745, + "learning_rate": 0.000973285348682934, + "loss": 0.86144638, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.33007812, + "step": 683, + "time_per_iteration": 2.7236225605010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226892, + "balance_loss_mlp": 1.21096563, + "epoch": 0.13158907272027703, + "flos": 1484163357696.0, + "grad_norm": 0.08359566880013784, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79125261, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.15917969, + "step": 684, + "time_per_iteration": 4.87854790687561 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087932, + "balance_loss_mlp": 1.05488706, + "epoch": 0.1317814544055406, + "flos": 985049344512.0, + "grad_norm": 0.07039095593234826, + "language_loss": 0.85449159, + "learning_rate": 0.0009730840387095046, + "loss": 0.86537099, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.33056641, + "step": 685, + "time_per_iteration": 3.30759596824646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096943, + "balance_loss_mlp": 1.06156158, + "epoch": 0.13197383609080415, + "flos": 611163892224.0, + "grad_norm": 0.05759402546544749, + "language_loss": 0.912597, + "learning_rate": 0.0009729831082019642, + "loss": 0.92356646, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.35351562, + "step": 686, + "time_per_iteration": 2.7965087890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093388, + "balance_loss_mlp": 1.0589608, + "epoch": 0.1321662177760677, + "flos": 494116305408.0, + "grad_norm": 0.058033147986452156, + "language_loss": 0.89668858, + "learning_rate": 0.0009728819940660958, + "loss": 0.90762246, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.34399414, + "step": 687, + "time_per_iteration": 2.7347469329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110102, + "balance_loss_mlp": 1.0653528, + "epoch": 0.13235859946133127, + "flos": 495591713280.0, + "grad_norm": 0.07548862234195632, + "language_loss": 0.86088693, + "learning_rate": 0.0009727806963411557, + "loss": 0.87189722, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.35668945, + "step": 688, + "time_per_iteration": 2.621638774871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098222, + "balance_loss_mlp": 1.06279302, + "epoch": 0.13255098114659483, + "flos": 511417539072.0, + "grad_norm": 0.08656773393569435, + "language_loss": 0.88000298, + "learning_rate": 0.000972679215066471, + "loss": 0.89098513, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.35449219, + "step": 689, + "time_per_iteration": 2.6806418895721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103388, + "balance_loss_mlp": 1.06900764, + "epoch": 0.13274336283185842, + "flos": 547114120704.0, + "grad_norm": 0.07064056682134613, + "language_loss": 0.99675226, + "learning_rate": 0.0009725775502814401, + "loss": 1.00778604, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.34350586, + "step": 690, + "time_per_iteration": 2.607179641723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121046, + "balance_loss_mlp": 1.08397222, + "epoch": 0.13293574451712198, + "flos": 640465781760.0, + "grad_norm": 0.08777481913975324, + "language_loss": 0.85673726, + "learning_rate": 0.0009724757020255327, + "loss": 0.86794776, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.37084961, + "step": 691, + "time_per_iteration": 2.81113338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111244, + "balance_loss_mlp": 1.07726967, + "epoch": 0.13312812620238554, + "flos": 491234042880.0, + "grad_norm": 0.09165524457583717, + "language_loss": 0.87811983, + "learning_rate": 0.0009723736703382902, + "loss": 0.88923222, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.33984375, + "step": 692, + "time_per_iteration": 2.548689603805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110186, + "balance_loss_mlp": 1.0692203, + "epoch": 0.1333205078876491, + "flos": 508693837824.0, + "grad_norm": 0.061462060991887495, + "language_loss": 0.83746743, + "learning_rate": 0.0009722714552593244, + "loss": 0.84848601, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.32641602, + "step": 693, + "time_per_iteration": 2.6584513187408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099112, + "balance_loss_mlp": 1.06358743, + "epoch": 0.13351288957291266, + "flos": 418474722816.0, + "grad_norm": 0.07144638741394425, + "language_loss": 0.94810003, + "learning_rate": 0.000972169056828319, + "loss": 0.95909119, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.35522461, + "step": 694, + "time_per_iteration": 2.461437702178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_mlp": 1.06751275, + "epoch": 0.13370527125817622, + "flos": 615614694912.0, + "grad_norm": 0.05672506947017021, + "language_loss": 0.87834966, + "learning_rate": 0.0009720664750850283, + "loss": 0.88935745, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.33251953, + "step": 695, + "time_per_iteration": 2.7716193199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103519, + "balance_loss_mlp": 1.07085609, + "epoch": 0.13389765294343978, + "flos": 625757391360.0, + "grad_norm": 0.07304651625724701, + "language_loss": 0.93482703, + "learning_rate": 0.0009719637100692784, + "loss": 0.94586229, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.32666016, + "step": 696, + "time_per_iteration": 2.7741310596466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111476, + "balance_loss_mlp": 1.08090401, + "epoch": 0.13409003462870334, + "flos": 609391710720.0, + "grad_norm": 0.06235589965882817, + "language_loss": 0.83759153, + "learning_rate": 0.0009718607618209661, + "loss": 0.84873915, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.33862305, + "step": 697, + "time_per_iteration": 2.869180202484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128671, + "balance_loss_mlp": 1.09488726, + "epoch": 0.13428241631396692, + "flos": 683499810816.0, + "grad_norm": 0.0709058406100417, + "language_loss": 0.88053036, + "learning_rate": 0.0009717576303800595, + "loss": 0.89181709, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.33789062, + "step": 698, + "time_per_iteration": 3.007253408432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122193, + "balance_loss_mlp": 1.08716917, + "epoch": 0.13447479799923048, + "flos": 508565799936.0, + "grad_norm": 0.07060238478807088, + "language_loss": 0.86057615, + "learning_rate": 0.0009716543157865975, + "loss": 0.87179804, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.35083008, + "step": 699, + "time_per_iteration": 2.6622114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112811, + "balance_loss_mlp": 1.07812154, + "epoch": 0.13466717968449404, + "flos": 897124737024.0, + "grad_norm": 0.06896685381510245, + "language_loss": 0.84149206, + "learning_rate": 0.0009715508180806907, + "loss": 0.85262012, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.34716797, + "step": 700, + "time_per_iteration": 3.175494909286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106112, + "balance_loss_mlp": 1.07054055, + "epoch": 0.1348595613697576, + "flos": 989501557248.0, + "grad_norm": 0.07388845252403331, + "language_loss": 0.90260321, + "learning_rate": 0.0009714471373025202, + "loss": 0.91366434, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.35546875, + "step": 701, + "time_per_iteration": 3.3912835121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090254, + "balance_loss_mlp": 1.05499172, + "epoch": 0.13505194305502116, + "flos": 487580580864.0, + "grad_norm": 0.07959074518459132, + "language_loss": 0.89355272, + "learning_rate": 0.0009713432734923386, + "loss": 0.9044553, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.35253906, + "step": 702, + "time_per_iteration": 2.6718733310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090728, + "balance_loss_mlp": 1.05572796, + "epoch": 0.13524432474028472, + "flos": 613103399424.0, + "grad_norm": 0.06387437846302528, + "language_loss": 0.875036, + "learning_rate": 0.0009712392266904696, + "loss": 0.88594317, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.34985352, + "step": 703, + "time_per_iteration": 2.6985831260681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010863, + "balance_loss_mlp": 1.0524683, + "epoch": 0.13543670642554828, + "flos": 904425868800.0, + "grad_norm": 0.06666466963859687, + "language_loss": 0.86250496, + "learning_rate": 0.0009711349969373076, + "loss": 0.87336791, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.33862305, + "step": 704, + "time_per_iteration": 3.1328465938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095762, + "balance_loss_mlp": 1.0610956, + "epoch": 0.13562908811081184, + "flos": 550335416832.0, + "grad_norm": 0.0628446006314887, + "language_loss": 0.80944061, + "learning_rate": 0.0009710305842733178, + "loss": 0.82039821, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.34667969, + "step": 705, + "time_per_iteration": 2.7668187618255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093976, + "balance_loss_mlp": 1.06147909, + "epoch": 0.1358214697960754, + "flos": 507797572608.0, + "grad_norm": 0.06635154625105166, + "language_loss": 0.90133065, + "learning_rate": 0.0009709259887390373, + "loss": 0.91227043, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.32519531, + "step": 706, + "time_per_iteration": 2.656233072280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096924, + "balance_loss_mlp": 1.06390333, + "epoch": 0.136013851481339, + "flos": 528640197120.0, + "grad_norm": 0.09290535615143355, + "language_loss": 0.91425377, + "learning_rate": 0.0009708212103750737, + "loss": 0.92522299, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.33007812, + "step": 707, + "time_per_iteration": 2.569655656814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_mlp": 1.06812644, + "epoch": 0.13620623316660255, + "flos": 658772379648.0, + "grad_norm": 0.06731423560591156, + "language_loss": 0.87756282, + "learning_rate": 0.0009707162492221051, + "loss": 0.88857424, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.33007812, + "step": 708, + "time_per_iteration": 2.880669593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103707, + "balance_loss_mlp": 1.07009029, + "epoch": 0.1363986148518661, + "flos": 671583522816.0, + "grad_norm": 0.07312175328849302, + "language_loss": 0.88322687, + "learning_rate": 0.0009706111053208815, + "loss": 0.89426386, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.33642578, + "step": 709, + "time_per_iteration": 2.7878787517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097257, + "balance_loss_mlp": 1.06342554, + "epoch": 0.13659099653712967, + "flos": 472828520448.0, + "grad_norm": 0.06741688104713542, + "language_loss": 0.86067665, + "learning_rate": 0.0009705057787122232, + "loss": 0.87164921, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.33862305, + "step": 710, + "time_per_iteration": 2.528298854827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105446, + "balance_loss_mlp": 1.07190061, + "epoch": 0.13678337822239323, + "flos": 452483490816.0, + "grad_norm": 0.05706590332145298, + "language_loss": 0.91653168, + "learning_rate": 0.0009704002694370216, + "loss": 0.92758614, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.33569336, + "step": 711, + "time_per_iteration": 2.5201761722564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114394, + "balance_loss_mlp": 1.0794661, + "epoch": 0.13697575990765679, + "flos": 519373416960.0, + "grad_norm": 0.06387130477766731, + "language_loss": 0.86892813, + "learning_rate": 0.0009702945775362388, + "loss": 0.88007212, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.34960938, + "step": 712, + "time_per_iteration": 2.661848783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130562, + "balance_loss_mlp": 1.0947994, + "epoch": 0.13716814159292035, + "flos": 480145618944.0, + "grad_norm": 0.06038249383316015, + "language_loss": 0.87339497, + "learning_rate": 0.0009701887030509086, + "loss": 0.8847006, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.35766602, + "step": 713, + "time_per_iteration": 2.6068434715270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125148, + "balance_loss_mlp": 1.0908401, + "epoch": 0.1373605232781839, + "flos": 545376844800.0, + "grad_norm": 0.06924339631343991, + "language_loss": 0.92127877, + "learning_rate": 0.0009700826460221346, + "loss": 0.93253028, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.34301758, + "step": 714, + "time_per_iteration": 2.653224468231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145818, + "balance_loss_mlp": 1.11050797, + "epoch": 0.1375529049634475, + "flos": 708473143296.0, + "grad_norm": 0.0682346884445605, + "language_loss": 0.93435562, + "learning_rate": 0.0009699764064910921, + "loss": 0.94581378, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.35302734, + "step": 715, + "time_per_iteration": 2.878445625305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130305, + "balance_loss_mlp": 1.09542441, + "epoch": 0.13774528664871105, + "flos": 486452971008.0, + "grad_norm": 0.07091873756636237, + "language_loss": 0.87931371, + "learning_rate": 0.0009698699844990268, + "loss": 0.89061677, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.34863281, + "step": 716, + "time_per_iteration": 2.6278092861175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124133, + "balance_loss_mlp": 1.09070659, + "epoch": 0.1379376683339746, + "flos": 679885636608.0, + "grad_norm": 0.0686032560828043, + "language_loss": 0.88731855, + "learning_rate": 0.0009697633800872555, + "loss": 0.89855987, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.33422852, + "step": 717, + "time_per_iteration": 2.888576030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112997, + "balance_loss_mlp": 1.07825947, + "epoch": 0.13813005001923817, + "flos": 610628419584.0, + "grad_norm": 0.07907714555147631, + "language_loss": 0.9128629, + "learning_rate": 0.0009696565932971655, + "loss": 0.92399287, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.34741211, + "step": 718, + "time_per_iteration": 2.8937225341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110249, + "balance_loss_mlp": 1.06837237, + "epoch": 0.13832243170450173, + "flos": 588431222784.0, + "grad_norm": 0.05947825646897862, + "language_loss": 0.9001984, + "learning_rate": 0.0009695496241702153, + "loss": 0.91122329, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.34155273, + "step": 719, + "time_per_iteration": 2.791111469268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094313, + "balance_loss_mlp": 1.06093454, + "epoch": 0.1385148133897653, + "flos": 699674844672.0, + "grad_norm": 0.07440757355955382, + "language_loss": 0.86308432, + "learning_rate": 0.0009694424727479339, + "loss": 0.87402749, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.33398438, + "step": 720, + "time_per_iteration": 2.8781325817108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088445, + "balance_loss_mlp": 1.05475688, + "epoch": 0.13870719507502885, + "flos": 597977399808.0, + "grad_norm": 0.059872525751604476, + "language_loss": 0.90073895, + "learning_rate": 0.0009693351390719213, + "loss": 0.91162348, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.3371582, + "step": 721, + "time_per_iteration": 2.691493272781372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095999, + "balance_loss_mlp": 1.06242967, + "epoch": 0.1388995767602924, + "flos": 586279309824.0, + "grad_norm": 0.07792099406652078, + "language_loss": 0.91640067, + "learning_rate": 0.000969227623183848, + "loss": 0.92736065, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.33569336, + "step": 722, + "time_per_iteration": 2.768209218978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086676, + "balance_loss_mlp": 1.05475235, + "epoch": 0.139091958445556, + "flos": 650810709504.0, + "grad_norm": 0.07717859695455091, + "language_loss": 0.91485119, + "learning_rate": 0.0009691199251254554, + "loss": 0.92571795, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.3190918, + "step": 723, + "time_per_iteration": 2.813594102859497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093708, + "balance_loss_mlp": 1.06159282, + "epoch": 0.13928434013081956, + "flos": 575446961664.0, + "grad_norm": 0.06414169604653322, + "language_loss": 0.8718468, + "learning_rate": 0.0009690120449385555, + "loss": 0.88278389, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.32104492, + "step": 724, + "time_per_iteration": 2.732372999191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110008, + "balance_loss_mlp": 1.06574821, + "epoch": 0.13947672181608312, + "flos": 562954503168.0, + "grad_norm": 0.07538454681544235, + "language_loss": 0.93399024, + "learning_rate": 0.0009689039826650312, + "loss": 0.94499099, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.34375, + "step": 725, + "time_per_iteration": 2.769481658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111743, + "balance_loss_mlp": 1.09967864, + "epoch": 0.13966910350134668, + "flos": 1520699387904.0, + "grad_norm": 0.042030956775344956, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77634799, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.12060547, + "step": 726, + "time_per_iteration": 4.903716802597046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101674, + "balance_loss_mlp": 1.06619751, + "epoch": 0.13986148518661023, + "flos": 499604557824.0, + "grad_norm": 0.07361028590256702, + "language_loss": 0.88265646, + "learning_rate": 0.0009686873120259941, + "loss": 0.89367324, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.35522461, + "step": 727, + "time_per_iteration": 2.639673948287964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099007, + "balance_loss_mlp": 1.06612897, + "epoch": 0.1400538668718738, + "flos": 598381862400.0, + "grad_norm": 0.053177263225715844, + "language_loss": 0.87612498, + "learning_rate": 0.0009685787037446004, + "loss": 0.88711506, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.32885742, + "step": 728, + "time_per_iteration": 2.7457332611083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095755, + "balance_loss_mlp": 1.06135106, + "epoch": 0.14024624855713735, + "flos": 593757941760.0, + "grad_norm": 0.0730266030670127, + "language_loss": 0.88032103, + "learning_rate": 0.0009684699135448201, + "loss": 0.89127851, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.34423828, + "step": 729, + "time_per_iteration": 2.6995558738708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091636, + "balance_loss_mlp": 1.05940139, + "epoch": 0.1404386302424009, + "flos": 506335311360.0, + "grad_norm": 0.06378774069808751, + "language_loss": 0.93033969, + "learning_rate": 0.0009683609414688895, + "loss": 0.94125605, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.32226562, + "step": 730, + "time_per_iteration": 2.6648926734924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097348, + "balance_loss_mlp": 1.06175184, + "epoch": 0.14063101192766447, + "flos": 573132105216.0, + "grad_norm": 0.05452232030629634, + "language_loss": 0.86945236, + "learning_rate": 0.0009682517875591154, + "loss": 0.88042581, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.35620117, + "step": 731, + "time_per_iteration": 2.7333967685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099629, + "balance_loss_mlp": 1.06656027, + "epoch": 0.14082339361292806, + "flos": 564333806592.0, + "grad_norm": 0.06482276791137384, + "language_loss": 0.87207299, + "learning_rate": 0.0009681424518578749, + "loss": 0.88306928, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.33081055, + "step": 732, + "time_per_iteration": 2.706704616546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_mlp": 1.06734443, + "epoch": 0.14101577529819162, + "flos": 463336187904.0, + "grad_norm": 0.05411989278901109, + "language_loss": 0.88122302, + "learning_rate": 0.000968032934407616, + "loss": 0.89222693, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.33056641, + "step": 733, + "time_per_iteration": 2.5904436111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100997, + "balance_loss_mlp": 1.06766593, + "epoch": 0.14120815698345518, + "flos": 595791991296.0, + "grad_norm": 0.06321555834593343, + "language_loss": 0.82077157, + "learning_rate": 0.0009679232352508571, + "loss": 0.83178151, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.33349609, + "step": 734, + "time_per_iteration": 2.758493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102299, + "balance_loss_mlp": 1.06992185, + "epoch": 0.14140053866871874, + "flos": 534864591360.0, + "grad_norm": 0.05697576898708014, + "language_loss": 0.81442666, + "learning_rate": 0.0009678133544301871, + "loss": 0.82544965, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.32373047, + "step": 735, + "time_per_iteration": 2.6508195400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092208, + "balance_loss_mlp": 1.06006956, + "epoch": 0.1415929203539823, + "flos": 520013606400.0, + "grad_norm": 0.0400187761209974, + "language_loss": 0.91843486, + "learning_rate": 0.0009677032919882658, + "loss": 0.92935699, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.32128906, + "step": 736, + "time_per_iteration": 2.705019474029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_mlp": 1.06975937, + "epoch": 0.14178530203924586, + "flos": 482095300608.0, + "grad_norm": 0.07179339183341249, + "language_loss": 0.92199683, + "learning_rate": 0.000967593047967823, + "loss": 0.93300164, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.30712891, + "step": 737, + "time_per_iteration": 2.55415415763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109678, + "balance_loss_mlp": 1.07577443, + "epoch": 0.14197768372450942, + "flos": 676339863552.0, + "grad_norm": 0.08640894081958116, + "language_loss": 0.87084705, + "learning_rate": 0.0009674826224116593, + "loss": 0.88194382, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.33911133, + "step": 738, + "time_per_iteration": 2.819878101348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097633, + "balance_loss_mlp": 1.06544614, + "epoch": 0.14217006540977298, + "flos": 445802199552.0, + "grad_norm": 0.06953952980021996, + "language_loss": 0.8713401, + "learning_rate": 0.0009673720153626455, + "loss": 0.88231641, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.32177734, + "step": 739, + "time_per_iteration": 2.5987422466278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096281, + "balance_loss_mlp": 1.06385565, + "epoch": 0.14236244709503657, + "flos": 496261016064.0, + "grad_norm": 0.08400230511878481, + "language_loss": 0.87465405, + "learning_rate": 0.0009672612268637235, + "loss": 0.88561684, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.32421875, + "step": 740, + "time_per_iteration": 2.6148736476898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098472, + "balance_loss_mlp": 1.06669128, + "epoch": 0.14255482878030012, + "flos": 648022989312.0, + "grad_norm": 0.0806935070673247, + "language_loss": 0.846753, + "learning_rate": 0.0009671502569579048, + "loss": 0.85773772, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.31762695, + "step": 741, + "time_per_iteration": 2.7533769607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089599, + "balance_loss_mlp": 1.05774641, + "epoch": 0.14274721046556368, + "flos": 535888894464.0, + "grad_norm": 0.06572551706098649, + "language_loss": 0.90748239, + "learning_rate": 0.0009670391056882719, + "loss": 0.91837835, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.31835938, + "step": 742, + "time_per_iteration": 2.698690176010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088511, + "balance_loss_mlp": 1.0565629, + "epoch": 0.14293959215082724, + "flos": 956677215744.0, + "grad_norm": 0.07291469749344824, + "language_loss": 0.89417249, + "learning_rate": 0.0009669277730979776, + "loss": 0.90505755, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.31958008, + "step": 743, + "time_per_iteration": 3.1728732585906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108649, + "balance_loss_mlp": 1.05408931, + "epoch": 0.1431319738360908, + "flos": 692766590976.0, + "grad_norm": 0.06693583917292938, + "language_loss": 0.85588205, + "learning_rate": 0.0009668162592302449, + "loss": 0.86674696, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.32397461, + "step": 744, + "time_per_iteration": 2.896467685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099426, + "balance_loss_mlp": 1.06673896, + "epoch": 0.14332435552135436, + "flos": 565174817280.0, + "grad_norm": 0.0717564206721674, + "language_loss": 0.86683381, + "learning_rate": 0.0009667045641283676, + "loss": 0.877828, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.3269043, + "step": 745, + "time_per_iteration": 2.6326427459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095955, + "balance_loss_mlp": 1.06336319, + "epoch": 0.14351673720661792, + "flos": 738045665280.0, + "grad_norm": 0.07083856064802352, + "language_loss": 0.95545924, + "learning_rate": 0.0009665926878357092, + "loss": 0.96641874, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.32592773, + "step": 746, + "time_per_iteration": 2.902628183364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108393, + "balance_loss_mlp": 1.07565856, + "epoch": 0.14370911889188148, + "flos": 548951731200.0, + "grad_norm": 0.08672542857876225, + "language_loss": 0.91510898, + "learning_rate": 0.0009664806303957043, + "loss": 0.92619288, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.32714844, + "step": 747, + "time_per_iteration": 2.678656578063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107271, + "balance_loss_mlp": 1.07448816, + "epoch": 0.14390150057714507, + "flos": 589973469696.0, + "grad_norm": 0.06575006445724518, + "language_loss": 0.87633115, + "learning_rate": 0.0009663683918518571, + "loss": 0.88740385, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.32788086, + "step": 748, + "time_per_iteration": 2.894339084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116744, + "balance_loss_mlp": 1.08226848, + "epoch": 0.14409388226240863, + "flos": 590773782528.0, + "grad_norm": 0.06412555003569581, + "language_loss": 0.86334193, + "learning_rate": 0.0009662559722477428, + "loss": 0.87450933, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.3449707, + "step": 749, + "time_per_iteration": 2.6673357486724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116866, + "balance_loss_mlp": 1.15397346, + "epoch": 0.1442862639476722, + "flos": 1510418479104.0, + "grad_norm": 0.05654081816866197, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77331638, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.14648438, + "step": 750, + "time_per_iteration": 4.97744607925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111461, + "balance_loss_mlp": 1.0782733, + "epoch": 0.14447864563293575, + "flos": 496493770752.0, + "grad_norm": 0.05840496998451829, + "language_loss": 0.89989787, + "learning_rate": 0.0009660305900333632, + "loss": 0.91101241, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.33203125, + "step": 751, + "time_per_iteration": 2.6919631958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108129, + "balance_loss_mlp": 1.07513142, + "epoch": 0.1446710273181993, + "flos": 589400271360.0, + "grad_norm": 0.0663289310880325, + "language_loss": 0.83084202, + "learning_rate": 0.0009659176275105992, + "loss": 0.8419233, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.33007812, + "step": 752, + "time_per_iteration": 2.702003240585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097403, + "balance_loss_mlp": 1.0634284, + "epoch": 0.14486340900346287, + "flos": 585521256960.0, + "grad_norm": 0.05748666507804042, + "language_loss": 0.86628646, + "learning_rate": 0.0009658044841025701, + "loss": 0.87726045, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.34008789, + "step": 753, + "time_per_iteration": 2.7666702270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106626, + "balance_loss_mlp": 1.07114923, + "epoch": 0.14505579068872643, + "flos": 504405978624.0, + "grad_norm": 0.07320865998852653, + "language_loss": 0.81996346, + "learning_rate": 0.0009656911598532021, + "loss": 0.83102977, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.35498047, + "step": 754, + "time_per_iteration": 2.6273839473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094053, + "balance_loss_mlp": 1.05936301, + "epoch": 0.14524817237399, + "flos": 486566452224.0, + "grad_norm": 0.05776902712696923, + "language_loss": 0.90229332, + "learning_rate": 0.0009655776548064917, + "loss": 0.91323388, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.34667969, + "step": 755, + "time_per_iteration": 2.6639461517333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092368, + "balance_loss_mlp": 1.05867922, + "epoch": 0.14544055405925355, + "flos": 727857888768.0, + "grad_norm": 0.059694446461720084, + "language_loss": 0.88762641, + "learning_rate": 0.0009654639690065054, + "loss": 0.89855003, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.33691406, + "step": 756, + "time_per_iteration": 2.881164789199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092737, + "balance_loss_mlp": 1.05981112, + "epoch": 0.14563293574451713, + "flos": 593359271424.0, + "grad_norm": 0.0719411984245977, + "language_loss": 0.88362074, + "learning_rate": 0.00096535010249738, + "loss": 0.89454818, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.3293457, + "step": 757, + "time_per_iteration": 2.703355312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092325, + "balance_loss_mlp": 1.05925632, + "epoch": 0.1458253174297807, + "flos": 560192924160.0, + "grad_norm": 0.09095988428785044, + "language_loss": 0.8300786, + "learning_rate": 0.0009652360553233224, + "loss": 0.84100187, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.33081055, + "step": 758, + "time_per_iteration": 2.7321062088012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_mlp": 1.04821551, + "epoch": 0.14601769911504425, + "flos": 1557025984512.0, + "grad_norm": 0.03493248396843453, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74836457, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.14453125, + "step": 759, + "time_per_iteration": 4.917184591293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099151, + "balance_loss_mlp": 1.06605887, + "epoch": 0.1462100808003078, + "flos": 865922628096.0, + "grad_norm": 0.05465610046720203, + "language_loss": 0.8166393, + "learning_rate": 0.0009650074191575883, + "loss": 0.82763088, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.33105469, + "step": 760, + "time_per_iteration": 3.2009472846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097005, + "balance_loss_mlp": 1.06341171, + "epoch": 0.14640246248557137, + "flos": 522673288704.0, + "grad_norm": 0.07890258703475667, + "language_loss": 0.86329532, + "learning_rate": 0.0009648928302546766, + "loss": 0.87426543, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.3359375, + "step": 761, + "time_per_iteration": 2.6858482360839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087335, + "balance_loss_mlp": 1.05340791, + "epoch": 0.14659484417083493, + "flos": 1030121805312.0, + "grad_norm": 0.05505233607608704, + "language_loss": 0.8584463, + "learning_rate": 0.0009647780608643613, + "loss": 0.86931968, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.33935547, + "step": 762, + "time_per_iteration": 3.3784618377685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087006, + "balance_loss_mlp": 1.05365133, + "epoch": 0.1467872258560985, + "flos": 500426629632.0, + "grad_norm": 0.083565321416964, + "language_loss": 0.88299912, + "learning_rate": 0.0009646631110312001, + "loss": 0.89386916, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.33349609, + "step": 763, + "time_per_iteration": 2.642038345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096574, + "balance_loss_mlp": 1.06465006, + "epoch": 0.14697960754136205, + "flos": 547514201088.0, + "grad_norm": 0.05646167170610495, + "language_loss": 0.88908124, + "learning_rate": 0.0009645479807998203, + "loss": 0.900047, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.3190918, + "step": 764, + "time_per_iteration": 2.7709102630615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093321, + "balance_loss_mlp": 1.0614922, + "epoch": 0.14717198922662564, + "flos": 517586678784.0, + "grad_norm": 0.06731397985108602, + "language_loss": 0.93233657, + "learning_rate": 0.0009644326702149196, + "loss": 0.94326979, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.31811523, + "step": 765, + "time_per_iteration": 2.691761016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098194, + "balance_loss_mlp": 1.06472015, + "epoch": 0.1473643709118892, + "flos": 731661147648.0, + "grad_norm": 0.08664060064789567, + "language_loss": 0.85604531, + "learning_rate": 0.0009643171793212653, + "loss": 0.86702728, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.33496094, + "step": 766, + "time_per_iteration": 3.0578510761260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095374, + "balance_loss_mlp": 1.06190002, + "epoch": 0.14755675259715276, + "flos": 620257554432.0, + "grad_norm": 0.06875066800131625, + "language_loss": 0.90379435, + "learning_rate": 0.0009642015081636952, + "loss": 0.91474807, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.33496094, + "step": 767, + "time_per_iteration": 2.690892219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091039, + "balance_loss_mlp": 1.05830407, + "epoch": 0.14774913428241632, + "flos": 451981513728.0, + "grad_norm": 0.06617868208271054, + "language_loss": 0.88812423, + "learning_rate": 0.0009640856567871166, + "loss": 0.89903462, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.32714844, + "step": 768, + "time_per_iteration": 2.5108768939971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086849, + "balance_loss_mlp": 1.05316067, + "epoch": 0.14794151596767988, + "flos": 836881196544.0, + "grad_norm": 0.06813910901976611, + "language_loss": 0.89643073, + "learning_rate": 0.0009639696252365072, + "loss": 0.90729922, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.33691406, + "step": 769, + "time_per_iteration": 3.036872386932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087914, + "balance_loss_mlp": 1.05546558, + "epoch": 0.14813389765294344, + "flos": 685765204992.0, + "grad_norm": 0.06952898718112278, + "language_loss": 0.82433641, + "learning_rate": 0.0009638534135569144, + "loss": 0.83521557, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.32446289, + "step": 770, + "time_per_iteration": 2.920228958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096521, + "balance_loss_mlp": 1.06395316, + "epoch": 0.148326279338207, + "flos": 509625008640.0, + "grad_norm": 0.05850145176667806, + "language_loss": 0.90417981, + "learning_rate": 0.0009637370217934554, + "loss": 0.91514498, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.32568359, + "step": 771, + "time_per_iteration": 2.6692943572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0624088, + "epoch": 0.14851866102347056, + "flos": 587869608960.0, + "grad_norm": 0.06374792966079154, + "language_loss": 0.83362675, + "learning_rate": 0.0009636204499913175, + "loss": 0.84457153, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.32055664, + "step": 772, + "time_per_iteration": 2.9103784561157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101595, + "balance_loss_mlp": 1.07129157, + "epoch": 0.14871104270873411, + "flos": 690722366976.0, + "grad_norm": 0.05784692032564958, + "language_loss": 0.8891257, + "learning_rate": 0.0009635036981957581, + "loss": 0.90014172, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.30273438, + "step": 773, + "time_per_iteration": 2.840233087539673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109331, + "balance_loss_mlp": 1.06112361, + "epoch": 0.1489034243939977, + "flos": 654803205120.0, + "grad_norm": 0.06091674471201955, + "language_loss": 0.9126395, + "learning_rate": 0.0009633867664521043, + "loss": 0.9235726, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.32202148, + "step": 774, + "time_per_iteration": 2.8467912673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098219, + "balance_loss_mlp": 1.0643878, + "epoch": 0.14909580607926126, + "flos": 475595891712.0, + "grad_norm": 0.06395321005815084, + "language_loss": 0.87366414, + "learning_rate": 0.0009632696548057527, + "loss": 0.8846463, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.33862305, + "step": 775, + "time_per_iteration": 2.55267596244812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090534, + "balance_loss_mlp": 1.05729866, + "epoch": 0.14928818776452482, + "flos": 610789953024.0, + "grad_norm": 0.07257335679926562, + "language_loss": 0.85489643, + "learning_rate": 0.0009631523633021704, + "loss": 0.86580181, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.33251953, + "step": 776, + "time_per_iteration": 2.800656795501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090063, + "balance_loss_mlp": 1.05694628, + "epoch": 0.14948056944978838, + "flos": 561487859712.0, + "grad_norm": 0.058446141184189525, + "language_loss": 0.88943005, + "learning_rate": 0.0009630348919868936, + "loss": 0.90033066, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.33129883, + "step": 777, + "time_per_iteration": 2.7306644916534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088502, + "balance_loss_mlp": 1.05397916, + "epoch": 0.14967295113505194, + "flos": 448972623360.0, + "grad_norm": 0.08136314957760014, + "language_loss": 0.81536144, + "learning_rate": 0.0009629172409055293, + "loss": 0.82624644, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.34545898, + "step": 778, + "time_per_iteration": 2.532480239868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091534, + "balance_loss_mlp": 1.05937171, + "epoch": 0.1498653328203155, + "flos": 571000541184.0, + "grad_norm": 0.06865521140792329, + "language_loss": 0.88039231, + "learning_rate": 0.0009627994101037531, + "loss": 0.89130771, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.3215332, + "step": 779, + "time_per_iteration": 2.7336056232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091586, + "balance_loss_mlp": 1.05811191, + "epoch": 0.15005771450557906, + "flos": 630918194688.0, + "grad_norm": 0.06277485509918372, + "language_loss": 0.8981787, + "learning_rate": 0.0009626813996273114, + "loss": 0.90909451, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.3347168, + "step": 780, + "time_per_iteration": 2.8651859760284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092654, + "balance_loss_mlp": 1.06018162, + "epoch": 0.15025009619084262, + "flos": 577633780224.0, + "grad_norm": 0.06737111741199381, + "language_loss": 0.89359641, + "learning_rate": 0.0009625632095220198, + "loss": 0.90452296, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.32470703, + "step": 781, + "time_per_iteration": 2.910163640975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093984, + "balance_loss_mlp": 1.06041455, + "epoch": 0.1504424778761062, + "flos": 483646311936.0, + "grad_norm": 0.06188715182302237, + "language_loss": 0.87568116, + "learning_rate": 0.0009624448398337637, + "loss": 0.88662094, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.3359375, + "step": 782, + "time_per_iteration": 2.532055616378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100397, + "balance_loss_mlp": 1.06751907, + "epoch": 0.15063485956136977, + "flos": 762167812608.0, + "grad_norm": 0.06229794960735175, + "language_loss": 0.89905757, + "learning_rate": 0.0009623262906084984, + "loss": 0.91006154, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.32861328, + "step": 783, + "time_per_iteration": 2.9851605892181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104855, + "balance_loss_mlp": 1.0712378, + "epoch": 0.15082724124663333, + "flos": 497369687040.0, + "grad_norm": 0.060596744514248076, + "language_loss": 0.90796679, + "learning_rate": 0.0009622075618922486, + "loss": 0.91901541, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.33642578, + "step": 784, + "time_per_iteration": 2.6796786785125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102349, + "balance_loss_mlp": 1.06928015, + "epoch": 0.15101962293189689, + "flos": 509476621824.0, + "grad_norm": 0.06389342174673626, + "language_loss": 0.87423813, + "learning_rate": 0.0009620886537311091, + "loss": 0.8852616, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.33081055, + "step": 785, + "time_per_iteration": 2.6153056621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113163, + "balance_loss_mlp": 1.07685184, + "epoch": 0.15121200461716044, + "flos": 457520638464.0, + "grad_norm": 0.06793935281312648, + "language_loss": 0.85492945, + "learning_rate": 0.000961969566171244, + "loss": 0.86606109, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.36303711, + "step": 786, + "time_per_iteration": 2.506267786026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122272, + "balance_loss_mlp": 1.08703363, + "epoch": 0.151404386302424, + "flos": 537729477120.0, + "grad_norm": 0.0670602351843582, + "language_loss": 0.90370345, + "learning_rate": 0.0009618502992588873, + "loss": 0.91492617, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.35253906, + "step": 787, + "time_per_iteration": 2.623457670211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141844, + "balance_loss_mlp": 1.10658193, + "epoch": 0.15159676798768756, + "flos": 687858891264.0, + "grad_norm": 0.06543467559167064, + "language_loss": 0.88581872, + "learning_rate": 0.0009617308530403424, + "loss": 0.89723718, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.35302734, + "step": 788, + "time_per_iteration": 2.975861072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149381, + "balance_loss_mlp": 1.11371326, + "epoch": 0.15178914967295112, + "flos": 545042193408.0, + "grad_norm": 0.059566397417978756, + "language_loss": 0.87806541, + "learning_rate": 0.0009616112275619825, + "loss": 0.88955921, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.35668945, + "step": 789, + "time_per_iteration": 2.683262348175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152452, + "balance_loss_mlp": 1.1169517, + "epoch": 0.1519815313582147, + "flos": 511510671360.0, + "grad_norm": 0.05728483560240697, + "language_loss": 0.84466863, + "learning_rate": 0.0009614914228702503, + "loss": 0.85619313, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.35498047, + "step": 790, + "time_per_iteration": 2.6616339683532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142719, + "balance_loss_mlp": 1.10850596, + "epoch": 0.15217391304347827, + "flos": 683747122176.0, + "grad_norm": 0.057799273493116435, + "language_loss": 0.89279461, + "learning_rate": 0.0009613714390116581, + "loss": 0.90422177, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.34204102, + "step": 791, + "time_per_iteration": 2.947608470916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133841, + "balance_loss_mlp": 1.0997231, + "epoch": 0.15236629472874183, + "flos": 643873342464.0, + "grad_norm": 0.06413295296627212, + "language_loss": 0.86589968, + "learning_rate": 0.0009612512760327879, + "loss": 0.87723809, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.34155273, + "step": 792, + "time_per_iteration": 2.8261189460754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124449, + "balance_loss_mlp": 1.08727932, + "epoch": 0.1525586764140054, + "flos": 412654791168.0, + "grad_norm": 0.06095846853214657, + "language_loss": 0.85749042, + "learning_rate": 0.0009611309339802909, + "loss": 0.86873484, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.37182617, + "step": 793, + "time_per_iteration": 2.438474178314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113041, + "balance_loss_mlp": 1.07811236, + "epoch": 0.15275105809926895, + "flos": 802444644864.0, + "grad_norm": 0.04691390558901254, + "language_loss": 0.84620011, + "learning_rate": 0.0009610104129008881, + "loss": 0.85733056, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.34985352, + "step": 794, + "time_per_iteration": 3.1149892807006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092099, + "balance_loss_mlp": 1.05786228, + "epoch": 0.1529434397845325, + "flos": 612143115264.0, + "grad_norm": 0.06446455819394356, + "language_loss": 0.88995111, + "learning_rate": 0.0009608897128413701, + "loss": 0.90087205, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.3425293, + "step": 795, + "time_per_iteration": 2.7310965061187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085841, + "balance_loss_mlp": 1.05160367, + "epoch": 0.15313582146979607, + "flos": 614941009920.0, + "grad_norm": 0.04580320827636504, + "language_loss": 0.8595438, + "learning_rate": 0.0009607688338485965, + "loss": 0.87040222, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.3425293, + "step": 796, + "time_per_iteration": 2.8534584045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088916, + "balance_loss_mlp": 1.05427384, + "epoch": 0.15332820315505963, + "flos": 793256440320.0, + "grad_norm": 0.053101967265095064, + "language_loss": 0.91128695, + "learning_rate": 0.0009606477759694969, + "loss": 0.92217612, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.34643555, + "step": 797, + "time_per_iteration": 3.0544466972351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_mlp": 1.06441545, + "epoch": 0.1535205848403232, + "flos": 549945510912.0, + "grad_norm": 0.0662794157411924, + "language_loss": 0.87591946, + "learning_rate": 0.0009605265392510703, + "loss": 0.88690674, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.34350586, + "step": 798, + "time_per_iteration": 2.6120660305023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011091, + "balance_loss_mlp": 1.07417202, + "epoch": 0.15371296652558677, + "flos": 535691045376.0, + "grad_norm": 0.07220239734969772, + "language_loss": 0.92342889, + "learning_rate": 0.0009604051237403846, + "loss": 0.93451989, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.34960938, + "step": 799, + "time_per_iteration": 2.640749216079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111728, + "balance_loss_mlp": 1.07808757, + "epoch": 0.15390534821085033, + "flos": 395002939392.0, + "grad_norm": 0.06314402273456009, + "language_loss": 0.86126584, + "learning_rate": 0.0009602835294845776, + "loss": 0.87238312, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.33666992, + "step": 800, + "time_per_iteration": 2.44914174079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117351, + "balance_loss_mlp": 1.08254242, + "epoch": 0.1540977298961139, + "flos": 535587738624.0, + "grad_norm": 0.057636094576239, + "language_loss": 0.91100746, + "learning_rate": 0.0009601617565308565, + "loss": 0.92218101, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.34790039, + "step": 801, + "time_per_iteration": 2.599679470062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119062, + "balance_loss_mlp": 1.08511138, + "epoch": 0.15429011158137745, + "flos": 723388147200.0, + "grad_norm": 0.05961266019354579, + "language_loss": 0.86783326, + "learning_rate": 0.0009600398049264977, + "loss": 0.87902391, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.33935547, + "step": 802, + "time_per_iteration": 2.9514007568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121787, + "balance_loss_mlp": 1.08735943, + "epoch": 0.154482493266641, + "flos": 620209502208.0, + "grad_norm": 0.06366105456569557, + "language_loss": 0.92098475, + "learning_rate": 0.0009599176747188469, + "loss": 0.9322027, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.34448242, + "step": 803, + "time_per_iteration": 2.8068411350250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_mlp": 1.08012128, + "epoch": 0.15467487495190457, + "flos": 525351909888.0, + "grad_norm": 0.08101366702111423, + "language_loss": 0.83651662, + "learning_rate": 0.0009597953659553196, + "loss": 0.84765685, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.33911133, + "step": 804, + "time_per_iteration": 2.7075448036193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098377, + "balance_loss_mlp": 1.06616712, + "epoch": 0.15486725663716813, + "flos": 527473299456.0, + "grad_norm": 0.07377431927286832, + "language_loss": 0.89624304, + "learning_rate": 0.0009596728786833997, + "loss": 0.90722686, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.32202148, + "step": 805, + "time_per_iteration": 2.6376051902770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088085, + "balance_loss_mlp": 1.05420554, + "epoch": 0.1550596383224317, + "flos": 1048118482944.0, + "grad_norm": 0.06708822771662253, + "language_loss": 0.90018022, + "learning_rate": 0.0009595502129506415, + "loss": 0.91106105, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.33911133, + "step": 806, + "time_per_iteration": 3.3391284942626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092582, + "balance_loss_mlp": 1.05903625, + "epoch": 0.15525202000769528, + "flos": 613438050816.0, + "grad_norm": 0.06052700763637142, + "language_loss": 0.83084035, + "learning_rate": 0.0009594273688046678, + "loss": 0.84176612, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.33544922, + "step": 807, + "time_per_iteration": 2.7136006355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088184, + "balance_loss_mlp": 1.05273128, + "epoch": 0.15544440169295884, + "flos": 532805810688.0, + "grad_norm": 0.07048562468234597, + "language_loss": 0.86048424, + "learning_rate": 0.000959304346293171, + "loss": 0.87136608, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.35473633, + "step": 808, + "time_per_iteration": 2.6744906902313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097573, + "balance_loss_mlp": 1.06254935, + "epoch": 0.1556367833782224, + "flos": 644433546240.0, + "grad_norm": 0.06803397985071584, + "language_loss": 0.88331544, + "learning_rate": 0.0009591811454639125, + "loss": 0.89429116, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.3503418, + "step": 809, + "time_per_iteration": 2.730431079864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0610261, + "epoch": 0.15582916506348596, + "flos": 543540644352.0, + "grad_norm": 0.06204685505428811, + "language_loss": 0.88227659, + "learning_rate": 0.0009590577663647234, + "loss": 0.89322132, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.3347168, + "step": 810, + "time_per_iteration": 2.71469783782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104684, + "balance_loss_mlp": 1.07078123, + "epoch": 0.15602154674874952, + "flos": 579740613120.0, + "grad_norm": 0.05672341894910533, + "language_loss": 0.86610442, + "learning_rate": 0.0009589342090435036, + "loss": 0.87715125, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.33935547, + "step": 811, + "time_per_iteration": 2.799246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110387, + "balance_loss_mlp": 1.06918025, + "epoch": 0.15621392843401308, + "flos": 534982454784.0, + "grad_norm": 0.0647852675732537, + "language_loss": 0.87778354, + "learning_rate": 0.0009588104735482223, + "loss": 0.8888222, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.34692383, + "step": 812, + "time_per_iteration": 2.6684510707855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126921, + "balance_loss_mlp": 1.09106326, + "epoch": 0.15640631011927664, + "flos": 550635162624.0, + "grad_norm": 0.08222618986335321, + "language_loss": 0.84280443, + "learning_rate": 0.0009586865599269177, + "loss": 0.85407358, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.35864258, + "step": 813, + "time_per_iteration": 2.6293816566467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131277, + "balance_loss_mlp": 1.09651566, + "epoch": 0.1565986918045402, + "flos": 637190641152.0, + "grad_norm": 0.05945515562529824, + "language_loss": 0.88725412, + "learning_rate": 0.0009585624682276977, + "loss": 0.89856684, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.34814453, + "step": 814, + "time_per_iteration": 2.744253158569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137563, + "balance_loss_mlp": 1.10113239, + "epoch": 0.15679107348980378, + "flos": 490569122304.0, + "grad_norm": 0.09591637295165127, + "language_loss": 0.87945771, + "learning_rate": 0.0009584381984987386, + "loss": 0.89083332, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.36474609, + "step": 815, + "time_per_iteration": 2.5264036655426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124613, + "balance_loss_mlp": 1.0911386, + "epoch": 0.15698345517506734, + "flos": 529689231360.0, + "grad_norm": 0.05838460881618622, + "language_loss": 0.90277314, + "learning_rate": 0.0009583137507882864, + "loss": 0.91401929, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.3347168, + "step": 816, + "time_per_iteration": 2.6488330364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109898, + "balance_loss_mlp": 1.07418323, + "epoch": 0.1571758368603309, + "flos": 545779897344.0, + "grad_norm": 0.07313796537718548, + "language_loss": 0.81262791, + "learning_rate": 0.000958189125144656, + "loss": 0.82372689, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.35766602, + "step": 817, + "time_per_iteration": 2.7040657997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101746, + "balance_loss_mlp": 1.06672239, + "epoch": 0.15736821854559446, + "flos": 565377048576.0, + "grad_norm": 0.067694528538076, + "language_loss": 0.88558215, + "learning_rate": 0.0009580643216162313, + "loss": 0.89659959, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.3503418, + "step": 818, + "time_per_iteration": 2.6538634300231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096156, + "balance_loss_mlp": 1.06110835, + "epoch": 0.15756060023085802, + "flos": 500707436544.0, + "grad_norm": 0.05957146674366314, + "language_loss": 0.79884583, + "learning_rate": 0.0009579393402514652, + "loss": 0.80980736, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.35107422, + "step": 819, + "time_per_iteration": 2.5606625080108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082975, + "balance_loss_mlp": 1.048738, + "epoch": 0.15775298191612158, + "flos": 519014034432.0, + "grad_norm": 0.06194437160070725, + "language_loss": 0.91126758, + "learning_rate": 0.0009578141810988801, + "loss": 0.92209733, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.34228516, + "step": 820, + "time_per_iteration": 2.55538010597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082194, + "balance_loss_mlp": 1.04712272, + "epoch": 0.15794536360138514, + "flos": 465891153408.0, + "grad_norm": 0.060184436438788555, + "language_loss": 0.91010749, + "learning_rate": 0.0009576888442070668, + "loss": 0.92092943, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.35083008, + "step": 821, + "time_per_iteration": 2.6139276027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094225, + "balance_loss_mlp": 1.05982161, + "epoch": 0.1581377452866487, + "flos": 516911583744.0, + "grad_norm": 0.06832586535724347, + "language_loss": 0.92820144, + "learning_rate": 0.0009575633296246854, + "loss": 0.93914366, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.34423828, + "step": 822, + "time_per_iteration": 2.557404041290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096972, + "balance_loss_mlp": 1.06242526, + "epoch": 0.15833012697191226, + "flos": 549522109440.0, + "grad_norm": 0.06257557491721027, + "language_loss": 0.83520567, + "learning_rate": 0.0009574376374004652, + "loss": 0.84617537, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.34570312, + "step": 823, + "time_per_iteration": 2.673220157623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100077, + "balance_loss_mlp": 1.06395626, + "epoch": 0.15852250865717585, + "flos": 487206641664.0, + "grad_norm": 0.07116075590187526, + "language_loss": 0.81073487, + "learning_rate": 0.000957311767583204, + "loss": 0.82173562, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.36132812, + "step": 824, + "time_per_iteration": 2.605074882507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159484, + "balance_loss_mlp": 1.14126849, + "epoch": 0.1587148903424394, + "flos": 1309041672192.0, + "grad_norm": 0.051809649393169656, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83231074, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.18261719, + "step": 825, + "time_per_iteration": 4.726073265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111019, + "balance_loss_mlp": 1.07349157, + "epoch": 0.15890727202770297, + "flos": 466634649600.0, + "grad_norm": 0.07947222616221912, + "language_loss": 0.92132723, + "learning_rate": 0.0009570594953650961, + "loss": 0.93243748, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.37524414, + "step": 826, + "time_per_iteration": 2.5146830081939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109067, + "balance_loss_mlp": 1.07208848, + "epoch": 0.15909965371296653, + "flos": 776733608448.0, + "grad_norm": 0.06013225990958685, + "language_loss": 0.80852252, + "learning_rate": 0.00095693309306219, + "loss": 0.81961316, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.36962891, + "step": 827, + "time_per_iteration": 3.095632553100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117102, + "balance_loss_mlp": 1.07945621, + "epoch": 0.1592920353982301, + "flos": 1077852538368.0, + "grad_norm": 0.05984978885312211, + "language_loss": 0.88600951, + "learning_rate": 0.0009568065133621244, + "loss": 0.89718056, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.37646484, + "step": 828, + "time_per_iteration": 3.3153574466705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111269, + "balance_loss_mlp": 1.07584, + "epoch": 0.15948441708349365, + "flos": 725307305472.0, + "grad_norm": 0.0632864692280333, + "language_loss": 0.85493571, + "learning_rate": 0.0009566797563140422, + "loss": 0.86604846, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.35449219, + "step": 829, + "time_per_iteration": 2.8705785274505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121543, + "balance_loss_mlp": 1.08470702, + "epoch": 0.1596767987687572, + "flos": 578447087616.0, + "grad_norm": 0.06433687205870958, + "language_loss": 0.88630873, + "learning_rate": 0.0009565528219671547, + "loss": 0.89752412, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.36816406, + "step": 830, + "time_per_iteration": 2.8890771865844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137333, + "balance_loss_mlp": 1.10049748, + "epoch": 0.15986918045402077, + "flos": 528728947200.0, + "grad_norm": 0.04994246668943954, + "language_loss": 0.85232639, + "learning_rate": 0.0009564257103707418, + "loss": 0.86369967, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.36816406, + "step": 831, + "time_per_iteration": 2.5870308876037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133852, + "balance_loss_mlp": 1.09632492, + "epoch": 0.16006156213928435, + "flos": 574313559552.0, + "grad_norm": 0.0648316290803925, + "language_loss": 0.91675746, + "learning_rate": 0.0009562984215741533, + "loss": 0.92809594, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.37524414, + "step": 832, + "time_per_iteration": 2.655066967010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117496, + "balance_loss_mlp": 1.08170903, + "epoch": 0.1602539438245479, + "flos": 515258675712.0, + "grad_norm": 0.14271195523272245, + "language_loss": 0.82911491, + "learning_rate": 0.0009561709556268065, + "loss": 0.84028995, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.35839844, + "step": 833, + "time_per_iteration": 2.69999098777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119914, + "balance_loss_mlp": 1.08419931, + "epoch": 0.16044632550981147, + "flos": 620730418176.0, + "grad_norm": 0.05962773238435596, + "language_loss": 0.95060706, + "learning_rate": 0.0009560433125781884, + "loss": 0.96180618, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.35693359, + "step": 834, + "time_per_iteration": 2.711109161376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126421, + "balance_loss_mlp": 1.08977628, + "epoch": 0.16063870719507503, + "flos": 560817146880.0, + "grad_norm": 0.06388697234939344, + "language_loss": 0.92829657, + "learning_rate": 0.0009559154924778544, + "loss": 0.93956077, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.36621094, + "step": 835, + "time_per_iteration": 2.695260763168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121916, + "balance_loss_mlp": 1.08789361, + "epoch": 0.1608310888803386, + "flos": 804778440192.0, + "grad_norm": 0.05750453212643973, + "language_loss": 0.85217482, + "learning_rate": 0.0009557874953754284, + "loss": 0.86339402, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.34057617, + "step": 836, + "time_per_iteration": 3.002013921737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126166, + "balance_loss_mlp": 1.09204817, + "epoch": 0.16102347056560215, + "flos": 600311195136.0, + "grad_norm": 0.06332628409766573, + "language_loss": 0.84060842, + "learning_rate": 0.0009556593213206038, + "loss": 0.85187006, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.34130859, + "step": 837, + "time_per_iteration": 2.698716163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125003, + "balance_loss_mlp": 1.09102869, + "epoch": 0.1612158522508657, + "flos": 553235208192.0, + "grad_norm": 0.07524747482874264, + "language_loss": 0.87844718, + "learning_rate": 0.0009555309703631414, + "loss": 0.88969719, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.33984375, + "step": 838, + "time_per_iteration": 2.669588327407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133813, + "balance_loss_mlp": 1.09752607, + "epoch": 0.16140823393612927, + "flos": 555701423616.0, + "grad_norm": 0.07144746672945328, + "language_loss": 0.87685311, + "learning_rate": 0.0009554024425528722, + "loss": 0.88819122, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.36279297, + "step": 839, + "time_per_iteration": 2.6809709072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112028, + "balance_loss_mlp": 1.08737814, + "epoch": 0.16160061562139286, + "flos": 543613427712.0, + "grad_norm": 0.06970106087394082, + "language_loss": 0.8929134, + "learning_rate": 0.0009552737379396948, + "loss": 0.90411627, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.32885742, + "step": 840, + "time_per_iteration": 2.6100995540618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102587, + "balance_loss_mlp": 1.06920815, + "epoch": 0.16179299730665642, + "flos": 603590717952.0, + "grad_norm": 0.06131687325166246, + "language_loss": 0.87945604, + "learning_rate": 0.0009551448565735767, + "loss": 0.89048195, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.33398438, + "step": 841, + "time_per_iteration": 2.7360360622406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095736, + "balance_loss_mlp": 1.06168985, + "epoch": 0.16198537899191998, + "flos": 786821050368.0, + "grad_norm": 0.07162496841720159, + "language_loss": 0.8519845, + "learning_rate": 0.0009550157985045543, + "loss": 0.86294186, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.34082031, + "step": 842, + "time_per_iteration": 3.0436456203460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087632, + "balance_loss_mlp": 1.05413389, + "epoch": 0.16217776067718354, + "flos": 519550917120.0, + "grad_norm": 0.060562390499230526, + "language_loss": 0.89622426, + "learning_rate": 0.0009548865637827321, + "loss": 0.90710062, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.33496094, + "step": 843, + "time_per_iteration": 2.6422221660614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086448, + "balance_loss_mlp": 1.05342698, + "epoch": 0.1623701423624471, + "flos": 505015644672.0, + "grad_norm": 0.07097995853224412, + "language_loss": 0.90216166, + "learning_rate": 0.0009547571524582838, + "loss": 0.91302609, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.33032227, + "step": 844, + "time_per_iteration": 2.6082894802093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095057, + "balance_loss_mlp": 1.06031966, + "epoch": 0.16256252404771065, + "flos": 496940493312.0, + "grad_norm": 0.06932052947515681, + "language_loss": 0.92511153, + "learning_rate": 0.0009546275645814512, + "loss": 0.9360621, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.34765625, + "step": 845, + "time_per_iteration": 2.5985872745513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100013, + "balance_loss_mlp": 1.065418, + "epoch": 0.16275490573297421, + "flos": 502110061056.0, + "grad_norm": 0.07540183512891604, + "language_loss": 0.90294898, + "learning_rate": 0.0009544978002025446, + "loss": 0.91394913, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.34619141, + "step": 846, + "time_per_iteration": 2.5778391361236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096289, + "balance_loss_mlp": 1.06174231, + "epoch": 0.16294728741823777, + "flos": 506952179712.0, + "grad_norm": 0.06018935314915502, + "language_loss": 0.87532055, + "learning_rate": 0.0009543678593719434, + "loss": 0.8862834, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.34570312, + "step": 847, + "time_per_iteration": 2.697566270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102624, + "balance_loss_mlp": 1.06824434, + "epoch": 0.16313966910350133, + "flos": 509418395136.0, + "grad_norm": 0.054217985504269955, + "language_loss": 0.8754853, + "learning_rate": 0.0009542377421400945, + "loss": 0.88651162, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.34375, + "step": 848, + "time_per_iteration": 2.786766290664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104457, + "balance_loss_mlp": 1.06847942, + "epoch": 0.16333205078876492, + "flos": 543712352256.0, + "grad_norm": 0.06122856356214084, + "language_loss": 0.83524954, + "learning_rate": 0.0009541074485575145, + "loss": 0.84629411, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.35986328, + "step": 849, + "time_per_iteration": 2.713759183883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098701, + "balance_loss_mlp": 1.06346297, + "epoch": 0.16352443247402848, + "flos": 507477477888.0, + "grad_norm": 0.06331477383231503, + "language_loss": 0.92240757, + "learning_rate": 0.0009539769786747874, + "loss": 0.93339461, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.35253906, + "step": 850, + "time_per_iteration": 2.589945077896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100894, + "balance_loss_mlp": 1.06584692, + "epoch": 0.16371681415929204, + "flos": 541851420672.0, + "grad_norm": 0.06704648725492578, + "language_loss": 0.81567919, + "learning_rate": 0.0009538463325425665, + "loss": 0.82668811, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.35083008, + "step": 851, + "time_per_iteration": 2.6779844760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105544, + "balance_loss_mlp": 1.07042515, + "epoch": 0.1639091958445556, + "flos": 520501026816.0, + "grad_norm": 0.058426853420895056, + "language_loss": 0.8673842, + "learning_rate": 0.0009537155102115728, + "loss": 0.87843966, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.35131836, + "step": 852, + "time_per_iteration": 2.5614206790924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106136, + "balance_loss_mlp": 1.07175565, + "epoch": 0.16410157752981916, + "flos": 547149026304.0, + "grad_norm": 0.06460558975646845, + "language_loss": 0.83482397, + "learning_rate": 0.0009535845117325961, + "loss": 0.84588534, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.34423828, + "step": 853, + "time_per_iteration": 2.6453073024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098632, + "balance_loss_mlp": 1.06470513, + "epoch": 0.16429395921508272, + "flos": 582561828864.0, + "grad_norm": 0.052152281018199936, + "language_loss": 0.93584174, + "learning_rate": 0.0009534533371564946, + "loss": 0.94682807, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.33959961, + "step": 854, + "time_per_iteration": 2.75186824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111206, + "balance_loss_mlp": 1.07670665, + "epoch": 0.16448634090034628, + "flos": 530678628864.0, + "grad_norm": 0.06475772966833339, + "language_loss": 0.8907218, + "learning_rate": 0.0009533219865341949, + "loss": 0.90183383, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.3449707, + "step": 855, + "time_per_iteration": 2.581479787826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108852, + "balance_loss_mlp": 1.07285094, + "epoch": 0.16467872258560984, + "flos": 491623948800.0, + "grad_norm": 0.06378602693040462, + "language_loss": 0.87287533, + "learning_rate": 0.0009531904599166916, + "loss": 0.88396388, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.36035156, + "step": 856, + "time_per_iteration": 2.6429831981658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107008, + "balance_loss_mlp": 1.07141232, + "epoch": 0.16487110427087343, + "flos": 506015216640.0, + "grad_norm": 0.07162133431974482, + "language_loss": 0.85139728, + "learning_rate": 0.0009530587573550478, + "loss": 0.86246729, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.35620117, + "step": 857, + "time_per_iteration": 2.5667338371276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071319, + "balance_loss_mlp": 1.05434394, + "epoch": 0.16506348595613698, + "flos": 1432006553088.0, + "grad_norm": 0.02717136097410494, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75390708, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.16992188, + "step": 858, + "time_per_iteration": 5.02930474281311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107487, + "balance_loss_mlp": 1.0740366, + "epoch": 0.16525586764140054, + "flos": 476890827264.0, + "grad_norm": 0.06438670275364486, + "language_loss": 0.90481895, + "learning_rate": 0.0009527948246039337, + "loss": 0.91589379, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.33447266, + "step": 859, + "time_per_iteration": 2.5222055912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109782, + "balance_loss_mlp": 1.07618856, + "epoch": 0.1654482493266641, + "flos": 880737297408.0, + "grad_norm": 0.058857893791213665, + "language_loss": 0.88361865, + "learning_rate": 0.000952662594516931, + "loss": 0.8947165, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.33618164, + "step": 860, + "time_per_iteration": 3.065053701400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109979, + "balance_loss_mlp": 1.07497942, + "epoch": 0.16564063101192766, + "flos": 626527028736.0, + "grad_norm": 0.058557043780191484, + "language_loss": 0.86803752, + "learning_rate": 0.0009525301886907234, + "loss": 0.87913728, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.34985352, + "step": 861, + "time_per_iteration": 2.873415470123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121945, + "balance_loss_mlp": 1.08537149, + "epoch": 0.16583301269719122, + "flos": 561250722816.0, + "grad_norm": 0.0761086770239273, + "language_loss": 0.8825953, + "learning_rate": 0.0009523976071767155, + "loss": 0.8938148, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.36572266, + "step": 862, + "time_per_iteration": 2.71508526802063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115561, + "balance_loss_mlp": 1.07994115, + "epoch": 0.16602539438245478, + "flos": 567510022656.0, + "grad_norm": 0.05388299317844869, + "language_loss": 0.88433009, + "learning_rate": 0.00095226485002638, + "loss": 0.8954857, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.35620117, + "step": 863, + "time_per_iteration": 2.7524497509002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111173, + "balance_loss_mlp": 1.07617354, + "epoch": 0.16621777606771834, + "flos": 574589984256.0, + "grad_norm": 0.05833582522103205, + "language_loss": 0.89311475, + "learning_rate": 0.0009521319172912576, + "loss": 0.90422642, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.35009766, + "step": 864, + "time_per_iteration": 2.717493772506714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134798, + "balance_loss_mlp": 1.09846306, + "epoch": 0.16641015775298193, + "flos": 514292599296.0, + "grad_norm": 0.05644176285984134, + "language_loss": 0.94990546, + "learning_rate": 0.0009519988090229579, + "loss": 0.96125346, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.36352539, + "step": 865, + "time_per_iteration": 2.6850624084472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133116, + "balance_loss_mlp": 1.09668565, + "epoch": 0.1666025394382455, + "flos": 621395338752.0, + "grad_norm": 0.05816643645022503, + "language_loss": 0.88684535, + "learning_rate": 0.0009518655252731576, + "loss": 0.89817655, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.36450195, + "step": 866, + "time_per_iteration": 2.7240021228790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124082, + "balance_loss_mlp": 1.0882715, + "epoch": 0.16679492112350905, + "flos": 548528329728.0, + "grad_norm": 0.06128727898968579, + "language_loss": 0.9070124, + "learning_rate": 0.0009517320660936022, + "loss": 0.91825324, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.35839844, + "step": 867, + "time_per_iteration": 2.6959731578826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118134, + "balance_loss_mlp": 1.08260965, + "epoch": 0.1669873028087726, + "flos": 665379477504.0, + "grad_norm": 0.05857722537468161, + "language_loss": 0.83557463, + "learning_rate": 0.0009515984315361051, + "loss": 0.84675598, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.35546875, + "step": 868, + "time_per_iteration": 2.813674211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122458, + "balance_loss_mlp": 1.08638549, + "epoch": 0.16717968449403617, + "flos": 538305647616.0, + "grad_norm": 0.06553445455365839, + "language_loss": 0.87103701, + "learning_rate": 0.000951464621652548, + "loss": 0.88226151, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.36083984, + "step": 869, + "time_per_iteration": 2.674333333969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111253, + "balance_loss_mlp": 1.07757819, + "epoch": 0.16737206617929973, + "flos": 529833235968.0, + "grad_norm": 0.059309523866322815, + "language_loss": 0.78951609, + "learning_rate": 0.0009513306364948804, + "loss": 0.80064136, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.34985352, + "step": 870, + "time_per_iteration": 2.7519431114196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011143, + "balance_loss_mlp": 1.07953846, + "epoch": 0.1675644478645633, + "flos": 480529732608.0, + "grad_norm": 0.06711563999134491, + "language_loss": 0.89559376, + "learning_rate": 0.0009511964761151197, + "loss": 0.90673673, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.34814453, + "step": 871, + "time_per_iteration": 2.544520854949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113298, + "balance_loss_mlp": 1.07820272, + "epoch": 0.16775682954982685, + "flos": 494311334400.0, + "grad_norm": 0.06484202096701225, + "language_loss": 0.9050945, + "learning_rate": 0.0009510621405653521, + "loss": 0.91622752, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.35131836, + "step": 872, + "time_per_iteration": 2.594224452972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106918, + "balance_loss_mlp": 1.07265687, + "epoch": 0.1679492112350904, + "flos": 751694846976.0, + "grad_norm": 0.060317450015561574, + "language_loss": 0.847211, + "learning_rate": 0.0009509276298977309, + "loss": 0.85828018, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.34277344, + "step": 873, + "time_per_iteration": 2.9428915977478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110568, + "balance_loss_mlp": 1.07261181, + "epoch": 0.168141592920354, + "flos": 1135413075456.0, + "grad_norm": 0.05441785661992682, + "language_loss": 0.81867516, + "learning_rate": 0.0009507929441644778, + "loss": 0.82978088, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.37939453, + "step": 874, + "time_per_iteration": 3.52008318901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101336, + "balance_loss_mlp": 1.06640816, + "epoch": 0.16833397460561755, + "flos": 632114205696.0, + "grad_norm": 0.06557720885733571, + "language_loss": 0.86201179, + "learning_rate": 0.0009506580834178826, + "loss": 0.87302518, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.34936523, + "step": 875, + "time_per_iteration": 2.7744014263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110472, + "balance_loss_mlp": 1.06817079, + "epoch": 0.1685263562908811, + "flos": 541171943424.0, + "grad_norm": 0.06007828909359903, + "language_loss": 0.91612709, + "learning_rate": 0.0009505230477103028, + "loss": 0.92717427, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.36547852, + "step": 876, + "time_per_iteration": 2.6593635082244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103279, + "balance_loss_mlp": 1.06703997, + "epoch": 0.16871873797614467, + "flos": 619036812288.0, + "grad_norm": 0.08702038824672748, + "language_loss": 0.81312418, + "learning_rate": 0.0009503878370941641, + "loss": 0.824157, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.36206055, + "step": 877, + "time_per_iteration": 2.7511024475097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094199, + "balance_loss_mlp": 1.05986643, + "epoch": 0.16891111966140823, + "flos": 606067107840.0, + "grad_norm": 0.06953183101172467, + "language_loss": 0.88841844, + "learning_rate": 0.0009502524516219595, + "loss": 0.89936042, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.34375, + "step": 878, + "time_per_iteration": 2.697455406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091575, + "balance_loss_mlp": 1.05757689, + "epoch": 0.1691035013466718, + "flos": 552058136064.0, + "grad_norm": 0.0721678347454753, + "language_loss": 0.89980447, + "learning_rate": 0.0009501168913462506, + "loss": 0.91072023, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.34008789, + "step": 879, + "time_per_iteration": 2.6825287342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080465, + "balance_loss_mlp": 1.06263125, + "epoch": 0.16929588303193535, + "flos": 1475544121344.0, + "grad_norm": 0.044515803528062385, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80202389, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.17871094, + "step": 880, + "time_per_iteration": 4.825777769088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.0464623, + "epoch": 0.1694882647171989, + "flos": 925850456064.0, + "grad_norm": 0.06491790696384477, + "language_loss": 0.85360616, + "learning_rate": 0.0009498452465949042, + "loss": 0.86441934, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.34887695, + "step": 881, + "time_per_iteration": 3.2700376510620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086319, + "balance_loss_mlp": 1.05227244, + "epoch": 0.1696806464024625, + "flos": 545829359616.0, + "grad_norm": 0.057533624801199786, + "language_loss": 0.916857, + "learning_rate": 0.0009497091622247285, + "loss": 0.92772019, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.34082031, + "step": 882, + "time_per_iteration": 2.711721181869507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085559, + "balance_loss_mlp": 1.05184698, + "epoch": 0.16987302808772606, + "flos": 528970466304.0, + "grad_norm": 0.08384615451013337, + "language_loss": 0.93744707, + "learning_rate": 0.0009495729032619723, + "loss": 0.94830269, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.33740234, + "step": 883, + "time_per_iteration": 2.688525438308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084621, + "balance_loss_mlp": 1.05062199, + "epoch": 0.17006540977298962, + "flos": 754855096320.0, + "grad_norm": 0.06073677328113264, + "language_loss": 0.84419179, + "learning_rate": 0.0009494364697595354, + "loss": 0.85503805, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.34033203, + "step": 884, + "time_per_iteration": 2.9112000465393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092434, + "balance_loss_mlp": 1.05750597, + "epoch": 0.17025779145825318, + "flos": 558532813824.0, + "grad_norm": 0.06728326387015754, + "language_loss": 0.89818925, + "learning_rate": 0.0009492998617703867, + "loss": 0.90911365, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.34936523, + "step": 885, + "time_per_iteration": 2.6760926246643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093981, + "balance_loss_mlp": 1.06045985, + "epoch": 0.17045017314351674, + "flos": 511963186176.0, + "grad_norm": 0.0687386743468794, + "language_loss": 0.87971282, + "learning_rate": 0.0009491630793475619, + "loss": 0.89065266, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.33520508, + "step": 886, + "time_per_iteration": 2.59726619720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.06011951, + "epoch": 0.1706425548287803, + "flos": 508674898944.0, + "grad_norm": 0.058204707286146434, + "language_loss": 0.85722017, + "learning_rate": 0.0009490261225441643, + "loss": 0.8681649, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.34350586, + "step": 887, + "time_per_iteration": 2.900501012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087689, + "balance_loss_mlp": 1.0545013, + "epoch": 0.17083493651404386, + "flos": 717016776192.0, + "grad_norm": 0.05310353290702558, + "language_loss": 0.90775931, + "learning_rate": 0.0009488889914133656, + "loss": 0.9186362, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.33203125, + "step": 888, + "time_per_iteration": 2.992532968521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089684, + "balance_loss_mlp": 1.05520868, + "epoch": 0.17102731819930742, + "flos": 558852908544.0, + "grad_norm": 0.047287767355612194, + "language_loss": 0.88680297, + "learning_rate": 0.0009487516860084047, + "loss": 0.89769983, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.34472656, + "step": 889, + "time_per_iteration": 2.7029643058776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082798, + "balance_loss_mlp": 1.04858518, + "epoch": 0.17121969988457098, + "flos": 494542679040.0, + "grad_norm": 0.0765590367769256, + "language_loss": 0.88680983, + "learning_rate": 0.0009486142063825884, + "loss": 0.89763772, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.34228516, + "step": 890, + "time_per_iteration": 2.5640931129455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_mlp": 1.02402985, + "epoch": 0.17141208156983456, + "flos": 1548088063488.0, + "grad_norm": 0.02832238153451814, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.7346493, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.14648438, + "step": 891, + "time_per_iteration": 4.979609251022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108867, + "balance_loss_mlp": 1.0540278, + "epoch": 0.17160446325509812, + "flos": 619282713600.0, + "grad_norm": 0.06449268303648867, + "language_loss": 0.90758598, + "learning_rate": 0.0009483387246819542, + "loss": 0.91847265, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.34667969, + "step": 892, + "time_per_iteration": 2.731332540512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_mlp": 1.01767898, + "epoch": 0.17179684494036168, + "flos": 1381026972672.0, + "grad_norm": 0.016720826063608682, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83318138, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.1484375, + "step": 893, + "time_per_iteration": 4.641844987869263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097484, + "balance_loss_mlp": 1.06386662, + "epoch": 0.17198922662562524, + "flos": 492386383872.0, + "grad_norm": 0.05711411270468228, + "language_loss": 0.89587665, + "learning_rate": 0.0009480625467392688, + "loss": 0.90685147, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.33642578, + "step": 894, + "time_per_iteration": 2.6310250759124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_mlp": 1.01795936, + "epoch": 0.1721816083108888, + "flos": 1457529914880.0, + "grad_norm": 0.013728573618451478, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79027796, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.15136719, + "step": 895, + "time_per_iteration": 4.7525646686553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132534, + "balance_loss_mlp": 1.09834456, + "epoch": 0.17237398999615236, + "flos": 527853030912.0, + "grad_norm": 0.05821127752563967, + "language_loss": 0.87793648, + "learning_rate": 0.0009477856729834196, + "loss": 0.88926184, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.34228516, + "step": 896, + "time_per_iteration": 2.7015438079833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132108, + "balance_loss_mlp": 1.09901524, + "epoch": 0.17256637168141592, + "flos": 603644562432.0, + "grad_norm": 0.08337200045302615, + "language_loss": 0.9056648, + "learning_rate": 0.0009476469753098809, + "loss": 0.91698587, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.33105469, + "step": 897, + "time_per_iteration": 2.7035813331604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125108, + "balance_loss_mlp": 1.08922589, + "epoch": 0.17275875336667948, + "flos": 509437334016.0, + "grad_norm": 0.05742024530278536, + "language_loss": 0.874506, + "learning_rate": 0.0009475081038443738, + "loss": 0.88575709, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.35913086, + "step": 898, + "time_per_iteration": 2.584437370300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115669, + "balance_loss_mlp": 1.07971573, + "epoch": 0.17295113505194307, + "flos": 664951693824.0, + "grad_norm": 0.06535241228499304, + "language_loss": 0.85809892, + "learning_rate": 0.0009473690586408124, + "loss": 0.86925566, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.35986328, + "step": 899, + "time_per_iteration": 2.83156418800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116393, + "balance_loss_mlp": 1.08084452, + "epoch": 0.17314351673720663, + "flos": 555125253120.0, + "grad_norm": 0.0683413775827569, + "language_loss": 0.86639923, + "learning_rate": 0.0009472298397531792, + "loss": 0.87756318, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.35546875, + "step": 900, + "time_per_iteration": 2.6944193840026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123337, + "balance_loss_mlp": 1.08635855, + "epoch": 0.17333589842247019, + "flos": 503361326592.0, + "grad_norm": 0.09670394547256775, + "language_loss": 0.87118709, + "learning_rate": 0.0009470904472355235, + "loss": 0.88242042, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.36987305, + "step": 901, + "time_per_iteration": 2.637882709503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114178, + "balance_loss_mlp": 1.07982159, + "epoch": 0.17352828010773375, + "flos": 555924003840.0, + "grad_norm": 0.06358596699153923, + "language_loss": 0.79912066, + "learning_rate": 0.0009469508811419626, + "loss": 0.81026244, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.34399414, + "step": 902, + "time_per_iteration": 2.726072311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077408, + "balance_loss_mlp": 1.06453359, + "epoch": 0.1737206617929973, + "flos": 1553711556096.0, + "grad_norm": 0.030950293127884103, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72691238, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.12890625, + "step": 903, + "time_per_iteration": 4.791790723800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109546, + "balance_loss_mlp": 1.07445073, + "epoch": 0.17391304347826086, + "flos": 516390667776.0, + "grad_norm": 0.06883251868009001, + "language_loss": 0.84220147, + "learning_rate": 0.0009466712284439292, + "loss": 0.85329694, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.35131836, + "step": 904, + "time_per_iteration": 2.7648017406463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104504, + "balance_loss_mlp": 1.06995738, + "epoch": 0.17410542516352442, + "flos": 540773273088.0, + "grad_norm": 0.06988851938988141, + "language_loss": 0.8903957, + "learning_rate": 0.0009465311419480276, + "loss": 0.90144074, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.34545898, + "step": 905, + "time_per_iteration": 2.725829601287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098399, + "balance_loss_mlp": 1.0637325, + "epoch": 0.17429780684878798, + "flos": 623542869504.0, + "grad_norm": 0.06312030659776342, + "language_loss": 0.88624233, + "learning_rate": 0.0009463908820933622, + "loss": 0.89722633, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.34692383, + "step": 906, + "time_per_iteration": 2.8389482498168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093859, + "balance_loss_mlp": 1.05900264, + "epoch": 0.17449018853405157, + "flos": 575368386048.0, + "grad_norm": 0.056066721215551084, + "language_loss": 0.83138871, + "learning_rate": 0.0009462504489343868, + "loss": 0.84232736, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.34863281, + "step": 907, + "time_per_iteration": 2.863349437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086469, + "balance_loss_mlp": 1.05199337, + "epoch": 0.17468257021931513, + "flos": 533499844608.0, + "grad_norm": 0.07604190500703253, + "language_loss": 0.894853, + "learning_rate": 0.0009461098425256222, + "loss": 0.90571761, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.3449707, + "step": 908, + "time_per_iteration": 2.5941011905670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108834, + "balance_loss_mlp": 1.05345941, + "epoch": 0.1748749519045787, + "flos": 540496848384.0, + "grad_norm": 0.050694136543679796, + "language_loss": 0.85873353, + "learning_rate": 0.0009459690629216567, + "loss": 0.86961693, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.34887695, + "step": 909, + "time_per_iteration": 2.6097571849823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109032, + "balance_loss_mlp": 1.0570612, + "epoch": 0.17506733358984225, + "flos": 498373641216.0, + "grad_norm": 0.0569262349138849, + "language_loss": 0.88138729, + "learning_rate": 0.0009458281101771457, + "loss": 0.89229047, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.33276367, + "step": 910, + "time_per_iteration": 2.5904784202575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091588, + "balance_loss_mlp": 1.05744696, + "epoch": 0.1752597152751058, + "flos": 622621873152.0, + "grad_norm": 0.06350455217589325, + "language_loss": 0.8266046, + "learning_rate": 0.0009456869843468122, + "loss": 0.83752048, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.34179688, + "step": 911, + "time_per_iteration": 2.8930556774139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090023, + "balance_loss_mlp": 1.05476046, + "epoch": 0.17545209696036937, + "flos": 520717814784.0, + "grad_norm": 0.07844481886152296, + "language_loss": 0.79097009, + "learning_rate": 0.0009455456854854459, + "loss": 0.80187035, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.35302734, + "step": 912, + "time_per_iteration": 2.5984511375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096028, + "balance_loss_mlp": 1.0631026, + "epoch": 0.17564447864563293, + "flos": 461750270976.0, + "grad_norm": 0.05516798292623818, + "language_loss": 0.84505737, + "learning_rate": 0.0009454042136479039, + "loss": 0.85601771, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.3293457, + "step": 913, + "time_per_iteration": 2.586195945739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085981, + "balance_loss_mlp": 1.05286503, + "epoch": 0.1758368603308965, + "flos": 480416251392.0, + "grad_norm": 0.05301404729603274, + "language_loss": 0.83308446, + "learning_rate": 0.0009452625688891103, + "loss": 0.84394431, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.33129883, + "step": 914, + "time_per_iteration": 2.5374929904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052517, + "balance_loss_mlp": 1.038975, + "epoch": 0.17602924201616005, + "flos": 1478160133632.0, + "grad_norm": 0.03507986977902886, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79787254, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.13574219, + "step": 915, + "time_per_iteration": 4.5561583042144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097389, + "balance_loss_mlp": 1.06226993, + "epoch": 0.17622162370142364, + "flos": 602010593280.0, + "grad_norm": 0.06815502965849334, + "language_loss": 0.93451321, + "learning_rate": 0.0009449787608278015, + "loss": 0.94548714, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.35131836, + "step": 916, + "time_per_iteration": 2.7807908058166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092918, + "balance_loss_mlp": 1.0588007, + "epoch": 0.1764140053866872, + "flos": 442473214464.0, + "grad_norm": 0.0637680644109211, + "language_loss": 0.92700857, + "learning_rate": 0.0009448365976354704, + "loss": 0.93793774, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.34130859, + "step": 917, + "time_per_iteration": 2.4800124168395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.06909323, + "epoch": 0.17660638707195075, + "flos": 500362610688.0, + "grad_norm": 0.07080486598597346, + "language_loss": 0.90158784, + "learning_rate": 0.0009446942617422558, + "loss": 0.91264427, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.36547852, + "step": 918, + "time_per_iteration": 2.5415430068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101469, + "balance_loss_mlp": 1.06766129, + "epoch": 0.17679876875721431, + "flos": 538621360128.0, + "grad_norm": 0.060000223973742446, + "language_loss": 0.86201262, + "learning_rate": 0.0009445517532034176, + "loss": 0.87302732, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.33789062, + "step": 919, + "time_per_iteration": 2.6849868297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121669, + "balance_loss_mlp": 1.08569145, + "epoch": 0.17699115044247787, + "flos": 497477376000.0, + "grad_norm": 0.08221632690768264, + "language_loss": 0.89522099, + "learning_rate": 0.0009444090720742824, + "loss": 0.9064377, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.35986328, + "step": 920, + "time_per_iteration": 2.6034600734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113572, + "balance_loss_mlp": 1.07883418, + "epoch": 0.17718353212774143, + "flos": 662444780544.0, + "grad_norm": 0.08029288241638204, + "language_loss": 0.88040781, + "learning_rate": 0.0009442662184102439, + "loss": 0.89154357, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.34741211, + "step": 921, + "time_per_iteration": 2.767352342605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105947, + "balance_loss_mlp": 1.07309294, + "epoch": 0.177375913813005, + "flos": 582340658688.0, + "grad_norm": 0.0705507668945597, + "language_loss": 0.87951338, + "learning_rate": 0.000944123192266763, + "loss": 0.89057279, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.32836914, + "step": 922, + "time_per_iteration": 2.789315700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108289, + "balance_loss_mlp": 1.0727644, + "epoch": 0.17756829549826855, + "flos": 552285098496.0, + "grad_norm": 0.06115562628552814, + "language_loss": 0.83835006, + "learning_rate": 0.0009439799936993671, + "loss": 0.84943295, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.35546875, + "step": 923, + "time_per_iteration": 2.7160987854003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090669, + "balance_loss_mlp": 1.05733824, + "epoch": 0.17776067718353214, + "flos": 556060806144.0, + "grad_norm": 0.07059184324253498, + "language_loss": 0.88508618, + "learning_rate": 0.0009438366227636511, + "loss": 0.89599288, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.33349609, + "step": 924, + "time_per_iteration": 2.6319191455841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084897, + "balance_loss_mlp": 1.05163789, + "epoch": 0.1779530588687957, + "flos": 658161303552.0, + "grad_norm": 0.06263940487075517, + "language_loss": 0.86677843, + "learning_rate": 0.0009436930795152763, + "loss": 0.87762737, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.33276367, + "step": 925, + "time_per_iteration": 2.8063783645629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084326, + "balance_loss_mlp": 1.05159163, + "epoch": 0.17814544055405926, + "flos": 644187644928.0, + "grad_norm": 0.06448697412821461, + "language_loss": 0.8710525, + "learning_rate": 0.0009435493640099713, + "loss": 0.88189578, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.32739258, + "step": 926, + "time_per_iteration": 2.7599081993103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080787, + "balance_loss_mlp": 1.04664516, + "epoch": 0.17833782223932282, + "flos": 460672123392.0, + "grad_norm": 0.06497730431564504, + "language_loss": 0.84328961, + "learning_rate": 0.0009434054763035314, + "loss": 0.85409749, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.34155273, + "step": 927, + "time_per_iteration": 2.612910032272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081381, + "balance_loss_mlp": 1.04740596, + "epoch": 0.17853020392458638, + "flos": 759212766720.0, + "grad_norm": 0.04594292129212818, + "language_loss": 0.85898727, + "learning_rate": 0.0009432614164518185, + "loss": 0.8698011, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.33984375, + "step": 928, + "time_per_iteration": 2.926981210708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086846, + "balance_loss_mlp": 1.05153632, + "epoch": 0.17872258560984994, + "flos": 782320785408.0, + "grad_norm": 0.055185850673896385, + "language_loss": 0.84792197, + "learning_rate": 0.000943117184510762, + "loss": 0.85879046, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.35327148, + "step": 929, + "time_per_iteration": 2.995514154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039861, + "balance_loss_mlp": 1.02660513, + "epoch": 0.1789149672951135, + "flos": 1459095482880.0, + "grad_norm": 0.021362691821678215, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79829824, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.1328125, + "step": 930, + "time_per_iteration": 4.99839448928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091678, + "balance_loss_mlp": 1.05739331, + "epoch": 0.17910734898037706, + "flos": 503598463488.0, + "grad_norm": 0.05761618473313655, + "language_loss": 0.88773429, + "learning_rate": 0.0009428282045846674, + "loss": 0.89865112, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.34301758, + "step": 931, + "time_per_iteration": 2.6966652870178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.05452061, + "epoch": 0.17929973066564064, + "flos": 745895264256.0, + "grad_norm": 0.05798282919409206, + "language_loss": 0.89983928, + "learning_rate": 0.0009426834567118214, + "loss": 0.91071755, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.33300781, + "step": 932, + "time_per_iteration": 3.072160482406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092765, + "balance_loss_mlp": 1.05907631, + "epoch": 0.1794921123509042, + "flos": 712875893760.0, + "grad_norm": 0.055390897890994044, + "language_loss": 0.80879378, + "learning_rate": 0.0009425385369740155, + "loss": 0.81972146, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.3371582, + "step": 933, + "time_per_iteration": 3.0337042808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092731, + "balance_loss_mlp": 1.05825567, + "epoch": 0.17968449403616776, + "flos": 632838763008.0, + "grad_norm": 0.0687685702394307, + "language_loss": 0.87443584, + "learning_rate": 0.0009423934454275125, + "loss": 0.8853631, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.3449707, + "step": 934, + "time_per_iteration": 2.7970879077911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089195, + "balance_loss_mlp": 1.05526757, + "epoch": 0.17987687572143132, + "flos": 536060602368.0, + "grad_norm": 0.08214865293258214, + "language_loss": 0.92215371, + "learning_rate": 0.0009422481821286418, + "loss": 0.93304563, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.33959961, + "step": 935, + "time_per_iteration": 2.7134642601013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091708, + "balance_loss_mlp": 1.05914021, + "epoch": 0.18006925740669488, + "flos": 537818227200.0, + "grad_norm": 0.0718764173736199, + "language_loss": 0.87967253, + "learning_rate": 0.0009421027471337998, + "loss": 0.89058959, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.32568359, + "step": 936, + "time_per_iteration": 2.608764171600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098474, + "balance_loss_mlp": 1.06333113, + "epoch": 0.18026163909195844, + "flos": 539255757312.0, + "grad_norm": 0.06697051800305152, + "language_loss": 0.82882118, + "learning_rate": 0.0009419571404994493, + "loss": 0.83980596, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.3515625, + "step": 937, + "time_per_iteration": 2.620296001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096306, + "balance_loss_mlp": 1.06240284, + "epoch": 0.180454020777222, + "flos": 500382959616.0, + "grad_norm": 0.08555714620461663, + "language_loss": 0.90948844, + "learning_rate": 0.00094181136228212, + "loss": 0.92045152, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.33935547, + "step": 938, + "time_per_iteration": 2.62837290763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109732, + "balance_loss_mlp": 1.06415629, + "epoch": 0.18064640246248556, + "flos": 498689353728.0, + "grad_norm": 0.06983123921060745, + "language_loss": 0.86323059, + "learning_rate": 0.0009416654125384077, + "loss": 0.8742038, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.33154297, + "step": 939, + "time_per_iteration": 2.715686321258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054723, + "balance_loss_mlp": 1.04242051, + "epoch": 0.18083878414774912, + "flos": 1518572358144.0, + "grad_norm": 0.027679884047562747, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80827093, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.12304688, + "step": 940, + "time_per_iteration": 4.941875219345093 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090642, + "balance_loss_mlp": 1.05728722, + "epoch": 0.1810311658330127, + "flos": 727006703616.0, + "grad_norm": 0.07011009980003599, + "language_loss": 0.84326053, + "learning_rate": 0.000941372998698552, + "loss": 0.85416698, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.33374023, + "step": 941, + "time_per_iteration": 2.931520938873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094264, + "balance_loss_mlp": 1.0597409, + "epoch": 0.18122354751827627, + "flos": 564643726848.0, + "grad_norm": 0.08254502738164117, + "language_loss": 0.8207435, + "learning_rate": 0.0009412265347159336, + "loss": 0.8316862, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.34570312, + "step": 942, + "time_per_iteration": 2.696354627609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091238, + "balance_loss_mlp": 1.05869377, + "epoch": 0.18141592920353983, + "flos": 519024208896.0, + "grad_norm": 0.05729066672306875, + "language_loss": 0.85217965, + "learning_rate": 0.0009410798994339829, + "loss": 0.86309201, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.32543945, + "step": 943, + "time_per_iteration": 2.6009600162506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088013, + "balance_loss_mlp": 1.0545156, + "epoch": 0.1816083108888034, + "flos": 512219261952.0, + "grad_norm": 0.05342615519744699, + "language_loss": 0.88234782, + "learning_rate": 0.000940933092909628, + "loss": 0.89322793, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.33520508, + "step": 944, + "time_per_iteration": 2.618419647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095526, + "balance_loss_mlp": 1.06286263, + "epoch": 0.18180069257406695, + "flos": 492144864768.0, + "grad_norm": 0.053227732023653135, + "language_loss": 0.8393383, + "learning_rate": 0.0009407861151998649, + "loss": 0.85029352, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.32666016, + "step": 945, + "time_per_iteration": 2.5718705654144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097774, + "balance_loss_mlp": 1.06406188, + "epoch": 0.1819930742593305, + "flos": 569891870208.0, + "grad_norm": 0.05775782434029923, + "language_loss": 0.86156505, + "learning_rate": 0.0009406389663617552, + "loss": 0.87254274, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.33740234, + "step": 946, + "time_per_iteration": 2.66513729095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097522, + "balance_loss_mlp": 1.06433463, + "epoch": 0.18218545594459407, + "flos": 605693168640.0, + "grad_norm": 0.06350431386522506, + "language_loss": 0.85736459, + "learning_rate": 0.000940491646452427, + "loss": 0.86833978, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.33203125, + "step": 947, + "time_per_iteration": 2.715071201324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103916, + "balance_loss_mlp": 1.07010818, + "epoch": 0.18237783762985763, + "flos": 548419230720.0, + "grad_norm": 0.06277969821047595, + "language_loss": 0.91195452, + "learning_rate": 0.000940344155529075, + "loss": 0.92299366, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.33837891, + "step": 948, + "time_per_iteration": 2.6502938270568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099574, + "balance_loss_mlp": 1.06550407, + "epoch": 0.1825702193151212, + "flos": 450509078016.0, + "grad_norm": 0.06933176029299125, + "language_loss": 0.87683523, + "learning_rate": 0.0009401964936489605, + "loss": 0.88783091, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.34106445, + "step": 949, + "time_per_iteration": 2.5181798934936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084135, + "balance_loss_mlp": 1.05247355, + "epoch": 0.18276260100038477, + "flos": 588962313216.0, + "grad_norm": 0.07980064544074586, + "language_loss": 0.85422635, + "learning_rate": 0.0009400486608694108, + "loss": 0.86506772, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.31640625, + "step": 950, + "time_per_iteration": 2.7189955711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087871, + "balance_loss_mlp": 1.05384839, + "epoch": 0.18295498268564833, + "flos": 786988376064.0, + "grad_norm": 0.05265351460276348, + "language_loss": 0.87225658, + "learning_rate": 0.0009399006572478195, + "loss": 0.88313532, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.34033203, + "step": 951, + "time_per_iteration": 3.0805773735046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086089, + "balance_loss_mlp": 1.05218577, + "epoch": 0.1831473643709119, + "flos": 577878271488.0, + "grad_norm": 0.059447924131550096, + "language_loss": 0.91242015, + "learning_rate": 0.0009397524828416468, + "loss": 0.92328107, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.33935547, + "step": 952, + "time_per_iteration": 2.6567108631134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082155, + "balance_loss_mlp": 1.04801321, + "epoch": 0.18333974605617545, + "flos": 566622521856.0, + "grad_norm": 0.05513512337372911, + "language_loss": 0.96184212, + "learning_rate": 0.0009396041377084192, + "loss": 0.97266364, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.34179688, + "step": 953, + "time_per_iteration": 2.6937921047210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079403, + "balance_loss_mlp": 1.04478431, + "epoch": 0.183532127741439, + "flos": 526725421056.0, + "grad_norm": 0.07204875194033089, + "language_loss": 0.87840325, + "learning_rate": 0.0009394556219057295, + "loss": 0.88919723, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.34667969, + "step": 954, + "time_per_iteration": 2.6962215900421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107777, + "balance_loss_mlp": 1.04272258, + "epoch": 0.18372450942670257, + "flos": 594259918848.0, + "grad_norm": 0.07227161235955501, + "language_loss": 0.83883446, + "learning_rate": 0.0009393069354912362, + "loss": 0.84961212, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.35058594, + "step": 955, + "time_per_iteration": 2.718308925628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081248, + "balance_loss_mlp": 1.04677236, + "epoch": 0.18391689111196613, + "flos": 644720145408.0, + "grad_norm": 0.07091738302891186, + "language_loss": 0.82511717, + "learning_rate": 0.0009391580785226649, + "loss": 0.83592963, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.34521484, + "step": 956, + "time_per_iteration": 2.907367467880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077991, + "balance_loss_mlp": 1.06216049, + "epoch": 0.18410927279722972, + "flos": 1456246563840.0, + "grad_norm": 0.048423099914415325, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80418444, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.15820312, + "step": 957, + "time_per_iteration": 4.78663969039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091525, + "balance_loss_mlp": 1.05702567, + "epoch": 0.18430165448249328, + "flos": 658437728256.0, + "grad_norm": 0.09319397884021513, + "language_loss": 0.86484683, + "learning_rate": 0.0009388598531545196, + "loss": 0.8757621, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.34545898, + "step": 958, + "time_per_iteration": 2.8470118045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087326, + "balance_loss_mlp": 1.05285025, + "epoch": 0.18449403616775684, + "flos": 517679811072.0, + "grad_norm": 0.07377492103556435, + "language_loss": 0.86076611, + "learning_rate": 0.000938710484870727, + "loss": 0.87163937, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.3449707, + "step": 959, + "time_per_iteration": 2.5930731296539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090986, + "balance_loss_mlp": 1.05672574, + "epoch": 0.1846864178530204, + "flos": 552481537536.0, + "grad_norm": 0.06589557505977534, + "language_loss": 0.86379164, + "learning_rate": 0.0009385609462644189, + "loss": 0.8747015, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.34277344, + "step": 960, + "time_per_iteration": 2.688706636428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096456, + "balance_loss_mlp": 1.06212378, + "epoch": 0.18487879953828396, + "flos": 465930441216.0, + "grad_norm": 0.0643439417949763, + "language_loss": 0.86035949, + "learning_rate": 0.0009384112373936514, + "loss": 0.871324, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.34326172, + "step": 961, + "time_per_iteration": 4.0801496505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095895, + "balance_loss_mlp": 1.06132412, + "epoch": 0.18507118122354752, + "flos": 648200489472.0, + "grad_norm": 0.0614591664996872, + "language_loss": 0.91820455, + "learning_rate": 0.0009382613583165467, + "loss": 0.92916346, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.34594727, + "step": 962, + "time_per_iteration": 2.790069341659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113098, + "balance_loss_mlp": 1.07921863, + "epoch": 0.18526356290881107, + "flos": 626486330880.0, + "grad_norm": 0.06374556186760763, + "language_loss": 0.89594233, + "learning_rate": 0.0009381113090912928, + "loss": 0.90707326, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.33886719, + "step": 963, + "time_per_iteration": 2.6891098022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117951, + "balance_loss_mlp": 1.08559799, + "epoch": 0.18545594459407463, + "flos": 432497843712.0, + "grad_norm": 0.06491910119233056, + "language_loss": 0.90103394, + "learning_rate": 0.000937961089776144, + "loss": 0.91221344, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.32348633, + "step": 964, + "time_per_iteration": 2.5821962356567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124554, + "balance_loss_mlp": 1.08926833, + "epoch": 0.1856483262793382, + "flos": 748720862208.0, + "grad_norm": 0.06849062336391444, + "language_loss": 0.829036, + "learning_rate": 0.0009378107004294208, + "loss": 0.84028149, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.35302734, + "step": 965, + "time_per_iteration": 2.9898061752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115255, + "balance_loss_mlp": 1.081972, + "epoch": 0.18584070796460178, + "flos": 530058788352.0, + "grad_norm": 0.08647217477609576, + "language_loss": 0.91352308, + "learning_rate": 0.0009376601411095096, + "loss": 0.92467564, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.33300781, + "step": 966, + "time_per_iteration": 2.6415059566497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093436, + "balance_loss_mlp": 1.06196475, + "epoch": 0.18603308964986534, + "flos": 482863527936.0, + "grad_norm": 0.05783783242438048, + "language_loss": 0.8708145, + "learning_rate": 0.0009375094118748622, + "loss": 0.88174886, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.31445312, + "step": 967, + "time_per_iteration": 2.5149550437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089094, + "balance_loss_mlp": 1.05650234, + "epoch": 0.1862254713351289, + "flos": 800976591360.0, + "grad_norm": 0.0756042683078202, + "language_loss": 0.9083451, + "learning_rate": 0.0009373585127839976, + "loss": 0.91923606, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.32592773, + "step": 968, + "time_per_iteration": 2.9569485187530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084434, + "balance_loss_mlp": 1.05250978, + "epoch": 0.18641785302039246, + "flos": 478082456064.0, + "grad_norm": 0.06160067145414361, + "language_loss": 0.91074634, + "learning_rate": 0.0009372074438954994, + "loss": 0.92159069, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.3190918, + "step": 969, + "time_per_iteration": 2.508530378341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083542, + "balance_loss_mlp": 1.05040169, + "epoch": 0.18661023470565602, + "flos": 388695587328.0, + "grad_norm": 0.07517959095695621, + "language_loss": 0.91676056, + "learning_rate": 0.0009370562052680181, + "loss": 0.92759597, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.33154297, + "step": 970, + "time_per_iteration": 2.4572672843933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087332, + "balance_loss_mlp": 1.05400109, + "epoch": 0.18680261639091958, + "flos": 564402207744.0, + "grad_norm": 0.052448577146131624, + "language_loss": 0.89610398, + "learning_rate": 0.0009369047969602695, + "loss": 0.90697736, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.33349609, + "step": 971, + "time_per_iteration": 2.714704751968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101064, + "balance_loss_mlp": 1.06556404, + "epoch": 0.18699499807618314, + "flos": 479018009088.0, + "grad_norm": 0.06595213007116614, + "language_loss": 0.8674072, + "learning_rate": 0.0009367532190310357, + "loss": 0.87841785, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.35498047, + "step": 972, + "time_per_iteration": 2.589667558670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111914, + "balance_loss_mlp": 1.07660413, + "epoch": 0.1871873797614467, + "flos": 553022802432.0, + "grad_norm": 0.0720295199384638, + "language_loss": 0.88701892, + "learning_rate": 0.0009366014715391644, + "loss": 0.89813805, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.35327148, + "step": 973, + "time_per_iteration": 2.634023904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107724, + "balance_loss_mlp": 1.07389259, + "epoch": 0.18737976144671029, + "flos": 552526617600.0, + "grad_norm": 0.05153911900793568, + "language_loss": 0.8432554, + "learning_rate": 0.0009364495545435693, + "loss": 0.85433269, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.33837891, + "step": 974, + "time_per_iteration": 2.7729458808898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107099, + "balance_loss_mlp": 1.07281494, + "epoch": 0.18757214313197385, + "flos": 502002372096.0, + "grad_norm": 0.05815108638233015, + "language_loss": 0.88620323, + "learning_rate": 0.0009362974681032297, + "loss": 0.8972742, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.34326172, + "step": 975, + "time_per_iteration": 2.631744623184204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105498, + "balance_loss_mlp": 1.07130909, + "epoch": 0.1877645248172374, + "flos": 674691337728.0, + "grad_norm": 0.06841603134690444, + "language_loss": 0.88265896, + "learning_rate": 0.0009361452122771907, + "loss": 0.89371395, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.34204102, + "step": 976, + "time_per_iteration": 2.8427281379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094167, + "balance_loss_mlp": 1.06012082, + "epoch": 0.18795690650250096, + "flos": 404771696640.0, + "grad_norm": 0.07319435948671522, + "language_loss": 0.8377496, + "learning_rate": 0.0009359927871245635, + "loss": 0.84869128, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.34057617, + "step": 977, + "time_per_iteration": 2.4665186405181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090091, + "balance_loss_mlp": 1.0565697, + "epoch": 0.18814928818776452, + "flos": 637599485952.0, + "grad_norm": 0.05986452276683665, + "language_loss": 0.86337954, + "learning_rate": 0.0009358401927045246, + "loss": 0.87428045, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.33520508, + "step": 978, + "time_per_iteration": 2.8037781715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090672, + "balance_loss_mlp": 1.05707908, + "epoch": 0.18834166987302808, + "flos": 1137825446400.0, + "grad_norm": 0.054509582003230646, + "language_loss": 0.88314402, + "learning_rate": 0.0009356874290763166, + "loss": 0.89405078, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.33618164, + "step": 979, + "time_per_iteration": 3.456723213195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097981, + "balance_loss_mlp": 1.06481671, + "epoch": 0.18853405155829164, + "flos": 504538398720.0, + "grad_norm": 0.06366920756378494, + "language_loss": 0.8866874, + "learning_rate": 0.0009355344962992474, + "loss": 0.89766723, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.33154297, + "step": 980, + "time_per_iteration": 2.6105339527130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109825, + "balance_loss_mlp": 1.06494308, + "epoch": 0.1887264332435552, + "flos": 607879987200.0, + "grad_norm": 0.05130215804193928, + "language_loss": 0.88147485, + "learning_rate": 0.0009353813944326908, + "loss": 0.89245737, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.33325195, + "step": 981, + "time_per_iteration": 2.882836103439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109809, + "balance_loss_mlp": 1.0758822, + "epoch": 0.1889188149288188, + "flos": 552264749568.0, + "grad_norm": 0.07032712681879846, + "language_loss": 0.83146608, + "learning_rate": 0.0009352281235360863, + "loss": 0.84256417, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.33959961, + "step": 982, + "time_per_iteration": 2.695748805999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120418, + "balance_loss_mlp": 1.08775461, + "epoch": 0.18911119661408235, + "flos": 418332128256.0, + "grad_norm": 0.06033753714629359, + "language_loss": 0.84987485, + "learning_rate": 0.0009350746836689389, + "loss": 0.86107904, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.32666016, + "step": 983, + "time_per_iteration": 2.5073440074920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260435, + "balance_loss_mlp": 1.23916793, + "epoch": 0.1893035782993459, + "flos": 1481141320704.0, + "grad_norm": 0.0731593378732656, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82699656, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.21289062, + "step": 984, + "time_per_iteration": 5.065609931945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133244, + "balance_loss_mlp": 1.09831583, + "epoch": 0.18949595998460947, + "flos": 508220974080.0, + "grad_norm": 0.09166419018528392, + "language_loss": 0.83211792, + "learning_rate": 0.0009347672972613634, + "loss": 0.84345031, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.34936523, + "step": 985, + "time_per_iteration": 2.580009937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115864, + "balance_loss_mlp": 1.08270001, + "epoch": 0.18968834166987303, + "flos": 530812459008.0, + "grad_norm": 0.0668772854373454, + "language_loss": 0.85875785, + "learning_rate": 0.0009346133508402735, + "loss": 0.8699165, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.33178711, + "step": 986, + "time_per_iteration": 2.6872711181640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111031, + "balance_loss_mlp": 1.07724667, + "epoch": 0.1898807233551366, + "flos": 499515807744.0, + "grad_norm": 0.11088649382938841, + "language_loss": 0.8420769, + "learning_rate": 0.0009344592356873166, + "loss": 0.8531872, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.33813477, + "step": 987, + "time_per_iteration": 2.6347994804382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098007, + "balance_loss_mlp": 1.06462848, + "epoch": 0.19007310504040015, + "flos": 601936399872.0, + "grad_norm": 0.05765681888892058, + "language_loss": 0.78527796, + "learning_rate": 0.0009343049518623255, + "loss": 0.79625803, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.33398438, + "step": 988, + "time_per_iteration": 2.696929693222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082914, + "balance_loss_mlp": 1.05029869, + "epoch": 0.1902654867256637, + "flos": 601374786048.0, + "grad_norm": 0.05732720380572914, + "language_loss": 0.83250153, + "learning_rate": 0.0009341504994251985, + "loss": 0.84333068, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.32617188, + "step": 989, + "time_per_iteration": 2.8399016857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095582, + "balance_loss_mlp": 1.07841623, + "epoch": 0.19045786841092727, + "flos": 1574925147648.0, + "grad_norm": 0.03888561388969961, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74616081, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.171875, + "step": 990, + "time_per_iteration": 5.072636842727661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109747, + "balance_loss_mlp": 1.06394839, + "epoch": 0.19065025009619085, + "flos": 681280906752.0, + "grad_norm": 0.135211113906906, + "language_loss": 0.818295, + "learning_rate": 0.0009338410889544574, + "loss": 0.82926977, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.33544922, + "step": 991, + "time_per_iteration": 3.050665855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_mlp": 1.06786811, + "epoch": 0.1908426317814544, + "flos": 601971305472.0, + "grad_norm": 0.06286082016671143, + "language_loss": 0.87738663, + "learning_rate": 0.000933686131040967, + "loss": 0.88840532, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.34033203, + "step": 992, + "time_per_iteration": 2.7589659690856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089273, + "balance_loss_mlp": 1.05672884, + "epoch": 0.19103501346671797, + "flos": 586027616256.0, + "grad_norm": 0.0561482479745879, + "language_loss": 0.90427077, + "learning_rate": 0.0009335310047555883, + "loss": 0.91516346, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.32543945, + "step": 993, + "time_per_iteration": 2.7133467197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108709, + "balance_loss_mlp": 1.0532825, + "epoch": 0.19122739515198153, + "flos": 545494708224.0, + "grad_norm": 0.06221036652136981, + "language_loss": 0.88114065, + "learning_rate": 0.0009333757101585467, + "loss": 0.89201152, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.33837891, + "step": 994, + "time_per_iteration": 2.6733241081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083527, + "balance_loss_mlp": 1.05105424, + "epoch": 0.1914197768372451, + "flos": 521171739648.0, + "grad_norm": 0.05606370206634765, + "language_loss": 0.93617988, + "learning_rate": 0.0009332202473101329, + "loss": 0.94701517, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.32470703, + "step": 995, + "time_per_iteration": 2.6689558029174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079312, + "balance_loss_mlp": 1.04536152, + "epoch": 0.19161215852250865, + "flos": 610961660928.0, + "grad_norm": 0.05986652691328414, + "language_loss": 0.83121806, + "learning_rate": 0.0009330646162707028, + "loss": 0.84201121, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.33984375, + "step": 996, + "time_per_iteration": 2.7264511585235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081823, + "balance_loss_mlp": 1.04849207, + "epoch": 0.1918045402077722, + "flos": 846281806848.0, + "grad_norm": 0.05485586532204223, + "language_loss": 0.84800065, + "learning_rate": 0.0009329088171006779, + "loss": 0.85881883, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.33349609, + "step": 997, + "time_per_iteration": 3.1486315727233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097705, + "balance_loss_mlp": 1.06220424, + "epoch": 0.19199692189303577, + "flos": 465699096576.0, + "grad_norm": 0.06540772430376247, + "language_loss": 0.84963006, + "learning_rate": 0.0009327528498605446, + "loss": 0.86060709, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.35522461, + "step": 998, + "time_per_iteration": 2.532460927963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.0542109, + "epoch": 0.19218930357829936, + "flos": 531318818304.0, + "grad_norm": 0.06065225266474605, + "language_loss": 0.89716202, + "learning_rate": 0.0009325967146108548, + "loss": 0.90804029, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.33642578, + "step": 999, + "time_per_iteration": 2.6381072998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108792, + "balance_loss_mlp": 1.05334902, + "epoch": 0.19238168526356292, + "flos": 601350054912.0, + "grad_norm": 0.06318510310852068, + "language_loss": 0.87984866, + "learning_rate": 0.0009324404114122258, + "loss": 0.89072788, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.34594727, + "step": 1000, + "time_per_iteration": 2.7017252445220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088105, + "balance_loss_mlp": 1.0544883, + "epoch": 0.19257406694882648, + "flos": 571690192896.0, + "grad_norm": 0.05361295189234855, + "language_loss": 0.87132722, + "learning_rate": 0.0009322839403253397, + "loss": 0.88220823, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.33642578, + "step": 1001, + "time_per_iteration": 2.7725350856781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091645, + "balance_loss_mlp": 1.05759907, + "epoch": 0.19276644863409004, + "flos": 801478568448.0, + "grad_norm": 0.0661765462165054, + "language_loss": 0.84038174, + "learning_rate": 0.0009321273014109439, + "loss": 0.85129815, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.34082031, + "step": 1002, + "time_per_iteration": 2.9275383949279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089997, + "balance_loss_mlp": 1.05676103, + "epoch": 0.1929588303193536, + "flos": 563024314368.0, + "grad_norm": 0.05133430998282463, + "language_loss": 0.85232604, + "learning_rate": 0.0009319704947298513, + "loss": 0.863226, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.33251953, + "step": 1003, + "time_per_iteration": 2.9198272228240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083204, + "balance_loss_mlp": 1.05120838, + "epoch": 0.19315121200461716, + "flos": 626550349824.0, + "grad_norm": 0.04652496586479965, + "language_loss": 0.88737059, + "learning_rate": 0.0009318135203429393, + "loss": 0.8982026, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.31982422, + "step": 1004, + "time_per_iteration": 2.7145965099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094807, + "balance_loss_mlp": 1.06116605, + "epoch": 0.19334359368988072, + "flos": 517169069568.0, + "grad_norm": 0.06711221272981459, + "language_loss": 0.88228458, + "learning_rate": 0.0009316563783111511, + "loss": 0.8932327, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.33642578, + "step": 1005, + "time_per_iteration": 2.68135404586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095243, + "balance_loss_mlp": 1.06050563, + "epoch": 0.19353597537514428, + "flos": 693751606272.0, + "grad_norm": 0.04947727679523619, + "language_loss": 0.82323831, + "learning_rate": 0.0009314990686954943, + "loss": 0.83419079, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.34765625, + "step": 1006, + "time_per_iteration": 2.9068872928619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098932, + "balance_loss_mlp": 1.06495738, + "epoch": 0.19372835706040784, + "flos": 1209665180160.0, + "grad_norm": 0.05336104081377929, + "language_loss": 0.80917025, + "learning_rate": 0.000931341591557042, + "loss": 0.82015955, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.34008789, + "step": 1007, + "time_per_iteration": 3.759119749069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098415, + "balance_loss_mlp": 1.06291509, + "epoch": 0.19392073874567142, + "flos": 520368606720.0, + "grad_norm": 0.06549831272650784, + "language_loss": 0.87757689, + "learning_rate": 0.0009311839469569325, + "loss": 0.88856107, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.35522461, + "step": 1008, + "time_per_iteration": 2.6298930644989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100893, + "balance_loss_mlp": 1.06620264, + "epoch": 0.19411312043093498, + "flos": 588543293952.0, + "grad_norm": 0.06763315162421418, + "language_loss": 0.8732397, + "learning_rate": 0.0009310261349563687, + "loss": 0.88424855, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.34692383, + "step": 1009, + "time_per_iteration": 2.6843061447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110066, + "balance_loss_mlp": 1.06718588, + "epoch": 0.19430550211619854, + "flos": 579085867008.0, + "grad_norm": 0.05371296475785438, + "language_loss": 0.8534441, + "learning_rate": 0.0009308681556166186, + "loss": 0.86445075, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.33496094, + "step": 1010, + "time_per_iteration": 2.8197336196899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107606, + "balance_loss_mlp": 1.07291579, + "epoch": 0.1944978838014621, + "flos": 620848281600.0, + "grad_norm": 0.08312668477716535, + "language_loss": 0.87206143, + "learning_rate": 0.0009307100089990152, + "loss": 0.88313752, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.34716797, + "step": 1011, + "time_per_iteration": 2.7118990421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101924, + "balance_loss_mlp": 1.0672822, + "epoch": 0.19469026548672566, + "flos": 598440089088.0, + "grad_norm": 0.061832865854500894, + "language_loss": 0.83946323, + "learning_rate": 0.0009305516951649568, + "loss": 0.85048252, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.34667969, + "step": 1012, + "time_per_iteration": 2.667672872543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096764, + "balance_loss_mlp": 1.06314659, + "epoch": 0.19488264717198922, + "flos": 551890810368.0, + "grad_norm": 0.04827143175142062, + "language_loss": 0.87187612, + "learning_rate": 0.0009303932141759057, + "loss": 0.88284373, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.33642578, + "step": 1013, + "time_per_iteration": 2.7321088314056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101502, + "balance_loss_mlp": 1.06705046, + "epoch": 0.19507502885725278, + "flos": 665842166784.0, + "grad_norm": 0.05715794205563071, + "language_loss": 0.84201366, + "learning_rate": 0.0009302345660933902, + "loss": 0.85302866, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.3449707, + "step": 1014, + "time_per_iteration": 2.7699263095855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109904, + "balance_loss_mlp": 1.07616735, + "epoch": 0.19526741054251634, + "flos": 670771625472.0, + "grad_norm": 0.05949834877265084, + "language_loss": 0.84866655, + "learning_rate": 0.0009300757509790026, + "loss": 0.85976553, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.33764648, + "step": 1015, + "time_per_iteration": 2.8250515460968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110813, + "balance_loss_mlp": 1.0766474, + "epoch": 0.19545979222777993, + "flos": 446983653888.0, + "grad_norm": 0.0671511226198219, + "language_loss": 0.90974069, + "learning_rate": 0.0009299167688944005, + "loss": 0.92084885, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.34204102, + "step": 1016, + "time_per_iteration": 2.545133590698242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111157, + "balance_loss_mlp": 1.07778645, + "epoch": 0.1956521739130435, + "flos": 568813722624.0, + "grad_norm": 0.06338586690579641, + "language_loss": 0.85958129, + "learning_rate": 0.0009297576199013063, + "loss": 0.87069696, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.33813477, + "step": 1017, + "time_per_iteration": 2.668503761291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148218, + "balance_loss_mlp": 1.13295972, + "epoch": 0.19584455559830705, + "flos": 1454969157120.0, + "grad_norm": 0.047651466398381144, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74150348, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.15234375, + "step": 1018, + "time_per_iteration": 4.920944929122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104842, + "balance_loss_mlp": 1.09015501, + "epoch": 0.1960369372835706, + "flos": 1590320369664.0, + "grad_norm": 0.036993279908541045, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80531144, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.14648438, + "step": 1019, + "time_per_iteration": 6.0059425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118502, + "balance_loss_mlp": 1.08505166, + "epoch": 0.19622931896883417, + "flos": 615709237248.0, + "grad_norm": 0.05240041234704895, + "language_loss": 0.86600977, + "learning_rate": 0.0009292791720892659, + "loss": 0.87719476, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.3347168, + "step": 1020, + "time_per_iteration": 2.995192527770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113873, + "balance_loss_mlp": 1.07930255, + "epoch": 0.19642170065409773, + "flos": 465950790144.0, + "grad_norm": 0.0657036282835547, + "language_loss": 0.88724279, + "learning_rate": 0.0009291193560807218, + "loss": 0.89838147, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.34594727, + "step": 1021, + "time_per_iteration": 2.633256196975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114293, + "balance_loss_mlp": 1.07962656, + "epoch": 0.19661408233936128, + "flos": 515040477696.0, + "grad_norm": 0.054836200403870924, + "language_loss": 0.87439638, + "learning_rate": 0.0009289593734732688, + "loss": 0.88553929, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.34716797, + "step": 1022, + "time_per_iteration": 2.622284173965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107262, + "balance_loss_mlp": 1.0736922, + "epoch": 0.19680646402462484, + "flos": 392427624960.0, + "grad_norm": 0.053036961045345866, + "language_loss": 0.94139373, + "learning_rate": 0.0009287992243290175, + "loss": 0.95246631, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.3359375, + "step": 1023, + "time_per_iteration": 2.4402668476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108975, + "balance_loss_mlp": 1.07247353, + "epoch": 0.19699884570988843, + "flos": 626122566144.0, + "grad_norm": 0.056904835680118435, + "language_loss": 0.90850759, + "learning_rate": 0.0009286389087101435, + "loss": 0.91959733, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.36523438, + "step": 1024, + "time_per_iteration": 2.762068271636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110519, + "balance_loss_mlp": 1.06957078, + "epoch": 0.197191227395152, + "flos": 557710742016.0, + "grad_norm": 0.05298833269370499, + "language_loss": 0.88575542, + "learning_rate": 0.0009284784266788864, + "loss": 0.89680731, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.35668945, + "step": 1025, + "time_per_iteration": 4.087035417556763 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109606, + "balance_loss_mlp": 1.07565546, + "epoch": 0.19738360908041555, + "flos": 664681061376.0, + "grad_norm": 0.0565537913278748, + "language_loss": 0.92494339, + "learning_rate": 0.0009283177782975512, + "loss": 0.93603945, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.33984375, + "step": 1026, + "time_per_iteration": 2.948167562484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095626, + "balance_loss_mlp": 1.06117415, + "epoch": 0.1975759907656791, + "flos": 522244094976.0, + "grad_norm": 0.06218898027866582, + "language_loss": 0.88052273, + "learning_rate": 0.000928156963628507, + "loss": 0.89147896, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.3449707, + "step": 1027, + "time_per_iteration": 2.564019203186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091019, + "balance_loss_mlp": 1.05694866, + "epoch": 0.19776837245094267, + "flos": 462233309184.0, + "grad_norm": 0.056114928823487176, + "language_loss": 0.8826099, + "learning_rate": 0.0009279959827341877, + "loss": 0.89352006, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.34082031, + "step": 1028, + "time_per_iteration": 2.7226340770721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090946, + "balance_loss_mlp": 1.05699515, + "epoch": 0.19796075413620623, + "flos": 502809887232.0, + "grad_norm": 0.05507551359640612, + "language_loss": 0.88204837, + "learning_rate": 0.0009278348356770915, + "loss": 0.89295781, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.33984375, + "step": 1029, + "time_per_iteration": 2.592756748199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085455, + "balance_loss_mlp": 1.05093157, + "epoch": 0.1981531358214698, + "flos": 507281038848.0, + "grad_norm": 0.061172366255401664, + "language_loss": 0.85939109, + "learning_rate": 0.0009276735225197814, + "loss": 0.87024558, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.34570312, + "step": 1030, + "time_per_iteration": 2.598607063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088832, + "balance_loss_mlp": 1.05495238, + "epoch": 0.19834551750673335, + "flos": 531275148288.0, + "grad_norm": 0.0802549423316463, + "language_loss": 0.86293721, + "learning_rate": 0.0009275120433248847, + "loss": 0.87382561, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.33886719, + "step": 1031, + "time_per_iteration": 2.7143311500549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090216, + "balance_loss_mlp": 1.05726683, + "epoch": 0.1985378991919969, + "flos": 775147691520.0, + "grad_norm": 0.05308511447166053, + "language_loss": 0.86272347, + "learning_rate": 0.0009273503981550931, + "loss": 0.87362564, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.32958984, + "step": 1032, + "time_per_iteration": 3.0616648197174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087082, + "balance_loss_mlp": 1.05351269, + "epoch": 0.1987302808772605, + "flos": 434063411712.0, + "grad_norm": 0.059916166081832097, + "language_loss": 0.8703599, + "learning_rate": 0.0009271885870731626, + "loss": 0.88123071, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.3359375, + "step": 1033, + "time_per_iteration": 2.487316131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092715, + "balance_loss_mlp": 1.05921745, + "epoch": 0.19892266256252406, + "flos": 553342897152.0, + "grad_norm": 0.06168947094446192, + "language_loss": 0.88599998, + "learning_rate": 0.0009270266101419143, + "loss": 0.89692712, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.33520508, + "step": 1034, + "time_per_iteration": 2.5978119373321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085912, + "balance_loss_mlp": 1.05403578, + "epoch": 0.19911504424778761, + "flos": 549596302848.0, + "grad_norm": 0.06019117447906982, + "language_loss": 0.85564321, + "learning_rate": 0.0009268644674242328, + "loss": 0.86650234, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.31860352, + "step": 1035, + "time_per_iteration": 2.7259163856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097892, + "balance_loss_mlp": 1.0645138, + "epoch": 0.19930742593305117, + "flos": 518024636928.0, + "grad_norm": 0.05869793462101787, + "language_loss": 0.81141233, + "learning_rate": 0.0009267021589830678, + "loss": 0.82239127, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.33398438, + "step": 1036, + "time_per_iteration": 2.597724199295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161292, + "balance_loss_mlp": 1.14507985, + "epoch": 0.19949980761831473, + "flos": 1508516849664.0, + "grad_norm": 0.04621309141147155, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78788376, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.16210938, + "step": 1037, + "time_per_iteration": 4.918612241744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093993, + "balance_loss_mlp": 1.06044722, + "epoch": 0.1996921893035783, + "flos": 697803738624.0, + "grad_norm": 0.061892224045152405, + "language_loss": 0.93283784, + "learning_rate": 0.000926377045182406, + "loss": 0.94377768, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.33569336, + "step": 1038, + "time_per_iteration": 2.8800160884857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096412, + "balance_loss_mlp": 1.06334293, + "epoch": 0.19988457098884185, + "flos": 726682226688.0, + "grad_norm": 0.0613562398808313, + "language_loss": 0.87972045, + "learning_rate": 0.0009262142399491296, + "loss": 0.89068449, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.33081055, + "step": 1039, + "time_per_iteration": 3.0561435222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097284, + "balance_loss_mlp": 1.06345224, + "epoch": 0.2000769526741054, + "flos": 560275881984.0, + "grad_norm": 0.06364175085873486, + "language_loss": 0.87837642, + "learning_rate": 0.0009260512692448105, + "loss": 0.88934934, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.33862305, + "step": 1040, + "time_per_iteration": 2.7037088871002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090351, + "balance_loss_mlp": 1.05697203, + "epoch": 0.200269334359369, + "flos": 571758594048.0, + "grad_norm": 0.05851279903795688, + "language_loss": 0.84325236, + "learning_rate": 0.000925888133132719, + "loss": 0.85415584, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.33398438, + "step": 1041, + "time_per_iteration": 2.836177110671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082089, + "balance_loss_mlp": 1.06730711, + "epoch": 0.20046171604463256, + "flos": 1485362340864.0, + "grad_norm": 0.029405325300647274, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80692518, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.14746094, + "step": 1042, + "time_per_iteration": 4.901337146759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094947, + "balance_loss_mlp": 1.06140149, + "epoch": 0.20065409772989612, + "flos": 496266808320.0, + "grad_norm": 0.07205728359427886, + "language_loss": 0.81256473, + "learning_rate": 0.0009255613649386244, + "loss": 0.82351422, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.33544922, + "step": 1043, + "time_per_iteration": 2.6198885440826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089381, + "balance_loss_mlp": 1.05686069, + "epoch": 0.20084647941515968, + "flos": 579094631424.0, + "grad_norm": 0.06625931968059934, + "language_loss": 0.79017001, + "learning_rate": 0.0009253977329834838, + "loss": 0.80106384, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.32519531, + "step": 1044, + "time_per_iteration": 2.6872498989105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111234, + "balance_loss_mlp": 1.07745981, + "epoch": 0.20103886110042324, + "flos": 641775273984.0, + "grad_norm": 0.06628294367657735, + "language_loss": 0.86666185, + "learning_rate": 0.0009252339358742965, + "loss": 0.87778521, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.34912109, + "step": 1045, + "time_per_iteration": 2.7749996185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116952, + "balance_loss_mlp": 1.08219087, + "epoch": 0.2012312427856868, + "flos": 441720953856.0, + "grad_norm": 0.05401214919341486, + "language_loss": 0.83449644, + "learning_rate": 0.000925069973674654, + "loss": 0.84566593, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.34814453, + "step": 1046, + "time_per_iteration": 2.662992477416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114258, + "balance_loss_mlp": 1.08116508, + "epoch": 0.20142362447095036, + "flos": 554135855616.0, + "grad_norm": 0.049297877184233195, + "language_loss": 0.88960069, + "learning_rate": 0.000924905846448212, + "loss": 0.90074325, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.33105469, + "step": 1047, + "time_per_iteration": 2.8164925575256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127137, + "balance_loss_mlp": 1.09306693, + "epoch": 0.20161600615621392, + "flos": 669988841472.0, + "grad_norm": 0.07365230282100185, + "language_loss": 0.85615861, + "learning_rate": 0.0009247415542586906, + "loss": 0.86742997, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.34106445, + "step": 1048, + "time_per_iteration": 2.858611822128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115399, + "balance_loss_mlp": 1.08130527, + "epoch": 0.2018083878414775, + "flos": 572788689408.0, + "grad_norm": 0.05223287600750505, + "language_loss": 0.83514655, + "learning_rate": 0.0009245770971698735, + "loss": 0.84630048, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.34106445, + "step": 1049, + "time_per_iteration": 2.8758163452148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103626, + "balance_loss_mlp": 1.07036686, + "epoch": 0.20200076952674106, + "flos": 425624495616.0, + "grad_norm": 0.061140118103518055, + "language_loss": 0.88792473, + "learning_rate": 0.0009244124752456087, + "loss": 0.89896095, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.33276367, + "step": 1050, + "time_per_iteration": 2.501565456390381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097765, + "balance_loss_mlp": 1.06457758, + "epoch": 0.20219315121200462, + "flos": 536326852608.0, + "grad_norm": 0.049507299183714965, + "language_loss": 0.85344577, + "learning_rate": 0.0009242476885498081, + "loss": 0.86442339, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.33203125, + "step": 1051, + "time_per_iteration": 2.698791027069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095962, + "balance_loss_mlp": 1.06222594, + "epoch": 0.20238553289726818, + "flos": 477634323456.0, + "grad_norm": 0.07140169421024865, + "language_loss": 0.8134433, + "learning_rate": 0.0009240827371464474, + "loss": 0.82440293, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.33764648, + "step": 1052, + "time_per_iteration": 2.5603079795837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082157, + "balance_loss_mlp": 1.04958868, + "epoch": 0.20257791458253174, + "flos": 1151611430400.0, + "grad_norm": 0.06069279327125781, + "language_loss": 0.84372044, + "learning_rate": 0.0009239176210995666, + "loss": 0.85454196, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.32568359, + "step": 1053, + "time_per_iteration": 3.4549684524536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088056, + "balance_loss_mlp": 1.05393791, + "epoch": 0.2027702962677953, + "flos": 666606011904.0, + "grad_norm": 0.06066867592012189, + "language_loss": 0.93657684, + "learning_rate": 0.0009237523404732695, + "loss": 0.94745743, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.34130859, + "step": 1054, + "time_per_iteration": 4.344247817993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078384, + "balance_loss_mlp": 1.04567289, + "epoch": 0.20296267795305886, + "flos": 641011428864.0, + "grad_norm": 0.0678922331878557, + "language_loss": 0.84289086, + "learning_rate": 0.0009235868953317235, + "loss": 0.85367465, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.32714844, + "step": 1055, + "time_per_iteration": 2.7755184173583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086857, + "balance_loss_mlp": 1.05321646, + "epoch": 0.20315505963832242, + "flos": 930187777536.0, + "grad_norm": 0.06816541670806936, + "language_loss": 0.85603452, + "learning_rate": 0.0009234212857391602, + "loss": 0.86690307, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.33642578, + "step": 1056, + "time_per_iteration": 3.1736087799072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089034, + "balance_loss_mlp": 1.05477369, + "epoch": 0.20334744132358598, + "flos": 561818128896.0, + "grad_norm": 0.05209348313890264, + "language_loss": 0.88978589, + "learning_rate": 0.000923255511759875, + "loss": 0.90067613, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.34301758, + "step": 1057, + "time_per_iteration": 2.7823617458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093097, + "balance_loss_mlp": 1.05945587, + "epoch": 0.20353982300884957, + "flos": 643902455808.0, + "grad_norm": 0.061337083912670884, + "language_loss": 0.85219932, + "learning_rate": 0.000923089573458227, + "loss": 0.86313027, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.33666992, + "step": 1058, + "time_per_iteration": 2.8398988246917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092738, + "balance_loss_mlp": 1.05952644, + "epoch": 0.20373220469411313, + "flos": 651101690880.0, + "grad_norm": 0.0713114334987562, + "language_loss": 0.84425724, + "learning_rate": 0.0009229234708986392, + "loss": 0.85518456, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.33203125, + "step": 1059, + "time_per_iteration": 2.891934394836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069957, + "balance_loss_mlp": 1.05603302, + "epoch": 0.2039245863793767, + "flos": 1436939136000.0, + "grad_norm": 0.037855568460977755, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.8273685, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.13964844, + "step": 1060, + "time_per_iteration": 4.673142194747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092929, + "balance_loss_mlp": 1.05985999, + "epoch": 0.20411696806464025, + "flos": 596678082048.0, + "grad_norm": 0.07190006801568614, + "language_loss": 0.85404283, + "learning_rate": 0.0009225907732636548, + "loss": 0.86497211, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.33081055, + "step": 1061, + "time_per_iteration": 2.74110746383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091714, + "balance_loss_mlp": 1.0585742, + "epoch": 0.2043093497499038, + "flos": 573530775552.0, + "grad_norm": 0.06271161302412134, + "language_loss": 0.86991799, + "learning_rate": 0.0009224241783174227, + "loss": 0.88083506, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.33154297, + "step": 1062, + "time_per_iteration": 2.6885697841644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084619, + "balance_loss_mlp": 1.05233693, + "epoch": 0.20450173143516737, + "flos": 630061217280.0, + "grad_norm": 0.055816021094363524, + "language_loss": 0.85842204, + "learning_rate": 0.0009222574193715802, + "loss": 0.86926818, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.32275391, + "step": 1063, + "time_per_iteration": 2.7569899559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087796, + "balance_loss_mlp": 1.05522823, + "epoch": 0.20469411312043093, + "flos": 573718450176.0, + "grad_norm": 0.051897822989382614, + "language_loss": 0.8621105, + "learning_rate": 0.000922090496490869, + "loss": 0.87298846, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.32568359, + "step": 1064, + "time_per_iteration": 2.7099597454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082485, + "balance_loss_mlp": 1.05025065, + "epoch": 0.20488649480569449, + "flos": 636748300800.0, + "grad_norm": 0.04962250787968228, + "language_loss": 0.90165728, + "learning_rate": 0.0009219234097400937, + "loss": 0.91248214, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.32226562, + "step": 1065, + "time_per_iteration": 2.8492319583892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108042, + "balance_loss_mlp": 1.04806709, + "epoch": 0.20507887649095807, + "flos": 975383894016.0, + "grad_norm": 0.051536979552593745, + "language_loss": 0.83029723, + "learning_rate": 0.0009217561591841237, + "loss": 0.84110147, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.32348633, + "step": 1066, + "time_per_iteration": 3.267207145690918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082085, + "balance_loss_mlp": 1.04951739, + "epoch": 0.20527125817622163, + "flos": 485940819456.0, + "grad_norm": 0.09661652793466288, + "language_loss": 0.81334901, + "learning_rate": 0.0009215887448878913, + "loss": 0.82416987, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.32568359, + "step": 1067, + "time_per_iteration": 2.5429391860961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088998, + "balance_loss_mlp": 1.05552411, + "epoch": 0.2054636398614852, + "flos": 526921860096.0, + "grad_norm": 0.09953641970782799, + "language_loss": 0.85144234, + "learning_rate": 0.0009214211669163922, + "loss": 0.86233234, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.33496094, + "step": 1068, + "time_per_iteration": 2.7006540298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085875, + "balance_loss_mlp": 1.05428481, + "epoch": 0.20565602154674875, + "flos": 557898416640.0, + "grad_norm": 0.048729379907622286, + "language_loss": 0.93896133, + "learning_rate": 0.0009212534253346862, + "loss": 0.94982004, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.31567383, + "step": 1069, + "time_per_iteration": 2.7544496059417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098993, + "balance_loss_mlp": 1.06713986, + "epoch": 0.2058484032320123, + "flos": 503976784896.0, + "grad_norm": 0.06649355865978995, + "language_loss": 0.8497259, + "learning_rate": 0.0009210855202078964, + "loss": 0.86071587, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.31835938, + "step": 1070, + "time_per_iteration": 2.59660005569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113348, + "balance_loss_mlp": 1.07975471, + "epoch": 0.20604078491727587, + "flos": 432950358528.0, + "grad_norm": 0.06315152856471482, + "language_loss": 0.87476587, + "learning_rate": 0.0009209174516012091, + "loss": 0.88589936, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.3359375, + "step": 1071, + "time_per_iteration": 2.498087167739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110052, + "balance_loss_mlp": 1.07624412, + "epoch": 0.20623316660253943, + "flos": 608421252096.0, + "grad_norm": 0.06211591839104366, + "language_loss": 0.89244497, + "learning_rate": 0.0009207492195798747, + "loss": 0.9035455, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.33837891, + "step": 1072, + "time_per_iteration": 2.760019063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116237, + "balance_loss_mlp": 1.08142757, + "epoch": 0.206425548287803, + "flos": 480184906752.0, + "grad_norm": 0.07379229384440758, + "language_loss": 0.84887302, + "learning_rate": 0.0009205808242092061, + "loss": 0.86003542, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.34838867, + "step": 1073, + "time_per_iteration": 2.6034529209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118683, + "balance_loss_mlp": 1.08423102, + "epoch": 0.20661792997306658, + "flos": 949007937024.0, + "grad_norm": 0.0763588165275792, + "language_loss": 0.82845032, + "learning_rate": 0.0009204122655545808, + "loss": 0.83963716, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.34472656, + "step": 1074, + "time_per_iteration": 3.3222029209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111187, + "balance_loss_mlp": 1.07759392, + "epoch": 0.20681031165833014, + "flos": 603206604288.0, + "grad_norm": 0.05592396046249817, + "language_loss": 0.80705297, + "learning_rate": 0.0009202435436814388, + "loss": 0.81816483, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.33618164, + "step": 1075, + "time_per_iteration": 2.721888780593872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114903, + "balance_loss_mlp": 1.08121455, + "epoch": 0.2070026933435937, + "flos": 708665200128.0, + "grad_norm": 0.07630450069092473, + "language_loss": 0.89700603, + "learning_rate": 0.0009200746586552836, + "loss": 0.90815508, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.3371582, + "step": 1076, + "time_per_iteration": 2.8797900676727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107662, + "balance_loss_mlp": 1.07416463, + "epoch": 0.20719507502885726, + "flos": 829456409088.0, + "grad_norm": 0.06176881640488279, + "language_loss": 0.84210765, + "learning_rate": 0.0009199056105416825, + "loss": 0.85318428, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.33520508, + "step": 1077, + "time_per_iteration": 3.120950698852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107166, + "balance_loss_mlp": 1.07312012, + "epoch": 0.20738745671412082, + "flos": 637993774080.0, + "grad_norm": 0.055893649084458805, + "language_loss": 0.86594802, + "learning_rate": 0.0009197363994062654, + "loss": 0.87701964, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.34057617, + "step": 1078, + "time_per_iteration": 2.8197755813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086727, + "balance_loss_mlp": 1.05480289, + "epoch": 0.20757983839938438, + "flos": 685258845696.0, + "grad_norm": 0.054433441748304986, + "language_loss": 0.84861732, + "learning_rate": 0.0009195670253147262, + "loss": 0.85948461, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.3190918, + "step": 1079, + "time_per_iteration": 2.966987133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096354, + "balance_loss_mlp": 1.06340468, + "epoch": 0.20777222008464794, + "flos": 519024208896.0, + "grad_norm": 0.07868801214896702, + "language_loss": 0.82301188, + "learning_rate": 0.0009193974883328216, + "loss": 0.83397532, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.32958984, + "step": 1080, + "time_per_iteration": 2.620704174041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091209, + "balance_loss_mlp": 1.05725837, + "epoch": 0.2079646017699115, + "flos": 511136732160.0, + "grad_norm": 0.09961486538628272, + "language_loss": 0.87482947, + "learning_rate": 0.0009192277885263718, + "loss": 0.88574153, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.33984375, + "step": 1081, + "time_per_iteration": 2.6247479915618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087114, + "balance_loss_mlp": 1.05352044, + "epoch": 0.20815698345517505, + "flos": 931409929728.0, + "grad_norm": 0.05448561879445608, + "language_loss": 0.86255312, + "learning_rate": 0.0009190579259612602, + "loss": 0.87342417, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.33618164, + "step": 1082, + "time_per_iteration": 3.2661428451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085916, + "balance_loss_mlp": 1.05187023, + "epoch": 0.20834936514043864, + "flos": 632114205696.0, + "grad_norm": 0.059638645169798186, + "language_loss": 0.86669636, + "learning_rate": 0.000918887900703433, + "loss": 0.87755549, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.34082031, + "step": 1083, + "time_per_iteration": 2.7930080890655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083976, + "balance_loss_mlp": 1.05038285, + "epoch": 0.2085417468257022, + "flos": 394170693120.0, + "grad_norm": 0.06326041775418027, + "language_loss": 0.90427065, + "learning_rate": 0.0009187177128188999, + "loss": 0.91511047, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.33618164, + "step": 1084, + "time_per_iteration": 2.431358814239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054848, + "balance_loss_mlp": 1.04159176, + "epoch": 0.20873412851096576, + "flos": 1401387969024.0, + "grad_norm": 0.04127554175786628, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78211385, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.1328125, + "step": 1085, + "time_per_iteration": 6.352816343307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080772, + "balance_loss_mlp": 1.04686832, + "epoch": 0.20892651019622932, + "flos": 447599112192.0, + "grad_norm": 0.06234370040412467, + "language_loss": 0.8612783, + "learning_rate": 0.000918376849434071, + "loss": 0.87208605, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.33935547, + "step": 1086, + "time_per_iteration": 2.5168843269348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084999, + "balance_loss_mlp": 1.05040443, + "epoch": 0.20911889188149288, + "flos": 492863629824.0, + "grad_norm": 0.07820142019527274, + "language_loss": 0.90828383, + "learning_rate": 0.0009182061740661098, + "loss": 0.91913384, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.34643555, + "step": 1087, + "time_per_iteration": 2.5461525917053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083648, + "balance_loss_mlp": 1.0494113, + "epoch": 0.20931127356675644, + "flos": 840928946688.0, + "grad_norm": 0.05821627614551514, + "language_loss": 0.85034752, + "learning_rate": 0.0009180353363361127, + "loss": 0.86118406, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.3425293, + "step": 1088, + "time_per_iteration": 3.11942982673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084605, + "balance_loss_mlp": 1.05036855, + "epoch": 0.20950365525202, + "flos": 756796013568.0, + "grad_norm": 0.06471498550944753, + "language_loss": 0.82101512, + "learning_rate": 0.0009178643363104044, + "loss": 0.83186114, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.34277344, + "step": 1089, + "time_per_iteration": 3.0986390113830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107905, + "balance_loss_mlp": 1.04543328, + "epoch": 0.20969603693728356, + "flos": 472301812224.0, + "grad_norm": 0.07091461504319575, + "language_loss": 0.91050649, + "learning_rate": 0.0009176931740553735, + "loss": 0.92129695, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.33642578, + "step": 1090, + "time_per_iteration": 2.4965460300445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108219, + "balance_loss_mlp": 1.04845381, + "epoch": 0.20988841862254715, + "flos": 976507121664.0, + "grad_norm": 0.05967441428812083, + "language_loss": 0.82829833, + "learning_rate": 0.0009175218496374708, + "loss": 0.83912027, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.33740234, + "step": 1091, + "time_per_iteration": 3.325467348098755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082454, + "balance_loss_mlp": 1.04917121, + "epoch": 0.2100808003078107, + "flos": 1092697731072.0, + "grad_norm": 0.06552872916111846, + "language_loss": 0.85816884, + "learning_rate": 0.0009173503631232103, + "loss": 0.86899334, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.33300781, + "step": 1092, + "time_per_iteration": 3.3492543697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080972, + "balance_loss_mlp": 1.04761767, + "epoch": 0.21027318199307427, + "flos": 1012567468032.0, + "grad_norm": 0.06864870254184631, + "language_loss": 0.8205356, + "learning_rate": 0.0009171787145791691, + "loss": 0.83134532, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.33374023, + "step": 1093, + "time_per_iteration": 3.229302167892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083111, + "balance_loss_mlp": 1.04975629, + "epoch": 0.21046556367833782, + "flos": 521141216256.0, + "grad_norm": 0.08362122797221107, + "language_loss": 0.80208671, + "learning_rate": 0.000917006904071987, + "loss": 0.81291783, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.33374023, + "step": 1094, + "time_per_iteration": 2.5901217460632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091484, + "balance_loss_mlp": 1.05843902, + "epoch": 0.21065794536360138, + "flos": 603437948928.0, + "grad_norm": 0.05523679641811596, + "language_loss": 0.87209588, + "learning_rate": 0.0009168349316683669, + "loss": 0.88301063, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.33056641, + "step": 1095, + "time_per_iteration": 2.67250919342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093141, + "balance_loss_mlp": 1.06081104, + "epoch": 0.21085032704886494, + "flos": 603045070848.0, + "grad_norm": 0.05347318487829757, + "language_loss": 0.82685143, + "learning_rate": 0.0009166627974350741, + "loss": 0.83778286, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.32324219, + "step": 1096, + "time_per_iteration": 2.9291882514953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097603, + "balance_loss_mlp": 1.06362867, + "epoch": 0.2110427087341285, + "flos": 637382697984.0, + "grad_norm": 0.059512513015867, + "language_loss": 0.89716321, + "learning_rate": 0.0009164905014389373, + "loss": 0.90813923, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.34008789, + "step": 1097, + "time_per_iteration": 2.7384700775146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100621, + "balance_loss_mlp": 1.06798196, + "epoch": 0.21123509041939206, + "flos": 522667496448.0, + "grad_norm": 0.08051519151754843, + "language_loss": 0.87020361, + "learning_rate": 0.0009163180437468476, + "loss": 0.88120985, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.32641602, + "step": 1098, + "time_per_iteration": 2.584890365600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109632, + "balance_loss_mlp": 1.0635848, + "epoch": 0.21142747210465565, + "flos": 450938271744.0, + "grad_norm": 0.05811835985780437, + "language_loss": 0.86184567, + "learning_rate": 0.000916145424425759, + "loss": 0.87280893, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.32739258, + "step": 1099, + "time_per_iteration": 2.6362791061401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_mlp": 1.07059634, + "epoch": 0.2116198537899192, + "flos": 875813630976.0, + "grad_norm": 0.07623729144092387, + "language_loss": 0.9082064, + "learning_rate": 0.0009159726435426885, + "loss": 0.91924655, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.33447266, + "step": 1100, + "time_per_iteration": 3.0668158531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108924, + "balance_loss_mlp": 1.0554564, + "epoch": 0.21181223547518277, + "flos": 523410992640.0, + "grad_norm": 0.06029059005678133, + "language_loss": 0.90809137, + "learning_rate": 0.0009157997011647154, + "loss": 0.91898382, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.33813477, + "step": 1101, + "time_per_iteration": 2.5932393074035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110663, + "balance_loss_mlp": 1.07327497, + "epoch": 0.21200461716044633, + "flos": 572014669824.0, + "grad_norm": 0.05812758027328986, + "language_loss": 0.86378956, + "learning_rate": 0.0009156265973589817, + "loss": 0.87485588, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.33374023, + "step": 1102, + "time_per_iteration": 2.79496431350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110266, + "balance_loss_mlp": 1.07672012, + "epoch": 0.2121969988457099, + "flos": 544869075456.0, + "grad_norm": 0.0704183859149776, + "language_loss": 0.89789248, + "learning_rate": 0.0009154533321926926, + "loss": 0.90899515, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.33569336, + "step": 1103, + "time_per_iteration": 2.5982048511505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107149, + "balance_loss_mlp": 1.07393694, + "epoch": 0.21238938053097345, + "flos": 843489704448.0, + "grad_norm": 0.06399101868010165, + "language_loss": 0.87705767, + "learning_rate": 0.0009152799057331156, + "loss": 0.88812917, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.33227539, + "step": 1104, + "time_per_iteration": 3.088672637939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100871, + "balance_loss_mlp": 1.06768262, + "epoch": 0.212581762216237, + "flos": 445984081920.0, + "grad_norm": 0.064105004549741, + "language_loss": 0.91047186, + "learning_rate": 0.0009151063180475805, + "loss": 0.9214806, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.33203125, + "step": 1105, + "time_per_iteration": 2.569998025894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090698, + "balance_loss_mlp": 1.05798697, + "epoch": 0.21277414390150057, + "flos": 514129655808.0, + "grad_norm": 0.05967324045126681, + "language_loss": 0.84732658, + "learning_rate": 0.0009149325692034803, + "loss": 0.85823357, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.32714844, + "step": 1106, + "time_per_iteration": 2.6009016036987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149918, + "balance_loss_mlp": 1.13380063, + "epoch": 0.21296652558676413, + "flos": 1484790552576.0, + "grad_norm": 0.04210654195905191, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80353343, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.16113281, + "step": 1107, + "time_per_iteration": 4.820629596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082056, + "balance_loss_mlp": 1.04994082, + "epoch": 0.21315890727202771, + "flos": 845689669632.0, + "grad_norm": 0.07454945953507684, + "language_loss": 0.87513995, + "learning_rate": 0.0009145845883094678, + "loss": 0.88596046, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.32080078, + "step": 1108, + "time_per_iteration": 3.0311591625213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083334, + "balance_loss_mlp": 1.04971695, + "epoch": 0.21335128895729127, + "flos": 629086376448.0, + "grad_norm": 0.07212897946446892, + "language_loss": 0.85387337, + "learning_rate": 0.000914410356394654, + "loss": 0.86470675, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.33642578, + "step": 1109, + "time_per_iteration": 2.7746968269348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085071, + "balance_loss_mlp": 1.05102468, + "epoch": 0.21354367064255483, + "flos": 710649787392.0, + "grad_norm": 0.053148069764317206, + "language_loss": 0.85104829, + "learning_rate": 0.0009142359635914709, + "loss": 0.86189902, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.34057617, + "step": 1110, + "time_per_iteration": 3.109018564224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083884, + "balance_loss_mlp": 1.05067194, + "epoch": 0.2137360523278184, + "flos": 455950688256.0, + "grad_norm": 0.07113647076789116, + "language_loss": 0.84692943, + "learning_rate": 0.0009140614099676245, + "loss": 0.8577683, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.33203125, + "step": 1111, + "time_per_iteration": 2.5607409477233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083673, + "balance_loss_mlp": 1.05072355, + "epoch": 0.21392843401308195, + "flos": 665749034496.0, + "grad_norm": 0.059219994241997045, + "language_loss": 0.82997137, + "learning_rate": 0.0009138866955908821, + "loss": 0.84080815, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.32958984, + "step": 1112, + "time_per_iteration": 2.901376724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086583, + "balance_loss_mlp": 1.05327559, + "epoch": 0.2141208156983455, + "flos": 748656843264.0, + "grad_norm": 0.06302145936378449, + "language_loss": 0.80617785, + "learning_rate": 0.0009137118205290738, + "loss": 0.81704366, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.33325195, + "step": 1113, + "time_per_iteration": 2.9629132747650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085777, + "balance_loss_mlp": 1.05142069, + "epoch": 0.21431319738360907, + "flos": 418898124288.0, + "grad_norm": 0.06913372638273338, + "language_loss": 0.90778732, + "learning_rate": 0.0009135367848500924, + "loss": 0.91864502, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.34399414, + "step": 1114, + "time_per_iteration": 2.5860419273376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087087, + "balance_loss_mlp": 1.05406582, + "epoch": 0.21450557906887263, + "flos": 608849035776.0, + "grad_norm": 0.07370492115341919, + "language_loss": 0.86567605, + "learning_rate": 0.0009133615886218927, + "loss": 0.87654686, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.33032227, + "step": 1115, + "time_per_iteration": 2.6986265182495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095845, + "balance_loss_mlp": 1.06082106, + "epoch": 0.21469796075413622, + "flos": 561649393152.0, + "grad_norm": 0.0682239504380638, + "language_loss": 0.88444531, + "learning_rate": 0.0009131862319124917, + "loss": 0.89540386, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.3503418, + "step": 1116, + "time_per_iteration": 2.644977569580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086331, + "balance_loss_mlp": 1.0540725, + "epoch": 0.21489034243939978, + "flos": 594363225600.0, + "grad_norm": 0.06937847326766512, + "language_loss": 0.8429122, + "learning_rate": 0.0009130107147899691, + "loss": 0.85377544, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.32250977, + "step": 1117, + "time_per_iteration": 2.768064498901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084886, + "balance_loss_mlp": 1.05148315, + "epoch": 0.21508272412466334, + "flos": 441661317120.0, + "grad_norm": 0.09911577685113587, + "language_loss": 0.85504615, + "learning_rate": 0.0009128350373224665, + "loss": 0.86589503, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.33422852, + "step": 1118, + "time_per_iteration": 2.5369865894317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_mlp": 1.02206326, + "epoch": 0.2152751058099269, + "flos": 1495397348352.0, + "grad_norm": 0.028624916140058014, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82490271, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.11767578, + "step": 1119, + "time_per_iteration": 4.634536266326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109091, + "balance_loss_mlp": 1.05741262, + "epoch": 0.21546748749519046, + "flos": 493759895040.0, + "grad_norm": 0.057336284262766976, + "language_loss": 0.85470641, + "learning_rate": 0.0009124832016254005, + "loss": 0.86561549, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.33520508, + "step": 1120, + "time_per_iteration": 2.57099986076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097048, + "balance_loss_mlp": 1.06245303, + "epoch": 0.21565986918045402, + "flos": 634241387520.0, + "grad_norm": 0.0556622286599547, + "language_loss": 0.8842063, + "learning_rate": 0.0009123070435324316, + "loss": 0.89517677, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.34619141, + "step": 1121, + "time_per_iteration": 2.73698091506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010392, + "balance_loss_mlp": 1.02780366, + "epoch": 0.21585225086571758, + "flos": 1582502704128.0, + "grad_norm": 0.024824935431588098, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78914982, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.11376953, + "step": 1122, + "time_per_iteration": 4.960963010787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093163, + "balance_loss_mlp": 1.05897415, + "epoch": 0.21604463255098114, + "flos": 683799556608.0, + "grad_norm": 0.06637115500638362, + "language_loss": 0.86772728, + "learning_rate": 0.0009119542471995752, + "loss": 0.87865889, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.34204102, + "step": 1123, + "time_per_iteration": 2.819042205810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109659, + "balance_loss_mlp": 1.06228209, + "epoch": 0.2162370142362447, + "flos": 780660675072.0, + "grad_norm": 0.06221084946299637, + "language_loss": 0.81623554, + "learning_rate": 0.0009117776090966554, + "loss": 0.82720149, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.34326172, + "step": 1124, + "time_per_iteration": 2.9435975551605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090546, + "balance_loss_mlp": 1.05473578, + "epoch": 0.21642939592150828, + "flos": 1001745294336.0, + "grad_norm": 0.06219513600405685, + "language_loss": 0.86821365, + "learning_rate": 0.0009116008111274899, + "loss": 0.8791191, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.35839844, + "step": 1125, + "time_per_iteration": 3.250828504562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023544, + "balance_loss_mlp": 1.01271951, + "epoch": 0.21662177760677184, + "flos": 1481867440128.0, + "grad_norm": 0.013492425774453086, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.8013047, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.10839844, + "step": 1126, + "time_per_iteration": 4.836662530899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078043, + "balance_loss_mlp": 1.0431627, + "epoch": 0.2168141592920354, + "flos": 887030092800.0, + "grad_norm": 0.06408405180788145, + "language_loss": 0.84878719, + "learning_rate": 0.0009112467358650396, + "loss": 0.85956764, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.34912109, + "step": 1127, + "time_per_iteration": 3.118460178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087154, + "balance_loss_mlp": 1.05205846, + "epoch": 0.21700654097729896, + "flos": 545682382848.0, + "grad_norm": 0.06014422622436645, + "language_loss": 0.86521864, + "learning_rate": 0.0009110694587092192, + "loss": 0.87609017, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.35131836, + "step": 1128, + "time_per_iteration": 2.736814022064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080165, + "balance_loss_mlp": 1.0446167, + "epoch": 0.21719892266256252, + "flos": 509270008320.0, + "grad_norm": 0.06606219668196793, + "language_loss": 0.81429344, + "learning_rate": 0.0009108920219620815, + "loss": 0.82509506, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.35571289, + "step": 1129, + "time_per_iteration": 2.6214489936828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083138, + "balance_loss_mlp": 1.04782772, + "epoch": 0.21739130434782608, + "flos": 543150738432.0, + "grad_norm": 0.060577581075914995, + "language_loss": 0.89903116, + "learning_rate": 0.0009107144256925133, + "loss": 0.90986252, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.35302734, + "step": 1130, + "time_per_iteration": 2.6337971687316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079633, + "balance_loss_mlp": 1.04489541, + "epoch": 0.21758368603308964, + "flos": 616564804608.0, + "grad_norm": 0.0674499307688184, + "language_loss": 0.82610142, + "learning_rate": 0.0009105366699694638, + "loss": 0.83689773, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.34790039, + "step": 1131, + "time_per_iteration": 2.6984267234802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085371, + "balance_loss_mlp": 1.04979873, + "epoch": 0.2177760677183532, + "flos": 634813175808.0, + "grad_norm": 0.051829013054278075, + "language_loss": 0.8159321, + "learning_rate": 0.0009103587548619439, + "loss": 0.8267858, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.35571289, + "step": 1132, + "time_per_iteration": 2.8308732509613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083319, + "balance_loss_mlp": 1.04850996, + "epoch": 0.2179684494036168, + "flos": 532181587968.0, + "grad_norm": 0.06772780520844247, + "language_loss": 0.86115086, + "learning_rate": 0.0009101806804390261, + "loss": 0.87198412, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.34863281, + "step": 1133, + "time_per_iteration": 2.7745282649993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081903, + "balance_loss_mlp": 1.04671264, + "epoch": 0.21816083108888035, + "flos": 474980433408.0, + "grad_norm": 0.05911376481567057, + "language_loss": 0.90451765, + "learning_rate": 0.0009100024467698453, + "loss": 0.91533667, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.35205078, + "step": 1134, + "time_per_iteration": 2.551278829574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086865, + "balance_loss_mlp": 1.051054, + "epoch": 0.2183532127741439, + "flos": 577198794240.0, + "grad_norm": 0.07962415284192025, + "language_loss": 0.83050048, + "learning_rate": 0.0009098240539235981, + "loss": 0.84136909, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.3581543, + "step": 1135, + "time_per_iteration": 2.660019636154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086508, + "balance_loss_mlp": 1.05172312, + "epoch": 0.21854559445940747, + "flos": 593832135168.0, + "grad_norm": 0.05867668726509775, + "language_loss": 0.87679315, + "learning_rate": 0.0009096455019695423, + "loss": 0.88765824, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.34838867, + "step": 1136, + "time_per_iteration": 2.756463050842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090029, + "balance_loss_mlp": 1.05426645, + "epoch": 0.21873797614467103, + "flos": 408464446464.0, + "grad_norm": 0.06290439978907646, + "language_loss": 0.90092266, + "learning_rate": 0.000909466790976998, + "loss": 0.91182297, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.35791016, + "step": 1137, + "time_per_iteration": 2.5046186447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089401, + "balance_loss_mlp": 1.05373359, + "epoch": 0.21893035782993459, + "flos": 893824865280.0, + "grad_norm": 0.05253297698454947, + "language_loss": 0.83030021, + "learning_rate": 0.0009092879210153473, + "loss": 0.84119421, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.35668945, + "step": 1138, + "time_per_iteration": 3.1294023990631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092735, + "balance_loss_mlp": 1.05835557, + "epoch": 0.21912273951519814, + "flos": 467392702464.0, + "grad_norm": 0.05516730570048504, + "language_loss": 0.88930631, + "learning_rate": 0.0009091088921540333, + "loss": 0.90023363, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.34399414, + "step": 1139, + "time_per_iteration": 2.5161380767822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081896, + "balance_loss_mlp": 1.06921172, + "epoch": 0.2193151212004617, + "flos": 1531262665728.0, + "grad_norm": 0.036356034107047845, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76590574, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.12695312, + "step": 1140, + "time_per_iteration": 4.929131984710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087847, + "balance_loss_mlp": 1.05306172, + "epoch": 0.2195075028857253, + "flos": 590901820416.0, + "grad_norm": 0.07364984820319191, + "language_loss": 0.8488574, + "learning_rate": 0.0009087503580104985, + "loss": 0.85973585, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.34814453, + "step": 1141, + "time_per_iteration": 2.676321029663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087752, + "balance_loss_mlp": 1.05287158, + "epoch": 0.21969988457098885, + "flos": 636033917952.0, + "grad_norm": 0.0662048159418312, + "language_loss": 0.79610777, + "learning_rate": 0.0009085708528674728, + "loss": 0.80698538, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.34912109, + "step": 1142, + "time_per_iteration": 2.7667393684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088746, + "balance_loss_mlp": 1.05202913, + "epoch": 0.2198922662562524, + "flos": 911974311936.0, + "grad_norm": 0.07907290305355467, + "language_loss": 0.86086833, + "learning_rate": 0.0009083911891031745, + "loss": 0.87175578, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.36743164, + "step": 1143, + "time_per_iteration": 3.1026079654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093334, + "balance_loss_mlp": 1.05809617, + "epoch": 0.22008464794151597, + "flos": 822603409920.0, + "grad_norm": 0.06284406217527433, + "language_loss": 0.91362917, + "learning_rate": 0.0009082113667873553, + "loss": 0.92456251, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.3527832, + "step": 1144, + "time_per_iteration": 3.098741292953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107482, + "balance_loss_mlp": 1.07188582, + "epoch": 0.22027702962677953, + "flos": 459416475648.0, + "grad_norm": 0.06625631151069579, + "language_loss": 0.90562177, + "learning_rate": 0.0009080313859898283, + "loss": 0.91669661, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.35620117, + "step": 1145, + "time_per_iteration": 2.4998207092285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111259, + "balance_loss_mlp": 1.07606888, + "epoch": 0.2204694113120431, + "flos": 530998723584.0, + "grad_norm": 0.05051092763003013, + "language_loss": 0.91815794, + "learning_rate": 0.0009078512467804684, + "loss": 0.92927051, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.35180664, + "step": 1146, + "time_per_iteration": 2.569073438644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117194, + "balance_loss_mlp": 1.08162224, + "epoch": 0.22066179299730665, + "flos": 522382307328.0, + "grad_norm": 0.06837547739928014, + "language_loss": 0.90610331, + "learning_rate": 0.0009076709492292119, + "loss": 0.91727525, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.35571289, + "step": 1147, + "time_per_iteration": 2.614039659500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114459, + "balance_loss_mlp": 1.07969809, + "epoch": 0.2208541746825702, + "flos": 546188742144.0, + "grad_norm": 0.06837160959472317, + "language_loss": 0.89193797, + "learning_rate": 0.0009074904934060562, + "loss": 0.90308249, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.34790039, + "step": 1148, + "time_per_iteration": 2.6419012546539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112121, + "balance_loss_mlp": 1.08578134, + "epoch": 0.22104655636783377, + "flos": 708404742144.0, + "grad_norm": 0.07108081727062696, + "language_loss": 0.84988266, + "learning_rate": 0.0009073098793810607, + "loss": 0.86109483, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.35473633, + "step": 1149, + "time_per_iteration": 2.909515142440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116065, + "balance_loss_mlp": 1.08061242, + "epoch": 0.22123893805309736, + "flos": 584594468352.0, + "grad_norm": 0.07695680382665727, + "language_loss": 0.88374794, + "learning_rate": 0.000907129107224346, + "loss": 0.89490861, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.35522461, + "step": 1150, + "time_per_iteration": 2.7029008865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099143, + "balance_loss_mlp": 1.06369042, + "epoch": 0.22143131973836092, + "flos": 492002270208.0, + "grad_norm": 0.049049579749502144, + "language_loss": 0.88305712, + "learning_rate": 0.0009069481770060939, + "loss": 0.89404863, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.35498047, + "step": 1151, + "time_per_iteration": 2.65167236328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097606, + "balance_loss_mlp": 1.06248736, + "epoch": 0.22162370142362448, + "flos": 1079227459584.0, + "grad_norm": 0.054063738490033035, + "language_loss": 0.84240663, + "learning_rate": 0.000906767088796548, + "loss": 0.85338271, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.35180664, + "step": 1152, + "time_per_iteration": 3.423985004425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110736, + "balance_loss_mlp": 1.07300401, + "epoch": 0.22181608310888803, + "flos": 492258345984.0, + "grad_norm": 0.057939830998464815, + "language_loss": 0.87012136, + "learning_rate": 0.0009065858426660127, + "loss": 0.88119501, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.34399414, + "step": 1153, + "time_per_iteration": 2.5987319946289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108178, + "balance_loss_mlp": 1.07355952, + "epoch": 0.2220084647941516, + "flos": 723687892992.0, + "grad_norm": 0.0653796708212952, + "language_loss": 0.84926325, + "learning_rate": 0.0009064044386848543, + "loss": 0.86034507, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.34667969, + "step": 1154, + "time_per_iteration": 2.9024224281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112372, + "balance_loss_mlp": 1.0753932, + "epoch": 0.22220084647941515, + "flos": 488988997632.0, + "grad_norm": 0.06606878403176955, + "language_loss": 0.88905716, + "learning_rate": 0.0009062228769234997, + "loss": 0.90018088, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.36987305, + "step": 1155, + "time_per_iteration": 2.5483920574188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104305, + "balance_loss_mlp": 1.06868565, + "epoch": 0.2223932281646787, + "flos": 536025696768.0, + "grad_norm": 0.06185680569649912, + "language_loss": 0.81360811, + "learning_rate": 0.0009060411574524376, + "loss": 0.82465118, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.35644531, + "step": 1156, + "time_per_iteration": 2.629166841506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114341, + "balance_loss_mlp": 1.0794363, + "epoch": 0.22258560984994227, + "flos": 931034580480.0, + "grad_norm": 0.06288530021121215, + "language_loss": 0.88191485, + "learning_rate": 0.0009058592803422178, + "loss": 0.8930583, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.34936523, + "step": 1157, + "time_per_iteration": 3.133453845977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219449, + "balance_loss_mlp": 1.20495331, + "epoch": 0.22277799153520586, + "flos": 1198998443520.0, + "grad_norm": 0.06392494715081258, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79929739, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.14453125, + "step": 1158, + "time_per_iteration": 4.838433027267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095231, + "balance_loss_mlp": 1.0620904, + "epoch": 0.22297037322046942, + "flos": 501052262400.0, + "grad_norm": 0.059439708082357066, + "language_loss": 0.90095651, + "learning_rate": 0.00090549505348681, + "loss": 0.91190875, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.33154297, + "step": 1159, + "time_per_iteration": 2.561887264251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083603, + "balance_loss_mlp": 1.04977143, + "epoch": 0.22316275490573298, + "flos": 752413612032.0, + "grad_norm": 0.05610915875378834, + "language_loss": 0.84254742, + "learning_rate": 0.0009053127038830275, + "loss": 0.85338354, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.33862305, + "step": 1160, + "time_per_iteration": 2.9465925693511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082896, + "balance_loss_mlp": 1.04844403, + "epoch": 0.22335513659099654, + "flos": 514553057280.0, + "grad_norm": 0.06657410760601727, + "language_loss": 0.87182009, + "learning_rate": 0.000905130196922898, + "loss": 0.88264906, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.3449707, + "step": 1161, + "time_per_iteration": 2.597325325012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076013, + "balance_loss_mlp": 1.04213357, + "epoch": 0.2235475182762601, + "flos": 484286501376.0, + "grad_norm": 0.057467913173926514, + "language_loss": 0.87228084, + "learning_rate": 0.0009049475326772769, + "loss": 0.88304096, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.33911133, + "step": 1162, + "time_per_iteration": 2.591592788696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107232, + "balance_loss_mlp": 1.0379163, + "epoch": 0.22373989996152366, + "flos": 469698794496.0, + "grad_norm": 0.05481831884816676, + "language_loss": 0.83362567, + "learning_rate": 0.0009047647112170811, + "loss": 0.84434885, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.34448242, + "step": 1163, + "time_per_iteration": 2.7466936111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080512, + "balance_loss_mlp": 1.04503536, + "epoch": 0.22393228164678722, + "flos": 1270512594432.0, + "grad_norm": 0.0775991801606853, + "language_loss": 0.87402856, + "learning_rate": 0.0009045817326132876, + "loss": 0.88483369, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.35498047, + "step": 1164, + "time_per_iteration": 3.6615524291992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082563, + "balance_loss_mlp": 1.04665732, + "epoch": 0.22412466333205078, + "flos": 596052449280.0, + "grad_norm": 0.05603114612800397, + "language_loss": 0.83484542, + "learning_rate": 0.0009043985969369357, + "loss": 0.84567106, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.35913086, + "step": 1165, + "time_per_iteration": 2.800389528274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084793, + "balance_loss_mlp": 1.047647, + "epoch": 0.22431704501731436, + "flos": 608136062976.0, + "grad_norm": 0.052919924442321326, + "language_loss": 0.84423298, + "learning_rate": 0.0009042153042591245, + "loss": 0.85508084, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.37158203, + "step": 1166, + "time_per_iteration": 2.7848384380340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080872, + "balance_loss_mlp": 1.04622972, + "epoch": 0.22450942670257792, + "flos": 906203842560.0, + "grad_norm": 0.054053491984114646, + "language_loss": 0.85318398, + "learning_rate": 0.0009040318546510146, + "loss": 0.86399269, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.34667969, + "step": 1167, + "time_per_iteration": 3.1406538486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080871, + "balance_loss_mlp": 1.04529881, + "epoch": 0.22470180838784148, + "flos": 565032222720.0, + "grad_norm": 0.06590224184570584, + "language_loss": 0.85490131, + "learning_rate": 0.0009038482481838275, + "loss": 0.86571002, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.35620117, + "step": 1168, + "time_per_iteration": 2.6623363494873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107832, + "balance_loss_mlp": 1.04265296, + "epoch": 0.22489419007310504, + "flos": 834109443072.0, + "grad_norm": 0.05295244004415107, + "language_loss": 0.87364161, + "learning_rate": 0.0009036644849288455, + "loss": 0.88442481, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.35668945, + "step": 1169, + "time_per_iteration": 3.096397638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082872, + "balance_loss_mlp": 1.04567838, + "epoch": 0.2250865717583686, + "flos": 580788237312.0, + "grad_norm": 0.06189616009675637, + "language_loss": 0.85257494, + "learning_rate": 0.0009034805649574118, + "loss": 0.86340362, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.37207031, + "step": 1170, + "time_per_iteration": 2.655629873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081931, + "balance_loss_mlp": 1.04669285, + "epoch": 0.22527895344363216, + "flos": 600091435008.0, + "grad_norm": 0.0574349936504533, + "language_loss": 0.85081124, + "learning_rate": 0.0009032964883409308, + "loss": 0.86163056, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.35253906, + "step": 1171, + "time_per_iteration": 2.9228479862213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096453, + "balance_loss_mlp": 1.08109915, + "epoch": 0.22547133512889572, + "flos": 1440009073152.0, + "grad_norm": 0.03435009764288223, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74146986, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.15332031, + "step": 1172, + "time_per_iteration": 4.9870195388793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099751, + "balance_loss_mlp": 1.06365418, + "epoch": 0.22566371681415928, + "flos": 490377065472.0, + "grad_norm": 0.06750207251725504, + "language_loss": 0.87597418, + "learning_rate": 0.0009029278654587462, + "loss": 0.88697171, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.36108398, + "step": 1173, + "time_per_iteration": 2.545078754425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098309, + "balance_loss_mlp": 1.06156898, + "epoch": 0.22585609849942284, + "flos": 604334214144.0, + "grad_norm": 0.06244795934891309, + "language_loss": 0.82517409, + "learning_rate": 0.0009027433193361548, + "loss": 0.8361572, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.36767578, + "step": 1174, + "time_per_iteration": 2.69753098487854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100346, + "balance_loss_mlp": 1.06305695, + "epoch": 0.22604848018468643, + "flos": 635280247296.0, + "grad_norm": 0.06854123529633785, + "language_loss": 0.87138826, + "learning_rate": 0.00090255861685474, + "loss": 0.88239175, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.37280273, + "step": 1175, + "time_per_iteration": 2.7199149131774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094451, + "balance_loss_mlp": 1.05744886, + "epoch": 0.22624086186995, + "flos": 479633467392.0, + "grad_norm": 0.06836538183258173, + "language_loss": 0.91474092, + "learning_rate": 0.0009023737580862095, + "loss": 0.92568541, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.36962891, + "step": 1176, + "time_per_iteration": 2.51255464553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092025, + "balance_loss_mlp": 1.0570724, + "epoch": 0.22643324355521355, + "flos": 495566982144.0, + "grad_norm": 0.05906016973995859, + "language_loss": 0.83066601, + "learning_rate": 0.0009021887431023321, + "loss": 0.84158623, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.34985352, + "step": 1177, + "time_per_iteration": 2.5783960819244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084144, + "balance_loss_mlp": 1.04842877, + "epoch": 0.2266256252404771, + "flos": 561271071744.0, + "grad_norm": 0.05542928649781209, + "language_loss": 0.87720597, + "learning_rate": 0.0009020035719749369, + "loss": 0.8880474, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.35742188, + "step": 1178, + "time_per_iteration": 2.7076900005340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088743, + "balance_loss_mlp": 1.05259871, + "epoch": 0.22681800692574067, + "flos": 579353527296.0, + "grad_norm": 0.05892405405909356, + "language_loss": 0.77506709, + "learning_rate": 0.0009018182447759136, + "loss": 0.78595448, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.36157227, + "step": 1179, + "time_per_iteration": 2.974362373352051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083275, + "balance_loss_mlp": 1.04798961, + "epoch": 0.22701038861100423, + "flos": 739842577920.0, + "grad_norm": 0.0555118465290956, + "language_loss": 0.80168724, + "learning_rate": 0.0009016327615772126, + "loss": 0.81252003, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.35327148, + "step": 1180, + "time_per_iteration": 2.9207658767700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082522, + "balance_loss_mlp": 1.04776096, + "epoch": 0.2272027702962678, + "flos": 576996562944.0, + "grad_norm": 0.06857059729731818, + "language_loss": 0.88146389, + "learning_rate": 0.0009014471224508451, + "loss": 0.8922891, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.34790039, + "step": 1181, + "time_per_iteration": 2.6884429454803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080826, + "balance_loss_mlp": 1.0466603, + "epoch": 0.22739515198153135, + "flos": 544012098048.0, + "grad_norm": 0.07386093909180869, + "language_loss": 0.83020878, + "learning_rate": 0.0009012613274688823, + "loss": 0.84101701, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.34204102, + "step": 1182, + "time_per_iteration": 2.625973701477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082925, + "balance_loss_mlp": 1.04735291, + "epoch": 0.22758753366679493, + "flos": 439932805632.0, + "grad_norm": 0.06621157637783351, + "language_loss": 0.87839937, + "learning_rate": 0.0009010753767034565, + "loss": 0.88922858, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.35571289, + "step": 1183, + "time_per_iteration": 2.545569658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086712, + "balance_loss_mlp": 1.05030489, + "epoch": 0.2277799153520585, + "flos": 729104772096.0, + "grad_norm": 0.07242159959797279, + "language_loss": 0.79501748, + "learning_rate": 0.0009008892702267599, + "loss": 0.80588454, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.36425781, + "step": 1184, + "time_per_iteration": 2.9862120151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089174, + "balance_loss_mlp": 1.05322075, + "epoch": 0.22797229703732205, + "flos": 526641053184.0, + "grad_norm": 0.0740207336504876, + "language_loss": 0.89059424, + "learning_rate": 0.0009007030081110457, + "loss": 0.90148592, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.35961914, + "step": 1185, + "time_per_iteration": 2.6184284687042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083236, + "balance_loss_mlp": 1.04783034, + "epoch": 0.2281646787225856, + "flos": 535159954944.0, + "grad_norm": 0.06479663876551665, + "language_loss": 0.84969211, + "learning_rate": 0.000900516590428627, + "loss": 0.86052454, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.35449219, + "step": 1186, + "time_per_iteration": 2.724161386489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083897, + "balance_loss_mlp": 1.049088, + "epoch": 0.22835706040784917, + "flos": 541107924480.0, + "grad_norm": 0.052728830082858405, + "language_loss": 0.89177948, + "learning_rate": 0.0009003300172518778, + "loss": 0.90261841, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.34790039, + "step": 1187, + "time_per_iteration": 2.6810121536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108576, + "balance_loss_mlp": 1.05018783, + "epoch": 0.22854944209311273, + "flos": 790297012224.0, + "grad_norm": 0.05376177869775473, + "language_loss": 0.84676045, + "learning_rate": 0.0009001432886532321, + "loss": 0.85761803, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.35571289, + "step": 1188, + "time_per_iteration": 2.977433919906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109735, + "balance_loss_mlp": 1.0614686, + "epoch": 0.2287418237783763, + "flos": 469047020544.0, + "grad_norm": 0.06589135500726684, + "language_loss": 0.86752445, + "learning_rate": 0.0008999564047051843, + "loss": 0.87849802, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.35913086, + "step": 1189, + "time_per_iteration": 2.5126237869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094774, + "balance_loss_mlp": 1.06017935, + "epoch": 0.22893420546363985, + "flos": 467786990592.0, + "grad_norm": 0.061551577223012334, + "language_loss": 0.84713042, + "learning_rate": 0.0008997693654802894, + "loss": 0.85807812, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.34643555, + "step": 1190, + "time_per_iteration": 2.6570322513580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088276, + "balance_loss_mlp": 1.05318046, + "epoch": 0.22912658714890344, + "flos": 625974179328.0, + "grad_norm": 0.05326512300588333, + "language_loss": 0.86549705, + "learning_rate": 0.0008995821710511625, + "loss": 0.87637979, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.35107422, + "step": 1191, + "time_per_iteration": 2.8115806579589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108036, + "balance_loss_mlp": 1.04731488, + "epoch": 0.229318968834167, + "flos": 502785156096.0, + "grad_norm": 0.06330680163661413, + "language_loss": 0.8511278, + "learning_rate": 0.0008993948214904786, + "loss": 0.86193144, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.33056641, + "step": 1192, + "time_per_iteration": 2.546410083770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125589, + "balance_loss_mlp": 1.11023474, + "epoch": 0.22951135051943056, + "flos": 1374108544512.0, + "grad_norm": 0.06153086464986019, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79547799, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.15332031, + "step": 1193, + "time_per_iteration": 4.891269207000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099997, + "balance_loss_mlp": 1.06306624, + "epoch": 0.22970373220469412, + "flos": 644045050368.0, + "grad_norm": 0.06658536787009234, + "language_loss": 0.79028845, + "learning_rate": 0.0008990196572654427, + "loss": 0.80128849, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.36914062, + "step": 1194, + "time_per_iteration": 2.8504366874694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100467, + "balance_loss_mlp": 1.06582475, + "epoch": 0.22989611388995768, + "flos": 499945001472.0, + "grad_norm": 0.048025217626556156, + "language_loss": 0.87748766, + "learning_rate": 0.0008988318427467426, + "loss": 0.88849235, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.34667969, + "step": 1195, + "time_per_iteration": 2.735084056854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099859, + "balance_loss_mlp": 1.06524014, + "epoch": 0.23008849557522124, + "flos": 1096071796224.0, + "grad_norm": 0.06731751876810108, + "language_loss": 0.86263168, + "learning_rate": 0.0008986438733877887, + "loss": 0.87363023, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.34667969, + "step": 1196, + "time_per_iteration": 3.435035228729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100924, + "balance_loss_mlp": 1.06606746, + "epoch": 0.2302808772604848, + "flos": 683313546240.0, + "grad_norm": 0.04733445604251135, + "language_loss": 0.84099567, + "learning_rate": 0.0008984557492615576, + "loss": 0.85200489, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.34887695, + "step": 1197, + "time_per_iteration": 2.927668809890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107534, + "balance_loss_mlp": 1.07317793, + "epoch": 0.23047325894574835, + "flos": 528664928256.0, + "grad_norm": 0.0630370564804667, + "language_loss": 0.89949608, + "learning_rate": 0.0008982674704410854, + "loss": 0.91057146, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.34399414, + "step": 1198, + "time_per_iteration": 2.691016435623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107479, + "balance_loss_mlp": 1.07228875, + "epoch": 0.23066564063101191, + "flos": 682427455488.0, + "grad_norm": 0.06209648084563375, + "language_loss": 0.77829844, + "learning_rate": 0.0008980790369994682, + "loss": 0.78937328, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.35205078, + "step": 1199, + "time_per_iteration": 2.9320883750915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_mlp": 1.06525421, + "epoch": 0.2308580223162755, + "flos": 558247624704.0, + "grad_norm": 0.09180159748966574, + "language_loss": 0.87396461, + "learning_rate": 0.000897890449009863, + "loss": 0.88496947, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.3527832, + "step": 1200, + "time_per_iteration": 2.6804869174957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096306, + "balance_loss_mlp": 1.06183052, + "epoch": 0.23105040400153906, + "flos": 555406060032.0, + "grad_norm": 0.05982856494430897, + "language_loss": 0.90313268, + "learning_rate": 0.0008977017065454853, + "loss": 0.9140957, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.3449707, + "step": 1201, + "time_per_iteration": 2.639636754989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090282, + "balance_loss_mlp": 1.05556786, + "epoch": 0.23124278568680262, + "flos": 704474855424.0, + "grad_norm": 0.06077351963181601, + "language_loss": 0.80804175, + "learning_rate": 0.0008975128096796121, + "loss": 0.81894457, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.34765625, + "step": 1202, + "time_per_iteration": 2.8410260677337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078695, + "balance_loss_mlp": 1.04431474, + "epoch": 0.23143516737206618, + "flos": 612469002240.0, + "grad_norm": 0.07413481943536562, + "language_loss": 0.85940087, + "learning_rate": 0.0008973237584855794, + "loss": 0.87018776, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.34423828, + "step": 1203, + "time_per_iteration": 2.898423671722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077022, + "balance_loss_mlp": 1.04233205, + "epoch": 0.23162754905732974, + "flos": 389030238720.0, + "grad_norm": 0.06038618944932519, + "language_loss": 0.8201915, + "learning_rate": 0.0008971345530367832, + "loss": 0.83096182, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.34716797, + "step": 1204, + "time_per_iteration": 2.486668586730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082339, + "balance_loss_mlp": 1.04738712, + "epoch": 0.2318199307425933, + "flos": 667481928192.0, + "grad_norm": 0.05427081728260985, + "language_loss": 0.85029405, + "learning_rate": 0.0008969451934066799, + "loss": 0.86111748, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.34960938, + "step": 1205, + "time_per_iteration": 2.771306276321411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091653, + "balance_loss_mlp": 1.05662966, + "epoch": 0.23201231242785686, + "flos": 666093860352.0, + "grad_norm": 0.0707913589572404, + "language_loss": 0.80143172, + "learning_rate": 0.0008967556796687854, + "loss": 0.81234825, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.35058594, + "step": 1206, + "time_per_iteration": 2.8904309272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087698, + "balance_loss_mlp": 1.05353224, + "epoch": 0.23220469411312042, + "flos": 748498281984.0, + "grad_norm": 0.05559113870944949, + "language_loss": 0.83954245, + "learning_rate": 0.0008965660118966752, + "loss": 0.8504194, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.34204102, + "step": 1207, + "time_per_iteration": 2.9140615463256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108846, + "balance_loss_mlp": 1.05529559, + "epoch": 0.232397075798384, + "flos": 666763163136.0, + "grad_norm": 0.04975334384076733, + "language_loss": 0.90441763, + "learning_rate": 0.0008963761901639851, + "loss": 0.91530222, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.33154297, + "step": 1208, + "time_per_iteration": 2.8032286167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094486, + "balance_loss_mlp": 1.06008244, + "epoch": 0.23258945748364757, + "flos": 609937357824.0, + "grad_norm": 0.05840728669351643, + "language_loss": 0.83201033, + "learning_rate": 0.0008961862145444103, + "loss": 0.84295517, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.34399414, + "step": 1209, + "time_per_iteration": 2.7161943912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_mlp": 1.05954397, + "epoch": 0.23278183916891113, + "flos": 489397842432.0, + "grad_norm": 0.06743904317466738, + "language_loss": 0.85216832, + "learning_rate": 0.0008959960851117059, + "loss": 0.86311138, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.34790039, + "step": 1210, + "time_per_iteration": 2.5817031860351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095047, + "balance_loss_mlp": 1.06014228, + "epoch": 0.23297422085417469, + "flos": 511314232320.0, + "grad_norm": 0.057575168534165826, + "language_loss": 0.84338427, + "learning_rate": 0.0008958058019396868, + "loss": 0.85433477, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.34936523, + "step": 1211, + "time_per_iteration": 2.7788164615631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091213, + "balance_loss_mlp": 1.05730987, + "epoch": 0.23316660253943824, + "flos": 546145072128.0, + "grad_norm": 0.057082370400879795, + "language_loss": 0.86897939, + "learning_rate": 0.0008956153651022274, + "loss": 0.87989151, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.33935547, + "step": 1212, + "time_per_iteration": 2.7309062480926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090204, + "balance_loss_mlp": 1.05608642, + "epoch": 0.2333589842247018, + "flos": 509998947840.0, + "grad_norm": 0.06317396982696966, + "language_loss": 0.84641176, + "learning_rate": 0.0008954247746732618, + "loss": 0.85731381, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.34155273, + "step": 1213, + "time_per_iteration": 2.619058609008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084199, + "balance_loss_mlp": 1.05072522, + "epoch": 0.23355136590996536, + "flos": 662834686464.0, + "grad_norm": 0.09780220222501788, + "language_loss": 0.90954423, + "learning_rate": 0.0008952340307267837, + "loss": 0.9203862, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.33496094, + "step": 1214, + "time_per_iteration": 2.869351387023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108593, + "balance_loss_mlp": 1.05128753, + "epoch": 0.23374374759522892, + "flos": 508206417408.0, + "grad_norm": 0.061496426320555984, + "language_loss": 0.83373952, + "learning_rate": 0.0008950431333368468, + "loss": 0.84459883, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.34667969, + "step": 1215, + "time_per_iteration": 2.5557806491851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093645, + "balance_loss_mlp": 1.05928898, + "epoch": 0.2339361292804925, + "flos": 1293964028928.0, + "grad_norm": 0.062331860667319446, + "language_loss": 0.84730738, + "learning_rate": 0.0008948520825775634, + "loss": 0.85824382, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.34399414, + "step": 1216, + "time_per_iteration": 3.6050164699554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098293, + "balance_loss_mlp": 1.06343639, + "epoch": 0.23412851096575607, + "flos": 705617021952.0, + "grad_norm": 0.06500023378725601, + "language_loss": 0.84162283, + "learning_rate": 0.0008946608785231067, + "loss": 0.8526057, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.34863281, + "step": 1217, + "time_per_iteration": 2.858696699142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109995, + "balance_loss_mlp": 1.065045, + "epoch": 0.23432089265101963, + "flos": 438036968448.0, + "grad_norm": 0.06356573317347913, + "language_loss": 0.84325957, + "learning_rate": 0.0008944695212477084, + "loss": 0.85425907, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.34912109, + "step": 1218, + "time_per_iteration": 2.4787168502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107192, + "balance_loss_mlp": 1.07190585, + "epoch": 0.2345132743362832, + "flos": 480697058304.0, + "grad_norm": 0.05460931090439532, + "language_loss": 0.86098325, + "learning_rate": 0.0008942780108256599, + "loss": 0.87205517, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.35327148, + "step": 1219, + "time_per_iteration": 2.574692726135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105521, + "balance_loss_mlp": 1.06966305, + "epoch": 0.23470565602154675, + "flos": 411231817728.0, + "grad_norm": 0.05853057396081394, + "language_loss": 0.86360055, + "learning_rate": 0.0008940863473313121, + "loss": 0.87465572, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.35839844, + "step": 1220, + "time_per_iteration": 2.4849462509155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115361, + "balance_loss_mlp": 1.08024168, + "epoch": 0.2348980377068103, + "flos": 545189170176.0, + "grad_norm": 0.0745659618807548, + "language_loss": 0.87691534, + "learning_rate": 0.0008938945308390756, + "loss": 0.88806891, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.3515625, + "step": 1221, + "time_per_iteration": 2.6100285053253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107677, + "balance_loss_mlp": 1.07248664, + "epoch": 0.23509041939207387, + "flos": 575465900544.0, + "grad_norm": 0.055913245264753976, + "language_loss": 0.87316763, + "learning_rate": 0.00089370256142342, + "loss": 0.88424438, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.35205078, + "step": 1222, + "time_per_iteration": 2.726897716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109541, + "balance_loss_mlp": 1.06007659, + "epoch": 0.23528280107733743, + "flos": 588568025088.0, + "grad_norm": 0.04976165943815558, + "language_loss": 0.85095507, + "learning_rate": 0.0008935104391588746, + "loss": 0.86190915, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.35351562, + "step": 1223, + "time_per_iteration": 2.7249879837036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108966, + "balance_loss_mlp": 1.07313156, + "epoch": 0.235475182762601, + "flos": 822948235776.0, + "grad_norm": 0.05651852634602403, + "language_loss": 0.82749176, + "learning_rate": 0.0008933181641200276, + "loss": 0.83858138, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.35839844, + "step": 1224, + "time_per_iteration": 3.1651737689971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094197, + "balance_loss_mlp": 1.06017447, + "epoch": 0.23566756444786457, + "flos": 679865287680.0, + "grad_norm": 0.06356150049585653, + "language_loss": 0.8609674, + "learning_rate": 0.0008931257363815271, + "loss": 0.87190938, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.34033203, + "step": 1225, + "time_per_iteration": 2.891789674758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091395, + "balance_loss_mlp": 1.05711007, + "epoch": 0.23585994613312813, + "flos": 701481931776.0, + "grad_norm": 0.04853721262867189, + "language_loss": 0.89892405, + "learning_rate": 0.0008929331560180798, + "loss": 0.90983796, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.34277344, + "step": 1226, + "time_per_iteration": 2.934101104736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093973, + "balance_loss_mlp": 1.06002271, + "epoch": 0.2360523278183917, + "flos": 523923144192.0, + "grad_norm": 0.06491881814379113, + "language_loss": 0.91129786, + "learning_rate": 0.0008927404231044525, + "loss": 0.92223763, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.33984375, + "step": 1227, + "time_per_iteration": 2.682377815246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083665, + "balance_loss_mlp": 1.0493325, + "epoch": 0.23624470950365525, + "flos": 524027860992.0, + "grad_norm": 0.053423388326064705, + "language_loss": 0.81944436, + "learning_rate": 0.0008925475377154703, + "loss": 0.83028102, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.34375, + "step": 1228, + "time_per_iteration": 2.7511117458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077291, + "balance_loss_mlp": 1.04169512, + "epoch": 0.2364370911889188, + "flos": 596525313024.0, + "grad_norm": 0.05836717970983508, + "language_loss": 0.8241868, + "learning_rate": 0.0008923544999260183, + "loss": 0.83495975, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.35644531, + "step": 1229, + "time_per_iteration": 2.7915079593658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087923, + "balance_loss_mlp": 1.05194569, + "epoch": 0.23662947287418237, + "flos": 756519588864.0, + "grad_norm": 0.08156392485297027, + "language_loss": 0.91757852, + "learning_rate": 0.00089216130981104, + "loss": 0.92845774, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.35986328, + "step": 1230, + "time_per_iteration": 3.037900924682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089884, + "balance_loss_mlp": 1.0531199, + "epoch": 0.23682185455944593, + "flos": 545907935232.0, + "grad_norm": 0.05473268619072285, + "language_loss": 0.82659578, + "learning_rate": 0.000891967967445539, + "loss": 0.83749461, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.36743164, + "step": 1231, + "time_per_iteration": 2.6595497131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088497, + "balance_loss_mlp": 1.05201924, + "epoch": 0.2370142362447095, + "flos": 661977709056.0, + "grad_norm": 0.04604146434030928, + "language_loss": 0.88502473, + "learning_rate": 0.0008917744729045772, + "loss": 0.89590967, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.36499023, + "step": 1232, + "time_per_iteration": 2.851651668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095833, + "balance_loss_mlp": 1.05868709, + "epoch": 0.23720661792997308, + "flos": 683361598464.0, + "grad_norm": 0.06835104069372223, + "language_loss": 0.84165156, + "learning_rate": 0.0008915808262632757, + "loss": 0.85260987, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.37133789, + "step": 1233, + "time_per_iteration": 2.8114235401153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095976, + "balance_loss_mlp": 1.05892599, + "epoch": 0.23739899961523664, + "flos": 558631738368.0, + "grad_norm": 0.055258261409357204, + "language_loss": 0.92769438, + "learning_rate": 0.0008913870275968148, + "loss": 0.93865418, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.37036133, + "step": 1234, + "time_per_iteration": 2.705349922180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092916, + "balance_loss_mlp": 1.05629516, + "epoch": 0.2375913813005002, + "flos": 889144128000.0, + "grad_norm": 0.12876854850300654, + "language_loss": 0.87540263, + "learning_rate": 0.0008911930769804342, + "loss": 0.8863318, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.36621094, + "step": 1235, + "time_per_iteration": 3.2342941761016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091192, + "balance_loss_mlp": 1.05492854, + "epoch": 0.23778376298576376, + "flos": 640810607616.0, + "grad_norm": 0.044375832072417805, + "language_loss": 0.91481459, + "learning_rate": 0.0008909989744894318, + "loss": 0.92572653, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.36303711, + "step": 1236, + "time_per_iteration": 2.8858232498168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089597, + "balance_loss_mlp": 1.05333364, + "epoch": 0.23797614467102732, + "flos": 616540073472.0, + "grad_norm": 0.05892762197337364, + "language_loss": 0.81707233, + "learning_rate": 0.0008908047201991649, + "loss": 0.82796836, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.36279297, + "step": 1237, + "time_per_iteration": 2.785226583480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085624, + "balance_loss_mlp": 1.05071974, + "epoch": 0.23816852635629088, + "flos": 623941539840.0, + "grad_norm": 0.051487502947417364, + "language_loss": 0.86561942, + "learning_rate": 0.0008906103141850502, + "loss": 0.87647569, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.34960938, + "step": 1238, + "time_per_iteration": 2.868241310119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095513, + "balance_loss_mlp": 1.05901158, + "epoch": 0.23836090804155444, + "flos": 521180504064.0, + "grad_norm": 0.07170300234131513, + "language_loss": 0.88119614, + "learning_rate": 0.0008904157565225621, + "loss": 0.89215136, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.36499023, + "step": 1239, + "time_per_iteration": 2.610048294067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092952, + "balance_loss_mlp": 1.05716562, + "epoch": 0.238553289726818, + "flos": 1153527616512.0, + "grad_norm": 0.07764557472008667, + "language_loss": 0.82042629, + "learning_rate": 0.000890221047287235, + "loss": 0.83135581, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.3581543, + "step": 1240, + "time_per_iteration": 3.547147274017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102634, + "balance_loss_mlp": 1.06772995, + "epoch": 0.23874567141208156, + "flos": 499600175616.0, + "grad_norm": 0.07123563443936186, + "language_loss": 0.91052604, + "learning_rate": 0.0008900261865546615, + "loss": 0.92155242, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.34936523, + "step": 1241, + "time_per_iteration": 2.6277406215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110411, + "balance_loss_mlp": 1.06768012, + "epoch": 0.23893805309734514, + "flos": 556657325568.0, + "grad_norm": 0.08027126565183675, + "language_loss": 0.84991688, + "learning_rate": 0.0008898311744004936, + "loss": 0.86095798, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.36425781, + "step": 1242, + "time_per_iteration": 2.687009811401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112655, + "balance_loss_mlp": 1.07708287, + "epoch": 0.2391304347826087, + "flos": 549009957888.0, + "grad_norm": 0.05686617926086787, + "language_loss": 0.86918116, + "learning_rate": 0.0008896360109004414, + "loss": 0.88030773, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.35595703, + "step": 1243, + "time_per_iteration": 2.6292564868927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111871, + "balance_loss_mlp": 1.07629931, + "epoch": 0.23932281646787226, + "flos": 515794148352.0, + "grad_norm": 0.05075175877282041, + "language_loss": 0.84481502, + "learning_rate": 0.0008894406961302742, + "loss": 0.85593379, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.35595703, + "step": 1244, + "time_per_iteration": 2.5960640907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122594, + "balance_loss_mlp": 1.08737969, + "epoch": 0.23951519815313582, + "flos": 743353445376.0, + "grad_norm": 0.06488001286924965, + "language_loss": 0.84053004, + "learning_rate": 0.0008892452301658201, + "loss": 0.85175598, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.35253906, + "step": 1245, + "time_per_iteration": 2.9320998191833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123528, + "balance_loss_mlp": 1.08728814, + "epoch": 0.23970757983839938, + "flos": 553855048704.0, + "grad_norm": 0.05553543969160018, + "language_loss": 0.83631629, + "learning_rate": 0.0008890496130829653, + "loss": 0.84755158, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.36230469, + "step": 1246, + "time_per_iteration": 2.6420071125030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123934, + "balance_loss_mlp": 1.08802795, + "epoch": 0.23989996152366294, + "flos": 480416251392.0, + "grad_norm": 0.0595921721906752, + "language_loss": 0.85551775, + "learning_rate": 0.0008888538449576555, + "loss": 0.86675715, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.359375, + "step": 1247, + "time_per_iteration": 2.544706344604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123543, + "balance_loss_mlp": 1.08687472, + "epoch": 0.2400923432089265, + "flos": 485069285376.0, + "grad_norm": 0.06973867138143126, + "language_loss": 0.82958472, + "learning_rate": 0.0008886579258658944, + "loss": 0.84082007, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.36669922, + "step": 1248, + "time_per_iteration": 2.5460424423217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108744, + "balance_loss_mlp": 1.0724808, + "epoch": 0.24028472489419006, + "flos": 623247505920.0, + "grad_norm": 0.04817062293972818, + "language_loss": 0.85353303, + "learning_rate": 0.0008884618558837446, + "loss": 0.86462045, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.36279297, + "step": 1249, + "time_per_iteration": 2.80222487449646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125107, + "balance_loss_mlp": 1.08765173, + "epoch": 0.24047710657945365, + "flos": 601302002688.0, + "grad_norm": 0.052194699096834656, + "language_loss": 0.86387813, + "learning_rate": 0.0008882656350873273, + "loss": 0.87512922, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.37426758, + "step": 1250, + "time_per_iteration": 2.830887794494629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136265, + "balance_loss_mlp": 1.09911942, + "epoch": 0.2406694882647172, + "flos": 841199579136.0, + "grad_norm": 0.07156482775024936, + "language_loss": 0.86951184, + "learning_rate": 0.0008880692635528219, + "loss": 0.88087451, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.37109375, + "step": 1251, + "time_per_iteration": 3.0439021587371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140454, + "balance_loss_mlp": 1.10450029, + "epoch": 0.24086186994998077, + "flos": 526789440000.0, + "grad_norm": 0.062254670736574515, + "language_loss": 0.89187038, + "learning_rate": 0.0008878727413564669, + "loss": 0.90327489, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.36010742, + "step": 1252, + "time_per_iteration": 2.7363240718841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094248, + "balance_loss_mlp": 1.07717752, + "epoch": 0.24105425163524433, + "flos": 1337464673280.0, + "grad_norm": 0.032126183170312766, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81229842, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.17089844, + "step": 1253, + "time_per_iteration": 4.8370680809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175937, + "balance_loss_mlp": 1.13755202, + "epoch": 0.24124663332050789, + "flos": 613822164480.0, + "grad_norm": 0.06436318886622608, + "language_loss": 0.78452635, + "learning_rate": 0.0008874792452834528, + "loss": 0.79628575, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.38354492, + "step": 1254, + "time_per_iteration": 2.724947452545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151033, + "balance_loss_mlp": 1.11381602, + "epoch": 0.24143901500577145, + "flos": 575278225920.0, + "grad_norm": 0.08996846201845816, + "language_loss": 0.87516546, + "learning_rate": 0.0008872822715595626, + "loss": 0.88667583, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.37207031, + "step": 1255, + "time_per_iteration": 2.676539659500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118396, + "balance_loss_mlp": 1.08275259, + "epoch": 0.241631396691035, + "flos": 494941349376.0, + "grad_norm": 0.06475486920780314, + "language_loss": 0.87080252, + "learning_rate": 0.0008870851474793598, + "loss": 0.88198644, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.35668945, + "step": 1256, + "time_per_iteration": 2.5523862838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108714, + "balance_loss_mlp": 1.07287991, + "epoch": 0.24182377837629856, + "flos": 635891323392.0, + "grad_norm": 0.0627898868455093, + "language_loss": 0.89724898, + "learning_rate": 0.0008868878731193752, + "loss": 0.90833616, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.35888672, + "step": 1257, + "time_per_iteration": 2.8152451515197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094242, + "balance_loss_mlp": 1.05933762, + "epoch": 0.24201616006156215, + "flos": 514938580992.0, + "grad_norm": 0.06450256361572139, + "language_loss": 0.89708877, + "learning_rate": 0.0008866904485561973, + "loss": 0.90803117, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.34936523, + "step": 1258, + "time_per_iteration": 2.7304298877716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095529, + "balance_loss_mlp": 1.05945659, + "epoch": 0.2422085417468257, + "flos": 614837703168.0, + "grad_norm": 0.05809143078881904, + "language_loss": 0.83024096, + "learning_rate": 0.000886492873866473, + "loss": 0.8411963, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.36108398, + "step": 1259, + "time_per_iteration": 2.828904628753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090865, + "balance_loss_mlp": 1.05576968, + "epoch": 0.24240092343208927, + "flos": 585515464704.0, + "grad_norm": 0.07568124142212555, + "language_loss": 0.84760439, + "learning_rate": 0.000886295149126908, + "loss": 0.85851306, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.35131836, + "step": 1260, + "time_per_iteration": 2.7011313438415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109068, + "balance_loss_mlp": 1.05537009, + "epoch": 0.24259330511735283, + "flos": 761930675712.0, + "grad_norm": 0.05459059834864095, + "language_loss": 0.85652769, + "learning_rate": 0.0008860972744142655, + "loss": 0.8674345, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.35327148, + "step": 1261, + "time_per_iteration": 2.9039082527160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084953, + "balance_loss_mlp": 1.05028725, + "epoch": 0.2427856868026164, + "flos": 626566316544.0, + "grad_norm": 0.06267274795834049, + "language_loss": 0.8183161, + "learning_rate": 0.0008858992498053671, + "loss": 0.82916564, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.34692383, + "step": 1262, + "time_per_iteration": 2.8293697834014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080455, + "balance_loss_mlp": 1.06586385, + "epoch": 0.24297806848787995, + "flos": 1510840470528.0, + "grad_norm": 0.02756761643082338, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77669203, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.14550781, + "step": 1263, + "time_per_iteration": 4.8116748332977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087756, + "balance_loss_mlp": 1.05328119, + "epoch": 0.2431704501731435, + "flos": 541669538304.0, + "grad_norm": 0.05501719814044903, + "language_loss": 0.83684969, + "learning_rate": 0.0008855027512063817, + "loss": 0.8477273, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.3449707, + "step": 1264, + "time_per_iteration": 2.6995394229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084053, + "balance_loss_mlp": 1.0493865, + "epoch": 0.24336283185840707, + "flos": 523588492800.0, + "grad_norm": 0.06804757776515359, + "language_loss": 0.85974693, + "learning_rate": 0.0008853042773702292, + "loss": 0.87058747, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.34692383, + "step": 1265, + "time_per_iteration": 2.683969497680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086337, + "balance_loss_mlp": 1.05002618, + "epoch": 0.24355521354367063, + "flos": 536839004160.0, + "grad_norm": 0.05444938358074035, + "language_loss": 0.87678754, + "learning_rate": 0.0008851056539456896, + "loss": 0.88765097, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.36303711, + "step": 1266, + "time_per_iteration": 2.674891471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_mlp": 1.04830909, + "epoch": 0.24374759522893422, + "flos": 930050975232.0, + "grad_norm": 0.04940136823280911, + "language_loss": 0.82195789, + "learning_rate": 0.0008849068810098755, + "loss": 0.83279288, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.35229492, + "step": 1267, + "time_per_iteration": 3.27172589302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083691, + "balance_loss_mlp": 1.04859626, + "epoch": 0.24393997691419778, + "flos": 427564002816.0, + "grad_norm": 0.07591960175092535, + "language_loss": 0.83287823, + "learning_rate": 0.0008847079586399575, + "loss": 0.84371519, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.35131836, + "step": 1268, + "time_per_iteration": 2.4539763927459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010876, + "balance_loss_mlp": 1.05281472, + "epoch": 0.24413235859946134, + "flos": 578582479872.0, + "grad_norm": 0.059755639557228325, + "language_loss": 0.86095846, + "learning_rate": 0.0008845088869131641, + "loss": 0.87183452, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.34790039, + "step": 1269, + "time_per_iteration": 2.651010274887085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094661, + "balance_loss_mlp": 1.058851, + "epoch": 0.2443247402847249, + "flos": 529600481280.0, + "grad_norm": 0.07776240560166553, + "language_loss": 0.89366186, + "learning_rate": 0.0008843096659067818, + "loss": 0.90460849, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.35839844, + "step": 1270, + "time_per_iteration": 2.61082124710083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108773, + "balance_loss_mlp": 1.05292046, + "epoch": 0.24451712196998845, + "flos": 695996651520.0, + "grad_norm": 0.05083497592617014, + "language_loss": 0.86395383, + "learning_rate": 0.000884110295698155, + "loss": 0.87483108, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.34863281, + "step": 1271, + "time_per_iteration": 2.930372476577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085133, + "balance_loss_mlp": 1.05089653, + "epoch": 0.24470950365525201, + "flos": 529575750144.0, + "grad_norm": 0.05520811698213447, + "language_loss": 0.86009014, + "learning_rate": 0.0008839107763646861, + "loss": 0.87094152, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.34277344, + "step": 1272, + "time_per_iteration": 2.576322078704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088964, + "balance_loss_mlp": 1.05324888, + "epoch": 0.24490188534051557, + "flos": 491091448320.0, + "grad_norm": 0.0616556586287024, + "language_loss": 0.9024111, + "learning_rate": 0.0008837111079838353, + "loss": 0.91330075, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.35742188, + "step": 1273, + "time_per_iteration": 2.6859118938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109117, + "balance_loss_mlp": 1.05631351, + "epoch": 0.24509426702577913, + "flos": 473916842496.0, + "grad_norm": 0.05704478566457949, + "language_loss": 0.89869869, + "learning_rate": 0.000883511290633121, + "loss": 0.90961039, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.34887695, + "step": 1274, + "time_per_iteration": 2.5262861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096408, + "balance_loss_mlp": 1.06152773, + "epoch": 0.24528664871104272, + "flos": 550329624576.0, + "grad_norm": 0.04914382449005864, + "language_loss": 0.92288065, + "learning_rate": 0.000883311324390119, + "loss": 0.93384475, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.34887695, + "step": 1275, + "time_per_iteration": 2.6791441440582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100279, + "balance_loss_mlp": 1.0631578, + "epoch": 0.24547903039630628, + "flos": 825546871296.0, + "grad_norm": 0.0705624444694786, + "language_loss": 0.81542301, + "learning_rate": 0.0008831112093324629, + "loss": 0.82642579, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.37060547, + "step": 1276, + "time_per_iteration": 3.0612823963165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100002, + "balance_loss_mlp": 1.06419206, + "epoch": 0.24567141208156984, + "flos": 591325221888.0, + "grad_norm": 0.0822852621967946, + "language_loss": 0.89184481, + "learning_rate": 0.0008829109455378444, + "loss": 0.90284485, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.35839844, + "step": 1277, + "time_per_iteration": 2.6601858139038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091196, + "balance_loss_mlp": 1.05567181, + "epoch": 0.2458637937668334, + "flos": 547611715584.0, + "grad_norm": 0.05101212903881184, + "language_loss": 0.86474031, + "learning_rate": 0.000882710533084013, + "loss": 0.87565225, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.35546875, + "step": 1278, + "time_per_iteration": 2.6333553791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087707, + "balance_loss_mlp": 1.05158627, + "epoch": 0.24605617545209696, + "flos": 515641379328.0, + "grad_norm": 0.04855931692812416, + "language_loss": 0.89387107, + "learning_rate": 0.0008825099720487755, + "loss": 0.9047482, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.36108398, + "step": 1279, + "time_per_iteration": 2.6388816833496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071735, + "balance_loss_mlp": 1.05943298, + "epoch": 0.24624855713736052, + "flos": 1510953951744.0, + "grad_norm": 0.03612446278815301, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76332873, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.12304688, + "step": 1280, + "time_per_iteration": 4.837193727493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044143, + "balance_loss_mlp": 1.03145874, + "epoch": 0.24644093882262408, + "flos": 1526826419712.0, + "grad_norm": 0.020354868078157083, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.78988254, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.12695312, + "step": 1281, + "time_per_iteration": 4.7473485469818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078971, + "balance_loss_mlp": 1.04418564, + "epoch": 0.24663332050788764, + "flos": 658811667456.0, + "grad_norm": 0.060866999123497585, + "language_loss": 0.89327228, + "learning_rate": 0.0008819073982335619, + "loss": 0.90406203, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.34838867, + "step": 1282, + "time_per_iteration": 2.839691162109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107951, + "balance_loss_mlp": 1.0446527, + "epoch": 0.24682570219315123, + "flos": 541510977024.0, + "grad_norm": 0.05752783194209404, + "language_loss": 0.84339237, + "learning_rate": 0.0008817062436519235, + "loss": 0.85418749, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.34887695, + "step": 1283, + "time_per_iteration": 2.6106019020080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080546, + "balance_loss_mlp": 1.04459274, + "epoch": 0.24701808387841478, + "flos": 440455131648.0, + "grad_norm": 0.05999718389674832, + "language_loss": 0.89926815, + "learning_rate": 0.0008815049408787788, + "loss": 0.91007358, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.36010742, + "step": 1284, + "time_per_iteration": 2.5186686515808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079282, + "balance_loss_mlp": 1.04518795, + "epoch": 0.24721046556367834, + "flos": 467826278400.0, + "grad_norm": 0.054777388157378364, + "language_loss": 0.8565737, + "learning_rate": 0.0008813034899922805, + "loss": 0.86736655, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.34106445, + "step": 1285, + "time_per_iteration": 2.5217878818511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082636, + "balance_loss_mlp": 1.04730225, + "epoch": 0.2474028472489419, + "flos": 504183398400.0, + "grad_norm": 0.06351521025868076, + "language_loss": 0.90182853, + "learning_rate": 0.0008811018910706387, + "loss": 0.91265488, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.35375977, + "step": 1286, + "time_per_iteration": 2.549523115158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010823, + "balance_loss_mlp": 1.04787278, + "epoch": 0.24759522893420546, + "flos": 479707660800.0, + "grad_norm": 0.06857789842871208, + "language_loss": 0.81978023, + "learning_rate": 0.0008809001441921211, + "loss": 0.83060318, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.34448242, + "step": 1287, + "time_per_iteration": 2.7147598266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083728, + "balance_loss_mlp": 1.04984844, + "epoch": 0.24778761061946902, + "flos": 533446000128.0, + "grad_norm": 0.05733880184845353, + "language_loss": 0.85523212, + "learning_rate": 0.0008806982494350528, + "loss": 0.86606944, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.33911133, + "step": 1288, + "time_per_iteration": 2.6304967403411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086884, + "balance_loss_mlp": 1.05197978, + "epoch": 0.24797999230473258, + "flos": 559513446912.0, + "grad_norm": 0.04849910181782432, + "language_loss": 0.90370154, + "learning_rate": 0.0008804962068778161, + "loss": 0.91457039, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.34936523, + "step": 1289, + "time_per_iteration": 2.8194985389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087315, + "balance_loss_mlp": 1.05291104, + "epoch": 0.24817237398999614, + "flos": 623912426496.0, + "grad_norm": 0.05410640942937228, + "language_loss": 0.80728722, + "learning_rate": 0.0008802940165988511, + "loss": 0.81816041, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.34423828, + "step": 1290, + "time_per_iteration": 2.8703298568725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096846, + "balance_loss_mlp": 1.06225193, + "epoch": 0.2483647556752597, + "flos": 611981581824.0, + "grad_norm": 0.06277561181530684, + "language_loss": 0.88376027, + "learning_rate": 0.000880091678676655, + "loss": 0.89472872, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.34619141, + "step": 1291, + "time_per_iteration": 2.7943451404571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088363, + "balance_loss_mlp": 1.05496097, + "epoch": 0.2485571373605233, + "flos": 583270419456.0, + "grad_norm": 0.061640996967182685, + "language_loss": 0.89207399, + "learning_rate": 0.0008798891931897821, + "loss": 0.90295762, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.33422852, + "step": 1292, + "time_per_iteration": 2.7013609409332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05463946, + "epoch": 0.24874951904578685, + "flos": 494503391232.0, + "grad_norm": 0.0568342609101268, + "language_loss": 0.84605837, + "learning_rate": 0.0008796865602168447, + "loss": 0.8569414, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.33691406, + "step": 1293, + "time_per_iteration": 2.517571210861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.05448937, + "epoch": 0.2489419007310504, + "flos": 455925957120.0, + "grad_norm": 0.05011975537228715, + "language_loss": 0.88745099, + "learning_rate": 0.0008794837798365115, + "loss": 0.8983261, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.33032227, + "step": 1294, + "time_per_iteration": 2.6243135929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093031, + "balance_loss_mlp": 1.05958056, + "epoch": 0.24913428241631397, + "flos": 485198733312.0, + "grad_norm": 0.05031013210073455, + "language_loss": 0.88537574, + "learning_rate": 0.0008792808521275089, + "loss": 0.89630604, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.3347168, + "step": 1295, + "time_per_iteration": 2.743821144104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090644, + "balance_loss_mlp": 1.05664551, + "epoch": 0.24932666410157753, + "flos": 518654651904.0, + "grad_norm": 0.0628198177294759, + "language_loss": 0.87554896, + "learning_rate": 0.0008790777771686206, + "loss": 0.8864553, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.34033203, + "step": 1296, + "time_per_iteration": 2.55996036529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091195, + "balance_loss_mlp": 1.05819798, + "epoch": 0.2495190457868411, + "flos": 472365831168.0, + "grad_norm": 0.05367084005526609, + "language_loss": 0.85479438, + "learning_rate": 0.0008788745550386872, + "loss": 0.86570632, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.33007812, + "step": 1297, + "time_per_iteration": 2.555238723754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099543, + "balance_loss_mlp": 1.06494844, + "epoch": 0.24971142747210465, + "flos": 745559202816.0, + "grad_norm": 0.05557204977607519, + "language_loss": 0.80045742, + "learning_rate": 0.0008786711858166063, + "loss": 0.81145287, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.34643555, + "step": 1298, + "time_per_iteration": 2.940908670425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090601, + "balance_loss_mlp": 1.05757999, + "epoch": 0.2499038091573682, + "flos": 749222839296.0, + "grad_norm": 0.08262860681241094, + "language_loss": 0.83490336, + "learning_rate": 0.0008784676695813332, + "loss": 0.84580934, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.33032227, + "step": 1299, + "time_per_iteration": 2.966646432876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092772, + "balance_loss_mlp": 1.05870187, + "epoch": 0.2500961908426318, + "flos": 744741513216.0, + "grad_norm": 0.04756275395178792, + "language_loss": 0.84761405, + "learning_rate": 0.0008782640064118796, + "loss": 0.85854173, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.34082031, + "step": 1300, + "time_per_iteration": 2.8889827728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078115, + "balance_loss_mlp": 1.06447709, + "epoch": 0.2502885725278953, + "flos": 1416652180992.0, + "grad_norm": 0.036683670441934005, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77262866, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.13671875, + "step": 1301, + "time_per_iteration": 4.988169431686401 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094633, + "balance_loss_mlp": 1.06196928, + "epoch": 0.2504809542131589, + "flos": 514961902080.0, + "grad_norm": 0.05923567857946263, + "language_loss": 0.86476314, + "learning_rate": 0.0008778562395867648, + "loss": 0.87570941, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.32666016, + "step": 1302, + "time_per_iteration": 2.5900919437408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087269, + "balance_loss_mlp": 1.05436766, + "epoch": 0.25067333589842244, + "flos": 525562905600.0, + "grad_norm": 0.06049595368492962, + "language_loss": 0.83774143, + "learning_rate": 0.0008776521360894127, + "loss": 0.8486141, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.32910156, + "step": 1303, + "time_per_iteration": 2.6029298305511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_mlp": 1.02275884, + "epoch": 0.25086571758368603, + "flos": 1473085108224.0, + "grad_norm": 0.024331867442101186, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.79997885, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.13085938, + "step": 1304, + "time_per_iteration": 4.800757646560669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096353, + "balance_loss_mlp": 1.063833, + "epoch": 0.2510580992689496, + "flos": 528128045568.0, + "grad_norm": 0.053799887970574674, + "language_loss": 0.90341735, + "learning_rate": 0.0008772434893213186, + "loss": 0.91438091, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.32519531, + "step": 1305, + "time_per_iteration": 2.5816421508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104465, + "balance_loss_mlp": 1.07123005, + "epoch": 0.25125048095421315, + "flos": 517192390656.0, + "grad_norm": 0.058690449713219205, + "language_loss": 0.84433925, + "learning_rate": 0.0008770389462092276, + "loss": 0.85538393, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.33251953, + "step": 1306, + "time_per_iteration": 2.6747090816497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106931, + "balance_loss_mlp": 1.07309926, + "epoch": 0.25144286263947674, + "flos": 620160039936.0, + "grad_norm": 0.16660488736040688, + "language_loss": 0.86719346, + "learning_rate": 0.0008768342567176357, + "loss": 0.87826276, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.33862305, + "step": 1307, + "time_per_iteration": 2.7788002490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098824, + "balance_loss_mlp": 1.06425333, + "epoch": 0.25163524432474027, + "flos": 503534444544.0, + "grad_norm": 0.04824933548887647, + "language_loss": 0.90589297, + "learning_rate": 0.0008766294209260107, + "loss": 0.91688126, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.34619141, + "step": 1308, + "time_per_iteration": 2.6300241947174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010929, + "balance_loss_mlp": 1.05852032, + "epoch": 0.25182762601000386, + "flos": 508821875712.0, + "grad_norm": 0.0633327884934456, + "language_loss": 0.91549027, + "learning_rate": 0.0008764244389138767, + "loss": 0.92641926, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.34399414, + "step": 1309, + "time_per_iteration": 2.574056625366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088516, + "balance_loss_mlp": 1.05306351, + "epoch": 0.2520200076952674, + "flos": 633596815872.0, + "grad_norm": 0.05898934519456769, + "language_loss": 0.8269434, + "learning_rate": 0.000876219310760815, + "loss": 0.83782852, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.35449219, + "step": 1310, + "time_per_iteration": 2.87404465675354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010989, + "balance_loss_mlp": 1.06299448, + "epoch": 0.252212389380531, + "flos": 494385527808.0, + "grad_norm": 0.05968729718727878, + "language_loss": 0.8144334, + "learning_rate": 0.0008760140365464631, + "loss": 0.82542241, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.35913086, + "step": 1311, + "time_per_iteration": 2.599480390548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105508, + "balance_loss_mlp": 1.07062793, + "epoch": 0.2524047710657945, + "flos": 490298489856.0, + "grad_norm": 0.06557576312810307, + "language_loss": 0.87226975, + "learning_rate": 0.0008758086163505156, + "loss": 0.88332486, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.34912109, + "step": 1312, + "time_per_iteration": 2.6165666580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112941, + "balance_loss_mlp": 1.07698762, + "epoch": 0.2525971527510581, + "flos": 647136898560.0, + "grad_norm": 0.06425852435188892, + "language_loss": 0.89039612, + "learning_rate": 0.0008756030502527239, + "loss": 0.90152562, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.36010742, + "step": 1313, + "time_per_iteration": 2.794595956802368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112418, + "balance_loss_mlp": 1.07636952, + "epoch": 0.2527895344363217, + "flos": 568991222784.0, + "grad_norm": 0.05792474282671988, + "language_loss": 0.90396988, + "learning_rate": 0.0008753973383328954, + "loss": 0.91509414, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.36108398, + "step": 1314, + "time_per_iteration": 2.66343355178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110344, + "balance_loss_mlp": 1.07491553, + "epoch": 0.2529819161215852, + "flos": 513795004416.0, + "grad_norm": 0.10488361484557306, + "language_loss": 0.84231019, + "learning_rate": 0.0008751914806708952, + "loss": 0.8534137, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.35449219, + "step": 1315, + "time_per_iteration": 2.5714006423950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099121, + "balance_loss_mlp": 1.06357241, + "epoch": 0.2531742978068488, + "flos": 530979784704.0, + "grad_norm": 0.0646255116041034, + "language_loss": 0.81763697, + "learning_rate": 0.0008749854773466439, + "loss": 0.82862812, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.35571289, + "step": 1316, + "time_per_iteration": 2.6507568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093995, + "balance_loss_mlp": 1.05892396, + "epoch": 0.25336667949211233, + "flos": 596362369536.0, + "grad_norm": 0.11519177634747009, + "language_loss": 0.84297431, + "learning_rate": 0.0008747793284401192, + "loss": 0.8539142, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.35107422, + "step": 1317, + "time_per_iteration": 2.6708261966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109538, + "balance_loss_mlp": 1.05966473, + "epoch": 0.2535590611773759, + "flos": 601764691968.0, + "grad_norm": 0.05376009268762157, + "language_loss": 0.86145389, + "learning_rate": 0.0008745730340313551, + "loss": 0.87240773, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.35742188, + "step": 1318, + "time_per_iteration": 2.7465810775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100043, + "balance_loss_mlp": 1.06504369, + "epoch": 0.25375144286263945, + "flos": 495079561728.0, + "grad_norm": 0.053440140598651036, + "language_loss": 0.8468703, + "learning_rate": 0.0008743665942004422, + "loss": 0.85787076, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.35009766, + "step": 1319, + "time_per_iteration": 2.632645606994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109509, + "balance_loss_mlp": 1.05908918, + "epoch": 0.25394382454790304, + "flos": 512219261952.0, + "grad_norm": 0.050076364746318706, + "language_loss": 0.92529714, + "learning_rate": 0.0008741600090275277, + "loss": 0.93624806, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.35986328, + "step": 1320, + "time_per_iteration": 2.564730405807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097426, + "balance_loss_mlp": 1.06092453, + "epoch": 0.25413620623316663, + "flos": 958586047488.0, + "grad_norm": 0.058049172943507095, + "language_loss": 0.83939385, + "learning_rate": 0.0008739532785928151, + "loss": 0.85036814, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.36474609, + "step": 1321, + "time_per_iteration": 3.4496617317199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056798, + "balance_loss_mlp": 1.04439986, + "epoch": 0.25432858791843016, + "flos": 1576445635584.0, + "grad_norm": 0.03297734471592195, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75950378, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.12353516, + "step": 1322, + "time_per_iteration": 4.803644418716431 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102711, + "balance_loss_mlp": 1.06706691, + "epoch": 0.25452096960369375, + "flos": 583530877440.0, + "grad_norm": 0.056711392027496164, + "language_loss": 0.83213425, + "learning_rate": 0.0008735393822590908, + "loss": 0.84316134, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.35668945, + "step": 1323, + "time_per_iteration": 2.6643528938293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099364, + "balance_loss_mlp": 1.06434083, + "epoch": 0.2547133512889573, + "flos": 508344629760.0, + "grad_norm": 0.06006943476027706, + "language_loss": 0.87018919, + "learning_rate": 0.0008733322165207681, + "loss": 0.88118285, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.35083008, + "step": 1324, + "time_per_iteration": 2.627495765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110409, + "balance_loss_mlp": 1.06863689, + "epoch": 0.25490573297422087, + "flos": 782266940928.0, + "grad_norm": 0.05604709920606865, + "language_loss": 0.83055937, + "learning_rate": 0.0008731249058420247, + "loss": 0.8416003, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.35498047, + "step": 1325, + "time_per_iteration": 3.0361831188201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100095, + "balance_loss_mlp": 1.06468964, + "epoch": 0.2550981146594844, + "flos": 509610451968.0, + "grad_norm": 0.06314633870869373, + "language_loss": 0.90780556, + "learning_rate": 0.0008729174503033459, + "loss": 0.91880649, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.35424805, + "step": 1326, + "time_per_iteration": 2.639625072479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109522, + "balance_loss_mlp": 1.06007695, + "epoch": 0.255290496344748, + "flos": 676360212480.0, + "grad_norm": 0.06489195741671011, + "language_loss": 0.82650065, + "learning_rate": 0.0008727098499852728, + "loss": 0.83745289, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.35180664, + "step": 1327, + "time_per_iteration": 2.830500602722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109753, + "balance_loss_mlp": 1.06231546, + "epoch": 0.2554828780300115, + "flos": 537524273664.0, + "grad_norm": 0.06666455638552511, + "language_loss": 0.89945138, + "learning_rate": 0.0008725021049684034, + "loss": 0.91042662, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.35253906, + "step": 1328, + "time_per_iteration": 2.747800350189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097741, + "balance_loss_mlp": 1.06240726, + "epoch": 0.2556752597152751, + "flos": 823828534272.0, + "grad_norm": 0.052131047599379726, + "language_loss": 0.82919741, + "learning_rate": 0.000872294215333391, + "loss": 0.84017479, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.35400391, + "step": 1329, + "time_per_iteration": 3.1658926010131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096694, + "balance_loss_mlp": 1.06176591, + "epoch": 0.2558676414005387, + "flos": 570517502976.0, + "grad_norm": 0.05425014800623288, + "language_loss": 0.82993001, + "learning_rate": 0.0008720861811609457, + "loss": 0.84089696, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.34985352, + "step": 1330, + "time_per_iteration": 2.709085702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101009, + "balance_loss_mlp": 1.06448317, + "epoch": 0.2560600230858022, + "flos": 486419475456.0, + "grad_norm": 0.05425594622111712, + "language_loss": 0.83756936, + "learning_rate": 0.0008718780025318338, + "loss": 0.84857947, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.36523438, + "step": 1331, + "time_per_iteration": 2.7126388549804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097427, + "balance_loss_mlp": 1.06280875, + "epoch": 0.2562524047710658, + "flos": 512874008064.0, + "grad_norm": 0.06594145834934585, + "language_loss": 0.8406449, + "learning_rate": 0.0008716696795268771, + "loss": 0.85161918, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.34667969, + "step": 1332, + "time_per_iteration": 2.6650350093841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089329, + "balance_loss_mlp": 1.05308938, + "epoch": 0.25644478645632934, + "flos": 634498873344.0, + "grad_norm": 0.051413439896644035, + "language_loss": 0.85076845, + "learning_rate": 0.0008714612122269538, + "loss": 0.86166173, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.36279297, + "step": 1333, + "time_per_iteration": 2.8611392974853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109443, + "balance_loss_mlp": 1.05754697, + "epoch": 0.25663716814159293, + "flos": 436353537024.0, + "grad_norm": 0.0705935369031189, + "language_loss": 0.89120972, + "learning_rate": 0.0008712526007129982, + "loss": 0.90215403, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.36889648, + "step": 1334, + "time_per_iteration": 2.5217065811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109442, + "balance_loss_mlp": 1.05813241, + "epoch": 0.25682954982685646, + "flos": 497892013056.0, + "grad_norm": 0.06578019441075163, + "language_loss": 0.90784955, + "learning_rate": 0.0008710438450660003, + "loss": 0.91879368, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.36303711, + "step": 1335, + "time_per_iteration": 2.651367425918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093579, + "balance_loss_mlp": 1.05643392, + "epoch": 0.25702193151212005, + "flos": 457471176192.0, + "grad_norm": 0.07087944464696884, + "language_loss": 0.8744905, + "learning_rate": 0.0008708349453670064, + "loss": 0.88542628, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.37133789, + "step": 1336, + "time_per_iteration": 2.51411771774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090754, + "balance_loss_mlp": 1.0543952, + "epoch": 0.2572143131973836, + "flos": 598002130944.0, + "grad_norm": 0.06329524505646734, + "language_loss": 0.91480416, + "learning_rate": 0.0008706259016971185, + "loss": 0.92571175, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.36401367, + "step": 1337, + "time_per_iteration": 2.754173517227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096487, + "balance_loss_mlp": 1.0589596, + "epoch": 0.25740669488264717, + "flos": 698004559872.0, + "grad_norm": 0.06697174190166053, + "language_loss": 0.83331275, + "learning_rate": 0.0008704167141374944, + "loss": 0.84427762, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.375, + "step": 1338, + "time_per_iteration": 2.795552968978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011015, + "balance_loss_mlp": 1.06385398, + "epoch": 0.25759907656791076, + "flos": 502130409984.0, + "grad_norm": 0.06639008708045263, + "language_loss": 0.88657552, + "learning_rate": 0.0008702073827693482, + "loss": 0.89759052, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.3762207, + "step": 1339, + "time_per_iteration": 2.6935572624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103743, + "balance_loss_mlp": 1.065763, + "epoch": 0.2577914582531743, + "flos": 773541425664.0, + "grad_norm": 0.06917089880544881, + "language_loss": 0.88938046, + "learning_rate": 0.0008699979076739494, + "loss": 0.90041792, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.37963867, + "step": 1340, + "time_per_iteration": 2.951148509979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102218, + "balance_loss_mlp": 1.06552505, + "epoch": 0.2579838399384379, + "flos": 459431032320.0, + "grad_norm": 0.07085954822691051, + "language_loss": 0.88831556, + "learning_rate": 0.0008697882889326234, + "loss": 0.89933777, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.36669922, + "step": 1341, + "time_per_iteration": 2.492182731628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105931, + "balance_loss_mlp": 1.06916654, + "epoch": 0.2581762216237014, + "flos": 568917029376.0, + "grad_norm": 0.060702491086151805, + "language_loss": 0.86630756, + "learning_rate": 0.0008695785266267515, + "loss": 0.8773669, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.36816406, + "step": 1342, + "time_per_iteration": 2.6635031700134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111038, + "balance_loss_mlp": 1.07448828, + "epoch": 0.258368603308965, + "flos": 603906430464.0, + "grad_norm": 0.06467765584173796, + "language_loss": 0.83112109, + "learning_rate": 0.0008693686208377704, + "loss": 0.84223145, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.36547852, + "step": 1343, + "time_per_iteration": 2.7769596576690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105659, + "balance_loss_mlp": 1.06975329, + "epoch": 0.2585609849942285, + "flos": 491204929536.0, + "grad_norm": 0.06376456739082713, + "language_loss": 0.88889539, + "learning_rate": 0.0008691585716471733, + "loss": 0.89995199, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.35913086, + "step": 1344, + "time_per_iteration": 2.6467716693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104044, + "balance_loss_mlp": 1.06809044, + "epoch": 0.2587533666794921, + "flos": 640455607296.0, + "grad_norm": 0.057733681270749564, + "language_loss": 0.85255873, + "learning_rate": 0.0008689483791365079, + "loss": 0.86359918, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.35961914, + "step": 1345, + "time_per_iteration": 2.8041999340057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099237, + "balance_loss_mlp": 1.06380773, + "epoch": 0.2589457483647557, + "flos": 576564397056.0, + "grad_norm": 0.05015471530609978, + "language_loss": 0.89365089, + "learning_rate": 0.0008687380433873786, + "loss": 0.9046433, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.35473633, + "step": 1346, + "time_per_iteration": 2.7955591678619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101447, + "balance_loss_mlp": 1.06630445, + "epoch": 0.25913813005001923, + "flos": 535164337152.0, + "grad_norm": 0.06074647569776127, + "language_loss": 0.82164252, + "learning_rate": 0.0008685275644814448, + "loss": 0.83265698, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.3515625, + "step": 1347, + "time_per_iteration": 2.6922154426574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100414, + "balance_loss_mlp": 1.06419861, + "epoch": 0.2593305117352828, + "flos": 720713908224.0, + "grad_norm": 0.05981927153656866, + "language_loss": 0.8445859, + "learning_rate": 0.0008683169425004216, + "loss": 0.85558999, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.36230469, + "step": 1348, + "time_per_iteration": 2.8701395988464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.05186677, + "epoch": 0.25952289342054635, + "flos": 709782635520.0, + "grad_norm": 0.06994851779161643, + "language_loss": 0.83445579, + "learning_rate": 0.0008681061775260799, + "loss": 0.84533083, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.35644531, + "step": 1349, + "time_per_iteration": 2.8206968307495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096473, + "balance_loss_mlp": 1.06032848, + "epoch": 0.25971527510580994, + "flos": 455688820224.0, + "grad_norm": 0.06118298275127208, + "language_loss": 0.91987318, + "learning_rate": 0.0008678952696402458, + "loss": 0.93083793, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.36132812, + "step": 1350, + "time_per_iteration": 2.5547540187835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108511, + "balance_loss_mlp": 1.04932308, + "epoch": 0.25990765679107347, + "flos": 612223100928.0, + "grad_norm": 0.04808004024566397, + "language_loss": 0.86496055, + "learning_rate": 0.000867684218924801, + "loss": 0.8758117, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.35791016, + "step": 1351, + "time_per_iteration": 2.8406949043273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110301, + "balance_loss_mlp": 1.09857082, + "epoch": 0.26010003847633706, + "flos": 1537105766400.0, + "grad_norm": 0.059206679514604114, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80057395, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.1171875, + "step": 1352, + "time_per_iteration": 4.8775153160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082749, + "balance_loss_mlp": 1.04753494, + "epoch": 0.2602924201616006, + "flos": 715947393024.0, + "grad_norm": 0.046134849134736367, + "language_loss": 0.85103661, + "learning_rate": 0.0008672616893328834, + "loss": 0.86186409, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.35253906, + "step": 1353, + "time_per_iteration": 2.98063588142395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108322, + "balance_loss_mlp": 1.04764819, + "epoch": 0.2604848018468642, + "flos": 643241917440.0, + "grad_norm": 0.060512322591449175, + "language_loss": 0.9000203, + "learning_rate": 0.0008670502106204512, + "loss": 0.91085243, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.35595703, + "step": 1354, + "time_per_iteration": 2.832679271697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087805, + "balance_loss_mlp": 1.05073047, + "epoch": 0.26067718353212777, + "flos": 516783545856.0, + "grad_norm": 0.05860289542603218, + "language_loss": 0.8165139, + "learning_rate": 0.0008668385894064892, + "loss": 0.82739192, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.37084961, + "step": 1355, + "time_per_iteration": 2.6204822063446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086317, + "balance_loss_mlp": 1.05143666, + "epoch": 0.2608695652173913, + "flos": 822361890816.0, + "grad_norm": 0.0623840657908754, + "language_loss": 0.88803548, + "learning_rate": 0.0008666268257731562, + "loss": 0.8988986, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.34912109, + "step": 1356, + "time_per_iteration": 3.113147735595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109295, + "balance_loss_mlp": 1.0566628, + "epoch": 0.2610619469026549, + "flos": 1007451744768.0, + "grad_norm": 0.056693012024963345, + "language_loss": 0.85794425, + "learning_rate": 0.0008664149198026662, + "loss": 0.86887372, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.36279297, + "step": 1357, + "time_per_iteration": 3.2569541931152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095714, + "balance_loss_mlp": 1.06040418, + "epoch": 0.2612543285879184, + "flos": 536523291648.0, + "grad_norm": 0.061594313952015485, + "language_loss": 0.88599586, + "learning_rate": 0.0008662028715772883, + "loss": 0.89695299, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.35351562, + "step": 1358, + "time_per_iteration": 2.6102256774902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100258, + "balance_loss_mlp": 1.06475711, + "epoch": 0.261446710273182, + "flos": 519166803456.0, + "grad_norm": 0.04975036534081278, + "language_loss": 0.85662109, + "learning_rate": 0.0008659906811793467, + "loss": 0.86762363, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.35546875, + "step": 1359, + "time_per_iteration": 2.6921935081481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101082, + "balance_loss_mlp": 1.06543839, + "epoch": 0.26163909195844554, + "flos": 582975055872.0, + "grad_norm": 0.06646109128582675, + "language_loss": 0.89397144, + "learning_rate": 0.0008657783486912215, + "loss": 0.90498233, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.35693359, + "step": 1360, + "time_per_iteration": 2.7003283500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110494, + "balance_loss_mlp": 1.06920147, + "epoch": 0.2618314736437091, + "flos": 958362057216.0, + "grad_norm": 0.06344844215605515, + "language_loss": 0.89840877, + "learning_rate": 0.0008655658741953472, + "loss": 0.90945816, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.35742188, + "step": 1361, + "time_per_iteration": 3.207960844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101357, + "balance_loss_mlp": 1.0664053, + "epoch": 0.26202385532897265, + "flos": 574530347520.0, + "grad_norm": 0.04606923720206454, + "language_loss": 0.88105857, + "learning_rate": 0.0008653532577742136, + "loss": 0.89207214, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.34960938, + "step": 1362, + "time_per_iteration": 2.69209885597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091565, + "balance_loss_mlp": 1.05744767, + "epoch": 0.26221623701423624, + "flos": 445240585728.0, + "grad_norm": 0.05480512848555835, + "language_loss": 0.87200153, + "learning_rate": 0.0008651404995103659, + "loss": 0.88291717, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.34155273, + "step": 1363, + "time_per_iteration": 2.5325255393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095927, + "balance_loss_mlp": 1.06164205, + "epoch": 0.26240861869949983, + "flos": 535459700736.0, + "grad_norm": 0.04992660146640532, + "language_loss": 0.870713, + "learning_rate": 0.0008649275994864041, + "loss": 0.8816722, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.34301758, + "step": 1364, + "time_per_iteration": 2.682365894317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098846, + "balance_loss_mlp": 1.06267846, + "epoch": 0.26260100038476336, + "flos": 564940500480.0, + "grad_norm": 0.05369640644722127, + "language_loss": 0.83917898, + "learning_rate": 0.0008647145577849834, + "loss": 0.85016745, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.36157227, + "step": 1365, + "time_per_iteration": 2.8129918575286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102304, + "balance_loss_mlp": 1.06701851, + "epoch": 0.26279338207002695, + "flos": 612745426944.0, + "grad_norm": 0.045782565775991005, + "language_loss": 0.82809973, + "learning_rate": 0.0008645013744888139, + "loss": 0.83912277, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.35327148, + "step": 1366, + "time_per_iteration": 2.8523411750793457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101615, + "balance_loss_mlp": 1.06730664, + "epoch": 0.2629857637552905, + "flos": 522555425280.0, + "grad_norm": 0.0597350257589219, + "language_loss": 0.87579656, + "learning_rate": 0.0008642880496806607, + "loss": 0.88681269, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.34350586, + "step": 1367, + "time_per_iteration": 2.766350507736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105811, + "balance_loss_mlp": 1.0706687, + "epoch": 0.26317814544055407, + "flos": 534273864192.0, + "grad_norm": 0.05812227598952832, + "language_loss": 0.84219468, + "learning_rate": 0.0008640745834433437, + "loss": 0.85325277, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.35205078, + "step": 1368, + "time_per_iteration": 2.7220964431762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100459, + "balance_loss_mlp": 1.06553102, + "epoch": 0.2633705271258176, + "flos": 555235762176.0, + "grad_norm": 0.06954601812969684, + "language_loss": 0.86862296, + "learning_rate": 0.000863860975859738, + "loss": 0.87962759, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.34960938, + "step": 1369, + "time_per_iteration": 2.8985280990600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094199, + "balance_loss_mlp": 1.05931866, + "epoch": 0.2635629088110812, + "flos": 552136711680.0, + "grad_norm": 0.06493737783890446, + "language_loss": 0.88711715, + "learning_rate": 0.0008636472270127733, + "loss": 0.89805913, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.34936523, + "step": 1370, + "time_per_iteration": 2.634615182876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090389, + "balance_loss_mlp": 1.05498338, + "epoch": 0.2637552904963448, + "flos": 455752839168.0, + "grad_norm": 0.062476231294863314, + "language_loss": 0.89913595, + "learning_rate": 0.0008634333369854345, + "loss": 0.91003978, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.35424805, + "step": 1371, + "time_per_iteration": 2.5908331871032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082759, + "balance_loss_mlp": 1.04818797, + "epoch": 0.2639476721816083, + "flos": 612847323648.0, + "grad_norm": 0.05509554660574217, + "language_loss": 0.87495965, + "learning_rate": 0.0008632193058607608, + "loss": 0.88578725, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.34594727, + "step": 1372, + "time_per_iteration": 2.6963188648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108436, + "balance_loss_mlp": 1.04878759, + "epoch": 0.2641400538668719, + "flos": 571645112832.0, + "grad_norm": 0.05982264925210271, + "language_loss": 0.81028771, + "learning_rate": 0.0008630051337218466, + "loss": 0.82113135, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.35595703, + "step": 1373, + "time_per_iteration": 2.644540786743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079985, + "balance_loss_mlp": 1.04582, + "epoch": 0.2643324355521354, + "flos": 581979866112.0, + "grad_norm": 0.08561984623812412, + "language_loss": 0.82428128, + "learning_rate": 0.0008627908206518409, + "loss": 0.8350811, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.34179688, + "step": 1374, + "time_per_iteration": 2.660578966140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110354, + "balance_loss_mlp": 1.08904421, + "epoch": 0.264524817237399, + "flos": 1543845284352.0, + "grad_norm": 0.03725698642258328, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76254791, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.14453125, + "step": 1375, + "time_per_iteration": 4.9595324993133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079461, + "balance_loss_mlp": 1.04474711, + "epoch": 0.26471719892266254, + "flos": 517783117824.0, + "grad_norm": 0.05493851972821551, + "language_loss": 0.91330564, + "learning_rate": 0.0008623617720514241, + "loss": 0.92410028, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.34741211, + "step": 1376, + "time_per_iteration": 2.5929205417633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.05498242, + "epoch": 0.26490958060792613, + "flos": 516936314880.0, + "grad_norm": 0.08106601153347975, + "language_loss": 0.84946424, + "learning_rate": 0.0008621470366875848, + "loss": 0.8603667, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.3527832, + "step": 1377, + "time_per_iteration": 2.5729684829711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084439, + "balance_loss_mlp": 1.0497725, + "epoch": 0.26510196229318966, + "flos": 596298350592.0, + "grad_norm": 0.05588669268878349, + "language_loss": 0.87771004, + "learning_rate": 0.0008619321607257966, + "loss": 0.88855445, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.34692383, + "step": 1378, + "time_per_iteration": 2.6708004474639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082921, + "balance_loss_mlp": 1.0483501, + "epoch": 0.26529434397845325, + "flos": 685488780288.0, + "grad_norm": 0.051774701706919043, + "language_loss": 0.82311988, + "learning_rate": 0.000861717144249482, + "loss": 0.83394915, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.34594727, + "step": 1379, + "time_per_iteration": 2.8249831199645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081979, + "balance_loss_mlp": 1.0468595, + "epoch": 0.26548672566371684, + "flos": 424127328768.0, + "grad_norm": 0.06288210815556809, + "language_loss": 0.90205348, + "learning_rate": 0.0008615019873421175, + "loss": 0.91287327, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.35131836, + "step": 1380, + "time_per_iteration": 2.455320358276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108606, + "balance_loss_mlp": 1.05108428, + "epoch": 0.26567910734898037, + "flos": 489619012608.0, + "grad_norm": 0.05393715583789803, + "language_loss": 0.8609767, + "learning_rate": 0.0008612866900872349, + "loss": 0.87183726, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.35009766, + "step": 1381, + "time_per_iteration": 2.54070782661438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083002, + "balance_loss_mlp": 1.04895568, + "epoch": 0.26587148903424396, + "flos": 533947977216.0, + "grad_norm": 0.05290962754614328, + "language_loss": 0.88052452, + "learning_rate": 0.0008610712525684197, + "loss": 0.8913545, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.34082031, + "step": 1382, + "time_per_iteration": 2.6350595951080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084878, + "balance_loss_mlp": 1.05049801, + "epoch": 0.2660638707195075, + "flos": 1017067732992.0, + "grad_norm": 0.06267977315545337, + "language_loss": 0.84534729, + "learning_rate": 0.0008608556748693121, + "loss": 0.85619605, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.34423828, + "step": 1383, + "time_per_iteration": 3.231172561645508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086499, + "balance_loss_mlp": 1.05216646, + "epoch": 0.2662562524047711, + "flos": 523712148480.0, + "grad_norm": 0.0585640776606728, + "language_loss": 0.86247015, + "learning_rate": 0.000860639957073607, + "loss": 0.87333512, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.34375, + "step": 1384, + "time_per_iteration": 2.72265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108718, + "balance_loss_mlp": 1.05280018, + "epoch": 0.2664486340900346, + "flos": 552107598336.0, + "grad_norm": 0.07312693577598182, + "language_loss": 0.87888551, + "learning_rate": 0.0008604240992650534, + "loss": 0.88975734, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.34423828, + "step": 1385, + "time_per_iteration": 2.6524593830108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079729, + "balance_loss_mlp": 1.0455637, + "epoch": 0.2666410157752982, + "flos": 469895233536.0, + "grad_norm": 0.058731941016447735, + "language_loss": 0.89070451, + "learning_rate": 0.0008602081015274545, + "loss": 0.90150183, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.34179688, + "step": 1386, + "time_per_iteration": 2.7026963233947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083917, + "balance_loss_mlp": 1.04953694, + "epoch": 0.2668333974605617, + "flos": 569645968896.0, + "grad_norm": 0.04572049987167494, + "language_loss": 0.83031899, + "learning_rate": 0.0008599919639446684, + "loss": 0.84115815, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.34423828, + "step": 1387, + "time_per_iteration": 2.6891515254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083945, + "balance_loss_mlp": 1.04920745, + "epoch": 0.2670257791458253, + "flos": 398755325952.0, + "grad_norm": 0.06113709644372323, + "language_loss": 0.80263156, + "learning_rate": 0.000859775686600607, + "loss": 0.81347102, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.34765625, + "step": 1388, + "time_per_iteration": 2.5367043018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088195, + "balance_loss_mlp": 1.05400586, + "epoch": 0.2672181608310889, + "flos": 515587534848.0, + "grad_norm": 0.07715457421599592, + "language_loss": 0.85045016, + "learning_rate": 0.0008595592695792367, + "loss": 0.86133218, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.34228516, + "step": 1389, + "time_per_iteration": 2.653684139251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097788, + "balance_loss_mlp": 1.06348014, + "epoch": 0.26741054251635243, + "flos": 507270864384.0, + "grad_norm": 0.05276083290405683, + "language_loss": 0.9085412, + "learning_rate": 0.0008593427129645778, + "loss": 0.91951907, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.34350586, + "step": 1390, + "time_per_iteration": 2.5497426986694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100169, + "balance_loss_mlp": 1.06512117, + "epoch": 0.267602924201616, + "flos": 576357783552.0, + "grad_norm": 0.0907689109524766, + "language_loss": 0.85371816, + "learning_rate": 0.0008591260168407052, + "loss": 0.86471987, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.35083008, + "step": 1391, + "time_per_iteration": 2.752777576446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099505, + "balance_loss_mlp": 1.06598353, + "epoch": 0.26779530588687955, + "flos": 523731087360.0, + "grad_norm": 0.05201595058269412, + "language_loss": 0.83216429, + "learning_rate": 0.0008589091812917479, + "loss": 0.84315932, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.33544922, + "step": 1392, + "time_per_iteration": 2.602858781814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092875, + "balance_loss_mlp": 1.05897164, + "epoch": 0.26798768757214314, + "flos": 556508938752.0, + "grad_norm": 0.054199587407170555, + "language_loss": 0.85476619, + "learning_rate": 0.0008586922064018887, + "loss": 0.86569488, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.33911133, + "step": 1393, + "time_per_iteration": 2.663135528564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110003, + "balance_loss_mlp": 1.0641005, + "epoch": 0.2681800692574067, + "flos": 930246004224.0, + "grad_norm": 0.05606615550920643, + "language_loss": 0.89228028, + "learning_rate": 0.0008584750922553651, + "loss": 0.90328062, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.35961914, + "step": 1394, + "time_per_iteration": 3.126030206680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094414, + "balance_loss_mlp": 1.06020141, + "epoch": 0.26837245094267026, + "flos": 700771931136.0, + "grad_norm": 0.055333821001128054, + "language_loss": 0.83724457, + "learning_rate": 0.0008582578389364677, + "loss": 0.84818876, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.3425293, + "step": 1395, + "time_per_iteration": 2.858774423599243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096924, + "balance_loss_mlp": 1.06187642, + "epoch": 0.26856483262793385, + "flos": 592892199936.0, + "grad_norm": 0.04773968262798697, + "language_loss": 0.9195987, + "learning_rate": 0.0008580404465295422, + "loss": 0.93056792, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.35058594, + "step": 1396, + "time_per_iteration": 2.7737646102905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096258, + "balance_loss_mlp": 1.06190252, + "epoch": 0.2687572143131974, + "flos": 713943866880.0, + "grad_norm": 0.07288281155022573, + "language_loss": 0.88208908, + "learning_rate": 0.0008578229151189876, + "loss": 0.89305162, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.34375, + "step": 1397, + "time_per_iteration": 2.9974029064178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087674, + "balance_loss_mlp": 1.05441451, + "epoch": 0.26894959599846097, + "flos": 467481452544.0, + "grad_norm": 0.0581153622766974, + "language_loss": 0.81433654, + "learning_rate": 0.0008576052447892573, + "loss": 0.82521319, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.33276367, + "step": 1398, + "time_per_iteration": 2.586427688598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090085, + "balance_loss_mlp": 1.05551457, + "epoch": 0.2691419776837245, + "flos": 468470850048.0, + "grad_norm": 0.08083264737918114, + "language_loss": 0.86589479, + "learning_rate": 0.000857387435624858, + "loss": 0.87679559, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.34619141, + "step": 1399, + "time_per_iteration": 2.5227789878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096966, + "balance_loss_mlp": 1.06239533, + "epoch": 0.2693343593689881, + "flos": 937244418048.0, + "grad_norm": 0.0443934808912178, + "language_loss": 0.88525635, + "learning_rate": 0.0008571694877103513, + "loss": 0.89622605, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.34594727, + "step": 1400, + "time_per_iteration": 3.252573251724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095335, + "balance_loss_mlp": 1.06064546, + "epoch": 0.2695267410542516, + "flos": 577303511040.0, + "grad_norm": 0.05297583192015558, + "language_loss": 0.87603962, + "learning_rate": 0.0008569514011303515, + "loss": 0.88699305, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.34716797, + "step": 1401, + "time_per_iteration": 2.7824223041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092716, + "balance_loss_mlp": 1.0577879, + "epoch": 0.2697191227395152, + "flos": 556539462144.0, + "grad_norm": 0.06414718170709632, + "language_loss": 0.87859815, + "learning_rate": 0.0008567331759695277, + "loss": 0.88952529, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.34985352, + "step": 1402, + "time_per_iteration": 2.7109498977661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098142, + "balance_loss_mlp": 1.06183052, + "epoch": 0.26991150442477874, + "flos": 529024310784.0, + "grad_norm": 0.05462837975359106, + "language_loss": 0.86148876, + "learning_rate": 0.0008565148123126023, + "loss": 0.87247014, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.36328125, + "step": 1403, + "time_per_iteration": 2.6526310443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088301, + "balance_loss_mlp": 1.05289555, + "epoch": 0.2701038861100423, + "flos": 531737837568.0, + "grad_norm": 0.12276519374226595, + "language_loss": 0.86177158, + "learning_rate": 0.0008562963102443516, + "loss": 0.87265456, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.35400391, + "step": 1404, + "time_per_iteration": 2.6809849739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092814, + "balance_loss_mlp": 1.05757618, + "epoch": 0.2702962677953059, + "flos": 734908737024.0, + "grad_norm": 0.05743337882617235, + "language_loss": 0.85265231, + "learning_rate": 0.0008560776698496056, + "loss": 0.86358047, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.3527832, + "step": 1405, + "time_per_iteration": 2.9008774757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099947, + "balance_loss_mlp": 1.06420779, + "epoch": 0.27048864948056944, + "flos": 574453181952.0, + "grad_norm": 0.06281004106283315, + "language_loss": 0.85864103, + "learning_rate": 0.0008558588912132481, + "loss": 0.86964047, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.35742188, + "step": 1406, + "time_per_iteration": 2.8967840671539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057025, + "balance_loss_mlp": 1.04519963, + "epoch": 0.27068103116583303, + "flos": 1423091953152.0, + "grad_norm": 0.03126478371356873, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77516007, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.11816406, + "step": 1407, + "time_per_iteration": 4.933698892593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109874, + "balance_loss_mlp": 1.06400251, + "epoch": 0.27087341285109656, + "flos": 531742219776.0, + "grad_norm": 0.050597666424933845, + "language_loss": 0.82942683, + "learning_rate": 0.0008554209195555016, + "loss": 0.84041423, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.34765625, + "step": 1408, + "time_per_iteration": 2.6599888801574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093614, + "balance_loss_mlp": 1.0582329, + "epoch": 0.27106579453636015, + "flos": 581108332032.0, + "grad_norm": 0.058412744436649705, + "language_loss": 0.88199335, + "learning_rate": 0.0008552017267041483, + "loss": 0.89292949, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.35375977, + "step": 1409, + "time_per_iteration": 2.673828363418579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091257, + "balance_loss_mlp": 1.05563736, + "epoch": 0.2712581762216237, + "flos": 506533160448.0, + "grad_norm": 0.05246666988179206, + "language_loss": 0.83264577, + "learning_rate": 0.0008549823959512549, + "loss": 0.84355831, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.35644531, + "step": 1410, + "time_per_iteration": 2.634523868560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091517, + "balance_loss_mlp": 1.05451441, + "epoch": 0.27145055790688727, + "flos": 997019476992.0, + "grad_norm": 0.050905982394943275, + "language_loss": 0.86668658, + "learning_rate": 0.0008547629273819728, + "loss": 0.87760168, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.36987305, + "step": 1411, + "time_per_iteration": 3.3559322357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094566, + "balance_loss_mlp": 1.05875564, + "epoch": 0.2716429395921508, + "flos": 546420086784.0, + "grad_norm": 0.06363965087638479, + "language_loss": 0.83773881, + "learning_rate": 0.0008545433210815074, + "loss": 0.84868449, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.35839844, + "step": 1412, + "time_per_iteration": 2.607379913330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109226, + "balance_loss_mlp": 1.05606771, + "epoch": 0.2718353212774144, + "flos": 572954605056.0, + "grad_norm": 0.05881941163427475, + "language_loss": 0.87753916, + "learning_rate": 0.0008543235771351176, + "loss": 0.88846171, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.36230469, + "step": 1413, + "time_per_iteration": 2.722318649291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096323, + "balance_loss_mlp": 1.06118035, + "epoch": 0.272027702962678, + "flos": 643986823680.0, + "grad_norm": 0.044269909609048815, + "language_loss": 0.84649068, + "learning_rate": 0.0008541036956281154, + "loss": 0.85745388, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.3515625, + "step": 1414, + "time_per_iteration": 2.8785104751586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100326, + "balance_loss_mlp": 1.0645628, + "epoch": 0.2722200846479415, + "flos": 653410755072.0, + "grad_norm": 0.0658433573318433, + "language_loss": 0.82281864, + "learning_rate": 0.0008538836766458665, + "loss": 0.83382189, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.35791016, + "step": 1415, + "time_per_iteration": 2.834148645401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098566, + "balance_loss_mlp": 1.06311321, + "epoch": 0.2724124663332051, + "flos": 579346324992.0, + "grad_norm": 0.07330345680392343, + "language_loss": 0.85275221, + "learning_rate": 0.0008536635202737897, + "loss": 0.86373788, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.35473633, + "step": 1416, + "time_per_iteration": 2.7886626720428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099993, + "balance_loss_mlp": 1.06513667, + "epoch": 0.2726048480184686, + "flos": 537178037760.0, + "grad_norm": 0.06667202152625065, + "language_loss": 0.82212454, + "learning_rate": 0.0008534432265973573, + "loss": 0.8331244, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.34912109, + "step": 1417, + "time_per_iteration": 2.604626417160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095149, + "balance_loss_mlp": 1.05931497, + "epoch": 0.2727972297037322, + "flos": 995360776704.0, + "grad_norm": 0.08172035912068172, + "language_loss": 0.88052338, + "learning_rate": 0.000853222795702095, + "loss": 0.8914749, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.35839844, + "step": 1418, + "time_per_iteration": 3.391664505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095275, + "balance_loss_mlp": 1.06125307, + "epoch": 0.27298961138899575, + "flos": 605924513280.0, + "grad_norm": 0.05480231780963067, + "language_loss": 0.83608377, + "learning_rate": 0.0008530022276735813, + "loss": 0.84703648, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.34057617, + "step": 1419, + "time_per_iteration": 2.705235004425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107985, + "balance_loss_mlp": 1.04666257, + "epoch": 0.27318199307425933, + "flos": 529059216384.0, + "grad_norm": 0.054542785174291425, + "language_loss": 0.85957551, + "learning_rate": 0.0008527815225974489, + "loss": 0.87037402, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.33203125, + "step": 1420, + "time_per_iteration": 2.654003620147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087931, + "balance_loss_mlp": 1.05395651, + "epoch": 0.2733743747595229, + "flos": 408809272320.0, + "grad_norm": 0.06460584893454492, + "language_loss": 0.88538897, + "learning_rate": 0.0008525606805593829, + "loss": 0.89626825, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.33984375, + "step": 1421, + "time_per_iteration": 2.4287912845611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087097, + "balance_loss_mlp": 1.05233574, + "epoch": 0.27356675644478645, + "flos": 515976030720.0, + "grad_norm": 0.055753761808712644, + "language_loss": 0.82379127, + "learning_rate": 0.0008523397016451213, + "loss": 0.8346622, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.34814453, + "step": 1422, + "time_per_iteration": 2.5620808601379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096714, + "balance_loss_mlp": 1.0617379, + "epoch": 0.27375913813005004, + "flos": 1051914539520.0, + "grad_norm": 0.0481984630724129, + "language_loss": 0.87272507, + "learning_rate": 0.0008521185859404564, + "loss": 0.88369215, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.34985352, + "step": 1423, + "time_per_iteration": 3.361171245574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085067, + "balance_loss_mlp": 1.05037737, + "epoch": 0.27395151981531357, + "flos": 624507535872.0, + "grad_norm": 0.05502068897485729, + "language_loss": 0.89717311, + "learning_rate": 0.0008518973335312326, + "loss": 0.90802383, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.34716797, + "step": 1424, + "time_per_iteration": 2.7964961528778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088932, + "balance_loss_mlp": 1.05390823, + "epoch": 0.27414390150057716, + "flos": 550112836608.0, + "grad_norm": 0.056708357312241935, + "language_loss": 0.83878243, + "learning_rate": 0.0008516759445033477, + "loss": 0.84967172, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.35058594, + "step": 1425, + "time_per_iteration": 2.6100170612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090578, + "balance_loss_mlp": 1.05603087, + "epoch": 0.2743362831858407, + "flos": 539596200960.0, + "grad_norm": 0.061048707375716146, + "language_loss": 0.84983361, + "learning_rate": 0.0008514544189427526, + "loss": 0.86073935, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.34594727, + "step": 1426, + "time_per_iteration": 2.6465015411376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088002, + "balance_loss_mlp": 1.05347919, + "epoch": 0.2745286648711043, + "flos": 468352986624.0, + "grad_norm": 0.061383055639382046, + "language_loss": 0.8704657, + "learning_rate": 0.0008512327569354511, + "loss": 0.88134569, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.34570312, + "step": 1427, + "time_per_iteration": 2.5696229934692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087297, + "balance_loss_mlp": 1.05212998, + "epoch": 0.2747210465563678, + "flos": 472617524736.0, + "grad_norm": 0.05941983459852813, + "language_loss": 0.8349936, + "learning_rate": 0.0008510109585675001, + "loss": 0.84586656, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.35180664, + "step": 1428, + "time_per_iteration": 2.6195123195648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086947, + "balance_loss_mlp": 1.07245111, + "epoch": 0.2749134282416314, + "flos": 1314345070080.0, + "grad_norm": 0.037284634304165044, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82240289, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.14453125, + "step": 1429, + "time_per_iteration": 4.714681625366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083808, + "balance_loss_mlp": 1.04938078, + "epoch": 0.275105809926895, + "flos": 970445670912.0, + "grad_norm": 0.07686972857934649, + "language_loss": 0.80942416, + "learning_rate": 0.0008505669530941415, + "loss": 0.82026225, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.34472656, + "step": 1430, + "time_per_iteration": 3.2975006103515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080869, + "balance_loss_mlp": 1.04715657, + "epoch": 0.2752981916121585, + "flos": 527089185792.0, + "grad_norm": 0.061626933195079385, + "language_loss": 0.84357536, + "learning_rate": 0.000850344746161112, + "loss": 0.85438406, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.33740234, + "step": 1431, + "time_per_iteration": 2.596623182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079512, + "balance_loss_mlp": 1.04487014, + "epoch": 0.2754905732974221, + "flos": 453487444992.0, + "grad_norm": 0.05883177646218185, + "language_loss": 0.87880194, + "learning_rate": 0.0008501224032121894, + "loss": 0.88959706, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.34692383, + "step": 1432, + "time_per_iteration": 2.5134201049804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082699, + "balance_loss_mlp": 1.04817569, + "epoch": 0.27568295498268564, + "flos": 497216918016.0, + "grad_norm": 0.05235854639463291, + "language_loss": 0.82002538, + "learning_rate": 0.0008498999243336946, + "loss": 0.83085239, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.34570312, + "step": 1433, + "time_per_iteration": 2.601771593093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095225, + "balance_loss_mlp": 1.06060696, + "epoch": 0.2758753366679492, + "flos": 607890161664.0, + "grad_norm": 0.05891633941102979, + "language_loss": 0.87444806, + "learning_rate": 0.0008496773096120021, + "loss": 0.8854003, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.34643555, + "step": 1434, + "time_per_iteration": 2.788516044616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096763, + "balance_loss_mlp": 1.06212115, + "epoch": 0.27606771835321275, + "flos": 739803290112.0, + "grad_norm": 0.06770297286276174, + "language_loss": 0.84185004, + "learning_rate": 0.0008494545591335381, + "loss": 0.85281765, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.34667969, + "step": 1435, + "time_per_iteration": 2.8759751319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110127, + "balance_loss_mlp": 1.06736696, + "epoch": 0.27626010003847634, + "flos": 554279860224.0, + "grad_norm": 0.04450223786838935, + "language_loss": 0.87244952, + "learning_rate": 0.0008492316729847823, + "loss": 0.88346225, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.33935547, + "step": 1436, + "time_per_iteration": 2.781244993209839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094118, + "balance_loss_mlp": 1.06023908, + "epoch": 0.2764524817237399, + "flos": 542270439936.0, + "grad_norm": 0.055325808882979444, + "language_loss": 0.79874223, + "learning_rate": 0.0008490086512522664, + "loss": 0.80968338, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.33911133, + "step": 1437, + "time_per_iteration": 2.7197165489196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093138, + "balance_loss_mlp": 1.05913925, + "epoch": 0.27664486340900346, + "flos": 406027344384.0, + "grad_norm": 0.0539948754920925, + "language_loss": 0.90713239, + "learning_rate": 0.0008487854940225755, + "loss": 0.91806382, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.34008789, + "step": 1438, + "time_per_iteration": 2.438218593597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094602, + "balance_loss_mlp": 1.06017423, + "epoch": 0.27683724509426705, + "flos": 521884712448.0, + "grad_norm": 0.06140365718889793, + "language_loss": 0.90140885, + "learning_rate": 0.0008485622013823466, + "loss": 0.91235483, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.34448242, + "step": 1439, + "time_per_iteration": 2.653393268585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102393, + "balance_loss_mlp": 1.06770289, + "epoch": 0.2770296267795306, + "flos": 535085761536.0, + "grad_norm": 0.07554461134761571, + "language_loss": 0.8283006, + "learning_rate": 0.00084833877341827, + "loss": 0.83932453, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.34692383, + "step": 1440, + "time_per_iteration": 2.6312928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109947, + "balance_loss_mlp": 1.06587648, + "epoch": 0.27722200846479417, + "flos": 487747906560.0, + "grad_norm": 0.12145939933268801, + "language_loss": 0.8064183, + "learning_rate": 0.000848115210217088, + "loss": 0.81741297, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.33618164, + "step": 1441, + "time_per_iteration": 2.5490710735321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086774, + "balance_loss_mlp": 1.05167925, + "epoch": 0.2774143901500577, + "flos": 618012509184.0, + "grad_norm": 0.05766268366580332, + "language_loss": 0.82057106, + "learning_rate": 0.0008478915118655952, + "loss": 0.83143878, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.35131836, + "step": 1442, + "time_per_iteration": 2.710240602493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086605, + "balance_loss_mlp": 1.05160558, + "epoch": 0.2776067718353213, + "flos": 513563659776.0, + "grad_norm": 0.05774564569051742, + "language_loss": 0.86505657, + "learning_rate": 0.0008476676784506393, + "loss": 0.87592262, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.35009766, + "step": 1443, + "time_per_iteration": 2.636622667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108686, + "balance_loss_mlp": 1.0526228, + "epoch": 0.2777991535205848, + "flos": 1003985957376.0, + "grad_norm": 0.10311001825576924, + "language_loss": 0.82024419, + "learning_rate": 0.0008474437100591201, + "loss": 0.8311128, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.34277344, + "step": 1444, + "time_per_iteration": 3.282383918762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091274, + "balance_loss_mlp": 1.05517697, + "epoch": 0.2779915352058484, + "flos": 550005147648.0, + "grad_norm": 0.05151300271624721, + "language_loss": 0.85496646, + "learning_rate": 0.0008472196067779898, + "loss": 0.86587918, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.36108398, + "step": 1445, + "time_per_iteration": 2.703263998031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092087, + "balance_loss_mlp": 1.05591917, + "epoch": 0.278183916891112, + "flos": 873444930048.0, + "grad_norm": 0.06736388569990436, + "language_loss": 0.85432607, + "learning_rate": 0.0008469953686942531, + "loss": 0.86524689, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.36181641, + "step": 1446, + "time_per_iteration": 3.0834743976593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087224, + "balance_loss_mlp": 1.05305839, + "epoch": 0.2783762985763755, + "flos": 623782978560.0, + "grad_norm": 0.06536474240751361, + "language_loss": 0.83167183, + "learning_rate": 0.0008467709958949668, + "loss": 0.84254414, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.34204102, + "step": 1447, + "time_per_iteration": 2.737135887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087199, + "balance_loss_mlp": 1.05362952, + "epoch": 0.2785686802616391, + "flos": 581571021312.0, + "grad_norm": 0.057056565872365954, + "language_loss": 0.85917461, + "learning_rate": 0.0008465464884672403, + "loss": 0.87004662, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.33569336, + "step": 1448, + "time_per_iteration": 2.6771810054779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087224, + "balance_loss_mlp": 1.05346394, + "epoch": 0.27876106194690264, + "flos": 587032980480.0, + "grad_norm": 0.06237565084734976, + "language_loss": 0.85356677, + "learning_rate": 0.0008463218464982348, + "loss": 0.86443901, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.33789062, + "step": 1449, + "time_per_iteration": 2.799407720565796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086336, + "balance_loss_mlp": 1.05228984, + "epoch": 0.27895344363216623, + "flos": 875621574144.0, + "grad_norm": 0.06450477685794259, + "language_loss": 0.87800258, + "learning_rate": 0.0008460970700751645, + "loss": 0.88886595, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.34057617, + "step": 1450, + "time_per_iteration": 3.0517759323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086942, + "balance_loss_mlp": 1.05322921, + "epoch": 0.27914582531742976, + "flos": 603630005760.0, + "grad_norm": 0.06893143963997089, + "language_loss": 0.87761652, + "learning_rate": 0.000845872159285295, + "loss": 0.88848597, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.33740234, + "step": 1451, + "time_per_iteration": 2.6964316368103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042481, + "balance_loss_mlp": 1.0276041, + "epoch": 0.27933820700269335, + "flos": 1496892953088.0, + "grad_norm": 0.0242162718076618, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78809333, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.1484375, + "step": 1452, + "time_per_iteration": 4.936378717422485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085289, + "balance_loss_mlp": 1.05162406, + "epoch": 0.2795305886879569, + "flos": 1031445854208.0, + "grad_norm": 0.05721806240601363, + "language_loss": 0.86067116, + "learning_rate": 0.0008454219349544836, + "loss": 0.87152404, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.33691406, + "step": 1453, + "time_per_iteration": 3.336336135864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086797, + "balance_loss_mlp": 1.053442, + "epoch": 0.27972297037322047, + "flos": 606766934016.0, + "grad_norm": 0.056433536115445035, + "language_loss": 0.81829166, + "learning_rate": 0.000845196621588334, + "loss": 0.82915968, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.33374023, + "step": 1454, + "time_per_iteration": 2.7415192127227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088727, + "balance_loss_mlp": 1.05475271, + "epoch": 0.27991535205848406, + "flos": 630085948416.0, + "grad_norm": 0.05700257056170363, + "language_loss": 0.76605666, + "learning_rate": 0.0008449711742049706, + "loss": 0.77694392, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.33984375, + "step": 1455, + "time_per_iteration": 2.755082130432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093567, + "balance_loss_mlp": 1.06035542, + "epoch": 0.2801077337437476, + "flos": 549034689024.0, + "grad_norm": 0.056412270826162, + "language_loss": 0.83750427, + "learning_rate": 0.0008447455928919196, + "loss": 0.84843993, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.33227539, + "step": 1456, + "time_per_iteration": 2.601306438446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097544, + "balance_loss_mlp": 1.06423688, + "epoch": 0.2803001154290112, + "flos": 486516989952.0, + "grad_norm": 0.08664389404831466, + "language_loss": 0.86875856, + "learning_rate": 0.0008445198777367595, + "loss": 0.87973404, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.33325195, + "step": 1457, + "time_per_iteration": 2.5534915924072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106361, + "balance_loss_mlp": 1.07267249, + "epoch": 0.2804924971142747, + "flos": 521820693504.0, + "grad_norm": 0.060155581105879694, + "language_loss": 0.8096568, + "learning_rate": 0.0008442940288271208, + "loss": 0.82072043, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.33691406, + "step": 1458, + "time_per_iteration": 2.6646370887756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108311, + "balance_loss_mlp": 1.07459903, + "epoch": 0.2806848787995383, + "flos": 527410690560.0, + "grad_norm": 0.05492641307724046, + "language_loss": 0.86995763, + "learning_rate": 0.0008440680462506856, + "loss": 0.88104069, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.33740234, + "step": 1459, + "time_per_iteration": 2.793306589126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103222, + "balance_loss_mlp": 1.06927133, + "epoch": 0.2808772604848018, + "flos": 485246785536.0, + "grad_norm": 0.053370474172872176, + "language_loss": 0.86799729, + "learning_rate": 0.0008438419300951883, + "loss": 0.87902945, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.33984375, + "step": 1460, + "time_per_iteration": 2.6732945442199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098403, + "balance_loss_mlp": 1.06488156, + "epoch": 0.2810696421700654, + "flos": 617840801280.0, + "grad_norm": 0.06081455520295947, + "language_loss": 0.86599934, + "learning_rate": 0.0008436156804484148, + "loss": 0.87698334, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.33544922, + "step": 1461, + "time_per_iteration": 2.768385410308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087899, + "balance_loss_mlp": 1.05397177, + "epoch": 0.28126202385532895, + "flos": 454521922560.0, + "grad_norm": 0.061036272851527865, + "language_loss": 0.88221931, + "learning_rate": 0.0008433892973982031, + "loss": 0.89309829, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.33959961, + "step": 1462, + "time_per_iteration": 2.502269983291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108859, + "balance_loss_mlp": 1.0546149, + "epoch": 0.28145440554059253, + "flos": 530447284224.0, + "grad_norm": 0.06533100110399645, + "language_loss": 0.85006338, + "learning_rate": 0.0008431627810324431, + "loss": 0.86094928, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.34008789, + "step": 1463, + "time_per_iteration": 2.6481637954711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097854, + "balance_loss_mlp": 1.06344974, + "epoch": 0.2816467872258561, + "flos": 451996070400.0, + "grad_norm": 0.053948569536927254, + "language_loss": 0.81259125, + "learning_rate": 0.000842936131439076, + "loss": 0.82356977, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.34423828, + "step": 1464, + "time_per_iteration": 2.598619222640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100723, + "balance_loss_mlp": 1.06665313, + "epoch": 0.28183916891111965, + "flos": 472464755712.0, + "grad_norm": 0.06117554261618067, + "language_loss": 0.88043475, + "learning_rate": 0.0008427093487060951, + "loss": 0.89144206, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.34106445, + "step": 1465, + "time_per_iteration": 2.611067533493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092883, + "balance_loss_mlp": 1.05917072, + "epoch": 0.28203155059638324, + "flos": 556770806784.0, + "grad_norm": 0.05001896034653533, + "language_loss": 0.84742111, + "learning_rate": 0.000842482432921545, + "loss": 0.85834992, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.33740234, + "step": 1466, + "time_per_iteration": 2.8155059814453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109283, + "balance_loss_mlp": 1.05990458, + "epoch": 0.28222393228164677, + "flos": 416756385792.0, + "grad_norm": 0.06017010781955974, + "language_loss": 0.87132335, + "learning_rate": 0.0008422553841735225, + "loss": 0.88225162, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.3293457, + "step": 1467, + "time_per_iteration": 2.459348201751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091932, + "balance_loss_mlp": 1.05731392, + "epoch": 0.28241631396691036, + "flos": 604629577728.0, + "grad_norm": 0.060074700521020694, + "language_loss": 0.84810078, + "learning_rate": 0.0008420282025501757, + "loss": 0.85902011, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.34643555, + "step": 1468, + "time_per_iteration": 2.7499678134918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086806, + "balance_loss_mlp": 1.05275989, + "epoch": 0.2826086956521739, + "flos": 572698529280.0, + "grad_norm": 0.05717030328031113, + "language_loss": 0.854882, + "learning_rate": 0.0008418008881397043, + "loss": 0.86575013, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.34082031, + "step": 1469, + "time_per_iteration": 2.6512861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080943, + "balance_loss_mlp": 1.04677796, + "epoch": 0.2828010773374375, + "flos": 842367886848.0, + "grad_norm": 0.05184982716140645, + "language_loss": 0.82590878, + "learning_rate": 0.0008415734410303595, + "loss": 0.8367182, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.34204102, + "step": 1470, + "time_per_iteration": 3.1787467002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085547, + "balance_loss_mlp": 1.05214453, + "epoch": 0.28299345902270107, + "flos": 542402860032.0, + "grad_norm": 0.04590644458835405, + "language_loss": 0.90709066, + "learning_rate": 0.0008413458613104444, + "loss": 0.9179461, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.33447266, + "step": 1471, + "time_per_iteration": 2.6650707721710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092285, + "balance_loss_mlp": 1.05780995, + "epoch": 0.2831858407079646, + "flos": 571320635904.0, + "grad_norm": 0.05367648266979066, + "language_loss": 0.82737631, + "learning_rate": 0.0008411181490683129, + "loss": 0.83829916, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.34472656, + "step": 1472, + "time_per_iteration": 2.7423322200775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104233, + "balance_loss_mlp": 1.06909025, + "epoch": 0.2833782223932282, + "flos": 763491861504.0, + "grad_norm": 0.05498194123694656, + "language_loss": 0.82467097, + "learning_rate": 0.0008408903043923707, + "loss": 0.83571333, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.35180664, + "step": 1473, + "time_per_iteration": 2.991787910461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114011, + "balance_loss_mlp": 1.07810485, + "epoch": 0.2835706040784917, + "flos": 538793068032.0, + "grad_norm": 0.05681509946110844, + "language_loss": 0.81401253, + "learning_rate": 0.0008406623273710754, + "loss": 0.82515264, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.35913086, + "step": 1474, + "time_per_iteration": 2.58866286277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112011, + "balance_loss_mlp": 1.07732141, + "epoch": 0.2837629857637553, + "flos": 530329420800.0, + "grad_norm": 0.06008709036576614, + "language_loss": 0.82883334, + "learning_rate": 0.0008404342180929351, + "loss": 0.83995342, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.34716797, + "step": 1475, + "time_per_iteration": 2.636383295059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112842, + "balance_loss_mlp": 1.07834268, + "epoch": 0.28395536744901884, + "flos": 539763526656.0, + "grad_norm": 0.06514959519071023, + "language_loss": 0.81725156, + "learning_rate": 0.00084020597664651, + "loss": 0.82837999, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.34521484, + "step": 1476, + "time_per_iteration": 2.7587718963623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106069, + "balance_loss_mlp": 1.0697813, + "epoch": 0.2841477491342824, + "flos": 573344510976.0, + "grad_norm": 0.0608139165355994, + "language_loss": 0.84204602, + "learning_rate": 0.0008399776031204111, + "loss": 0.85310674, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.36303711, + "step": 1477, + "time_per_iteration": 2.7376203536987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097711, + "balance_loss_mlp": 1.06263912, + "epoch": 0.28434013081954596, + "flos": 571802264064.0, + "grad_norm": 0.06275845169868324, + "language_loss": 0.8026123, + "learning_rate": 0.0008397490976033009, + "loss": 0.81358939, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.35083008, + "step": 1478, + "time_per_iteration": 2.6391618251800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103776, + "balance_loss_mlp": 1.02412283, + "epoch": 0.28453251250480954, + "flos": 1552554832896.0, + "grad_norm": 0.016614249421738093, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78917408, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.13671875, + "step": 1479, + "time_per_iteration": 4.730362176895142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079858, + "balance_loss_mlp": 1.04550207, + "epoch": 0.28472489419007313, + "flos": 748720862208.0, + "grad_norm": 0.04873312803653651, + "language_loss": 0.85529596, + "learning_rate": 0.0008392916909509525, + "loss": 0.86609453, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.34399414, + "step": 1480, + "time_per_iteration": 3.0429892539978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083924, + "balance_loss_mlp": 1.0495913, + "epoch": 0.28491727587533666, + "flos": 489914376192.0, + "grad_norm": 0.056617404906403615, + "language_loss": 0.85149431, + "learning_rate": 0.0008390627899932954, + "loss": 0.86233348, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.34375, + "step": 1481, + "time_per_iteration": 2.6355843544006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083301, + "balance_loss_mlp": 1.04818201, + "epoch": 0.28510965756060025, + "flos": 728671196160.0, + "grad_norm": 0.06013951169928809, + "language_loss": 0.88358414, + "learning_rate": 0.000838833757399789, + "loss": 0.89441717, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.3515625, + "step": 1482, + "time_per_iteration": 2.9198856353759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088409, + "balance_loss_mlp": 1.05357623, + "epoch": 0.2853020392458638, + "flos": 551300083200.0, + "grad_norm": 0.06378715850004578, + "language_loss": 0.80512154, + "learning_rate": 0.0008386045932593515, + "loss": 0.81600565, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.34887695, + "step": 1483, + "time_per_iteration": 2.665919065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087283, + "balance_loss_mlp": 1.05233121, + "epoch": 0.28549442093112737, + "flos": 754456425984.0, + "grad_norm": 0.06049898751226662, + "language_loss": 0.86304945, + "learning_rate": 0.0008383752976609525, + "loss": 0.87392229, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.34960938, + "step": 1484, + "time_per_iteration": 2.9113876819610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081674, + "balance_loss_mlp": 1.04586315, + "epoch": 0.2856868026163909, + "flos": 538311439872.0, + "grad_norm": 0.05363431349597561, + "language_loss": 0.79897112, + "learning_rate": 0.0008381458706936123, + "loss": 0.80978787, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.3581543, + "step": 1485, + "time_per_iteration": 2.715182065963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082792, + "balance_loss_mlp": 1.0470289, + "epoch": 0.2858791843016545, + "flos": 583487207424.0, + "grad_norm": 0.055785658857036256, + "language_loss": 0.8776381, + "learning_rate": 0.0008379163124464025, + "loss": 0.888466, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.35766602, + "step": 1486, + "time_per_iteration": 2.713412284851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083174, + "balance_loss_mlp": 1.04881775, + "epoch": 0.286071565986918, + "flos": 644503357440.0, + "grad_norm": 0.05967072286491994, + "language_loss": 0.76593089, + "learning_rate": 0.0008376866230084452, + "loss": 0.7767626, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.34399414, + "step": 1487, + "time_per_iteration": 2.812953472137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091632, + "balance_loss_mlp": 1.05522537, + "epoch": 0.2862639476721816, + "flos": 491120561664.0, + "grad_norm": 0.06413337589788286, + "language_loss": 0.85965335, + "learning_rate": 0.000837456802468914, + "loss": 0.87056965, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.36401367, + "step": 1488, + "time_per_iteration": 2.5974318981170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096408, + "balance_loss_mlp": 1.06050217, + "epoch": 0.2864563293574452, + "flos": 521363796480.0, + "grad_norm": 0.06049840128310572, + "language_loss": 0.85439187, + "learning_rate": 0.0008372268509170331, + "loss": 0.86535597, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.35888672, + "step": 1489, + "time_per_iteration": 2.6646056175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100143, + "balance_loss_mlp": 1.06478548, + "epoch": 0.2866487110427087, + "flos": 546834723840.0, + "grad_norm": 0.05582745965585505, + "language_loss": 0.84845203, + "learning_rate": 0.0008369967684420779, + "loss": 0.85945344, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.35424805, + "step": 1490, + "time_per_iteration": 2.737180471420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094742, + "balance_loss_mlp": 1.05902719, + "epoch": 0.2868410927279723, + "flos": 481977437184.0, + "grad_norm": 0.0702351654670911, + "language_loss": 0.84684229, + "learning_rate": 0.0008367665551333736, + "loss": 0.85778964, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.35717773, + "step": 1491, + "time_per_iteration": 2.591179847717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095437, + "balance_loss_mlp": 1.05807662, + "epoch": 0.28703347441323585, + "flos": 724578365952.0, + "grad_norm": 0.0690733570245185, + "language_loss": 0.85732669, + "learning_rate": 0.0008365362110802977, + "loss": 0.86828107, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.37329102, + "step": 1492, + "time_per_iteration": 2.8586251735687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083578, + "balance_loss_mlp": 1.04733849, + "epoch": 0.28722585609849943, + "flos": 634670581248.0, + "grad_norm": 0.059898504183233336, + "language_loss": 0.82604659, + "learning_rate": 0.0008363057363722773, + "loss": 0.83688229, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.36254883, + "step": 1493, + "time_per_iteration": 2.8491427898406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076927, + "balance_loss_mlp": 1.04264212, + "epoch": 0.28741823778376296, + "flos": 509974216704.0, + "grad_norm": 0.05796804627405179, + "language_loss": 0.84095198, + "learning_rate": 0.0008360751310987906, + "loss": 0.85172129, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.34301758, + "step": 1494, + "time_per_iteration": 2.5735158920288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077369, + "balance_loss_mlp": 1.04294157, + "epoch": 0.28761061946902655, + "flos": 603458297856.0, + "grad_norm": 0.07368534281083228, + "language_loss": 0.85552645, + "learning_rate": 0.0008358443953493666, + "loss": 0.86630011, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.34472656, + "step": 1495, + "time_per_iteration": 2.8492400646209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078466, + "balance_loss_mlp": 1.04260767, + "epoch": 0.28780300115429014, + "flos": 406977454080.0, + "grad_norm": 0.061458136593000166, + "language_loss": 0.88553399, + "learning_rate": 0.0008356135292135851, + "loss": 0.89631861, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.35864258, + "step": 1496, + "time_per_iteration": 2.499234676361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109055, + "balance_loss_mlp": 1.05428672, + "epoch": 0.28799538283955367, + "flos": 374726310912.0, + "grad_norm": 0.06023187099093886, + "language_loss": 0.92244387, + "learning_rate": 0.0008353825327810758, + "loss": 0.93334937, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.36230469, + "step": 1497, + "time_per_iteration": 2.4068801403045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083025, + "balance_loss_mlp": 1.04761958, + "epoch": 0.28818776452481726, + "flos": 591645316608.0, + "grad_norm": 0.050935971597675156, + "language_loss": 0.81914794, + "learning_rate": 0.00083515140614152, + "loss": 0.82997811, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.35473633, + "step": 1498, + "time_per_iteration": 2.7172293663024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05012727, + "epoch": 0.2883801462100808, + "flos": 534819511296.0, + "grad_norm": 0.055380500747477406, + "language_loss": 0.86671853, + "learning_rate": 0.0008349201493846485, + "loss": 0.87756932, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.34985352, + "step": 1499, + "time_per_iteration": 2.666877508163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088163, + "balance_loss_mlp": 1.05268669, + "epoch": 0.2885725278953444, + "flos": 479850255360.0, + "grad_norm": 0.06392675802491345, + "language_loss": 0.89344347, + "learning_rate": 0.0008346887626002432, + "loss": 0.90432513, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.35473633, + "step": 1500, + "time_per_iteration": 2.547353744506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081111, + "balance_loss_mlp": 1.04613519, + "epoch": 0.2887649095806079, + "flos": 463798877184.0, + "grad_norm": 0.050375470508879826, + "language_loss": 0.86195928, + "learning_rate": 0.000834457245878137, + "loss": 0.87277037, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.35009766, + "step": 1501, + "time_per_iteration": 2.6108102798461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076981, + "balance_loss_mlp": 1.04317355, + "epoch": 0.2889572912658715, + "flos": 930631527936.0, + "grad_norm": 0.05668037017333152, + "language_loss": 0.81365681, + "learning_rate": 0.000834225599308212, + "loss": 0.82442665, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.33837891, + "step": 1502, + "time_per_iteration": 3.2222447395324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085232, + "balance_loss_mlp": 1.04994583, + "epoch": 0.28914967295113503, + "flos": 569848200192.0, + "grad_norm": 0.05132223508893719, + "language_loss": 0.85018057, + "learning_rate": 0.0008339938229804016, + "loss": 0.8610329, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.35327148, + "step": 1503, + "time_per_iteration": 2.698528289794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_mlp": 1.01945031, + "epoch": 0.2893420546363986, + "flos": 1485803119104.0, + "grad_norm": 0.02573157997511775, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76467812, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.13574219, + "step": 1504, + "time_per_iteration": 4.950274467468262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083767, + "balance_loss_mlp": 1.04793239, + "epoch": 0.2895344363216622, + "flos": 469938903552.0, + "grad_norm": 0.06568085425348943, + "language_loss": 0.84119928, + "learning_rate": 0.0008335298814111094, + "loss": 0.85203701, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.35864258, + "step": 1505, + "time_per_iteration": 2.542043924331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085418, + "balance_loss_mlp": 1.05089498, + "epoch": 0.28972681800692573, + "flos": 647909508096.0, + "grad_norm": 0.06591449016003405, + "language_loss": 0.87860626, + "learning_rate": 0.0008332977163497455, + "loss": 0.88946044, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.34570312, + "step": 1506, + "time_per_iteration": 2.810399293899536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087351, + "balance_loss_mlp": 1.05163622, + "epoch": 0.2899191996921893, + "flos": 571955033088.0, + "grad_norm": 0.054529888005095714, + "language_loss": 0.83185649, + "learning_rate": 0.0008330654218907325, + "loss": 0.84272999, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.35742188, + "step": 1507, + "time_per_iteration": 2.6968414783477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084817, + "balance_loss_mlp": 1.04981744, + "epoch": 0.29011158137745285, + "flos": 661037773824.0, + "grad_norm": 0.1280653735040032, + "language_loss": 0.81777966, + "learning_rate": 0.0008328329981242548, + "loss": 0.82862782, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.3503418, + "step": 1508, + "time_per_iteration": 2.886396884918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.05441689, + "epoch": 0.29030396306271644, + "flos": 535933974528.0, + "grad_norm": 0.060234790533374126, + "language_loss": 0.87937206, + "learning_rate": 0.0008326004451405475, + "loss": 0.89026886, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.3527832, + "step": 1509, + "time_per_iteration": 2.7797772884368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096033, + "balance_loss_mlp": 1.06148589, + "epoch": 0.29049634474798, + "flos": 511707110400.0, + "grad_norm": 0.05385470227261208, + "language_loss": 0.82548428, + "learning_rate": 0.0008323677630298957, + "loss": 0.83644462, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.34594727, + "step": 1510, + "time_per_iteration": 2.5542855262756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093546, + "balance_loss_mlp": 1.05892766, + "epoch": 0.29068872643324356, + "flos": 613454017536.0, + "grad_norm": 0.05556182475666109, + "language_loss": 0.85001689, + "learning_rate": 0.0008321349518826345, + "loss": 0.86095232, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.34643555, + "step": 1511, + "time_per_iteration": 2.7849388122558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093876, + "balance_loss_mlp": 1.05878115, + "epoch": 0.2908811081185071, + "flos": 546164011008.0, + "grad_norm": 0.07046084545113683, + "language_loss": 0.94823933, + "learning_rate": 0.0008319020117891491, + "loss": 0.95917809, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.35131836, + "step": 1512, + "time_per_iteration": 2.5936031341552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090975, + "balance_loss_mlp": 1.05485463, + "epoch": 0.2910734898037707, + "flos": 604516096512.0, + "grad_norm": 0.06307487928884016, + "language_loss": 0.87063539, + "learning_rate": 0.0008316689428398751, + "loss": 0.88154513, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.36108398, + "step": 1513, + "time_per_iteration": 2.6774067878723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082761, + "balance_loss_mlp": 1.04792798, + "epoch": 0.29126587148903427, + "flos": 574383370752.0, + "grad_norm": 0.043578668947666564, + "language_loss": 0.88254529, + "learning_rate": 0.0008314357451252979, + "loss": 0.89337289, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.34887695, + "step": 1514, + "time_per_iteration": 2.7561941146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086548, + "balance_loss_mlp": 1.05092812, + "epoch": 0.2914582531742978, + "flos": 570802692096.0, + "grad_norm": 0.06240160889449628, + "language_loss": 0.87564558, + "learning_rate": 0.0008312024187359527, + "loss": 0.88651109, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.35644531, + "step": 1515, + "time_per_iteration": 2.636056900024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084088, + "balance_loss_mlp": 1.04942179, + "epoch": 0.2916506348595614, + "flos": 730523363328.0, + "grad_norm": 0.06185972429295104, + "language_loss": 0.87361014, + "learning_rate": 0.000830968963762425, + "loss": 0.88445103, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.34716797, + "step": 1516, + "time_per_iteration": 3.021732807159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091474, + "balance_loss_mlp": 1.05528235, + "epoch": 0.2918430165448249, + "flos": 510220118016.0, + "grad_norm": 0.05583453201751925, + "language_loss": 0.83947027, + "learning_rate": 0.0008307353802953497, + "loss": 0.85038507, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.36206055, + "step": 1517, + "time_per_iteration": 2.6659955978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100836, + "balance_loss_mlp": 1.06318951, + "epoch": 0.2920353982300885, + "flos": 630096122880.0, + "grad_norm": 0.04472517729516854, + "language_loss": 0.86110896, + "learning_rate": 0.0008305016684254125, + "loss": 0.87211728, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.37646484, + "step": 1518, + "time_per_iteration": 2.7896409034729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098986, + "balance_loss_mlp": 1.06153059, + "epoch": 0.29222777991535204, + "flos": 501411644928.0, + "grad_norm": 0.055409034097420505, + "language_loss": 0.86932153, + "learning_rate": 0.0008302678282433479, + "loss": 0.88031137, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.37451172, + "step": 1519, + "time_per_iteration": 2.585256814956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095676, + "balance_loss_mlp": 1.05891216, + "epoch": 0.2924201616006156, + "flos": 486522782208.0, + "grad_norm": 0.057505891705300044, + "language_loss": 0.85011005, + "learning_rate": 0.0008300338598399411, + "loss": 0.86106682, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.36791992, + "step": 1520, + "time_per_iteration": 2.6471352577209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109825, + "balance_loss_mlp": 1.07210708, + "epoch": 0.2926125432858792, + "flos": 476211350016.0, + "grad_norm": 0.05302442020038178, + "language_loss": 0.9456166, + "learning_rate": 0.0008297997633060263, + "loss": 0.95671487, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.37719727, + "step": 1521, + "time_per_iteration": 2.547457695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.06987774, + "epoch": 0.29280492497114274, + "flos": 676379151360.0, + "grad_norm": 0.054888704647412474, + "language_loss": 0.85310549, + "learning_rate": 0.0008295655387324883, + "loss": 0.86417043, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.36645508, + "step": 1522, + "time_per_iteration": 2.822557210922241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105716, + "balance_loss_mlp": 1.0686183, + "epoch": 0.29299730665640633, + "flos": 458175384576.0, + "grad_norm": 0.055715580232585875, + "language_loss": 0.85025144, + "learning_rate": 0.0008293311862102609, + "loss": 0.86130863, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.37084961, + "step": 1523, + "time_per_iteration": 2.5033791065216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103076, + "balance_loss_mlp": 1.06795669, + "epoch": 0.29318968834166986, + "flos": 446343464448.0, + "grad_norm": 0.0584953499596263, + "language_loss": 0.88722956, + "learning_rate": 0.0008290967058303275, + "loss": 0.89826035, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.3515625, + "step": 1524, + "time_per_iteration": 2.5981156826019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_mlp": 1.06630898, + "epoch": 0.29338207002693345, + "flos": 450085676544.0, + "grad_norm": 0.05072610752657829, + "language_loss": 0.86932707, + "learning_rate": 0.0008288620976837219, + "loss": 0.88035178, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.36181641, + "step": 1525, + "time_per_iteration": 2.522019863128662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092716, + "balance_loss_mlp": 1.05623853, + "epoch": 0.293574451712197, + "flos": 502027103232.0, + "grad_norm": 0.05230718040210392, + "language_loss": 0.83001733, + "learning_rate": 0.000828627361861527, + "loss": 0.84094453, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.36474609, + "step": 1526, + "time_per_iteration": 2.559201955795288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085779, + "balance_loss_mlp": 1.05051661, + "epoch": 0.29376683339746057, + "flos": 696158184960.0, + "grad_norm": 0.071892180297548, + "language_loss": 0.8465147, + "learning_rate": 0.0008283924984548752, + "loss": 0.85737246, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.3527832, + "step": 1527, + "time_per_iteration": 2.8108816146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079347, + "balance_loss_mlp": 1.04494321, + "epoch": 0.2939592150827241, + "flos": 478353088512.0, + "grad_norm": 0.05128355551395112, + "language_loss": 0.85087478, + "learning_rate": 0.0008281575075549485, + "loss": 0.86166823, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.34399414, + "step": 1528, + "time_per_iteration": 2.576814889907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051452, + "balance_loss_mlp": 1.04057992, + "epoch": 0.2941515967679877, + "flos": 1484482042368.0, + "grad_norm": 0.031732851839211505, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.7840414, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.10888672, + "step": 1529, + "time_per_iteration": 4.662023067474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089321, + "balance_loss_mlp": 1.0547502, + "epoch": 0.2943439784532513, + "flos": 673848916992.0, + "grad_norm": 0.06453398347295829, + "language_loss": 0.90086716, + "learning_rate": 0.0008276871436402469, + "loss": 0.91176039, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.34619141, + "step": 1530, + "time_per_iteration": 2.795783758163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097742, + "balance_loss_mlp": 1.06460166, + "epoch": 0.2945363601385148, + "flos": 576031896576.0, + "grad_norm": 0.05195467848041957, + "language_loss": 0.87790835, + "learning_rate": 0.000827451770808083, + "loss": 0.88888574, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.33154297, + "step": 1531, + "time_per_iteration": 2.6522414684295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100953, + "balance_loss_mlp": 1.06628692, + "epoch": 0.2947287418237784, + "flos": 480416251392.0, + "grad_norm": 0.05572078055736918, + "language_loss": 0.83276248, + "learning_rate": 0.0008272162708478674, + "loss": 0.84377199, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.34692383, + "step": 1532, + "time_per_iteration": 2.5960874557495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098518, + "balance_loss_mlp": 1.06459141, + "epoch": 0.2949211235090419, + "flos": 557917355520.0, + "grad_norm": 0.05232404820193651, + "language_loss": 0.86136103, + "learning_rate": 0.000826980643851029, + "loss": 0.87234622, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.33959961, + "step": 1533, + "time_per_iteration": 2.671867609024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106353, + "balance_loss_mlp": 1.07147205, + "epoch": 0.2951135051943055, + "flos": 483646311936.0, + "grad_norm": 0.06650262295584625, + "language_loss": 0.84864676, + "learning_rate": 0.0008267448899090464, + "loss": 0.85971034, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.34887695, + "step": 1534, + "time_per_iteration": 2.5133543014526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095921, + "balance_loss_mlp": 1.0604682, + "epoch": 0.29530588687956905, + "flos": 550015322112.0, + "grad_norm": 0.05711998360561463, + "language_loss": 0.80980158, + "learning_rate": 0.0008265090091134473, + "loss": 0.82076073, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.35473633, + "step": 1535, + "time_per_iteration": 2.8528778553009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085228, + "balance_loss_mlp": 1.05072904, + "epoch": 0.29549826856483263, + "flos": 672731481600.0, + "grad_norm": 0.047870597747086484, + "language_loss": 0.80150926, + "learning_rate": 0.0008262730015558088, + "loss": 0.8123616, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.34521484, + "step": 1536, + "time_per_iteration": 2.8849382400512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076376, + "balance_loss_mlp": 1.04192495, + "epoch": 0.29569065025009617, + "flos": 764300786688.0, + "grad_norm": 0.06331525049863725, + "language_loss": 0.82269859, + "learning_rate": 0.0008260368673277574, + "loss": 0.8334623, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.34472656, + "step": 1537, + "time_per_iteration": 3.12172269821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078873, + "balance_loss_mlp": 1.04480314, + "epoch": 0.29588303193535975, + "flos": 543398049792.0, + "grad_norm": 0.05107262607685598, + "language_loss": 0.84019077, + "learning_rate": 0.0008258006065209682, + "loss": 0.85097957, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.34106445, + "step": 1538, + "time_per_iteration": 2.7388381958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082306, + "balance_loss_mlp": 1.04744971, + "epoch": 0.29607541362062334, + "flos": 596648968704.0, + "grad_norm": 0.06469434822608365, + "language_loss": 0.80634302, + "learning_rate": 0.0008255642192271657, + "loss": 0.81716609, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.34863281, + "step": 1539, + "time_per_iteration": 2.7957324981689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082434, + "balance_loss_mlp": 1.04774427, + "epoch": 0.29626779530588687, + "flos": 609588149760.0, + "grad_norm": 0.06097977692176942, + "language_loss": 0.83830953, + "learning_rate": 0.0008253277055381241, + "loss": 0.84913385, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.34741211, + "step": 1540, + "time_per_iteration": 2.8428521156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085866, + "balance_loss_mlp": 1.05146217, + "epoch": 0.29646017699115046, + "flos": 867050237952.0, + "grad_norm": 0.06407432486539091, + "language_loss": 0.8580029, + "learning_rate": 0.0008250910655456658, + "loss": 0.8688615, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.34448242, + "step": 1541, + "time_per_iteration": 3.1741185188293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081587, + "balance_loss_mlp": 1.04696846, + "epoch": 0.296652558676414, + "flos": 495616444416.0, + "grad_norm": 0.06683547404256097, + "language_loss": 0.83703458, + "learning_rate": 0.0008248542993416625, + "loss": 0.84785044, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.34643555, + "step": 1542, + "time_per_iteration": 2.5634429454803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083482, + "balance_loss_mlp": 1.04914963, + "epoch": 0.2968449403616776, + "flos": 571275555840.0, + "grad_norm": 0.054805025189504364, + "language_loss": 0.83645159, + "learning_rate": 0.0008246174070180352, + "loss": 0.84728634, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.34375, + "step": 1543, + "time_per_iteration": 2.664029121398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108698, + "balance_loss_mlp": 1.05226684, + "epoch": 0.2970373220469411, + "flos": 793799115264.0, + "grad_norm": 0.06369286414713611, + "language_loss": 0.84087443, + "learning_rate": 0.0008243803886667537, + "loss": 0.85174423, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.34765625, + "step": 1544, + "time_per_iteration": 3.129185199737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082817, + "balance_loss_mlp": 1.04793644, + "epoch": 0.2972297037322047, + "flos": 660736617984.0, + "grad_norm": 0.0569938777400986, + "language_loss": 0.79051471, + "learning_rate": 0.0008241432443798364, + "loss": 0.80134284, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.34936523, + "step": 1545, + "time_per_iteration": 2.7968478202819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076425, + "balance_loss_mlp": 1.04225969, + "epoch": 0.29742208541746823, + "flos": 596849789952.0, + "grad_norm": 0.05185676674228935, + "language_loss": 0.85634965, + "learning_rate": 0.0008239059742493512, + "loss": 0.86711389, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.34204102, + "step": 1546, + "time_per_iteration": 2.730803966522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079044, + "balance_loss_mlp": 1.0448308, + "epoch": 0.2976144671027318, + "flos": 769519816704.0, + "grad_norm": 0.049935350225070424, + "language_loss": 0.87424839, + "learning_rate": 0.0008236685783674142, + "loss": 0.88503873, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.34228516, + "step": 1547, + "time_per_iteration": 3.0735998153686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060573, + "balance_loss_mlp": 1.04903388, + "epoch": 0.2978068487879954, + "flos": 1483980065280.0, + "grad_norm": 0.022808758650826922, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77281767, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.11523438, + "step": 1548, + "time_per_iteration": 4.902673959732056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088892, + "balance_loss_mlp": 1.05460715, + "epoch": 0.29799923047325894, + "flos": 475079357952.0, + "grad_norm": 0.07696298762455249, + "language_loss": 0.82568306, + "learning_rate": 0.0008231934097178955, + "loss": 0.83657193, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.34326172, + "step": 1549, + "time_per_iteration": 2.59600567817688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087503, + "balance_loss_mlp": 1.05173981, + "epoch": 0.2981916121585225, + "flos": 759464460288.0, + "grad_norm": 0.05308820200633048, + "language_loss": 0.8525809, + "learning_rate": 0.0008229556371347903, + "loss": 0.86345589, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.35791016, + "step": 1550, + "time_per_iteration": 2.955467939376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080247, + "balance_loss_mlp": 1.04593909, + "epoch": 0.29838399384378606, + "flos": 874642351104.0, + "grad_norm": 0.058723621398699785, + "language_loss": 0.79088616, + "learning_rate": 0.0008227177391691874, + "loss": 0.80168855, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.34350586, + "step": 1551, + "time_per_iteration": 3.1204521656036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084967, + "balance_loss_mlp": 1.05001473, + "epoch": 0.29857637552904964, + "flos": 579389995008.0, + "grad_norm": 0.060980844602782615, + "language_loss": 0.89576113, + "learning_rate": 0.0008224797159134463, + "loss": 0.90661073, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.34985352, + "step": 1552, + "time_per_iteration": 2.7535347938537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.04132128, + "epoch": 0.2987687572143132, + "flos": 836048950272.0, + "grad_norm": 0.05791718796165568, + "language_loss": 0.83571118, + "learning_rate": 0.0008222415674599765, + "loss": 0.84646177, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.33764648, + "step": 1553, + "time_per_iteration": 3.0609707832336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077849, + "balance_loss_mlp": 1.0417521, + "epoch": 0.29896113889957676, + "flos": 566800022016.0, + "grad_norm": 0.05477323920870417, + "language_loss": 0.83255476, + "learning_rate": 0.0008220032939012349, + "loss": 0.84333324, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.36108398, + "step": 1554, + "time_per_iteration": 2.6683521270751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077555, + "balance_loss_mlp": 1.04310393, + "epoch": 0.29915352058484035, + "flos": 498370669056.0, + "grad_norm": 0.049159177960894807, + "language_loss": 0.87956095, + "learning_rate": 0.0008217648953297277, + "loss": 0.89033645, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.34472656, + "step": 1555, + "time_per_iteration": 2.82114315032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109245, + "balance_loss_mlp": 1.05711639, + "epoch": 0.2993459022701039, + "flos": 591837373440.0, + "grad_norm": 0.06210935096260163, + "language_loss": 0.7799179, + "learning_rate": 0.0008215263718380095, + "loss": 0.79084241, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.35327148, + "step": 1556, + "time_per_iteration": 2.6806485652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104132, + "balance_loss_mlp": 1.06815481, + "epoch": 0.29953828395536747, + "flos": 572107802112.0, + "grad_norm": 0.051501670996139066, + "language_loss": 0.8437115, + "learning_rate": 0.0008212877235186833, + "loss": 0.8547529, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.36010742, + "step": 1557, + "time_per_iteration": 2.706531286239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075275, + "balance_loss_mlp": 1.06321061, + "epoch": 0.299730665640631, + "flos": 1503855051264.0, + "grad_norm": 0.03618665962020262, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78812838, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.12060547, + "step": 1558, + "time_per_iteration": 4.914030313491821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102225, + "balance_loss_mlp": 1.06641483, + "epoch": 0.2999230473258946, + "flos": 513538928640.0, + "grad_norm": 0.06717328469529676, + "language_loss": 0.80777293, + "learning_rate": 0.0008208100527678611, + "loss": 0.8187952, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.3581543, + "step": 1559, + "time_per_iteration": 2.5862650871276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097511, + "balance_loss_mlp": 1.06294012, + "epoch": 0.3001154290111581, + "flos": 834128381952.0, + "grad_norm": 0.05731533213860712, + "language_loss": 0.78337204, + "learning_rate": 0.0008205710305218135, + "loss": 0.79434717, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.34594727, + "step": 1560, + "time_per_iteration": 3.00581693649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094947, + "balance_loss_mlp": 1.06149733, + "epoch": 0.3003078106964217, + "flos": 556485617664.0, + "grad_norm": 0.051151635719759364, + "language_loss": 0.89917201, + "learning_rate": 0.0008203318838190541, + "loss": 0.9101215, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.3347168, + "step": 1561, + "time_per_iteration": 2.730187177658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087932, + "balance_loss_mlp": 1.05345702, + "epoch": 0.30050019238168524, + "flos": 525897556992.0, + "grad_norm": 0.07455466191279551, + "language_loss": 0.85053575, + "learning_rate": 0.0008200926127524281, + "loss": 0.86141509, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.34521484, + "step": 1562, + "time_per_iteration": 2.6252634525299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082824, + "balance_loss_mlp": 1.04837239, + "epoch": 0.3006925740669488, + "flos": 577582907904.0, + "grad_norm": 0.08578868432126639, + "language_loss": 0.83193934, + "learning_rate": 0.0008198532174148289, + "loss": 0.84276754, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.3449707, + "step": 1563, + "time_per_iteration": 2.784132957458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010668, + "balance_loss_mlp": 0.99912882, + "epoch": 0.3008849557522124, + "flos": 1489408528896.0, + "grad_norm": 0.006418694176289122, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81696838, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.11523438, + "step": 1564, + "time_per_iteration": 4.826026678085327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079569, + "balance_loss_mlp": 1.04607105, + "epoch": 0.30107733743747594, + "flos": 509565371904.0, + "grad_norm": 0.057361266050022765, + "language_loss": 0.88701093, + "learning_rate": 0.0008193740542985244, + "loss": 0.89780664, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.33520508, + "step": 1565, + "time_per_iteration": 2.5685722827911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082334, + "balance_loss_mlp": 1.04881263, + "epoch": 0.30126971912273953, + "flos": 587425858560.0, + "grad_norm": 0.055549771382925904, + "language_loss": 0.86598676, + "learning_rate": 0.0008191342867058467, + "loss": 0.87681007, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.33520508, + "step": 1566, + "time_per_iteration": 2.7413527965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088116, + "balance_loss_mlp": 1.05423677, + "epoch": 0.30146210080800306, + "flos": 601822918656.0, + "grad_norm": 0.054174391750340056, + "language_loss": 0.83411789, + "learning_rate": 0.0008188943952142509, + "loss": 0.84499902, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.33911133, + "step": 1567, + "time_per_iteration": 2.816777229309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098385, + "balance_loss_mlp": 1.06376624, + "epoch": 0.30165448249326665, + "flos": 917424686592.0, + "grad_norm": 0.057308899380469513, + "language_loss": 0.81973398, + "learning_rate": 0.0008186543799168711, + "loss": 0.8307178, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.34643555, + "step": 1568, + "time_per_iteration": 3.138439655303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094575, + "balance_loss_mlp": 1.060076, + "epoch": 0.3018468641785302, + "flos": 776953368576.0, + "grad_norm": 0.06314470525088989, + "language_loss": 0.88671768, + "learning_rate": 0.0008184142409068892, + "loss": 0.89766341, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.34545898, + "step": 1569, + "time_per_iteration": 3.0061678886413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104914, + "balance_loss_mlp": 1.07134473, + "epoch": 0.30203924586379377, + "flos": 522101500416.0, + "grad_norm": 0.05000282823150535, + "language_loss": 0.86630476, + "learning_rate": 0.000818173978277536, + "loss": 0.87735385, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.3359375, + "step": 1570, + "time_per_iteration": 2.7171432971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101367, + "balance_loss_mlp": 1.06779718, + "epoch": 0.3022316275490573, + "flos": 524288318976.0, + "grad_norm": 0.052630401262377564, + "language_loss": 0.83781934, + "learning_rate": 0.000817933592122089, + "loss": 0.84883296, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.3359375, + "step": 1571, + "time_per_iteration": 2.7346580028533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105149, + "balance_loss_mlp": 1.0710789, + "epoch": 0.3024240092343209, + "flos": 479672755200.0, + "grad_norm": 0.05357670269103591, + "language_loss": 0.83451247, + "learning_rate": 0.0008176930825338749, + "loss": 0.84556395, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.34106445, + "step": 1572, + "time_per_iteration": 2.5449459552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095582, + "balance_loss_mlp": 1.06110692, + "epoch": 0.3026163909195845, + "flos": 686901579264.0, + "grad_norm": 0.06283664606524127, + "language_loss": 0.8873198, + "learning_rate": 0.0008174524496062679, + "loss": 0.89827561, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.3449707, + "step": 1573, + "time_per_iteration": 2.8826043605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108869, + "balance_loss_mlp": 1.05380964, + "epoch": 0.302808772604848, + "flos": 542654553600.0, + "grad_norm": 0.05929060654319276, + "language_loss": 0.85444844, + "learning_rate": 0.0008172116934326894, + "loss": 0.86533535, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.34912109, + "step": 1574, + "time_per_iteration": 2.7539572715759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088334, + "balance_loss_mlp": 1.05435979, + "epoch": 0.3030011542901116, + "flos": 474852395520.0, + "grad_norm": 0.051325587648683973, + "language_loss": 0.87683225, + "learning_rate": 0.0008169708141066097, + "loss": 0.88771558, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.34008789, + "step": 1575, + "time_per_iteration": 2.5635225772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086472, + "balance_loss_mlp": 1.05199683, + "epoch": 0.30319353597537513, + "flos": 481233940992.0, + "grad_norm": 0.06106098638193731, + "language_loss": 0.90820259, + "learning_rate": 0.0008167298117215465, + "loss": 0.91906732, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.3449707, + "step": 1576, + "time_per_iteration": 2.5388035774230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087257, + "balance_loss_mlp": 1.05242443, + "epoch": 0.3033859176606387, + "flos": 704455916544.0, + "grad_norm": 0.06728579874610481, + "language_loss": 0.88300574, + "learning_rate": 0.0008164886863710649, + "loss": 0.89387834, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.34887695, + "step": 1577, + "time_per_iteration": 2.8935675621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088541, + "balance_loss_mlp": 1.05554342, + "epoch": 0.30357829934590225, + "flos": 764344456704.0, + "grad_norm": 0.04642698643554312, + "language_loss": 0.86113924, + "learning_rate": 0.0008162474381487783, + "loss": 0.87202466, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.33007812, + "step": 1578, + "time_per_iteration": 3.0151257514953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088208, + "balance_loss_mlp": 1.05401909, + "epoch": 0.30377068103116583, + "flos": 532082663424.0, + "grad_norm": 0.05691489249418783, + "language_loss": 0.84894794, + "learning_rate": 0.0008160060671483475, + "loss": 0.85983002, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.34228516, + "step": 1579, + "time_per_iteration": 2.6575984954833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097191, + "balance_loss_mlp": 1.06338358, + "epoch": 0.3039630627164294, + "flos": 509934928896.0, + "grad_norm": 0.07240450604386858, + "language_loss": 0.83520651, + "learning_rate": 0.0008157645734634809, + "loss": 0.84617841, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.33837891, + "step": 1580, + "time_per_iteration": 2.5869438648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061274, + "balance_loss_mlp": 1.04992568, + "epoch": 0.30415544440169295, + "flos": 1505206803456.0, + "grad_norm": 0.030998653754179664, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.77957761, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.11328125, + "step": 1581, + "time_per_iteration": 4.907174348831177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_mlp": 1.01606703, + "epoch": 0.30434782608695654, + "flos": 1457976637440.0, + "grad_norm": 0.012214555664928241, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74242246, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.11669922, + "step": 1582, + "time_per_iteration": 4.8717710971832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102415, + "balance_loss_mlp": 1.0685122, + "epoch": 0.3045402077722201, + "flos": 482312088576.0, + "grad_norm": 0.05813255519406619, + "language_loss": 0.83851862, + "learning_rate": 0.000815039357240067, + "loss": 0.84954274, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.33935547, + "step": 1583, + "time_per_iteration": 2.640148401260376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102551, + "balance_loss_mlp": 1.06879056, + "epoch": 0.30473258945748366, + "flos": 543220549632.0, + "grad_norm": 0.06312099992380371, + "language_loss": 0.85312426, + "learning_rate": 0.0008147973737554952, + "loss": 0.86414981, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.33789062, + "step": 1584, + "time_per_iteration": 2.772367000579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098103, + "balance_loss_mlp": 1.06443787, + "epoch": 0.3049249711427472, + "flos": 566789847552.0, + "grad_norm": 0.054268296030043885, + "language_loss": 0.85613728, + "learning_rate": 0.000814555268055744, + "loss": 0.86711836, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.33691406, + "step": 1585, + "time_per_iteration": 2.616687536239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109794, + "balance_loss_mlp": 1.06441879, + "epoch": 0.3051173528280108, + "flos": 527970894336.0, + "grad_norm": 0.05527556644311566, + "language_loss": 0.87648201, + "learning_rate": 0.0008143130402348073, + "loss": 0.88746148, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.33544922, + "step": 1586, + "time_per_iteration": 2.635103940963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094771, + "balance_loss_mlp": 1.06141627, + "epoch": 0.3053097345132743, + "flos": 586097427456.0, + "grad_norm": 0.052385807505719764, + "language_loss": 0.79520649, + "learning_rate": 0.0008140706903867265, + "loss": 0.80615419, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.33349609, + "step": 1587, + "time_per_iteration": 2.7922940254211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087687, + "balance_loss_mlp": 1.05263984, + "epoch": 0.3055021161985379, + "flos": 606810604032.0, + "grad_norm": 0.054380951058352583, + "language_loss": 0.90043247, + "learning_rate": 0.0008138282186055897, + "loss": 0.9113093, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.35058594, + "step": 1588, + "time_per_iteration": 2.683783769607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085545, + "balance_loss_mlp": 1.05290556, + "epoch": 0.3056944978838015, + "flos": 573594794496.0, + "grad_norm": 0.05235756550943364, + "language_loss": 0.8193745, + "learning_rate": 0.0008135856249855331, + "loss": 0.83023, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.32641602, + "step": 1589, + "time_per_iteration": 2.6717309951782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081523, + "balance_loss_mlp": 1.04900289, + "epoch": 0.305886879569065, + "flos": 633640485888.0, + "grad_norm": 0.06284243799371535, + "language_loss": 0.89691997, + "learning_rate": 0.0008133429096207398, + "loss": 0.90773523, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.32519531, + "step": 1590, + "time_per_iteration": 2.757962465286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037916, + "balance_loss_mlp": 1.02561319, + "epoch": 0.3060792612543286, + "flos": 1368227414016.0, + "grad_norm": 0.023218914608202516, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76350176, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.12304688, + "step": 1591, + "time_per_iteration": 4.927033185958862 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078048, + "balance_loss_mlp": 1.0450511, + "epoch": 0.30627164293959214, + "flos": 518290887168.0, + "grad_norm": 0.05132667013942606, + "language_loss": 0.86601979, + "learning_rate": 0.0008128571140339123, + "loss": 0.87680024, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.33007812, + "step": 1592, + "time_per_iteration": 2.627272367477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075862, + "balance_loss_mlp": 1.0423162, + "epoch": 0.3064640246248557, + "flos": 455354168832.0, + "grad_norm": 0.054345541641725725, + "language_loss": 0.87405455, + "learning_rate": 0.0008126140340004805, + "loss": 0.88481319, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.33569336, + "step": 1593, + "time_per_iteration": 2.5047686100006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076578, + "balance_loss_mlp": 1.04355717, + "epoch": 0.30665640631011926, + "flos": 849718480896.0, + "grad_norm": 0.04925367115496714, + "language_loss": 0.82262254, + "learning_rate": 0.0008123708325995172, + "loss": 0.83338827, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.33032227, + "step": 1594, + "time_per_iteration": 3.1693196296691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107923, + "balance_loss_mlp": 1.04582715, + "epoch": 0.30684878799538284, + "flos": 757996406784.0, + "grad_norm": 0.04977841797679214, + "language_loss": 0.79901791, + "learning_rate": 0.0008121275099254414, + "loss": 0.80981016, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.33422852, + "step": 1595, + "time_per_iteration": 2.9197185039520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108089, + "balance_loss_mlp": 1.04758275, + "epoch": 0.3070411696806464, + "flos": 517320428544.0, + "grad_norm": 0.05488318824662342, + "language_loss": 0.88300943, + "learning_rate": 0.0008118840660727194, + "loss": 0.89381832, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.33325195, + "step": 1596, + "time_per_iteration": 2.6452150344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079954, + "balance_loss_mlp": 1.04788685, + "epoch": 0.30723355136590996, + "flos": 843883992576.0, + "grad_norm": 0.05612425557740203, + "language_loss": 0.87403214, + "learning_rate": 0.0008116405011358644, + "loss": 0.88483167, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.32055664, + "step": 1597, + "time_per_iteration": 3.135666608810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_mlp": 1.05092525, + "epoch": 0.30742593305117355, + "flos": 465905710080.0, + "grad_norm": 0.05391343675647517, + "language_loss": 0.80005342, + "learning_rate": 0.0008113968152094369, + "loss": 0.81089526, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.33276367, + "step": 1598, + "time_per_iteration": 2.5122313499450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076054, + "balance_loss_mlp": 1.04331923, + "epoch": 0.3076183147364371, + "flos": 686286120960.0, + "grad_norm": 0.04979397496165333, + "language_loss": 0.82305032, + "learning_rate": 0.0008111530083880438, + "loss": 0.83381081, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.32739258, + "step": 1599, + "time_per_iteration": 2.8883755207061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072089, + "balance_loss_mlp": 1.03949702, + "epoch": 0.30781069642170067, + "flos": 613729032192.0, + "grad_norm": 0.059000164712882774, + "language_loss": 0.86657357, + "learning_rate": 0.0008109090807663399, + "loss": 0.87729448, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.32592773, + "step": 1600, + "time_per_iteration": 2.7799928188323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075122, + "balance_loss_mlp": 1.04260147, + "epoch": 0.3080030781069642, + "flos": 590021521920.0, + "grad_norm": 0.046450828420206536, + "language_loss": 0.88735926, + "learning_rate": 0.0008106650324390257, + "loss": 0.89811045, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.32519531, + "step": 1601, + "time_per_iteration": 2.7887444496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071489, + "balance_loss_mlp": 1.03913534, + "epoch": 0.3081954597922278, + "flos": 562353601536.0, + "grad_norm": 0.06865077604181559, + "language_loss": 0.81335884, + "learning_rate": 0.0008104208635008493, + "loss": 0.82407373, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.32348633, + "step": 1602, + "time_per_iteration": 2.6526358127593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077173, + "balance_loss_mlp": 1.04448628, + "epoch": 0.3083878414774913, + "flos": 447599112192.0, + "grad_norm": 0.053973671264543166, + "language_loss": 0.81925714, + "learning_rate": 0.0008101765740466058, + "loss": 0.83002889, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.3269043, + "step": 1603, + "time_per_iteration": 2.5337142944335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073512, + "balance_loss_mlp": 1.04020488, + "epoch": 0.3085802231627549, + "flos": 493297205760.0, + "grad_norm": 0.05670542728571842, + "language_loss": 0.84135199, + "learning_rate": 0.0008099321641711364, + "loss": 0.85208714, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.33325195, + "step": 1604, + "time_per_iteration": 2.6616737842559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071402, + "balance_loss_mlp": 1.03804755, + "epoch": 0.3087726048480185, + "flos": 487437986304.0, + "grad_norm": 0.0517354770696361, + "language_loss": 0.8343811, + "learning_rate": 0.0008096876339693295, + "loss": 0.8450951, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.33374023, + "step": 1605, + "time_per_iteration": 2.6034042835235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078765, + "balance_loss_mlp": 1.04412317, + "epoch": 0.308964986533282, + "flos": 730265877504.0, + "grad_norm": 0.0630488444124333, + "language_loss": 0.8123467, + "learning_rate": 0.0008094429835361206, + "loss": 0.8231343, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.34667969, + "step": 1606, + "time_per_iteration": 2.9442811012268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_mlp": 1.03788495, + "epoch": 0.3091573682185456, + "flos": 605131554816.0, + "grad_norm": 0.0490228515497239, + "language_loss": 0.85833865, + "learning_rate": 0.0008091982129664908, + "loss": 0.8690542, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.33691406, + "step": 1607, + "time_per_iteration": 2.734976053237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077455, + "balance_loss_mlp": 1.04290783, + "epoch": 0.30934974990380915, + "flos": 460081396224.0, + "grad_norm": 0.04772079658934369, + "language_loss": 0.82646394, + "learning_rate": 0.0008089533223554687, + "loss": 0.83723843, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.34594727, + "step": 1608, + "time_per_iteration": 2.756741762161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080172, + "balance_loss_mlp": 1.04669785, + "epoch": 0.30954213158907273, + "flos": 553142075904.0, + "grad_norm": 0.05499274022240881, + "language_loss": 0.8525604, + "learning_rate": 0.0008087083117981294, + "loss": 0.86336207, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.33496094, + "step": 1609, + "time_per_iteration": 2.9062788486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081142, + "balance_loss_mlp": 1.04676199, + "epoch": 0.30973451327433627, + "flos": 552776901120.0, + "grad_norm": 0.0512798930400947, + "language_loss": 0.87996054, + "learning_rate": 0.0008084631813895943, + "loss": 0.89077199, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.34375, + "step": 1610, + "time_per_iteration": 2.789893627166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079299, + "balance_loss_mlp": 1.04575384, + "epoch": 0.30992689495959985, + "flos": 565430893056.0, + "grad_norm": 0.07403274033744815, + "language_loss": 0.83632123, + "learning_rate": 0.0008082179312250315, + "loss": 0.84711421, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.33544922, + "step": 1611, + "time_per_iteration": 2.713533878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_mlp": 1.02864099, + "epoch": 0.3101192766448634, + "flos": 1441621131264.0, + "grad_norm": 0.023040649208851512, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.80895925, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.11425781, + "step": 1612, + "time_per_iteration": 4.866560459136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_mlp": 1.02448523, + "epoch": 0.31031165833012697, + "flos": 1531086575616.0, + "grad_norm": 0.021256554447441355, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77664924, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.11132812, + "step": 1613, + "time_per_iteration": 5.01593279838562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010886, + "balance_loss_mlp": 1.05483997, + "epoch": 0.31050404001539056, + "flos": 991534196736.0, + "grad_norm": 0.06253960188626659, + "language_loss": 0.81937206, + "learning_rate": 0.0008074814631475545, + "loss": 0.83025801, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.33789062, + "step": 1614, + "time_per_iteration": 3.3154871463775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092644, + "balance_loss_mlp": 1.05940843, + "epoch": 0.3106964217006541, + "flos": 445748355072.0, + "grad_norm": 0.0719929788966035, + "language_loss": 0.78655052, + "learning_rate": 0.0008072357349114907, + "loss": 0.79747701, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.33251953, + "step": 1615, + "time_per_iteration": 2.6783502101898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084302, + "balance_loss_mlp": 1.05063736, + "epoch": 0.3108888033859177, + "flos": 510259405824.0, + "grad_norm": 0.06269338504314155, + "language_loss": 0.88523185, + "learning_rate": 0.0008069898873959363, + "loss": 0.89607489, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.33691406, + "step": 1616, + "time_per_iteration": 2.6805779933929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092165, + "balance_loss_mlp": 1.05952573, + "epoch": 0.3110811850711812, + "flos": 520471913472.0, + "grad_norm": 0.06669389997650658, + "language_loss": 0.85964763, + "learning_rate": 0.0008067439206963375, + "loss": 0.87056935, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.32641602, + "step": 1617, + "time_per_iteration": 2.6084542274475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091143, + "balance_loss_mlp": 1.05797851, + "epoch": 0.3112735667564448, + "flos": 686085299712.0, + "grad_norm": 0.06020913179087489, + "language_loss": 0.86049557, + "learning_rate": 0.0008064978349081873, + "loss": 0.87140703, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.33178711, + "step": 1618, + "time_per_iteration": 2.9622116088867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089777, + "balance_loss_mlp": 1.05554032, + "epoch": 0.31146594844170833, + "flos": 532786871808.0, + "grad_norm": 0.04562356821057988, + "language_loss": 0.86218596, + "learning_rate": 0.0008062516301270245, + "loss": 0.87308377, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.3425293, + "step": 1619, + "time_per_iteration": 2.691589593887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091127, + "balance_loss_mlp": 1.0575099, + "epoch": 0.3116583301269719, + "flos": 679187220480.0, + "grad_norm": 0.05429224886242875, + "language_loss": 0.88343138, + "learning_rate": 0.0008060053064484343, + "loss": 0.89434266, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.33642578, + "step": 1620, + "time_per_iteration": 2.936244487762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096127, + "balance_loss_mlp": 1.06277251, + "epoch": 0.31185071181223545, + "flos": 585855908352.0, + "grad_norm": 0.05040245512912965, + "language_loss": 0.85009742, + "learning_rate": 0.0008057588639680482, + "loss": 0.86105865, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.33374023, + "step": 1621, + "time_per_iteration": 2.7633163928985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107807, + "balance_loss_mlp": 1.07309282, + "epoch": 0.31204309349749904, + "flos": 725090517504.0, + "grad_norm": 0.06801147163116106, + "language_loss": 0.82624507, + "learning_rate": 0.0008055123027815434, + "loss": 0.83732307, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.34741211, + "step": 1622, + "time_per_iteration": 2.946943521499634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105084, + "balance_loss_mlp": 1.07156253, + "epoch": 0.3122354751827626, + "flos": 576558604800.0, + "grad_norm": 0.0611005921730787, + "language_loss": 0.85109818, + "learning_rate": 0.0008052656229846436, + "loss": 0.862149, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.33544922, + "step": 1623, + "time_per_iteration": 2.6431145668029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106208, + "balance_loss_mlp": 1.07106483, + "epoch": 0.31242785686802615, + "flos": 575672514048.0, + "grad_norm": 0.055122717603047884, + "language_loss": 0.90674621, + "learning_rate": 0.0008050188246731182, + "loss": 0.91780829, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.35180664, + "step": 1624, + "time_per_iteration": 2.6666738986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101645, + "balance_loss_mlp": 1.06745625, + "epoch": 0.31262023855328974, + "flos": 736490271744.0, + "grad_norm": 0.05430344032768667, + "language_loss": 0.81962687, + "learning_rate": 0.0008047719079427834, + "loss": 0.8306433, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.34204102, + "step": 1625, + "time_per_iteration": 2.978775978088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095181, + "balance_loss_mlp": 1.07791936, + "epoch": 0.3128126202385533, + "flos": 1558395113472.0, + "grad_norm": 0.034550759669135796, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75446886, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.17285156, + "step": 1626, + "time_per_iteration": 4.800370931625366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109181, + "balance_loss_mlp": 1.05695319, + "epoch": 0.31300500192381686, + "flos": 514666538496.0, + "grad_norm": 0.04752817769408696, + "language_loss": 0.86259782, + "learning_rate": 0.0008042777196091757, + "loss": 0.87351596, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.34863281, + "step": 1627, + "time_per_iteration": 2.695350408554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088728, + "balance_loss_mlp": 1.05301261, + "epoch": 0.3131973836090804, + "flos": 526370420736.0, + "grad_norm": 0.06407391520506579, + "language_loss": 0.8214981, + "learning_rate": 0.0008040304481977643, + "loss": 0.83238542, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.35742188, + "step": 1628, + "time_per_iteration": 2.6652393341064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_mlp": 1.05030346, + "epoch": 0.313389765294344, + "flos": 822473961984.0, + "grad_norm": 0.08346342139557943, + "language_loss": 0.86950874, + "learning_rate": 0.0008037830587512649, + "loss": 0.88034296, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.33129883, + "step": 1629, + "time_per_iteration": 3.0668327808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090413, + "balance_loss_mlp": 1.05651021, + "epoch": 0.31358214697960757, + "flos": 393604697088.0, + "grad_norm": 0.061409761762948115, + "language_loss": 0.78720629, + "learning_rate": 0.0008035355513657224, + "loss": 0.79811049, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.33935547, + "step": 1630, + "time_per_iteration": 2.5013740062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087918, + "balance_loss_mlp": 1.05449188, + "epoch": 0.3137745286648711, + "flos": 571611617280.0, + "grad_norm": 0.049199842100191564, + "language_loss": 0.93020999, + "learning_rate": 0.0008032879261372279, + "loss": 0.94108921, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.33447266, + "step": 1631, + "time_per_iteration": 2.8559622764587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088241, + "balance_loss_mlp": 1.07612944, + "epoch": 0.3139669103501347, + "flos": 1497614690304.0, + "grad_norm": 0.04267228885339989, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80724084, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.12109375, + "step": 1632, + "time_per_iteration": 5.726024627685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081932, + "balance_loss_mlp": 1.04886341, + "epoch": 0.3141592920353982, + "flos": 525090041856.0, + "grad_norm": 0.04986838794009694, + "language_loss": 0.87459773, + "learning_rate": 0.0008027923225359748, + "loss": 0.88541704, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.33081055, + "step": 1633, + "time_per_iteration": 2.599775791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078913, + "balance_loss_mlp": 1.04465246, + "epoch": 0.3143516737206618, + "flos": 592989714432.0, + "grad_norm": 0.05680374588643473, + "language_loss": 0.8835839, + "learning_rate": 0.0008025443443556267, + "loss": 0.89437306, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.34301758, + "step": 1634, + "time_per_iteration": 2.7439024448394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010804, + "balance_loss_mlp": 1.04776073, + "epoch": 0.31454405540592534, + "flos": 648034573824.0, + "grad_norm": 0.04764849369773053, + "language_loss": 0.88161099, + "learning_rate": 0.000802296248717147, + "loss": 0.89241499, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.32641602, + "step": 1635, + "time_per_iteration": 2.902290105819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082457, + "balance_loss_mlp": 1.04850602, + "epoch": 0.3147364370911889, + "flos": 642543501312.0, + "grad_norm": 0.05380775409858787, + "language_loss": 0.79150212, + "learning_rate": 0.0008020480357168554, + "loss": 0.80232668, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.33984375, + "step": 1636, + "time_per_iteration": 2.7940564155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107486, + "balance_loss_mlp": 1.04176795, + "epoch": 0.31492881877645246, + "flos": 471607778304.0, + "grad_norm": 0.05509564816324918, + "language_loss": 0.88341308, + "learning_rate": 0.0008017997054511165, + "loss": 0.89416164, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.33105469, + "step": 1637, + "time_per_iteration": 2.596212148666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075188, + "balance_loss_mlp": 1.04157114, + "epoch": 0.31512120046171604, + "flos": 629135838720.0, + "grad_norm": 0.0536589952194777, + "language_loss": 0.8549943, + "learning_rate": 0.0008015512580163407, + "loss": 0.86574614, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.33642578, + "step": 1638, + "time_per_iteration": 2.763343334197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107569, + "balance_loss_mlp": 1.0416913, + "epoch": 0.31531358214697963, + "flos": 703460726784.0, + "grad_norm": 0.0636877441873346, + "language_loss": 0.80888116, + "learning_rate": 0.0008013026935089838, + "loss": 0.81963813, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.34033203, + "step": 1639, + "time_per_iteration": 2.9786083698272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070933, + "balance_loss_mlp": 1.03798366, + "epoch": 0.31550596383224316, + "flos": 572275127808.0, + "grad_norm": 0.055086353977466425, + "language_loss": 0.83909047, + "learning_rate": 0.0008010540120255472, + "loss": 0.84979975, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.32958984, + "step": 1640, + "time_per_iteration": 2.666520357131958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075633, + "balance_loss_mlp": 1.04196858, + "epoch": 0.31569834551750675, + "flos": 658047822336.0, + "grad_norm": 0.06483249406864507, + "language_loss": 0.86052895, + "learning_rate": 0.0008008052136625774, + "loss": 0.8712852, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.33691406, + "step": 1641, + "time_per_iteration": 2.8062589168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078165, + "balance_loss_mlp": 1.04407096, + "epoch": 0.3158907272027703, + "flos": 566002681344.0, + "grad_norm": 0.05792040128516231, + "language_loss": 0.86837387, + "learning_rate": 0.0008005562985166666, + "loss": 0.87915552, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.34130859, + "step": 1642, + "time_per_iteration": 2.6996512413024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081029, + "balance_loss_mlp": 1.04576707, + "epoch": 0.31608310888803387, + "flos": 536622216192.0, + "grad_norm": 0.04642534602938139, + "language_loss": 0.84936821, + "learning_rate": 0.0008003072666844524, + "loss": 0.86017853, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.35302734, + "step": 1643, + "time_per_iteration": 2.6999776363372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078239, + "balance_loss_mlp": 1.04292917, + "epoch": 0.3162754905732974, + "flos": 486428239872.0, + "grad_norm": 0.08271259063406261, + "language_loss": 0.82613683, + "learning_rate": 0.0008000581182626173, + "loss": 0.83691919, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.35302734, + "step": 1644, + "time_per_iteration": 2.541093111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082389, + "balance_loss_mlp": 1.04893875, + "epoch": 0.316467872258561, + "flos": 529792538112.0, + "grad_norm": 0.058359985905672214, + "language_loss": 0.86275887, + "learning_rate": 0.0007998088533478894, + "loss": 0.87358278, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.33447266, + "step": 1645, + "time_per_iteration": 2.641402006149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077931, + "balance_loss_mlp": 1.04309845, + "epoch": 0.3166602539438245, + "flos": 443197771776.0, + "grad_norm": 0.07387441321187599, + "language_loss": 0.84062803, + "learning_rate": 0.000799559472037042, + "loss": 0.85140741, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.34887695, + "step": 1646, + "time_per_iteration": 2.5274438858032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076892, + "balance_loss_mlp": 1.04222584, + "epoch": 0.3168526356290881, + "flos": 645513103872.0, + "grad_norm": 0.053861363144643716, + "language_loss": 0.87875295, + "learning_rate": 0.0007993099744268932, + "loss": 0.8895219, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.34716797, + "step": 1647, + "time_per_iteration": 2.8893649578094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070585, + "balance_loss_mlp": 1.03646684, + "epoch": 0.3170450173143517, + "flos": 585889403904.0, + "grad_norm": 0.05841982976759713, + "language_loss": 0.87792766, + "learning_rate": 0.000799060360614307, + "loss": 0.88863349, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.34155273, + "step": 1648, + "time_per_iteration": 2.6867480278015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076826, + "balance_loss_mlp": 1.04273248, + "epoch": 0.3172373989996152, + "flos": 826763231232.0, + "grad_norm": 0.05654214693871822, + "language_loss": 0.83848637, + "learning_rate": 0.0007988106306961917, + "loss": 0.84925467, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.34130859, + "step": 1649, + "time_per_iteration": 3.1321003437042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080036, + "balance_loss_mlp": 1.04577541, + "epoch": 0.3174297806848788, + "flos": 527153204736.0, + "grad_norm": 0.060794493166337976, + "language_loss": 0.84529203, + "learning_rate": 0.0007985607847695014, + "loss": 0.85609239, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.34301758, + "step": 1650, + "time_per_iteration": 2.6306049823760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081772, + "balance_loss_mlp": 1.04851258, + "epoch": 0.31762216237014235, + "flos": 712855544832.0, + "grad_norm": 0.05325998456044798, + "language_loss": 0.82638443, + "learning_rate": 0.0007983108229312345, + "loss": 0.83720207, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.33276367, + "step": 1651, + "time_per_iteration": 2.909571647644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108965, + "balance_loss_mlp": 1.05567503, + "epoch": 0.31781454405540593, + "flos": 483567736320.0, + "grad_norm": 0.0653784528473409, + "language_loss": 0.86672306, + "learning_rate": 0.0007980607452784351, + "loss": 0.87761962, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.33984375, + "step": 1652, + "time_per_iteration": 2.5339765548706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_mlp": 1.06639075, + "epoch": 0.31800692574066947, + "flos": 548483249664.0, + "grad_norm": 0.0685029555550019, + "language_loss": 0.90562367, + "learning_rate": 0.0007978105519081919, + "loss": 0.91662765, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.34008789, + "step": 1653, + "time_per_iteration": 2.6916213035583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096542, + "balance_loss_mlp": 1.06213784, + "epoch": 0.31819930742593305, + "flos": 516640951296.0, + "grad_norm": 0.07941193091019123, + "language_loss": 0.87969935, + "learning_rate": 0.0007975602429176385, + "loss": 0.89066482, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.34423828, + "step": 1654, + "time_per_iteration": 2.5997297763824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100835, + "balance_loss_mlp": 1.06695616, + "epoch": 0.31839168911119664, + "flos": 455748456960.0, + "grad_norm": 0.07129171690745044, + "language_loss": 0.81582803, + "learning_rate": 0.0007973098184039536, + "loss": 0.82683635, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.33911133, + "step": 1655, + "time_per_iteration": 2.654914140701294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096724, + "balance_loss_mlp": 1.06284511, + "epoch": 0.3185840707964602, + "flos": 625719513600.0, + "grad_norm": 0.05658637496419385, + "language_loss": 0.86710656, + "learning_rate": 0.0007970592784643602, + "loss": 0.87807381, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.33911133, + "step": 1656, + "time_per_iteration": 2.846390962600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090807, + "balance_loss_mlp": 1.05719042, + "epoch": 0.31877645248172376, + "flos": 567213249024.0, + "grad_norm": 0.058346793379709355, + "language_loss": 0.85032123, + "learning_rate": 0.0007968086231961272, + "loss": 0.8612293, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.33642578, + "step": 1657, + "time_per_iteration": 2.652986526489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094073, + "balance_loss_mlp": 1.0593828, + "epoch": 0.3189688341669873, + "flos": 489338205696.0, + "grad_norm": 0.08644842740903268, + "language_loss": 0.836254, + "learning_rate": 0.0007965578526965671, + "loss": 0.84719473, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.34741211, + "step": 1658, + "time_per_iteration": 2.5607872009277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092531, + "balance_loss_mlp": 1.05860353, + "epoch": 0.3191612158522509, + "flos": 575948938752.0, + "grad_norm": 0.04707712809776705, + "language_loss": 0.86020696, + "learning_rate": 0.0007963069670630377, + "loss": 0.87113225, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.33959961, + "step": 1659, + "time_per_iteration": 2.7435247898101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097687, + "balance_loss_mlp": 1.06447566, + "epoch": 0.3193535975375144, + "flos": 537867689472.0, + "grad_norm": 0.062321727217464123, + "language_loss": 0.87956834, + "learning_rate": 0.0007960559663929416, + "loss": 0.89054519, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.33227539, + "step": 1660, + "time_per_iteration": 2.6282846927642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096682, + "balance_loss_mlp": 1.06265998, + "epoch": 0.319545979222778, + "flos": 733954245120.0, + "grad_norm": 0.07201541894751945, + "language_loss": 0.87465358, + "learning_rate": 0.0007958048507837259, + "loss": 0.88562042, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.34057617, + "step": 1661, + "time_per_iteration": 2.9250974655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099132, + "balance_loss_mlp": 1.0647999, + "epoch": 0.31973836090804153, + "flos": 764136433152.0, + "grad_norm": 0.0721917610669121, + "language_loss": 0.87230003, + "learning_rate": 0.0007955536203328822, + "loss": 0.8832913, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.34375, + "step": 1662, + "time_per_iteration": 2.899735450744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109485, + "balance_loss_mlp": 1.06109047, + "epoch": 0.3199307425933051, + "flos": 560252560896.0, + "grad_norm": 0.06666532975578916, + "language_loss": 0.83308822, + "learning_rate": 0.0007953022751379469, + "loss": 0.84403676, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.33789062, + "step": 1663, + "time_per_iteration": 2.7743418216705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093375, + "balance_loss_mlp": 1.05899549, + "epoch": 0.3201231242785687, + "flos": 751019751936.0, + "grad_norm": 0.058114271957014456, + "language_loss": 0.81677037, + "learning_rate": 0.000795050815296501, + "loss": 0.82770407, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.34399414, + "step": 1664, + "time_per_iteration": 2.9620323181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091229, + "balance_loss_mlp": 1.05768323, + "epoch": 0.32031550596383224, + "flos": 496157709312.0, + "grad_norm": 0.061791342299560625, + "language_loss": 0.93274921, + "learning_rate": 0.0007947992409061695, + "loss": 0.94366151, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.33569336, + "step": 1665, + "time_per_iteration": 2.6097099781036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083934, + "balance_loss_mlp": 1.05022132, + "epoch": 0.3205078876490958, + "flos": 731294562816.0, + "grad_norm": 0.05133774923717053, + "language_loss": 0.86471802, + "learning_rate": 0.0007945475520646226, + "loss": 0.8755573, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.33740234, + "step": 1666, + "time_per_iteration": 2.9224231243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088823, + "balance_loss_mlp": 1.05487204, + "epoch": 0.32070026933435936, + "flos": 549177283584.0, + "grad_norm": 0.1345109768982335, + "language_loss": 0.8496111, + "learning_rate": 0.0007942957488695743, + "loss": 0.86049932, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.33959961, + "step": 1667, + "time_per_iteration": 2.6267666816711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086537, + "balance_loss_mlp": 1.05265749, + "epoch": 0.32089265101962294, + "flos": 744949536768.0, + "grad_norm": 0.061316479944915916, + "language_loss": 0.80963373, + "learning_rate": 0.0007940438314187833, + "loss": 0.82049918, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.33886719, + "step": 1668, + "time_per_iteration": 3.00421142578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089646, + "balance_loss_mlp": 1.05638647, + "epoch": 0.3210850327048865, + "flos": 493937395200.0, + "grad_norm": 0.05864654089818211, + "language_loss": 0.80047274, + "learning_rate": 0.0007937917998100529, + "loss": 0.81136918, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.33276367, + "step": 1669, + "time_per_iteration": 2.607917070388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_mlp": 1.06610548, + "epoch": 0.32127741439015006, + "flos": 530383265280.0, + "grad_norm": 0.060159342011431034, + "language_loss": 0.78680766, + "learning_rate": 0.0007935396541412302, + "loss": 0.79781532, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.34692383, + "step": 1670, + "time_per_iteration": 2.6022346019744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108678, + "balance_loss_mlp": 1.07458389, + "epoch": 0.3214697960754136, + "flos": 500948955648.0, + "grad_norm": 0.07085567852213893, + "language_loss": 0.85879421, + "learning_rate": 0.0007932873945102068, + "loss": 0.86988097, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.34130859, + "step": 1671, + "time_per_iteration": 2.581815719604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120776, + "balance_loss_mlp": 1.10942781, + "epoch": 0.3216621777606772, + "flos": 1382579394048.0, + "grad_norm": 0.04969555951860313, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76882553, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.11328125, + "step": 1672, + "time_per_iteration": 4.821724891662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106649, + "balance_loss_mlp": 1.07193518, + "epoch": 0.32185455944594077, + "flos": 571260999168.0, + "grad_norm": 0.05896773993357689, + "language_loss": 0.86527109, + "learning_rate": 0.0007927825337533461, + "loss": 0.87633765, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.34765625, + "step": 1673, + "time_per_iteration": 2.6640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105888, + "balance_loss_mlp": 1.07184219, + "epoch": 0.3220469411312043, + "flos": 543652715520.0, + "grad_norm": 0.06618360944756078, + "language_loss": 0.84761298, + "learning_rate": 0.0007925299328235131, + "loss": 0.85867184, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.34057617, + "step": 1674, + "time_per_iteration": 2.6524405479431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102416, + "balance_loss_mlp": 1.06705832, + "epoch": 0.3222393228164679, + "flos": 490884834816.0, + "grad_norm": 0.05872681692102293, + "language_loss": 0.85148364, + "learning_rate": 0.000792277218323488, + "loss": 0.86250782, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.35424805, + "step": 1675, + "time_per_iteration": 2.557460069656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100116, + "balance_loss_mlp": 1.06523526, + "epoch": 0.3224317045017314, + "flos": 490145720832.0, + "grad_norm": 0.05188137415598196, + "language_loss": 0.84647608, + "learning_rate": 0.0007920243903513833, + "loss": 0.85747719, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.34912109, + "step": 1676, + "time_per_iteration": 2.5208208560943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092752, + "balance_loss_mlp": 1.05813313, + "epoch": 0.322624086186995, + "flos": 575505188352.0, + "grad_norm": 0.06429192544800656, + "language_loss": 0.83992624, + "learning_rate": 0.0007917714490053556, + "loss": 0.8508538, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.34667969, + "step": 1677, + "time_per_iteration": 2.653686761856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109123, + "balance_loss_mlp": 1.05668271, + "epoch": 0.32281646787225854, + "flos": 628974305280.0, + "grad_norm": 0.048890211607645416, + "language_loss": 0.86094737, + "learning_rate": 0.0007915183943836055, + "loss": 0.87185967, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.34594727, + "step": 1678, + "time_per_iteration": 2.852612018585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083818, + "balance_loss_mlp": 1.04950905, + "epoch": 0.3230088495575221, + "flos": 781036024320.0, + "grad_norm": 0.05620908679364121, + "language_loss": 0.83880055, + "learning_rate": 0.0007912652265843773, + "loss": 0.8496387, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.34350586, + "step": 1679, + "time_per_iteration": 3.006805419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.0433867, + "epoch": 0.3232012312427857, + "flos": 535839432192.0, + "grad_norm": 0.04762836982551939, + "language_loss": 0.81587136, + "learning_rate": 0.0007910119457059597, + "loss": 0.82664907, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.34423828, + "step": 1680, + "time_per_iteration": 2.6930737495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_mlp": 1.04129148, + "epoch": 0.32339361292804925, + "flos": 704515553280.0, + "grad_norm": 0.06110281418881611, + "language_loss": 0.80031025, + "learning_rate": 0.0007907585518466849, + "loss": 0.81107438, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.35180664, + "step": 1681, + "time_per_iteration": 2.940950870513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081204, + "balance_loss_mlp": 1.04603744, + "epoch": 0.32358599461331283, + "flos": 452099377152.0, + "grad_norm": 0.0474614445796137, + "language_loss": 0.90124965, + "learning_rate": 0.000790505045104929, + "loss": 0.91206169, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.35205078, + "step": 1682, + "time_per_iteration": 2.4919469356536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078264, + "balance_loss_mlp": 1.0435977, + "epoch": 0.32377837629857636, + "flos": 600597794304.0, + "grad_norm": 0.057051782898604, + "language_loss": 0.86989701, + "learning_rate": 0.0007902514255791125, + "loss": 0.88067961, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.34692383, + "step": 1683, + "time_per_iteration": 2.7545859813690186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108269, + "balance_loss_mlp": 1.04721308, + "epoch": 0.32397075798383995, + "flos": 807180636672.0, + "grad_norm": 0.05145240385219177, + "language_loss": 0.87981123, + "learning_rate": 0.0007899976933676986, + "loss": 0.89063811, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.35498047, + "step": 1684, + "time_per_iteration": 2.97807240486145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081365, + "balance_loss_mlp": 1.04638934, + "epoch": 0.3241631396691035, + "flos": 601414073856.0, + "grad_norm": 0.06429290680378846, + "language_loss": 0.8767072, + "learning_rate": 0.0007897438485691955, + "loss": 0.88752091, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.3503418, + "step": 1685, + "time_per_iteration": 2.704326868057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083548, + "balance_loss_mlp": 1.04826176, + "epoch": 0.32435552135436707, + "flos": 473980861440.0, + "grad_norm": 0.058364951070402814, + "language_loss": 0.82023847, + "learning_rate": 0.0007894898912821542, + "loss": 0.831074, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.3527832, + "step": 1686, + "time_per_iteration": 2.5206680297851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076257, + "balance_loss_mlp": 1.04178166, + "epoch": 0.3245479030396306, + "flos": 537824019456.0, + "grad_norm": 0.04476181031616706, + "language_loss": 0.86661267, + "learning_rate": 0.0007892358216051695, + "loss": 0.87737525, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.3449707, + "step": 1687, + "time_per_iteration": 2.7332026958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075747, + "balance_loss_mlp": 1.04108071, + "epoch": 0.3247402847248942, + "flos": 547394927616.0, + "grad_norm": 0.05643246072623682, + "language_loss": 0.92275292, + "learning_rate": 0.0007889816396368803, + "loss": 0.93351042, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.34692383, + "step": 1688, + "time_per_iteration": 2.6158432960510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077832, + "balance_loss_mlp": 1.04388082, + "epoch": 0.3249326664101578, + "flos": 377941814784.0, + "grad_norm": 0.04953960067471088, + "language_loss": 0.85575634, + "learning_rate": 0.0007887273454759687, + "loss": 0.86653465, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.33984375, + "step": 1689, + "time_per_iteration": 2.4958317279815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075156, + "balance_loss_mlp": 1.04051399, + "epoch": 0.3251250480954213, + "flos": 527818125312.0, + "grad_norm": 0.050587956688220255, + "language_loss": 0.82717729, + "learning_rate": 0.0007884729392211603, + "loss": 0.83792883, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.34692383, + "step": 1690, + "time_per_iteration": 2.6325736045837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075634, + "balance_loss_mlp": 1.04113472, + "epoch": 0.3253174297806849, + "flos": 449435312640.0, + "grad_norm": 0.06211432544239721, + "language_loss": 0.85214412, + "learning_rate": 0.0007882184209712245, + "loss": 0.8629005, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.34545898, + "step": 1691, + "time_per_iteration": 2.5199012756347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076936, + "balance_loss_mlp": 1.04303288, + "epoch": 0.32550981146594843, + "flos": 703855014912.0, + "grad_norm": 0.0444021152083115, + "language_loss": 0.85646939, + "learning_rate": 0.000787963790824974, + "loss": 0.86723876, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.33935547, + "step": 1692, + "time_per_iteration": 2.9585483074188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076864, + "balance_loss_mlp": 1.04217362, + "epoch": 0.325702193151212, + "flos": 392491643904.0, + "grad_norm": 0.06035071191190156, + "language_loss": 0.89588344, + "learning_rate": 0.0007877090488812651, + "loss": 0.90665203, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.34716797, + "step": 1693, + "time_per_iteration": 2.4247992038726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073167, + "balance_loss_mlp": 1.0379529, + "epoch": 0.32589457483647555, + "flos": 577223525376.0, + "grad_norm": 0.051335929306222446, + "language_loss": 0.83377099, + "learning_rate": 0.0007874541952389973, + "loss": 0.84450269, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.35253906, + "step": 1694, + "time_per_iteration": 2.679868459701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074307, + "balance_loss_mlp": 1.03947401, + "epoch": 0.32608695652173914, + "flos": 498092834304.0, + "grad_norm": 0.051580366849716015, + "language_loss": 0.86795568, + "learning_rate": 0.0007871992299971136, + "loss": 0.87869877, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.34887695, + "step": 1695, + "time_per_iteration": 2.6005072593688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079265, + "balance_loss_mlp": 1.04431272, + "epoch": 0.32627933820700267, + "flos": 590858150400.0, + "grad_norm": 0.054409905067417906, + "language_loss": 0.84529006, + "learning_rate": 0.0007869441532546001, + "loss": 0.85608268, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.34985352, + "step": 1696, + "time_per_iteration": 2.7373292446136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071229, + "balance_loss_mlp": 1.03749299, + "epoch": 0.32647171989226625, + "flos": 608790809088.0, + "grad_norm": 0.05196776598603691, + "language_loss": 0.79551816, + "learning_rate": 0.0007866889651104867, + "loss": 0.80623043, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.33764648, + "step": 1697, + "time_per_iteration": 2.768869638442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069598, + "balance_loss_mlp": 1.03464603, + "epoch": 0.32666410157752984, + "flos": 476896619520.0, + "grad_norm": 0.05699082390473629, + "language_loss": 0.83313, + "learning_rate": 0.000786433665663846, + "loss": 0.84382606, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.34985352, + "step": 1698, + "time_per_iteration": 2.6574184894561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070595, + "balance_loss_mlp": 1.03664398, + "epoch": 0.3268564832627934, + "flos": 718060018176.0, + "grad_norm": 0.0499104315286651, + "language_loss": 0.86441195, + "learning_rate": 0.0007861782550137942, + "loss": 0.8751179, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.33984375, + "step": 1699, + "time_per_iteration": 2.8897016048431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071541, + "balance_loss_mlp": 1.0379957, + "epoch": 0.32704886494805696, + "flos": 768469372416.0, + "grad_norm": 0.05892131453680714, + "language_loss": 0.85990739, + "learning_rate": 0.0007859227332594901, + "loss": 0.87062275, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.33569336, + "step": 1700, + "time_per_iteration": 2.8941755294799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080492, + "balance_loss_mlp": 1.046803, + "epoch": 0.3272412466333205, + "flos": 849540980736.0, + "grad_norm": 0.0647173620985618, + "language_loss": 0.84537613, + "learning_rate": 0.0007856671005001365, + "loss": 0.85618103, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.3371582, + "step": 1701, + "time_per_iteration": 3.1362555027008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107778, + "balance_loss_mlp": 1.04373336, + "epoch": 0.3274336283185841, + "flos": 831224208384.0, + "grad_norm": 0.055785838120560656, + "language_loss": 0.81608075, + "learning_rate": 0.0007854113568349787, + "loss": 0.82685852, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.34082031, + "step": 1702, + "time_per_iteration": 3.0684425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108348, + "balance_loss_mlp": 1.04900455, + "epoch": 0.3276260100038476, + "flos": 691721938944.0, + "grad_norm": 0.059478075679183354, + "language_loss": 0.80008304, + "learning_rate": 0.0007851555023633052, + "loss": 0.81091785, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.34521484, + "step": 1703, + "time_per_iteration": 2.829838991165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083551, + "balance_loss_mlp": 1.04974365, + "epoch": 0.3278183916891112, + "flos": 435831211008.0, + "grad_norm": 0.05938301584715095, + "language_loss": 0.82290888, + "learning_rate": 0.0007848995371844474, + "loss": 0.83374435, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.33837891, + "step": 1704, + "time_per_iteration": 2.498462200164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.0532254, + "epoch": 0.3280107733743748, + "flos": 460883119104.0, + "grad_norm": 0.06015932024064871, + "language_loss": 0.80622214, + "learning_rate": 0.0007846434613977801, + "loss": 0.81709725, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.34326172, + "step": 1705, + "time_per_iteration": 2.4933369159698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087045, + "balance_loss_mlp": 1.05330932, + "epoch": 0.3282031550596383, + "flos": 679018484736.0, + "grad_norm": 0.05558890685700398, + "language_loss": 0.78265798, + "learning_rate": 0.0007843872751027203, + "loss": 0.7935285, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.33764648, + "step": 1706, + "time_per_iteration": 2.81091046333313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090191, + "balance_loss_mlp": 1.05621648, + "epoch": 0.3283955367449019, + "flos": 544821023232.0, + "grad_norm": 0.10097233050810657, + "language_loss": 0.87312186, + "learning_rate": 0.0007841309783987287, + "loss": 0.88402379, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.34008789, + "step": 1707, + "time_per_iteration": 2.7456212043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083118, + "balance_loss_mlp": 1.0490005, + "epoch": 0.32858791843016544, + "flos": 481017153024.0, + "grad_norm": 0.06288690811568091, + "language_loss": 0.89185357, + "learning_rate": 0.0007838745713853084, + "loss": 0.90268475, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.34155273, + "step": 1708, + "time_per_iteration": 2.5734565258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086944, + "balance_loss_mlp": 1.0529933, + "epoch": 0.328780300115429, + "flos": 566529389568.0, + "grad_norm": 0.059735917623485235, + "language_loss": 0.84101981, + "learning_rate": 0.0007836180541620053, + "loss": 0.85188925, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.33984375, + "step": 1709, + "time_per_iteration": 2.6734848022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082239, + "balance_loss_mlp": 1.04843152, + "epoch": 0.32897268180069256, + "flos": 475787948544.0, + "grad_norm": 0.06557165815913592, + "language_loss": 0.86666405, + "learning_rate": 0.0007833614268284082, + "loss": 0.87748647, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.33813477, + "step": 1710, + "time_per_iteration": 2.5004236698150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109204, + "balance_loss_mlp": 1.07611382, + "epoch": 0.32916506348595614, + "flos": 1576517008896.0, + "grad_norm": 0.028486921929439343, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75201809, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.15917969, + "step": 1711, + "time_per_iteration": 4.857421159744263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084065, + "balance_loss_mlp": 1.05011439, + "epoch": 0.3293574451712197, + "flos": 482646739968.0, + "grad_norm": 0.05383776069577274, + "language_loss": 0.78376174, + "learning_rate": 0.0007828478422289016, + "loss": 0.79460239, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.33984375, + "step": 1712, + "time_per_iteration": 2.5763661861419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089524, + "balance_loss_mlp": 1.05438161, + "epoch": 0.32954982685648326, + "flos": 622266872832.0, + "grad_norm": 0.05220026625301518, + "language_loss": 0.89185119, + "learning_rate": 0.0007825908851623833, + "loss": 0.90274644, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.35205078, + "step": 1713, + "time_per_iteration": 2.7262768745422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081531, + "balance_loss_mlp": 1.04648352, + "epoch": 0.32974220854174685, + "flos": 544697367552.0, + "grad_norm": 0.06806070360888057, + "language_loss": 0.85360491, + "learning_rate": 0.0007823338183843533, + "loss": 0.86442018, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.35083008, + "step": 1714, + "time_per_iteration": 2.652278184890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078166, + "balance_loss_mlp": 1.0447638, + "epoch": 0.3299345902270104, + "flos": 981740708352.0, + "grad_norm": 0.05603975865081876, + "language_loss": 0.8075726, + "learning_rate": 0.0007820766419946141, + "loss": 0.81835425, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.33422852, + "step": 1715, + "time_per_iteration": 3.282278537750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087722, + "balance_loss_mlp": 1.07227242, + "epoch": 0.33012697191227397, + "flos": 1402857432576.0, + "grad_norm": 0.02753251532821737, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80760199, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.15429688, + "step": 1716, + "time_per_iteration": 4.925649881362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081747, + "balance_loss_mlp": 1.04789162, + "epoch": 0.3303193535975375, + "flos": 504897781248.0, + "grad_norm": 0.09582479469105179, + "language_loss": 0.75968826, + "learning_rate": 0.0007815619607794288, + "loss": 0.77050573, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.33886719, + "step": 1717, + "time_per_iteration": 2.653355598449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077451, + "balance_loss_mlp": 1.04423952, + "epoch": 0.3305117352828011, + "flos": 937602390528.0, + "grad_norm": 0.059316474336830904, + "language_loss": 0.8254683, + "learning_rate": 0.0007813044561538001, + "loss": 0.83624279, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.33227539, + "step": 1718, + "time_per_iteration": 3.1251535415649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084283, + "balance_loss_mlp": 1.05035567, + "epoch": 0.3307041169680646, + "flos": 721176597504.0, + "grad_norm": 0.08429030846847434, + "language_loss": 0.88411027, + "learning_rate": 0.0007810468423160958, + "loss": 0.89495313, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.33959961, + "step": 1719, + "time_per_iteration": 2.8598783016204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090007, + "balance_loss_mlp": 1.05760598, + "epoch": 0.3308964986533282, + "flos": 583315499520.0, + "grad_norm": 0.04547421634197757, + "language_loss": 0.81920642, + "learning_rate": 0.0007807891193663306, + "loss": 0.8301065, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.32397461, + "step": 1720, + "time_per_iteration": 2.7775802612304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092448, + "balance_loss_mlp": 1.0582583, + "epoch": 0.33108888033859174, + "flos": 473340672000.0, + "grad_norm": 0.07591254280459368, + "language_loss": 0.82440275, + "learning_rate": 0.0007805312874045614, + "loss": 0.83532727, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.34228516, + "step": 1721, + "time_per_iteration": 2.5138351917266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100808, + "balance_loss_mlp": 1.06657076, + "epoch": 0.3312812620238553, + "flos": 385913659392.0, + "grad_norm": 0.08101052667778896, + "language_loss": 0.86919391, + "learning_rate": 0.0007802733465308874, + "loss": 0.88020205, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.34277344, + "step": 1722, + "time_per_iteration": 2.440809726715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106056, + "balance_loss_mlp": 1.07074606, + "epoch": 0.3314736437091189, + "flos": 494292395520.0, + "grad_norm": 0.0567806329299034, + "language_loss": 0.8509872, + "learning_rate": 0.0007800152968454501, + "loss": 0.86204773, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.35375977, + "step": 1723, + "time_per_iteration": 2.61602520942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090428, + "balance_loss_mlp": 1.056072, + "epoch": 0.33166602539438245, + "flos": 653346736128.0, + "grad_norm": 0.038882210578800376, + "language_loss": 0.90476918, + "learning_rate": 0.0007797571384484334, + "loss": 0.91567349, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.34399414, + "step": 1724, + "time_per_iteration": 2.857562780380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090291, + "balance_loss_mlp": 1.05586314, + "epoch": 0.33185840707964603, + "flos": 520550489088.0, + "grad_norm": 0.04870849772261114, + "language_loss": 0.91599178, + "learning_rate": 0.0007794988714400633, + "loss": 0.92689478, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.34448242, + "step": 1725, + "time_per_iteration": 2.5884361267089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097227, + "balance_loss_mlp": 1.06077266, + "epoch": 0.33205078876490957, + "flos": 436712919552.0, + "grad_norm": 0.05260760436426434, + "language_loss": 0.85199809, + "learning_rate": 0.0007792404959206079, + "loss": 0.86297035, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.36474609, + "step": 1726, + "time_per_iteration": 2.4855122566223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095942, + "balance_loss_mlp": 1.05965447, + "epoch": 0.33224317045017315, + "flos": 768400971264.0, + "grad_norm": 0.052329818141719754, + "language_loss": 0.81527805, + "learning_rate": 0.0007789820119903774, + "loss": 0.82623744, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.36279297, + "step": 1727, + "time_per_iteration": 2.945405960083008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105991, + "balance_loss_mlp": 1.04579556, + "epoch": 0.3324355521354367, + "flos": 1465656090624.0, + "grad_norm": 0.02932642968329903, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79552573, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.14160156, + "step": 1728, + "time_per_iteration": 4.810296297073364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084477, + "balance_loss_mlp": 1.04880977, + "epoch": 0.3326279338207003, + "flos": 496415195136.0, + "grad_norm": 0.05339720943334808, + "language_loss": 0.83919221, + "learning_rate": 0.0007784647192990428, + "loss": 0.85003698, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.35693359, + "step": 1729, + "time_per_iteration": 2.6848132610321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083753, + "balance_loss_mlp": 1.04844344, + "epoch": 0.33282031550596386, + "flos": 635600342016.0, + "grad_norm": 0.05212570885713578, + "language_loss": 0.80661625, + "learning_rate": 0.0007782059107387696, + "loss": 0.81745386, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.35351562, + "step": 1730, + "time_per_iteration": 2.874356269836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078294, + "balance_loss_mlp": 1.04329371, + "epoch": 0.3330126971912274, + "flos": 689210643456.0, + "grad_norm": 0.05636936917064103, + "language_loss": 0.88407743, + "learning_rate": 0.0007779469941693826, + "loss": 0.89486033, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.3503418, + "step": 1731, + "time_per_iteration": 2.7914862632751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079351, + "balance_loss_mlp": 1.04511368, + "epoch": 0.333205078876491, + "flos": 566184563712.0, + "grad_norm": 0.05730145040609657, + "language_loss": 0.77017218, + "learning_rate": 0.0007776879696914029, + "loss": 0.78096569, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.34277344, + "step": 1732, + "time_per_iteration": 2.8158769607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081, + "balance_loss_mlp": 1.04666734, + "epoch": 0.3333974605617545, + "flos": 640618550784.0, + "grad_norm": 0.044212495165629015, + "language_loss": 0.8903594, + "learning_rate": 0.000777428837405392, + "loss": 0.90116942, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.34375, + "step": 1733, + "time_per_iteration": 2.8417906761169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079717, + "balance_loss_mlp": 1.04536092, + "epoch": 0.3335898422470181, + "flos": 461597501952.0, + "grad_norm": 0.05109390766697835, + "language_loss": 0.87070495, + "learning_rate": 0.0007771695974119544, + "loss": 0.88150203, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.34399414, + "step": 1734, + "time_per_iteration": 2.4995057582855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078764, + "balance_loss_mlp": 1.04514742, + "epoch": 0.33378222393228163, + "flos": 852504791040.0, + "grad_norm": 0.05825672376588237, + "language_loss": 0.75576115, + "learning_rate": 0.0007769102498117359, + "loss": 0.76654887, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.33642578, + "step": 1735, + "time_per_iteration": 3.0868663787841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083295, + "balance_loss_mlp": 1.04991651, + "epoch": 0.3339746056175452, + "flos": 954256080384.0, + "grad_norm": 0.05069255593645712, + "language_loss": 0.79858601, + "learning_rate": 0.000776650794705424, + "loss": 0.80941892, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.33398438, + "step": 1736, + "time_per_iteration": 3.2665328979492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108809, + "balance_loss_mlp": 1.05387688, + "epoch": 0.33416698730280875, + "flos": 544559155200.0, + "grad_norm": 0.045819605067785145, + "language_loss": 0.82160866, + "learning_rate": 0.0007763912321937483, + "loss": 0.83248949, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.34228516, + "step": 1737, + "time_per_iteration": 2.677316665649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081816, + "balance_loss_mlp": 1.04798412, + "epoch": 0.33435936898807234, + "flos": 1013652817920.0, + "grad_norm": 0.053421386657792044, + "language_loss": 0.82471478, + "learning_rate": 0.0007761315623774799, + "loss": 0.8355329, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.33862305, + "step": 1738, + "time_per_iteration": 3.4182283878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089571, + "balance_loss_mlp": 1.05554879, + "epoch": 0.3345517506733359, + "flos": 614935217664.0, + "grad_norm": 0.051536505858366714, + "language_loss": 0.87671852, + "learning_rate": 0.0007758717853574313, + "loss": 0.88761419, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.34057617, + "step": 1739, + "time_per_iteration": 2.7348380088806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084561, + "balance_loss_mlp": 1.05125391, + "epoch": 0.33474413235859946, + "flos": 494350622208.0, + "grad_norm": 0.06141180611747274, + "language_loss": 0.9002257, + "learning_rate": 0.0007756119012344571, + "loss": 0.91107136, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.33325195, + "step": 1740, + "time_per_iteration": 2.536121129989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091135, + "balance_loss_mlp": 1.05754209, + "epoch": 0.33493651404386304, + "flos": 628105743360.0, + "grad_norm": 0.06662069566578578, + "language_loss": 0.84404671, + "learning_rate": 0.0007753519101094535, + "loss": 0.85495806, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.33618164, + "step": 1741, + "time_per_iteration": 2.753371238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082833, + "balance_loss_mlp": 1.04945421, + "epoch": 0.3351288957291266, + "flos": 513474909696.0, + "grad_norm": 0.05750412427252262, + "language_loss": 0.86366677, + "learning_rate": 0.0007750918120833575, + "loss": 0.87449515, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.33398438, + "step": 1742, + "time_per_iteration": 2.56093168258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082276, + "balance_loss_mlp": 1.05037546, + "epoch": 0.33532127741439016, + "flos": 647008860672.0, + "grad_norm": 0.0676260973392943, + "language_loss": 0.87342101, + "learning_rate": 0.0007748316072571485, + "loss": 0.88424373, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.31884766, + "step": 1743, + "time_per_iteration": 2.7759556770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086123, + "balance_loss_mlp": 1.05193388, + "epoch": 0.3355136590996537, + "flos": 768134721024.0, + "grad_norm": 0.047185436483198326, + "language_loss": 0.79306734, + "learning_rate": 0.0007745712957318467, + "loss": 0.80392861, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.34228516, + "step": 1744, + "time_per_iteration": 2.959686756134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085284, + "balance_loss_mlp": 1.05119014, + "epoch": 0.3357060407849173, + "flos": 595259490816.0, + "grad_norm": 0.046948111550021425, + "language_loss": 0.86506951, + "learning_rate": 0.0007743108776085141, + "loss": 0.87592232, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.34106445, + "step": 1745, + "time_per_iteration": 2.7391204833984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089169, + "balance_loss_mlp": 1.05462217, + "epoch": 0.3358984224701808, + "flos": 598288730112.0, + "grad_norm": 0.04983419543630797, + "language_loss": 0.82728243, + "learning_rate": 0.0007740503529882543, + "loss": 0.8381741, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.34594727, + "step": 1746, + "time_per_iteration": 2.788041114807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108953, + "balance_loss_mlp": 1.05474496, + "epoch": 0.3360908041554444, + "flos": 578055771648.0, + "grad_norm": 0.05677254755827829, + "language_loss": 0.91252941, + "learning_rate": 0.0007737897219722114, + "loss": 0.92342472, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.34790039, + "step": 1747, + "time_per_iteration": 2.6752376556396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083715, + "balance_loss_mlp": 1.04945374, + "epoch": 0.336283185840708, + "flos": 513332315136.0, + "grad_norm": 0.05427874766165502, + "language_loss": 0.81146061, + "learning_rate": 0.0007735289846615716, + "loss": 0.82229781, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.34301758, + "step": 1748, + "time_per_iteration": 2.670315742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083181, + "balance_loss_mlp": 1.04984999, + "epoch": 0.3364755675259715, + "flos": 524716102656.0, + "grad_norm": 0.05445380235157479, + "language_loss": 0.81899059, + "learning_rate": 0.0007732681411575621, + "loss": 0.82982242, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.33349609, + "step": 1749, + "time_per_iteration": 2.644740104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079502, + "balance_loss_mlp": 1.04567027, + "epoch": 0.3366679492112351, + "flos": 554594162688.0, + "grad_norm": 0.05291201013717534, + "language_loss": 0.87517959, + "learning_rate": 0.0007730071915614514, + "loss": 0.88597459, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.33862305, + "step": 1750, + "time_per_iteration": 2.6605777740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082694, + "balance_loss_mlp": 1.04874277, + "epoch": 0.33686033089649864, + "flos": 427051851264.0, + "grad_norm": 0.07867660779661921, + "language_loss": 0.88976741, + "learning_rate": 0.0007727461359745489, + "loss": 0.90059435, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.33984375, + "step": 1751, + "time_per_iteration": 2.4562768936157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082471, + "balance_loss_mlp": 1.04987907, + "epoch": 0.3370527125817622, + "flos": 541452750336.0, + "grad_norm": 0.05472309390748721, + "language_loss": 0.86156446, + "learning_rate": 0.0007724849744982056, + "loss": 0.87238914, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.32592773, + "step": 1752, + "time_per_iteration": 2.683575391769409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086284, + "balance_loss_mlp": 1.05295336, + "epoch": 0.33724509426702576, + "flos": 541836864000.0, + "grad_norm": 0.052181206472060114, + "language_loss": 0.81578106, + "learning_rate": 0.0007722237072338131, + "loss": 0.82664388, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.33349609, + "step": 1753, + "time_per_iteration": 2.7059788703918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108949, + "balance_loss_mlp": 1.05563486, + "epoch": 0.33743747595228935, + "flos": 472557888000.0, + "grad_norm": 0.063588606701447, + "language_loss": 0.85402888, + "learning_rate": 0.0007719623342828046, + "loss": 0.86492383, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.33886719, + "step": 1754, + "time_per_iteration": 2.5117459297180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090728, + "balance_loss_mlp": 1.05708706, + "epoch": 0.33762985763755293, + "flos": 469564964352.0, + "grad_norm": 0.05602573096673387, + "language_loss": 0.84115714, + "learning_rate": 0.000771700855746654, + "loss": 0.85206437, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.33666992, + "step": 1755, + "time_per_iteration": 2.5685064792633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085423, + "balance_loss_mlp": 1.05214, + "epoch": 0.33782223932281646, + "flos": 492002270208.0, + "grad_norm": 0.05352941428578995, + "language_loss": 0.88329422, + "learning_rate": 0.0007714392717268763, + "loss": 0.89414847, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.33300781, + "step": 1756, + "time_per_iteration": 2.568432569503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084365, + "balance_loss_mlp": 1.04981852, + "epoch": 0.33801462100808005, + "flos": 464827562496.0, + "grad_norm": 0.051807056092833426, + "language_loss": 0.86368155, + "learning_rate": 0.0007711775823250273, + "loss": 0.87452519, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.34594727, + "step": 1757, + "time_per_iteration": 2.5542263984680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010902, + "balance_loss_mlp": 1.05660665, + "epoch": 0.3382070026933436, + "flos": 795319603200.0, + "grad_norm": 0.05510084593487172, + "language_loss": 0.83019066, + "learning_rate": 0.0007709157876427039, + "loss": 0.84109271, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.33618164, + "step": 1758, + "time_per_iteration": 3.07852840423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082149, + "balance_loss_mlp": 1.04903245, + "epoch": 0.33839938437860717, + "flos": 508181686272.0, + "grad_norm": 0.0524958838474987, + "language_loss": 0.85602981, + "learning_rate": 0.0007706538877815439, + "loss": 0.86685127, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.33129883, + "step": 1759, + "time_per_iteration": 2.6002085208892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082716, + "balance_loss_mlp": 1.05021918, + "epoch": 0.3385917660638707, + "flos": 483986755584.0, + "grad_norm": 0.05079207863068971, + "language_loss": 0.83150595, + "learning_rate": 0.0007703918828432259, + "loss": 0.84233308, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.32495117, + "step": 1760, + "time_per_iteration": 2.5961215496063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086286, + "balance_loss_mlp": 1.05297899, + "epoch": 0.3387841477491343, + "flos": 545071306752.0, + "grad_norm": 0.0542286668270813, + "language_loss": 0.89021361, + "learning_rate": 0.000770129772929469, + "loss": 0.9010765, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.33325195, + "step": 1761, + "time_per_iteration": 2.6393394470214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076289, + "balance_loss_mlp": 1.04264784, + "epoch": 0.3389765294343978, + "flos": 719487373824.0, + "grad_norm": 0.057381721603975526, + "language_loss": 0.88803625, + "learning_rate": 0.0007698675581420334, + "loss": 0.89879912, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.33666992, + "step": 1762, + "time_per_iteration": 2.8959014415740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_mlp": 1.03968656, + "epoch": 0.3391689111196614, + "flos": 699596269056.0, + "grad_norm": 0.05381480837735757, + "language_loss": 0.78922743, + "learning_rate": 0.0007696052385827199, + "loss": 0.79995811, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.33398438, + "step": 1763, + "time_per_iteration": 2.9110584259033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068767, + "balance_loss_mlp": 1.03519773, + "epoch": 0.339361292804925, + "flos": 626806425600.0, + "grad_norm": 0.05521588721088573, + "language_loss": 0.78407156, + "learning_rate": 0.00076934281435337, + "loss": 0.79475927, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.33569336, + "step": 1764, + "time_per_iteration": 2.7673115730285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073935, + "balance_loss_mlp": 1.04043674, + "epoch": 0.33955367449018853, + "flos": 609302960640.0, + "grad_norm": 0.0615155635578628, + "language_loss": 0.85995364, + "learning_rate": 0.0007690802855558658, + "loss": 0.87069303, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.33520508, + "step": 1765, + "time_per_iteration": 2.871255397796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_mlp": 1.07131636, + "epoch": 0.3397460561754521, + "flos": 1452494177280.0, + "grad_norm": 0.03113174858532202, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77458668, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.12402344, + "step": 1766, + "time_per_iteration": 4.906975746154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089831, + "balance_loss_mlp": 1.05485463, + "epoch": 0.33993843786071565, + "flos": 487068429312.0, + "grad_norm": 0.059784397062932884, + "language_loss": 0.89060128, + "learning_rate": 0.0007685549146641262, + "loss": 0.90149957, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.35009766, + "step": 1767, + "time_per_iteration": 2.5172948837280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081834, + "balance_loss_mlp": 1.04683375, + "epoch": 0.34013081954597923, + "flos": 417115768320.0, + "grad_norm": 0.05470212710373979, + "language_loss": 0.88085568, + "learning_rate": 0.0007682920727738579, + "loss": 0.89167398, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.35058594, + "step": 1768, + "time_per_iteration": 2.4423539638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084324, + "balance_loss_mlp": 1.04939604, + "epoch": 0.34032320123124277, + "flos": 437293472256.0, + "grad_norm": 0.06228189549734304, + "language_loss": 0.84428132, + "learning_rate": 0.000768029126723369, + "loss": 0.85512453, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.34960938, + "step": 1769, + "time_per_iteration": 2.5054280757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082733, + "balance_loss_mlp": 1.04811513, + "epoch": 0.34051558291650635, + "flos": 457353312768.0, + "grad_norm": 0.058774755629116764, + "language_loss": 0.81489038, + "learning_rate": 0.0007677660766147447, + "loss": 0.82571769, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.34667969, + "step": 1770, + "time_per_iteration": 2.524327039718628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_mlp": 1.01844561, + "epoch": 0.3407079646017699, + "flos": 1558029938688.0, + "grad_norm": 0.017684799672329513, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.7350117, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.11767578, + "step": 1771, + "time_per_iteration": 4.924427032470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081114, + "balance_loss_mlp": 1.04666233, + "epoch": 0.3409003462870335, + "flos": 492312190464.0, + "grad_norm": 0.06375677891517043, + "language_loss": 0.79619604, + "learning_rate": 0.0007672396646316306, + "loss": 0.80700719, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.3449707, + "step": 1772, + "time_per_iteration": 2.5239012241363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081305, + "balance_loss_mlp": 1.04589987, + "epoch": 0.34109272797229706, + "flos": 808145303040.0, + "grad_norm": 0.06003817873980187, + "language_loss": 0.80518734, + "learning_rate": 0.000766976302961512, + "loss": 0.81600046, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.35424805, + "step": 1773, + "time_per_iteration": 2.9730074405670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083157, + "balance_loss_mlp": 1.04834807, + "epoch": 0.3412851096575606, + "flos": 469903997952.0, + "grad_norm": 0.05958263274361502, + "language_loss": 0.81420594, + "learning_rate": 0.0007667128376420003, + "loss": 0.82503754, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.34863281, + "step": 1774, + "time_per_iteration": 2.521329879760742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073378, + "balance_loss_mlp": 1.03842556, + "epoch": 0.3414774913428242, + "flos": 595402085376.0, + "grad_norm": 0.09709010294240925, + "language_loss": 0.84563607, + "learning_rate": 0.0007664492687753817, + "loss": 0.85636985, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.34985352, + "step": 1775, + "time_per_iteration": 2.6744766235351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072901, + "balance_loss_mlp": 1.03976059, + "epoch": 0.3416698730280877, + "flos": 527202667008.0, + "grad_norm": 0.05030413358353647, + "language_loss": 0.81513566, + "learning_rate": 0.000766185596463983, + "loss": 0.82586467, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.33154297, + "step": 1776, + "time_per_iteration": 2.6050221920013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073672, + "balance_loss_mlp": 1.03962612, + "epoch": 0.3418622547133513, + "flos": 874272794112.0, + "grad_norm": 0.05039515754698922, + "language_loss": 0.76683038, + "learning_rate": 0.0007659218208101706, + "loss": 0.77756709, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.34082031, + "step": 1777, + "time_per_iteration": 3.0900516510009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079189, + "balance_loss_mlp": 1.04504752, + "epoch": 0.34205463639861483, + "flos": 603462680064.0, + "grad_norm": 0.04915159817243754, + "language_loss": 0.84680861, + "learning_rate": 0.0007656579419163515, + "loss": 0.85760045, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.34179688, + "step": 1778, + "time_per_iteration": 2.7328884601593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081812, + "balance_loss_mlp": 1.04774237, + "epoch": 0.3422470180838784, + "flos": 463547183616.0, + "grad_norm": 0.05230649511498847, + "language_loss": 0.76939148, + "learning_rate": 0.0007653939598849724, + "loss": 0.7802096, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.34106445, + "step": 1779, + "time_per_iteration": 2.5020573139190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_mlp": 1.01441097, + "epoch": 0.34243939976914195, + "flos": 1585584377856.0, + "grad_norm": 0.019842751190116498, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83907396, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.12792969, + "step": 1780, + "time_per_iteration": 4.919352054595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107992, + "balance_loss_mlp": 1.04656482, + "epoch": 0.34263178145440554, + "flos": 872662146048.0, + "grad_norm": 0.0514393238831889, + "language_loss": 0.80344206, + "learning_rate": 0.000764865686819522, + "loss": 0.81424129, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.33374023, + "step": 1781, + "time_per_iteration": 3.059682846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089318, + "balance_loss_mlp": 1.05546236, + "epoch": 0.3428241631396691, + "flos": 506630674944.0, + "grad_norm": 0.04318417455303755, + "language_loss": 0.85701579, + "learning_rate": 0.0007646013959905449, + "loss": 0.86790895, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.33886719, + "step": 1782, + "time_per_iteration": 2.572772741317749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085097, + "balance_loss_mlp": 1.05162311, + "epoch": 0.34301654482493266, + "flos": 879669324288.0, + "grad_norm": 0.05640606275212692, + "language_loss": 0.80626374, + "learning_rate": 0.0007643370024341949, + "loss": 0.81711471, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.33496094, + "step": 1783, + "time_per_iteration": 3.0805578231811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089012, + "balance_loss_mlp": 1.05472708, + "epoch": 0.34320892651019624, + "flos": 431537559552.0, + "grad_norm": 0.05116039291223259, + "language_loss": 0.82947731, + "learning_rate": 0.0007640725062531195, + "loss": 0.84036732, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.34326172, + "step": 1784, + "time_per_iteration": 2.497859239578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083577, + "balance_loss_mlp": 1.05010295, + "epoch": 0.3434013081954598, + "flos": 463404589056.0, + "grad_norm": 0.06763804466989645, + "language_loss": 0.86272931, + "learning_rate": 0.0007638079075500047, + "loss": 0.87356508, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.33496094, + "step": 1785, + "time_per_iteration": 2.516842842102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017655, + "balance_loss_mlp": 1.0058769, + "epoch": 0.34359368988072336, + "flos": 1556499276288.0, + "grad_norm": 0.01279941843938601, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76198322, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.11767578, + "step": 1786, + "time_per_iteration": 4.979317665100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077073, + "balance_loss_mlp": 1.04417157, + "epoch": 0.3437860715659869, + "flos": 495267236352.0, + "grad_norm": 0.04590480874587016, + "language_loss": 0.83035767, + "learning_rate": 0.0007632784029886026, + "loss": 0.84112841, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.32910156, + "step": 1787, + "time_per_iteration": 2.6075103282928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079505, + "balance_loss_mlp": 1.04617453, + "epoch": 0.3439784532512505, + "flos": 717942154752.0, + "grad_norm": 0.04559278353066439, + "language_loss": 0.85611933, + "learning_rate": 0.0007630134973358873, + "loss": 0.86691439, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.33349609, + "step": 1788, + "time_per_iteration": 2.917405366897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086657, + "balance_loss_mlp": 1.05327868, + "epoch": 0.34417083493651407, + "flos": 565598218752.0, + "grad_norm": 0.05301353071730806, + "language_loss": 0.86864436, + "learning_rate": 0.0007627484895722763, + "loss": 0.879511, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.33398438, + "step": 1789, + "time_per_iteration": 2.6353273391723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081939, + "balance_loss_mlp": 1.04834569, + "epoch": 0.3443632166217776, + "flos": 795988905984.0, + "grad_norm": 0.057022653970397005, + "language_loss": 0.80155563, + "learning_rate": 0.0007624833798006552, + "loss": 0.81237495, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.3359375, + "step": 1790, + "time_per_iteration": 3.039126396179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090848, + "balance_loss_mlp": 1.05692101, + "epoch": 0.3445555983070412, + "flos": 569045067264.0, + "grad_norm": 0.05940117534987587, + "language_loss": 0.84113955, + "learning_rate": 0.0007622181681239483, + "loss": 0.85204804, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.33935547, + "step": 1791, + "time_per_iteration": 2.6392083168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083434, + "balance_loss_mlp": 1.04903054, + "epoch": 0.3447479799923047, + "flos": 568524151296.0, + "grad_norm": 0.04492792711883196, + "language_loss": 0.84501636, + "learning_rate": 0.0007619528546451202, + "loss": 0.8558507, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.34448242, + "step": 1792, + "time_per_iteration": 2.776982069015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080197, + "balance_loss_mlp": 1.04708052, + "epoch": 0.3449403616775683, + "flos": 967323299328.0, + "grad_norm": 0.05878857203246004, + "language_loss": 0.8358798, + "learning_rate": 0.0007616874394671745, + "loss": 0.84668171, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.33129883, + "step": 1793, + "time_per_iteration": 3.3427693843841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074615, + "balance_loss_mlp": 1.04128361, + "epoch": 0.34513274336283184, + "flos": 568340858880.0, + "grad_norm": 0.05893035372227358, + "language_loss": 0.84961653, + "learning_rate": 0.0007614219226931547, + "loss": 0.86036265, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.33349609, + "step": 1794, + "time_per_iteration": 2.6591315269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070169, + "balance_loss_mlp": 1.03783977, + "epoch": 0.3453251250480954, + "flos": 460715793408.0, + "grad_norm": 0.06432823181520617, + "language_loss": 0.84724808, + "learning_rate": 0.0007611563044261435, + "loss": 0.85794979, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.32324219, + "step": 1795, + "time_per_iteration": 2.51755690574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078106, + "balance_loss_mlp": 1.0443697, + "epoch": 0.34551750673335896, + "flos": 415397431296.0, + "grad_norm": 0.0640589434438139, + "language_loss": 0.87120652, + "learning_rate": 0.0007608905847692631, + "loss": 0.88198757, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.33764648, + "step": 1796, + "time_per_iteration": 2.47190260887146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074227, + "balance_loss_mlp": 1.04103947, + "epoch": 0.34570988841862255, + "flos": 587540749824.0, + "grad_norm": 0.04642061059617041, + "language_loss": 0.86689866, + "learning_rate": 0.0007606247638256749, + "loss": 0.8776409, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.33203125, + "step": 1797, + "time_per_iteration": 2.956444025039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094984, + "balance_loss_mlp": 1.08373046, + "epoch": 0.34590227010388613, + "flos": 1566835439616.0, + "grad_norm": 0.041839887126655914, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79265279, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.11230469, + "step": 1798, + "time_per_iteration": 4.9134039878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.05352104, + "epoch": 0.34609465178914967, + "flos": 1536950177280.0, + "grad_norm": 0.029939636480755576, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80391788, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.11083984, + "step": 1799, + "time_per_iteration": 4.743322849273682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083924, + "balance_loss_mlp": 1.05054486, + "epoch": 0.34628703347441325, + "flos": 609075998208.0, + "grad_norm": 0.0564087129204809, + "language_loss": 0.85731971, + "learning_rate": 0.0007598266943068686, + "loss": 0.86815894, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.33398438, + "step": 1800, + "time_per_iteration": 2.7374043464660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077797, + "balance_loss_mlp": 1.04603946, + "epoch": 0.3464794151596768, + "flos": 473084596224.0, + "grad_norm": 0.06346922489791823, + "language_loss": 0.83911705, + "learning_rate": 0.0007595604692488507, + "loss": 0.849895, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.31738281, + "step": 1801, + "time_per_iteration": 2.5296249389648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.04147625, + "epoch": 0.34667179684494037, + "flos": 605397805056.0, + "grad_norm": 0.05750507090521113, + "language_loss": 0.83014846, + "learning_rate": 0.0007592941434205215, + "loss": 0.84088963, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.32641602, + "step": 1802, + "time_per_iteration": 2.758260488510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017015, + "balance_loss_mlp": 1.00628662, + "epoch": 0.3468641785302039, + "flos": 1564053511680.0, + "grad_norm": 0.014489769518708178, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74588072, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.10742188, + "step": 1803, + "time_per_iteration": 5.078529119491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069541, + "balance_loss_mlp": 1.0363059, + "epoch": 0.3470565602154675, + "flos": 906902258688.0, + "grad_norm": 0.0666829693597375, + "language_loss": 0.79937375, + "learning_rate": 0.0007587611898665566, + "loss": 0.81006914, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.33251953, + "step": 1804, + "time_per_iteration": 3.05087947845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078664, + "balance_loss_mlp": 1.04621565, + "epoch": 0.347248941900731, + "flos": 638613614592.0, + "grad_norm": 0.050247612363814816, + "language_loss": 0.8218019, + "learning_rate": 0.0007584945623478315, + "loss": 0.83258855, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.32446289, + "step": 1805, + "time_per_iteration": 2.8188822269439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071735, + "balance_loss_mlp": 1.03940511, + "epoch": 0.3474413235859946, + "flos": 847009336320.0, + "grad_norm": 0.06830759319763476, + "language_loss": 0.81376302, + "learning_rate": 0.000758227834472617, + "loss": 0.82448041, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.32324219, + "step": 1806, + "time_per_iteration": 3.049736976623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080854, + "balance_loss_mlp": 1.04771423, + "epoch": 0.3476337052712582, + "flos": 515395478016.0, + "grad_norm": 0.0580200838122141, + "language_loss": 0.77365351, + "learning_rate": 0.0007579610063444664, + "loss": 0.78446203, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.33154297, + "step": 1807, + "time_per_iteration": 2.768986701965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072697, + "balance_loss_mlp": 1.03993857, + "epoch": 0.34782608695652173, + "flos": 913161558528.0, + "grad_norm": 0.05804810611861273, + "language_loss": 0.8735044, + "learning_rate": 0.0007576940780669712, + "loss": 0.88423139, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.32763672, + "step": 1808, + "time_per_iteration": 3.200984477996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073414, + "balance_loss_mlp": 1.04041636, + "epoch": 0.3480184686417853, + "flos": 773374099968.0, + "grad_norm": 0.05336970886803796, + "language_loss": 0.84611619, + "learning_rate": 0.0007574270497437624, + "loss": 0.85685027, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.33007812, + "step": 1809, + "time_per_iteration": 2.9432260990142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069124, + "balance_loss_mlp": 1.03619814, + "epoch": 0.34821085032704885, + "flos": 576549840384.0, + "grad_norm": 0.04930975616190813, + "language_loss": 0.87883413, + "learning_rate": 0.000757159921478509, + "loss": 0.88952535, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.3293457, + "step": 1810, + "time_per_iteration": 2.769669771194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079936, + "balance_loss_mlp": 1.06887364, + "epoch": 0.34840323201231244, + "flos": 1524176911872.0, + "grad_norm": 0.03214902088053901, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75530577, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.11083984, + "step": 1811, + "time_per_iteration": 4.764174222946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107769, + "balance_loss_mlp": 1.04469275, + "epoch": 0.34859561369757597, + "flos": 508910625792.0, + "grad_norm": 0.059132347701423886, + "language_loss": 0.87255216, + "learning_rate": 0.0007566253655367423, + "loss": 0.88332909, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.33007812, + "step": 1812, + "time_per_iteration": 2.578930616378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073116, + "balance_loss_mlp": 1.04014218, + "epoch": 0.34878799538283956, + "flos": 548390117376.0, + "grad_norm": 0.051501554075800156, + "language_loss": 0.89574003, + "learning_rate": 0.000756357938067762, + "loss": 0.90647119, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.32983398, + "step": 1813, + "time_per_iteration": 2.6508982181549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079394, + "balance_loss_mlp": 1.04673076, + "epoch": 0.34898037706810314, + "flos": 983251021824.0, + "grad_norm": 0.051360492330316726, + "language_loss": 0.82609868, + "learning_rate": 0.0007560904110718033, + "loss": 0.8368926, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.32666016, + "step": 1814, + "time_per_iteration": 3.236894130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075398, + "balance_loss_mlp": 1.04185271, + "epoch": 0.3491727587533667, + "flos": 681298435584.0, + "grad_norm": 0.05446392192761228, + "language_loss": 0.83478653, + "learning_rate": 0.0007558227846527297, + "loss": 0.84554052, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.33569336, + "step": 1815, + "time_per_iteration": 2.8674044609069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074987, + "balance_loss_mlp": 1.04175162, + "epoch": 0.34936514043863026, + "flos": 393811310592.0, + "grad_norm": 0.0691488486506015, + "language_loss": 0.83195454, + "learning_rate": 0.0007555550589144429, + "loss": 0.84270442, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.33251953, + "step": 1816, + "time_per_iteration": 2.421494722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071292, + "balance_loss_mlp": 1.03917694, + "epoch": 0.3495575221238938, + "flos": 461120256000.0, + "grad_norm": 0.07868701205222765, + "language_loss": 0.8463372, + "learning_rate": 0.000755287233960883, + "loss": 0.85705012, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.32104492, + "step": 1817, + "time_per_iteration": 2.54315185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072544, + "balance_loss_mlp": 1.04023862, + "epoch": 0.3497499038091574, + "flos": 723859600896.0, + "grad_norm": 0.06602653795060065, + "language_loss": 0.77636009, + "learning_rate": 0.0007550193098960292, + "loss": 0.78708553, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.32299805, + "step": 1818, + "time_per_iteration": 2.848236560821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076763, + "balance_loss_mlp": 1.04452837, + "epoch": 0.3499422854944209, + "flos": 827364132864.0, + "grad_norm": 0.049816715297611704, + "language_loss": 0.86387616, + "learning_rate": 0.0007547512868238988, + "loss": 0.8746438, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.32226562, + "step": 1819, + "time_per_iteration": 3.140552043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076553, + "balance_loss_mlp": 1.0441277, + "epoch": 0.3501346671796845, + "flos": 493214247936.0, + "grad_norm": 0.049810070694169546, + "language_loss": 0.83196282, + "learning_rate": 0.0007544831648485473, + "loss": 0.84272838, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.32421875, + "step": 1820, + "time_per_iteration": 2.7085797786712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074414, + "balance_loss_mlp": 1.04179859, + "epoch": 0.35032704886494803, + "flos": 578479173120.0, + "grad_norm": 0.05987447994889705, + "language_loss": 0.81237, + "learning_rate": 0.0007542149440740694, + "loss": 0.82311416, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.32617188, + "step": 1821, + "time_per_iteration": 2.6648108959198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075377, + "balance_loss_mlp": 1.0426898, + "epoch": 0.3505194305502116, + "flos": 584383472640.0, + "grad_norm": 0.06285767185927299, + "language_loss": 0.85454488, + "learning_rate": 0.000753946624604597, + "loss": 0.86529863, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.3269043, + "step": 1822, + "time_per_iteration": 2.7114102840423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080366, + "balance_loss_mlp": 1.04722571, + "epoch": 0.3507118122354752, + "flos": 526705072128.0, + "grad_norm": 0.056571758739544044, + "language_loss": 0.88259315, + "learning_rate": 0.0007536782065443015, + "loss": 0.89339685, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.33154297, + "step": 1823, + "time_per_iteration": 2.6336190700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077518, + "balance_loss_mlp": 1.04576099, + "epoch": 0.35090419392073874, + "flos": 511269152256.0, + "grad_norm": 0.06612506998948281, + "language_loss": 0.74917412, + "learning_rate": 0.0007534096899973919, + "loss": 0.75994933, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.31738281, + "step": 1824, + "time_per_iteration": 2.5683584213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108279, + "balance_loss_mlp": 1.05069852, + "epoch": 0.3510965756060023, + "flos": 563728522752.0, + "grad_norm": 0.05207355522992398, + "language_loss": 0.82511663, + "learning_rate": 0.0007531410750681154, + "loss": 0.83594453, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.32080078, + "step": 1825, + "time_per_iteration": 2.7370071411132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108428, + "balance_loss_mlp": 1.05207014, + "epoch": 0.35128895729126586, + "flos": 1020107146752.0, + "grad_norm": 0.05855996344544413, + "language_loss": 0.86223209, + "learning_rate": 0.0007528723618607575, + "loss": 0.87307489, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.32202148, + "step": 1826, + "time_per_iteration": 3.4230828285217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080919, + "balance_loss_mlp": 1.04889941, + "epoch": 0.35148133897652944, + "flos": 587972915712.0, + "grad_norm": 0.06514472806491804, + "language_loss": 0.82370871, + "learning_rate": 0.0007526035504796422, + "loss": 0.8345179, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.32006836, + "step": 1827, + "time_per_iteration": 2.7472023963928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083394, + "balance_loss_mlp": 1.05011046, + "epoch": 0.351673720661793, + "flos": 495054830592.0, + "grad_norm": 0.13631276803870807, + "language_loss": 0.86120903, + "learning_rate": 0.0007523346410291312, + "loss": 0.87204289, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.33300781, + "step": 1828, + "time_per_iteration": 2.7665555477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080213, + "balance_loss_mlp": 1.04757404, + "epoch": 0.35186610234705656, + "flos": 762339520512.0, + "grad_norm": 0.04983334941453678, + "language_loss": 0.85021639, + "learning_rate": 0.0007520656336136245, + "loss": 0.86101854, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.32641602, + "step": 1829, + "time_per_iteration": 2.9405102729797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080442, + "balance_loss_mlp": 1.04847038, + "epoch": 0.3520584840323201, + "flos": 625822820352.0, + "grad_norm": 0.049266285647792965, + "language_loss": 0.87560928, + "learning_rate": 0.0007517965283375599, + "loss": 0.88641369, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.31958008, + "step": 1830, + "time_per_iteration": 2.8742456436157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076907, + "balance_loss_mlp": 1.04429162, + "epoch": 0.3522508657175837, + "flos": 537124193280.0, + "grad_norm": 0.05152278098600794, + "language_loss": 0.8913554, + "learning_rate": 0.0007515273253054132, + "loss": 0.90212452, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.32617188, + "step": 1831, + "time_per_iteration": 2.647270917892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085251, + "balance_loss_mlp": 1.0506562, + "epoch": 0.35244324740284727, + "flos": 567105560064.0, + "grad_norm": 0.052396269804254075, + "language_loss": 0.82697165, + "learning_rate": 0.0007512580246216988, + "loss": 0.83782411, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.34643555, + "step": 1832, + "time_per_iteration": 2.6887552738189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079591, + "balance_loss_mlp": 1.04673672, + "epoch": 0.3526356290881108, + "flos": 512809989120.0, + "grad_norm": 0.05749675796225481, + "language_loss": 0.85263908, + "learning_rate": 0.000750988626390968, + "loss": 0.86343497, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.32861328, + "step": 1833, + "time_per_iteration": 2.6013457775115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080641, + "balance_loss_mlp": 1.04781032, + "epoch": 0.3528280107733744, + "flos": 595496627712.0, + "grad_norm": 0.05344239959905588, + "language_loss": 0.84880912, + "learning_rate": 0.0007507191307178108, + "loss": 0.8596155, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.32836914, + "step": 1834, + "time_per_iteration": 2.7490363121032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080747, + "balance_loss_mlp": 1.04682052, + "epoch": 0.3530203924586379, + "flos": 550969814016.0, + "grad_norm": 0.06515988244691072, + "language_loss": 0.74431223, + "learning_rate": 0.0007504495377068543, + "loss": 0.75511968, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.33959961, + "step": 1835, + "time_per_iteration": 2.729029417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084115, + "balance_loss_mlp": 1.04925871, + "epoch": 0.3532127741439015, + "flos": 652662876672.0, + "grad_norm": 0.06759605529963146, + "language_loss": 0.81589389, + "learning_rate": 0.0007501798474627642, + "loss": 0.82673502, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.34912109, + "step": 1836, + "time_per_iteration": 2.9048030376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080775, + "balance_loss_mlp": 1.04708695, + "epoch": 0.35340515582916504, + "flos": 722452594176.0, + "grad_norm": 0.055893281392717674, + "language_loss": 0.83221173, + "learning_rate": 0.0007499100600902433, + "loss": 0.84301955, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.3371582, + "step": 1837, + "time_per_iteration": 2.9900574684143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080937, + "balance_loss_mlp": 1.0464375, + "epoch": 0.35359753751442863, + "flos": 594619301376.0, + "grad_norm": 0.06113982905710786, + "language_loss": 0.84191763, + "learning_rate": 0.0007496401756940324, + "loss": 0.852727, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.34545898, + "step": 1838, + "time_per_iteration": 2.6746203899383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079632, + "balance_loss_mlp": 1.04575253, + "epoch": 0.3537899191996922, + "flos": 632384838144.0, + "grad_norm": 0.05956961248716192, + "language_loss": 0.82392603, + "learning_rate": 0.0007493701943789098, + "loss": 0.8347224, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.33886719, + "step": 1839, + "time_per_iteration": 2.773550510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089423, + "balance_loss_mlp": 1.05590141, + "epoch": 0.35398230088495575, + "flos": 506118523392.0, + "grad_norm": 0.05410374630174333, + "language_loss": 0.82311571, + "learning_rate": 0.000749100116249692, + "loss": 0.83400989, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.33544922, + "step": 1840, + "time_per_iteration": 2.6255862712860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089923, + "balance_loss_mlp": 1.05649722, + "epoch": 0.35417468257021933, + "flos": 507783015936.0, + "grad_norm": 0.06109989504264522, + "language_loss": 0.86315167, + "learning_rate": 0.0007488299414112321, + "loss": 0.87405092, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.33447266, + "step": 1841, + "time_per_iteration": 2.5875229835510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088685, + "balance_loss_mlp": 1.05552149, + "epoch": 0.35436706425548287, + "flos": 656133046272.0, + "grad_norm": 0.05742985112465967, + "language_loss": 0.77833533, + "learning_rate": 0.0007485596699684215, + "loss": 0.78922212, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.33178711, + "step": 1842, + "time_per_iteration": 2.819591760635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092886, + "balance_loss_mlp": 1.05903101, + "epoch": 0.35455944594074645, + "flos": 652322433024.0, + "grad_norm": 0.047878329403948795, + "language_loss": 0.85455877, + "learning_rate": 0.000748289302026189, + "loss": 0.86548758, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.33886719, + "step": 1843, + "time_per_iteration": 2.829897880554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088118, + "balance_loss_mlp": 1.05569351, + "epoch": 0.35475182762601, + "flos": 848240252928.0, + "grad_norm": 0.06279452498251797, + "language_loss": 0.85658133, + "learning_rate": 0.0007480188376895004, + "loss": 0.86746252, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.32421875, + "step": 1844, + "time_per_iteration": 3.067828893661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085552, + "balance_loss_mlp": 1.07358336, + "epoch": 0.3549442093112736, + "flos": 1520644133376.0, + "grad_norm": 0.027370210450034033, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.7489689, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.11962891, + "step": 1845, + "time_per_iteration": 4.860119342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087227, + "balance_loss_mlp": 1.05401564, + "epoch": 0.3551365909965371, + "flos": 651087134208.0, + "grad_norm": 0.057022057365061586, + "language_loss": 0.7840451, + "learning_rate": 0.0007474776202528074, + "loss": 0.79491735, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.33227539, + "step": 1846, + "time_per_iteration": 2.924600601196289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081626, + "balance_loss_mlp": 1.04877198, + "epoch": 0.3553289726818007, + "flos": 897094213632.0, + "grad_norm": 0.05655103479540665, + "language_loss": 0.81245291, + "learning_rate": 0.000747206867362922, + "loss": 0.82326913, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.32861328, + "step": 1847, + "time_per_iteration": 3.0635437965393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078587, + "balance_loss_mlp": 1.0454942, + "epoch": 0.3555213543670643, + "flos": 688181958144.0, + "grad_norm": 0.057996459019562165, + "language_loss": 0.83748043, + "learning_rate": 0.0007469360184988194, + "loss": 0.84826624, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.33105469, + "step": 1848, + "time_per_iteration": 2.816774606704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072989, + "balance_loss_mlp": 1.03977752, + "epoch": 0.3557137360523278, + "flos": 538305647616.0, + "grad_norm": 0.0578078380794177, + "language_loss": 0.87284935, + "learning_rate": 0.0007466650737656518, + "loss": 0.88357925, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.33203125, + "step": 1849, + "time_per_iteration": 2.611743927001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075842, + "balance_loss_mlp": 1.04208231, + "epoch": 0.3559061177375914, + "flos": 402039230976.0, + "grad_norm": 0.05251231214094578, + "language_loss": 0.90093362, + "learning_rate": 0.0007463940332686098, + "loss": 0.91169202, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.33789062, + "step": 1850, + "time_per_iteration": 2.4692726135253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073866, + "balance_loss_mlp": 1.04017735, + "epoch": 0.35609849942285493, + "flos": 696238170624.0, + "grad_norm": 0.04795835093932571, + "language_loss": 0.84167922, + "learning_rate": 0.0007461228971129205, + "loss": 0.85241795, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.33691406, + "step": 1851, + "time_per_iteration": 2.894505023956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106997, + "balance_loss_mlp": 1.0372591, + "epoch": 0.3562908811081185, + "flos": 568660953600.0, + "grad_norm": 0.055081415669052246, + "language_loss": 0.85513294, + "learning_rate": 0.0007458516654038483, + "loss": 0.86583257, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.32714844, + "step": 1852, + "time_per_iteration": 2.678018569946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076636, + "balance_loss_mlp": 1.0421133, + "epoch": 0.35648326279338205, + "flos": 682081219584.0, + "grad_norm": 0.04842584798560518, + "language_loss": 0.8668319, + "learning_rate": 0.0007455803382466946, + "loss": 0.87759829, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.34545898, + "step": 1853, + "time_per_iteration": 2.795799493789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081318, + "balance_loss_mlp": 1.04674757, + "epoch": 0.35667564447864564, + "flos": 628840475136.0, + "grad_norm": 0.04891463031827082, + "language_loss": 0.87319207, + "learning_rate": 0.0007453089157467979, + "loss": 0.88400525, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.34594727, + "step": 1854, + "time_per_iteration": 2.7683348655700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084525, + "balance_loss_mlp": 1.05000162, + "epoch": 0.35686802616390917, + "flos": 813685837824.0, + "grad_norm": 0.04901692214928195, + "language_loss": 0.81941634, + "learning_rate": 0.0007450373980095341, + "loss": 0.83026159, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.34545898, + "step": 1855, + "time_per_iteration": 3.069664716720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088761, + "balance_loss_mlp": 1.0541904, + "epoch": 0.35706040784917276, + "flos": 525922288128.0, + "grad_norm": 0.06393454459125486, + "language_loss": 0.86792582, + "learning_rate": 0.0007447657851403155, + "loss": 0.87881339, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.34619141, + "step": 1856, + "time_per_iteration": 2.6120662689208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081268, + "balance_loss_mlp": 1.04793692, + "epoch": 0.35725278953443634, + "flos": 511698345984.0, + "grad_norm": 0.060959809088696394, + "language_loss": 0.78963649, + "learning_rate": 0.0007444940772445915, + "loss": 0.80044913, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.33349609, + "step": 1857, + "time_per_iteration": 2.802053689956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079861, + "balance_loss_mlp": 1.04653037, + "epoch": 0.3574451712196999, + "flos": 487162971648.0, + "grad_norm": 0.06448223618511208, + "language_loss": 0.80338144, + "learning_rate": 0.0007442222744278484, + "loss": 0.81418002, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.33349609, + "step": 1858, + "time_per_iteration": 2.660689353942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080529, + "balance_loss_mlp": 1.04765141, + "epoch": 0.35763755290496346, + "flos": 550384879104.0, + "grad_norm": 0.061962253699798436, + "language_loss": 0.84126002, + "learning_rate": 0.0007439503767956099, + "loss": 0.85206527, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.32885742, + "step": 1859, + "time_per_iteration": 2.7479875087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080547, + "balance_loss_mlp": 1.06767237, + "epoch": 0.357829934590227, + "flos": 1503300791808.0, + "grad_norm": 0.035903025748828234, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80752152, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.12890625, + "step": 1860, + "time_per_iteration": 4.900041580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077433, + "balance_loss_mlp": 1.04479325, + "epoch": 0.3580223162754906, + "flos": 568410670080.0, + "grad_norm": 0.040558802150678905, + "language_loss": 0.85799539, + "learning_rate": 0.000743406297506922, + "loss": 0.86876976, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.32641602, + "step": 1861, + "time_per_iteration": 2.701162576675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084428, + "balance_loss_mlp": 1.05107355, + "epoch": 0.3582146979607541, + "flos": 626153089536.0, + "grad_norm": 0.04686630584337546, + "language_loss": 0.8419295, + "learning_rate": 0.0007431341160617031, + "loss": 0.85277379, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.33374023, + "step": 1862, + "time_per_iteration": 2.860173463821411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085309, + "balance_loss_mlp": 1.05266929, + "epoch": 0.3584070796460177, + "flos": 507010406400.0, + "grad_norm": 0.04939599291948986, + "language_loss": 0.88143289, + "learning_rate": 0.0007428618402234491, + "loss": 0.89228594, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.32641602, + "step": 1863, + "time_per_iteration": 2.62233567237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083488, + "balance_loss_mlp": 1.05030036, + "epoch": 0.3585994613312813, + "flos": 606190763520.0, + "grad_norm": 0.051497717495276533, + "language_loss": 0.80248374, + "learning_rate": 0.0007425894700978668, + "loss": 0.81331861, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.33203125, + "step": 1864, + "time_per_iteration": 2.711484670639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087216, + "balance_loss_mlp": 1.05424261, + "epoch": 0.3587918430165448, + "flos": 1412338484736.0, + "grad_norm": 0.047877863134497434, + "language_loss": 0.79232943, + "learning_rate": 0.0007423170057906996, + "loss": 0.80320162, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.32983398, + "step": 1865, + "time_per_iteration": 3.852776527404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091453, + "balance_loss_mlp": 1.05821717, + "epoch": 0.3589842247018084, + "flos": 478313800704.0, + "grad_norm": 0.06431447428318769, + "language_loss": 0.85827845, + "learning_rate": 0.0007420444474077275, + "loss": 0.86919296, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.33251953, + "step": 1866, + "time_per_iteration": 2.6104037761688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101796, + "balance_loss_mlp": 1.06829846, + "epoch": 0.35917660638707194, + "flos": 504464205312.0, + "grad_norm": 0.06438653143979521, + "language_loss": 0.89830631, + "learning_rate": 0.0007417717950547671, + "loss": 0.90932429, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.33520508, + "step": 1867, + "time_per_iteration": 2.5619330406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108848, + "balance_loss_mlp": 1.09687901, + "epoch": 0.3593689880723355, + "flos": 1491294191616.0, + "grad_norm": 0.037524520389889536, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77105457, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.11962891, + "step": 1868, + "time_per_iteration": 4.971943378448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096427, + "balance_loss_mlp": 1.06388319, + "epoch": 0.35956136975759906, + "flos": 528369564672.0, + "grad_norm": 0.050983088733796166, + "language_loss": 0.84612024, + "learning_rate": 0.0007412262088623299, + "loss": 0.85708451, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.32543945, + "step": 1869, + "time_per_iteration": 2.7295072078704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104656, + "balance_loss_mlp": 1.07142007, + "epoch": 0.35975375144286265, + "flos": 534647803392.0, + "grad_norm": 0.057848782745497714, + "language_loss": 0.79012549, + "learning_rate": 0.0007409532752346684, + "loss": 0.80117208, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.33251953, + "step": 1870, + "time_per_iteration": 2.74664568901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107858, + "balance_loss_mlp": 1.07464623, + "epoch": 0.3599461331281262, + "flos": 504695549952.0, + "grad_norm": 0.054621035664709404, + "language_loss": 0.88661271, + "learning_rate": 0.0007406802480606491, + "loss": 0.89769125, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.33227539, + "step": 1871, + "time_per_iteration": 2.636101722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_mlp": 1.06923246, + "epoch": 0.36013851481338977, + "flos": 511283708928.0, + "grad_norm": 0.05849515281409536, + "language_loss": 0.90592384, + "learning_rate": 0.0007404071274462707, + "loss": 0.91694903, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.33300781, + "step": 1872, + "time_per_iteration": 2.559588670730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098641, + "balance_loss_mlp": 1.06533384, + "epoch": 0.36033089649865335, + "flos": 547330908672.0, + "grad_norm": 0.06237198940644659, + "language_loss": 0.8363173, + "learning_rate": 0.0007401339134975682, + "loss": 0.84730369, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.33325195, + "step": 1873, + "time_per_iteration": 2.6156845092773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100197, + "balance_loss_mlp": 1.06617522, + "epoch": 0.3605232781839169, + "flos": 458416903680.0, + "grad_norm": 0.05108892475659159, + "language_loss": 0.84187275, + "learning_rate": 0.0007398606063206122, + "loss": 0.85287476, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.34033203, + "step": 1874, + "time_per_iteration": 2.6152756214141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090341, + "balance_loss_mlp": 1.05729628, + "epoch": 0.36071565986918047, + "flos": 509309296128.0, + "grad_norm": 0.05589329807105905, + "language_loss": 0.7857852, + "learning_rate": 0.0007395872060215101, + "loss": 0.79668868, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.33056641, + "step": 1875, + "time_per_iteration": 2.592906951904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095107, + "balance_loss_mlp": 1.06230044, + "epoch": 0.360908041554444, + "flos": 558931484160.0, + "grad_norm": 0.12468103825296885, + "language_loss": 0.88329792, + "learning_rate": 0.0007393137127064056, + "loss": 0.89424896, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.328125, + "step": 1876, + "time_per_iteration": 2.629368782043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109653, + "balance_loss_mlp": 1.06300879, + "epoch": 0.3611004232397076, + "flos": 523588492800.0, + "grad_norm": 0.05189881397754868, + "language_loss": 0.84167802, + "learning_rate": 0.0007390401264814779, + "loss": 0.85264337, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.33544922, + "step": 1877, + "time_per_iteration": 2.644322156906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084049, + "balance_loss_mlp": 1.05179131, + "epoch": 0.3612928049249711, + "flos": 540728193024.0, + "grad_norm": 0.07312313725982984, + "language_loss": 0.84472072, + "learning_rate": 0.0007387664474529427, + "loss": 0.8555612, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.32250977, + "step": 1878, + "time_per_iteration": 2.6105034351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094149, + "balance_loss_mlp": 1.06131935, + "epoch": 0.3614851866102347, + "flos": 552289480704.0, + "grad_norm": 0.06338398309504269, + "language_loss": 0.9129535, + "learning_rate": 0.0007384926757270518, + "loss": 0.923895, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.32836914, + "step": 1879, + "time_per_iteration": 2.6268200874328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082708, + "balance_loss_mlp": 1.05023539, + "epoch": 0.36167756829549824, + "flos": 771734338560.0, + "grad_norm": 0.048672507925477976, + "language_loss": 0.79680419, + "learning_rate": 0.0007382188114100924, + "loss": 0.80763125, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.32470703, + "step": 1880, + "time_per_iteration": 2.9548373222351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078144, + "balance_loss_mlp": 1.04509938, + "epoch": 0.36186994998076183, + "flos": 711560609280.0, + "grad_norm": 0.04804943389379678, + "language_loss": 0.81544787, + "learning_rate": 0.0007379448546083884, + "loss": 0.82622933, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.33056641, + "step": 1881, + "time_per_iteration": 2.900480031967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082413, + "balance_loss_mlp": 1.04884315, + "epoch": 0.3620623316660254, + "flos": 747209138688.0, + "grad_norm": 0.049920719635936736, + "language_loss": 0.88019323, + "learning_rate": 0.0007376708054282992, + "loss": 0.89101738, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.3359375, + "step": 1882, + "time_per_iteration": 2.9482829570770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075385, + "balance_loss_mlp": 1.04286492, + "epoch": 0.36225471335128895, + "flos": 482312088576.0, + "grad_norm": 0.04692483307288239, + "language_loss": 0.83908749, + "learning_rate": 0.0007373966639762201, + "loss": 0.84984136, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.32519531, + "step": 1883, + "time_per_iteration": 2.597809076309204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078049, + "balance_loss_mlp": 1.0448606, + "epoch": 0.36244709503655254, + "flos": 506655406080.0, + "grad_norm": 0.0703007209611724, + "language_loss": 0.8835175, + "learning_rate": 0.0007371224303585822, + "loss": 0.89429802, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.33203125, + "step": 1884, + "time_per_iteration": 2.5686471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046189, + "balance_loss_mlp": 1.03360081, + "epoch": 0.36263947672181607, + "flos": 1393302643200.0, + "grad_norm": 0.020620128786032376, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81403255, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.12597656, + "step": 1885, + "time_per_iteration": 4.68831205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073343, + "balance_loss_mlp": 1.03939199, + "epoch": 0.36283185840707965, + "flos": 652991735808.0, + "grad_norm": 0.05943029236677907, + "language_loss": 0.82845902, + "learning_rate": 0.0007365736870525335, + "loss": 0.83919251, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.33959961, + "step": 1886, + "time_per_iteration": 2.8566346168518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070323, + "balance_loss_mlp": 1.0373739, + "epoch": 0.3630242400923432, + "flos": 488619440640.0, + "grad_norm": 0.06703223685064427, + "language_loss": 0.82574463, + "learning_rate": 0.000736299177577164, + "loss": 0.83644783, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.32958984, + "step": 1887, + "time_per_iteration": 2.5848894119262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074233, + "balance_loss_mlp": 1.04130709, + "epoch": 0.3632166217776068, + "flos": 516892644864.0, + "grad_norm": 0.0626455482667494, + "language_loss": 0.83844066, + "learning_rate": 0.0007360245763623174, + "loss": 0.84918302, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.3293457, + "step": 1888, + "time_per_iteration": 2.6179397106170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067525, + "balance_loss_mlp": 1.03564882, + "epoch": 0.36340900346287036, + "flos": 645881250816.0, + "grad_norm": 0.06111369810549259, + "language_loss": 0.89794236, + "learning_rate": 0.0007357498835146039, + "loss": 0.90861762, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.31860352, + "step": 1889, + "time_per_iteration": 2.8311662673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070281, + "balance_loss_mlp": 1.03854752, + "epoch": 0.3636013851481339, + "flos": 553057708032.0, + "grad_norm": 0.0568549422608731, + "language_loss": 0.86402494, + "learning_rate": 0.0007354750991406684, + "loss": 0.87472773, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.31713867, + "step": 1890, + "time_per_iteration": 2.6922197341918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072041, + "balance_loss_mlp": 1.03952074, + "epoch": 0.3637937668333975, + "flos": 546395355648.0, + "grad_norm": 0.053455628499382915, + "language_loss": 0.80957252, + "learning_rate": 0.0007352002233471919, + "loss": 0.82029295, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.32519531, + "step": 1891, + "time_per_iteration": 2.621241569519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066281, + "balance_loss_mlp": 1.03371286, + "epoch": 0.363986148518661, + "flos": 537838576128.0, + "grad_norm": 0.07508945751401845, + "language_loss": 0.79549944, + "learning_rate": 0.0007349252562408906, + "loss": 0.80616224, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.32543945, + "step": 1892, + "time_per_iteration": 2.674318552017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069175, + "balance_loss_mlp": 1.0372504, + "epoch": 0.3641785302039246, + "flos": 659895607296.0, + "grad_norm": 0.04761623500947703, + "language_loss": 0.81258041, + "learning_rate": 0.0007346501979285158, + "loss": 0.82327211, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.3190918, + "step": 1893, + "time_per_iteration": 2.8671226501464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_mlp": 1.0238719, + "epoch": 0.36437091188918813, + "flos": 1467911158272.0, + "grad_norm": 0.02143179240630706, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81574464, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.11474609, + "step": 1894, + "time_per_iteration": 4.7720019817352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075531, + "balance_loss_mlp": 1.04248571, + "epoch": 0.3645632935744517, + "flos": 597012733440.0, + "grad_norm": 0.049808711864839754, + "language_loss": 0.85850054, + "learning_rate": 0.0007340998081127308, + "loss": 0.8692559, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.33056641, + "step": 1895, + "time_per_iteration": 2.753730058670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081037, + "balance_loss_mlp": 1.0486834, + "epoch": 0.36475567525971525, + "flos": 599214108672.0, + "grad_norm": 0.05384470863640996, + "language_loss": 0.9063257, + "learning_rate": 0.0007338244768230007, + "loss": 0.91713607, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.32348633, + "step": 1896, + "time_per_iteration": 2.749844551086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_mlp": 1.05189669, + "epoch": 0.36494805694497884, + "flos": 798047686656.0, + "grad_norm": 0.12041633701688108, + "language_loss": 0.88843018, + "learning_rate": 0.0007335490547545578, + "loss": 0.89927363, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.32446289, + "step": 1897, + "time_per_iteration": 3.0181519985198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089996, + "balance_loss_mlp": 1.05795288, + "epoch": 0.3651404386302424, + "flos": 637023315456.0, + "grad_norm": 0.06340749789439089, + "language_loss": 0.82377589, + "learning_rate": 0.0007332735420143308, + "loss": 0.83467579, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.3203125, + "step": 1898, + "time_per_iteration": 2.7370855808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077681, + "balance_loss_mlp": 1.04442132, + "epoch": 0.36533282031550596, + "flos": 491337349632.0, + "grad_norm": 0.05751458989837244, + "language_loss": 0.8663426, + "learning_rate": 0.0007329979387092826, + "loss": 0.87711942, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.33276367, + "step": 1899, + "time_per_iteration": 2.552072048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076766, + "balance_loss_mlp": 1.0440551, + "epoch": 0.36552520200076954, + "flos": 855587874816.0, + "grad_norm": 0.050366197091212025, + "language_loss": 0.83863711, + "learning_rate": 0.0007327222449464124, + "loss": 0.84940481, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.32714844, + "step": 1900, + "time_per_iteration": 3.2450594902038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072697, + "balance_loss_mlp": 1.03936601, + "epoch": 0.3657175836860331, + "flos": 483449872896.0, + "grad_norm": 0.053278478248789174, + "language_loss": 0.88864619, + "learning_rate": 0.0007324464608327538, + "loss": 0.89937317, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.33349609, + "step": 1901, + "time_per_iteration": 2.6027348041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072333, + "balance_loss_mlp": 1.03957391, + "epoch": 0.36590996537129666, + "flos": 434561006592.0, + "grad_norm": 0.058664113400220264, + "language_loss": 0.88440275, + "learning_rate": 0.0007321705864753758, + "loss": 0.8951261, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.32763672, + "step": 1902, + "time_per_iteration": 2.668935537338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073172, + "balance_loss_mlp": 1.03950715, + "epoch": 0.3661023470565602, + "flos": 711880704000.0, + "grad_norm": 0.047699393186438684, + "language_loss": 0.84307706, + "learning_rate": 0.0007318946219813823, + "loss": 0.85380876, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.33691406, + "step": 1903, + "time_per_iteration": 3.025866985321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074477, + "balance_loss_mlp": 1.04100275, + "epoch": 0.3662947287418238, + "flos": 564495340032.0, + "grad_norm": 0.05797091317965262, + "language_loss": 0.90078342, + "learning_rate": 0.000731618567457912, + "loss": 0.91152817, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.3347168, + "step": 1904, + "time_per_iteration": 2.6391115188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073585, + "balance_loss_mlp": 1.03937197, + "epoch": 0.3664871104270873, + "flos": 789391982592.0, + "grad_norm": 0.05925410463566973, + "language_loss": 0.87083924, + "learning_rate": 0.000731342423012139, + "loss": 0.88157511, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.3425293, + "step": 1905, + "time_per_iteration": 3.020660400390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070848, + "balance_loss_mlp": 1.03682566, + "epoch": 0.3666794921123509, + "flos": 752202616320.0, + "grad_norm": 0.06601024748857935, + "language_loss": 0.82244205, + "learning_rate": 0.0007310661887512722, + "loss": 0.83315057, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.34033203, + "step": 1906, + "time_per_iteration": 3.0128185749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068568, + "balance_loss_mlp": 1.03571391, + "epoch": 0.3668718737976145, + "flos": 523264015872.0, + "grad_norm": 0.04853340441162438, + "language_loss": 0.82115662, + "learning_rate": 0.0007307898647825549, + "loss": 0.8318423, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.32861328, + "step": 1907, + "time_per_iteration": 2.6610257625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075142, + "balance_loss_mlp": 1.04126275, + "epoch": 0.367064255482878, + "flos": 571698957312.0, + "grad_norm": 0.05773956677348378, + "language_loss": 0.89470363, + "learning_rate": 0.0007305134512132659, + "loss": 0.90545505, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.33886719, + "step": 1908, + "time_per_iteration": 2.706658124923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076976, + "balance_loss_mlp": 1.04309678, + "epoch": 0.3672566371681416, + "flos": 446880347136.0, + "grad_norm": 0.0894454707503668, + "language_loss": 0.83388865, + "learning_rate": 0.0007302369481507183, + "loss": 0.84465849, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.33911133, + "step": 1909, + "time_per_iteration": 2.499483346939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049259, + "balance_loss_mlp": 1.03838694, + "epoch": 0.36744901885340514, + "flos": 1539275208192.0, + "grad_norm": 0.032302576214162944, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81011015, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.10888672, + "step": 1910, + "time_per_iteration": 4.86514949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088721, + "balance_loss_mlp": 1.05476999, + "epoch": 0.36764140053866873, + "flos": 563417192448.0, + "grad_norm": 0.061805914783829616, + "language_loss": 0.85575247, + "learning_rate": 0.000729683673975274, + "loss": 0.86663967, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.33984375, + "step": 1911, + "time_per_iteration": 2.6907522678375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091619, + "balance_loss_mlp": 1.05709589, + "epoch": 0.36783378222393226, + "flos": 1216168971264.0, + "grad_norm": 0.04498413319979697, + "language_loss": 0.82746279, + "learning_rate": 0.0007294069030771774, + "loss": 0.83837891, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.34570312, + "step": 1912, + "time_per_iteration": 3.6445353031158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109623, + "balance_loss_mlp": 1.06196952, + "epoch": 0.36802616390919585, + "flos": 498476947968.0, + "grad_norm": 0.055898807174015214, + "language_loss": 0.90671504, + "learning_rate": 0.0007291300431154224, + "loss": 0.9176774, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.34301758, + "step": 1913, + "time_per_iteration": 2.5600366592407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025736, + "balance_loss_mlp": 1.0155319, + "epoch": 0.36821854559445943, + "flos": 1581281961984.0, + "grad_norm": 0.015307788275572325, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71415472, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.10205078, + "step": 1914, + "time_per_iteration": 4.9577555656433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110633, + "balance_loss_mlp": 1.07287991, + "epoch": 0.36841092727972297, + "flos": 835261784064.0, + "grad_norm": 0.06223209716338702, + "language_loss": 0.79735458, + "learning_rate": 0.0007285760564309179, + "loss": 0.80841786, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.3347168, + "step": 1915, + "time_per_iteration": 3.1251590251922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101251, + "balance_loss_mlp": 1.06811082, + "epoch": 0.36860330896498655, + "flos": 689517591552.0, + "grad_norm": 0.05672479428696366, + "language_loss": 0.85010201, + "learning_rate": 0.0007282989299232448, + "loss": 0.8611145, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.33154297, + "step": 1916, + "time_per_iteration": 3.062971353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096553, + "balance_loss_mlp": 1.06381774, + "epoch": 0.3687956906502501, + "flos": 553919067648.0, + "grad_norm": 0.05955658020637064, + "language_loss": 0.83600092, + "learning_rate": 0.0007280217147820668, + "loss": 0.84696645, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.32739258, + "step": 1917, + "time_per_iteration": 2.6169495582580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097845, + "balance_loss_mlp": 1.06465673, + "epoch": 0.3689880723355137, + "flos": 576426184704.0, + "grad_norm": 0.05443515430960571, + "language_loss": 0.79137111, + "learning_rate": 0.0007277444111150079, + "loss": 0.80234957, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.33203125, + "step": 1918, + "time_per_iteration": 2.672696828842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101204, + "balance_loss_mlp": 1.06820679, + "epoch": 0.3691804540207772, + "flos": 528615465984.0, + "grad_norm": 0.06564490716140688, + "language_loss": 0.84340626, + "learning_rate": 0.0007274670190297272, + "loss": 0.85441828, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.33007812, + "step": 1919, + "time_per_iteration": 2.569920539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091279, + "balance_loss_mlp": 1.0575906, + "epoch": 0.3693728357060408, + "flos": 560729806848.0, + "grad_norm": 0.06475948680742319, + "language_loss": 0.81988895, + "learning_rate": 0.0007271895386339179, + "loss": 0.83080173, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.3371582, + "step": 1920, + "time_per_iteration": 2.765470027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086642, + "balance_loss_mlp": 1.05350161, + "epoch": 0.3695652173913043, + "flos": 579488919552.0, + "grad_norm": 0.0536525739451854, + "language_loss": 0.82950377, + "learning_rate": 0.0007269119700353073, + "loss": 0.84037018, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.33154297, + "step": 1921, + "time_per_iteration": 2.703117847442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089729, + "balance_loss_mlp": 1.05756688, + "epoch": 0.3697575990765679, + "flos": 512629516800.0, + "grad_norm": 0.04104943724396866, + "language_loss": 0.84983069, + "learning_rate": 0.0007266343133416571, + "loss": 0.86072791, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.3215332, + "step": 1922, + "time_per_iteration": 2.7371909618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060973, + "balance_loss_mlp": 1.04967153, + "epoch": 0.3699499807618315, + "flos": 1569826953216.0, + "grad_norm": 0.023907464900796205, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78177893, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.11279297, + "step": 1923, + "time_per_iteration": 4.827981233596802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084577, + "balance_loss_mlp": 1.05136538, + "epoch": 0.37014236244709503, + "flos": 497093262336.0, + "grad_norm": 0.06739223877154035, + "language_loss": 0.84575641, + "learning_rate": 0.0007260787361004556, + "loss": 0.85660219, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.33227539, + "step": 1924, + "time_per_iteration": 2.6221601963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048708, + "balance_loss_mlp": 1.03764546, + "epoch": 0.3703347441323586, + "flos": 1443562048512.0, + "grad_norm": 0.02017040526472397, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74810213, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.11083984, + "step": 1925, + "time_per_iteration": 4.909639120101929 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083689, + "balance_loss_mlp": 1.04997683, + "epoch": 0.37052712581762215, + "flos": 563324060160.0, + "grad_norm": 0.19489786122972683, + "language_loss": 0.87265027, + "learning_rate": 0.0007255228077730903, + "loss": 0.88348716, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.33740234, + "step": 1926, + "time_per_iteration": 2.726412773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092339, + "balance_loss_mlp": 1.05850744, + "epoch": 0.37071950750288574, + "flos": 925706451456.0, + "grad_norm": 0.06639539702607969, + "language_loss": 0.81730163, + "learning_rate": 0.0007252447122218632, + "loss": 0.82822502, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.33862305, + "step": 1927, + "time_per_iteration": 3.157439708709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090678, + "balance_loss_mlp": 1.05710912, + "epoch": 0.37091188918814927, + "flos": 418090609152.0, + "grad_norm": 0.06586667444600991, + "language_loss": 0.87736213, + "learning_rate": 0.0007249665292228834, + "loss": 0.88826889, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.3359375, + "step": 1928, + "time_per_iteration": 2.569284677505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086968, + "balance_loss_mlp": 1.05208778, + "epoch": 0.37110427087341286, + "flos": 462941899776.0, + "grad_norm": 0.056849308308669105, + "language_loss": 0.83676869, + "learning_rate": 0.000724688258884151, + "loss": 0.84763837, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.34912109, + "step": 1929, + "time_per_iteration": 2.522596597671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085634, + "balance_loss_mlp": 1.05204105, + "epoch": 0.3712966525586764, + "flos": 849303843840.0, + "grad_norm": 0.0484214736208702, + "language_loss": 0.86208755, + "learning_rate": 0.0007244099013137002, + "loss": 0.87294388, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.33618164, + "step": 1930, + "time_per_iteration": 3.055302619934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089345, + "balance_loss_mlp": 1.05370176, + "epoch": 0.37148903424394, + "flos": 925555092480.0, + "grad_norm": 0.05147814185741214, + "language_loss": 0.88918859, + "learning_rate": 0.0007241314566195993, + "loss": 0.90008199, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.35693359, + "step": 1931, + "time_per_iteration": 3.249950408935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108542, + "balance_loss_mlp": 1.05020559, + "epoch": 0.37168141592920356, + "flos": 519565473792.0, + "grad_norm": 0.061459583473066896, + "language_loss": 0.85347825, + "learning_rate": 0.0007238529249099496, + "loss": 0.86433244, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.35253906, + "step": 1932, + "time_per_iteration": 2.603287696838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068675, + "balance_loss_mlp": 1.05599129, + "epoch": 0.3718737976144671, + "flos": 1445107267584.0, + "grad_norm": 0.02721294021284605, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.7892555, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.12695312, + "step": 1933, + "time_per_iteration": 4.850685358047485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089729, + "balance_loss_mlp": 1.05346537, + "epoch": 0.3720661792997307, + "flos": 759218558976.0, + "grad_norm": 0.08735029164116491, + "language_loss": 0.80658156, + "learning_rate": 0.000723295600876581, + "loss": 0.81747884, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.36279297, + "step": 1934, + "time_per_iteration": 3.0082099437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092646, + "balance_loss_mlp": 1.05690694, + "epoch": 0.3722585609849942, + "flos": 516686031360.0, + "grad_norm": 0.1760204301041219, + "language_loss": 0.8798061, + "learning_rate": 0.0007230168087692344, + "loss": 0.89073259, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.35791016, + "step": 1935, + "time_per_iteration": 2.7076756954193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095918, + "balance_loss_mlp": 1.06070328, + "epoch": 0.3724509426702578, + "flos": 782114171904.0, + "grad_norm": 0.058450977170247324, + "language_loss": 0.82290804, + "learning_rate": 0.0007227379300790839, + "loss": 0.83386725, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.35205078, + "step": 1936, + "time_per_iteration": 3.0381107330322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108913, + "balance_loss_mlp": 1.07262528, + "epoch": 0.37264332435552133, + "flos": 391502246400.0, + "grad_norm": 0.062314619417064634, + "language_loss": 0.85779369, + "learning_rate": 0.0007224589649143997, + "loss": 0.86888283, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.36328125, + "step": 1937, + "time_per_iteration": 2.5413918495178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118094, + "balance_loss_mlp": 1.08223581, + "epoch": 0.3728357060407849, + "flos": 542599299072.0, + "grad_norm": 0.08241458585549921, + "language_loss": 0.80921531, + "learning_rate": 0.0007221799133834861, + "loss": 0.82039624, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.35839844, + "step": 1938, + "time_per_iteration": 2.620593309402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122902, + "balance_loss_mlp": 1.08682895, + "epoch": 0.3730280877260485, + "flos": 433344646656.0, + "grad_norm": 0.05702640818290307, + "language_loss": 0.81373966, + "learning_rate": 0.00072190077559468, + "loss": 0.8249687, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.36083984, + "step": 1939, + "time_per_iteration": 2.512871026992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133095, + "balance_loss_mlp": 1.09587836, + "epoch": 0.37322046941131204, + "flos": 531230068224.0, + "grad_norm": 0.0616329871980105, + "language_loss": 0.89228082, + "learning_rate": 0.0007216215516563527, + "loss": 0.90361178, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.37207031, + "step": 1940, + "time_per_iteration": 2.6655144691467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113311, + "balance_loss_mlp": 1.09534478, + "epoch": 0.3734128510965756, + "flos": 531294087168.0, + "grad_norm": 0.05659412158312536, + "language_loss": 0.83479297, + "learning_rate": 0.0007213422416769083, + "loss": 0.84612405, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.37744141, + "step": 1941, + "time_per_iteration": 2.6088144779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127109, + "balance_loss_mlp": 1.09022546, + "epoch": 0.37360523278183916, + "flos": 500195284992.0, + "grad_norm": 0.05910558413712496, + "language_loss": 0.74991721, + "learning_rate": 0.0007210628457647849, + "loss": 0.76118833, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.36889648, + "step": 1942, + "time_per_iteration": 2.57867169380188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131479, + "balance_loss_mlp": 1.09366596, + "epoch": 0.37379761446710275, + "flos": 547652413440.0, + "grad_norm": 0.06364761456819781, + "language_loss": 0.79148316, + "learning_rate": 0.000720783364028453, + "loss": 0.80279785, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.37768555, + "step": 1943, + "time_per_iteration": 2.744575023651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121556, + "balance_loss_mlp": 1.0834806, + "epoch": 0.3739899961523663, + "flos": 475517316096.0, + "grad_norm": 0.05406366318559307, + "language_loss": 0.87411249, + "learning_rate": 0.0007205037965764177, + "loss": 0.88532799, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.38061523, + "step": 1944, + "time_per_iteration": 2.5253238677978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122588, + "balance_loss_mlp": 1.0851568, + "epoch": 0.37418237783762986, + "flos": 611626581504.0, + "grad_norm": 0.05571778703090581, + "language_loss": 0.85614675, + "learning_rate": 0.0007202241435172161, + "loss": 0.86737263, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.37426758, + "step": 1945, + "time_per_iteration": 2.7462716102600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117651, + "balance_loss_mlp": 1.07931328, + "epoch": 0.3743747595228934, + "flos": 765953694720.0, + "grad_norm": 0.05225192391609906, + "language_loss": 0.88148731, + "learning_rate": 0.0007199444049594198, + "loss": 0.89266384, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.38330078, + "step": 1946, + "time_per_iteration": 2.9533469676971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116752, + "balance_loss_mlp": 1.07798529, + "epoch": 0.374567141208157, + "flos": 524120993280.0, + "grad_norm": 0.06150523549490838, + "language_loss": 0.83402771, + "learning_rate": 0.0007196645810116322, + "loss": 0.84519523, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.38720703, + "step": 1947, + "time_per_iteration": 2.709965705871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106001, + "balance_loss_mlp": 1.06735349, + "epoch": 0.37475952289342057, + "flos": 681067090944.0, + "grad_norm": 0.05833074938802531, + "language_loss": 0.83909506, + "learning_rate": 0.0007193846717824912, + "loss": 0.850155, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.38598633, + "step": 1948, + "time_per_iteration": 2.854522705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094783, + "balance_loss_mlp": 1.05682671, + "epoch": 0.3749519045786841, + "flos": 460061047296.0, + "grad_norm": 0.06673844071937801, + "language_loss": 0.88263041, + "learning_rate": 0.0007191046773806669, + "loss": 0.89357823, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.37915039, + "step": 1949, + "time_per_iteration": 2.575989007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085738, + "balance_loss_mlp": 1.04682803, + "epoch": 0.3751442862639477, + "flos": 954471458304.0, + "grad_norm": 0.06638817682476543, + "language_loss": 0.83010924, + "learning_rate": 0.0007188245979148631, + "loss": 0.84096658, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.38867188, + "step": 1950, + "time_per_iteration": 3.1386518478393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082888, + "balance_loss_mlp": 1.04462171, + "epoch": 0.3753366679492112, + "flos": 527483473920.0, + "grad_norm": 0.05996025340147905, + "language_loss": 0.8766306, + "learning_rate": 0.0007185444334938157, + "loss": 0.88745946, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.38232422, + "step": 1951, + "time_per_iteration": 2.644848585128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082635, + "balance_loss_mlp": 1.04501283, + "epoch": 0.3755290496344748, + "flos": 521535504384.0, + "grad_norm": 0.05938829335869994, + "language_loss": 0.84891546, + "learning_rate": 0.0007182641842262947, + "loss": 0.85974181, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.3762207, + "step": 1952, + "time_per_iteration": 2.6239395141601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081192, + "balance_loss_mlp": 1.04361689, + "epoch": 0.37572143131973834, + "flos": 620810403840.0, + "grad_norm": 0.06544951097265184, + "language_loss": 0.77827752, + "learning_rate": 0.0007179838502211022, + "loss": 0.78908944, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.37524414, + "step": 1953, + "time_per_iteration": 2.8444712162017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076992, + "balance_loss_mlp": 1.03958416, + "epoch": 0.37591381300500193, + "flos": 770635842048.0, + "grad_norm": 0.05616797515781331, + "language_loss": 0.86183697, + "learning_rate": 0.0007177034315870738, + "loss": 0.87260687, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.37402344, + "step": 1954, + "time_per_iteration": 2.9628727436065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078673, + "balance_loss_mlp": 1.0411936, + "epoch": 0.37610619469026546, + "flos": 520191106560.0, + "grad_norm": 0.05872311076525267, + "language_loss": 0.9098376, + "learning_rate": 0.0007174229284330773, + "loss": 0.92062426, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.37402344, + "step": 1955, + "time_per_iteration": 2.579792022705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010784, + "balance_loss_mlp": 1.0412302, + "epoch": 0.37629857637552905, + "flos": 598524456960.0, + "grad_norm": 0.050284285498010506, + "language_loss": 0.86896843, + "learning_rate": 0.0007171423408680141, + "loss": 0.8797524, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.37133789, + "step": 1956, + "time_per_iteration": 2.7764384746551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079026, + "balance_loss_mlp": 1.04211903, + "epoch": 0.37649095806079264, + "flos": 564687396864.0, + "grad_norm": 0.058102078307858664, + "language_loss": 0.89614129, + "learning_rate": 0.0007168616690008176, + "loss": 0.90693152, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.36889648, + "step": 1957, + "time_per_iteration": 2.646986246109009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083688, + "balance_loss_mlp": 1.04606569, + "epoch": 0.37668333974605617, + "flos": 592196755968.0, + "grad_norm": 0.10223927136981294, + "language_loss": 0.86142451, + "learning_rate": 0.0007165809129404545, + "loss": 0.8722614, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.37573242, + "step": 1958, + "time_per_iteration": 2.756287097930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081219, + "balance_loss_mlp": 1.04440713, + "epoch": 0.37687572143131975, + "flos": 419257506816.0, + "grad_norm": 0.0560584683493853, + "language_loss": 0.85760534, + "learning_rate": 0.0007163000727959239, + "loss": 0.8684175, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.36791992, + "step": 1959, + "time_per_iteration": 2.5415151119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105237, + "balance_loss_mlp": 1.03587127, + "epoch": 0.3770681031165833, + "flos": 1356484243968.0, + "grad_norm": 0.028402472736143748, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.7901144, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.16503906, + "step": 1960, + "time_per_iteration": 4.892657518386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086261, + "balance_loss_mlp": 1.04892445, + "epoch": 0.3772604848018469, + "flos": 644592107520.0, + "grad_norm": 0.04530874218926827, + "language_loss": 0.84377986, + "learning_rate": 0.00071573814069052, + "loss": 0.85464251, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.37329102, + "step": 1961, + "time_per_iteration": 2.898301839828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088156, + "balance_loss_mlp": 1.05098641, + "epoch": 0.3774528664871104, + "flos": 901265619456.0, + "grad_norm": 0.052585227845940184, + "language_loss": 0.87987518, + "learning_rate": 0.0007154570489478081, + "loss": 0.89075673, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.37158203, + "step": 1962, + "time_per_iteration": 3.1638717651367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088198, + "balance_loss_mlp": 1.05048013, + "epoch": 0.377645248172374, + "flos": 787717315584.0, + "grad_norm": 0.047624248218528294, + "language_loss": 0.864995, + "learning_rate": 0.0007151758735572514, + "loss": 0.87587702, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.37695312, + "step": 1963, + "time_per_iteration": 2.985558271408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090796, + "balance_loss_mlp": 1.05236292, + "epoch": 0.3778376298576376, + "flos": 586417522176.0, + "grad_norm": 0.06598015027050642, + "language_loss": 0.80448836, + "learning_rate": 0.0007148946146280119, + "loss": 0.81539631, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.38427734, + "step": 1964, + "time_per_iteration": 2.784947395324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053975, + "balance_loss_mlp": 1.03938425, + "epoch": 0.3780300115429011, + "flos": 1396014759936.0, + "grad_norm": 0.018109037433210438, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73246121, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.14550781, + "step": 1965, + "time_per_iteration": 4.85606050491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052214, + "balance_loss_mlp": 1.03838599, + "epoch": 0.3782223932281647, + "flos": 1356935348736.0, + "grad_norm": 0.019183634636191996, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76394159, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.13867188, + "step": 1966, + "time_per_iteration": 4.946903467178345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082797, + "balance_loss_mlp": 1.04517484, + "epoch": 0.37841477491342823, + "flos": 703811344896.0, + "grad_norm": 0.05921890511558738, + "language_loss": 0.83782387, + "learning_rate": 0.0007140503377003022, + "loss": 0.84865183, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.3762207, + "step": 1967, + "time_per_iteration": 2.984163761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089165, + "balance_loss_mlp": 1.0504458, + "epoch": 0.3786071565986918, + "flos": 528856985088.0, + "grad_norm": 0.047303083725180994, + "language_loss": 0.84754062, + "learning_rate": 0.000713768745708599, + "loss": 0.85843223, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.38696289, + "step": 1968, + "time_per_iteration": 2.6251039505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086627, + "balance_loss_mlp": 1.04881418, + "epoch": 0.37879953828395535, + "flos": 992872802304.0, + "grad_norm": 0.053091209869219315, + "language_loss": 0.76740122, + "learning_rate": 0.0007134870707245085, + "loss": 0.7782675, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.37792969, + "step": 1969, + "time_per_iteration": 3.252840995788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082313, + "balance_loss_mlp": 1.04435682, + "epoch": 0.37899191996921894, + "flos": 626358292992.0, + "grad_norm": 0.06088981396741891, + "language_loss": 0.8454808, + "learning_rate": 0.0007132053128573864, + "loss": 0.85630393, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.37915039, + "step": 1970, + "time_per_iteration": 2.739210844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078955, + "balance_loss_mlp": 1.0417614, + "epoch": 0.37918430165448247, + "flos": 686005314048.0, + "grad_norm": 0.05972304110224919, + "language_loss": 0.83631253, + "learning_rate": 0.0007129234722166211, + "loss": 0.84710205, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.37182617, + "step": 1971, + "time_per_iteration": 2.814235210418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086171, + "balance_loss_mlp": 1.05012178, + "epoch": 0.37937668333974606, + "flos": 475374721536.0, + "grad_norm": 0.05230765101952506, + "language_loss": 0.91063309, + "learning_rate": 0.0007126415489116328, + "loss": 0.92149478, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.3605957, + "step": 1972, + "time_per_iteration": 2.657435178756714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091568, + "balance_loss_mlp": 1.05413604, + "epoch": 0.37956906502500964, + "flos": 707271340032.0, + "grad_norm": 0.05210329015751025, + "language_loss": 0.81174934, + "learning_rate": 0.0007123595430518736, + "loss": 0.82266498, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.37402344, + "step": 1973, + "time_per_iteration": 2.832801103591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108628, + "balance_loss_mlp": 1.04856205, + "epoch": 0.3797614467102732, + "flos": 426421836288.0, + "grad_norm": 0.07403044475037865, + "language_loss": 0.8602494, + "learning_rate": 0.0007120774547468282, + "loss": 0.87111217, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.37695312, + "step": 1974, + "time_per_iteration": 2.523059844970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087165, + "balance_loss_mlp": 1.05016232, + "epoch": 0.37995382839553676, + "flos": 481588941312.0, + "grad_norm": 0.05250431859571283, + "language_loss": 0.81228226, + "learning_rate": 0.0007117952841060128, + "loss": 0.82315391, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.36962891, + "step": 1975, + "time_per_iteration": 2.648947238922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080927, + "balance_loss_mlp": 1.04409158, + "epoch": 0.3801462100808003, + "flos": 560286056448.0, + "grad_norm": 0.0511194255012935, + "language_loss": 0.83466387, + "learning_rate": 0.0007115130312389756, + "loss": 0.84547317, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.3684082, + "step": 1976, + "time_per_iteration": 2.6648154258728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086722, + "balance_loss_mlp": 1.0505538, + "epoch": 0.3803385917660639, + "flos": 464699524608.0, + "grad_norm": 0.06028169205400359, + "language_loss": 0.79143679, + "learning_rate": 0.0007112306962552973, + "loss": 0.80230403, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.36181641, + "step": 1977, + "time_per_iteration": 2.5715434551239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086468, + "balance_loss_mlp": 1.04934657, + "epoch": 0.3805309734513274, + "flos": 521614080000.0, + "grad_norm": 0.055719330197324175, + "language_loss": 0.8517288, + "learning_rate": 0.0007109482792645896, + "loss": 0.86259341, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.37084961, + "step": 1978, + "time_per_iteration": 2.6932663917541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088226, + "balance_loss_mlp": 1.05222487, + "epoch": 0.380723355136591, + "flos": 591128782848.0, + "grad_norm": 0.06665517257748008, + "language_loss": 0.83491528, + "learning_rate": 0.0007106657803764969, + "loss": 0.84579754, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.36010742, + "step": 1979, + "time_per_iteration": 2.710658311843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079049, + "balance_loss_mlp": 1.04204643, + "epoch": 0.38091573682185453, + "flos": 622394910720.0, + "grad_norm": 0.05735071115872701, + "language_loss": 0.81648314, + "learning_rate": 0.0007103831997006948, + "loss": 0.82727367, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.36987305, + "step": 1980, + "time_per_iteration": 2.7589223384857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107835, + "balance_loss_mlp": 1.04168153, + "epoch": 0.3811081185071181, + "flos": 568716208128.0, + "grad_norm": 0.047165366669346453, + "language_loss": 0.85731214, + "learning_rate": 0.0007101005373468908, + "loss": 0.86809564, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.36669922, + "step": 1981, + "time_per_iteration": 2.85588002204895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077478, + "balance_loss_mlp": 1.04161954, + "epoch": 0.3813005001923817, + "flos": 584550798336.0, + "grad_norm": 0.055048826019740454, + "language_loss": 0.86394024, + "learning_rate": 0.0007098177934248242, + "loss": 0.87471503, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.35888672, + "step": 1982, + "time_per_iteration": 2.7226805686950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077944, + "balance_loss_mlp": 1.04160953, + "epoch": 0.38149288187764524, + "flos": 621287649792.0, + "grad_norm": 0.056689743602043985, + "language_loss": 0.85823661, + "learning_rate": 0.0007095349680442661, + "loss": 0.86901605, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.36352539, + "step": 1983, + "time_per_iteration": 2.8454253673553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075326, + "balance_loss_mlp": 1.03829932, + "epoch": 0.3816852635629088, + "flos": 570414196224.0, + "grad_norm": 0.07971741755252446, + "language_loss": 0.78927159, + "learning_rate": 0.0007092520613150188, + "loss": 0.80002487, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.37036133, + "step": 1984, + "time_per_iteration": 2.7279529571533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081451, + "balance_loss_mlp": 1.04368556, + "epoch": 0.38187764524817236, + "flos": 565313029632.0, + "grad_norm": 0.06238598748372814, + "language_loss": 0.81304747, + "learning_rate": 0.0007089690733469165, + "loss": 0.82386196, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.37719727, + "step": 1985, + "time_per_iteration": 2.7343544960021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077201, + "balance_loss_mlp": 1.04115212, + "epoch": 0.38207002693343595, + "flos": 630932751360.0, + "grad_norm": 0.07832972313002672, + "language_loss": 0.82561398, + "learning_rate": 0.000708686004249825, + "loss": 0.83638602, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.36035156, + "step": 1986, + "time_per_iteration": 2.7691054344177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082228, + "balance_loss_mlp": 1.04572582, + "epoch": 0.3822624086186995, + "flos": 548507980800.0, + "grad_norm": 0.053849318526496194, + "language_loss": 0.9147824, + "learning_rate": 0.0007084028541336413, + "loss": 0.9256047, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.36499023, + "step": 1987, + "time_per_iteration": 2.7131056785583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083994, + "balance_loss_mlp": 1.04753971, + "epoch": 0.38245479030396307, + "flos": 613571880960.0, + "grad_norm": 0.06787860515410171, + "language_loss": 0.86709088, + "learning_rate": 0.0007081196231082942, + "loss": 0.87793082, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.36450195, + "step": 1988, + "time_per_iteration": 2.7983548641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083168, + "balance_loss_mlp": 1.04621339, + "epoch": 0.38264717198922665, + "flos": 667787466240.0, + "grad_norm": 0.05230973877590939, + "language_loss": 0.80107033, + "learning_rate": 0.0007078363112837436, + "loss": 0.81190205, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.36938477, + "step": 1989, + "time_per_iteration": 2.8133270740509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087048, + "balance_loss_mlp": 1.04935408, + "epoch": 0.3828395536744902, + "flos": 454521922560.0, + "grad_norm": 0.077904410907181, + "language_loss": 0.84988701, + "learning_rate": 0.000707552918769981, + "loss": 0.86075753, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.37646484, + "step": 1990, + "time_per_iteration": 2.5100817680358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089992, + "balance_loss_mlp": 1.05213106, + "epoch": 0.3830319353597538, + "flos": 499191330816.0, + "grad_norm": 0.06242573245055077, + "language_loss": 0.83457661, + "learning_rate": 0.000707269445677029, + "loss": 0.84547657, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.37817383, + "step": 1991, + "time_per_iteration": 2.7737526893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099946, + "balance_loss_mlp": 1.06327772, + "epoch": 0.3832243170450173, + "flos": 743787021312.0, + "grad_norm": 0.05437985066129539, + "language_loss": 0.84858984, + "learning_rate": 0.0007069858921149416, + "loss": 0.85958934, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.36694336, + "step": 1992, + "time_per_iteration": 2.9642581939697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095684, + "balance_loss_mlp": 1.05868101, + "epoch": 0.3834166987302809, + "flos": 577937908224.0, + "grad_norm": 0.10762195872615073, + "language_loss": 0.85869837, + "learning_rate": 0.0007067022581938043, + "loss": 0.86965525, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.36987305, + "step": 1993, + "time_per_iteration": 2.805201292037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101221, + "balance_loss_mlp": 1.06531525, + "epoch": 0.3836090804155444, + "flos": 536194432512.0, + "grad_norm": 0.06280477504596697, + "language_loss": 0.831635, + "learning_rate": 0.0007064185440237334, + "loss": 0.84264719, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.359375, + "step": 1994, + "time_per_iteration": 2.7297706604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101916, + "balance_loss_mlp": 1.06527066, + "epoch": 0.383801462100808, + "flos": 601587191808.0, + "grad_norm": 0.05513764490979663, + "language_loss": 0.84278905, + "learning_rate": 0.0007061347497148764, + "loss": 0.85380822, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.36621094, + "step": 1995, + "time_per_iteration": 2.725632429122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094031, + "balance_loss_mlp": 1.05876923, + "epoch": 0.38399384378607154, + "flos": 572427896832.0, + "grad_norm": 0.06604776765413087, + "language_loss": 0.86282277, + "learning_rate": 0.0007058508753774122, + "loss": 0.87376308, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.35302734, + "step": 1996, + "time_per_iteration": 2.760045051574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092829, + "balance_loss_mlp": 1.05773425, + "epoch": 0.38418622547133513, + "flos": 536513117184.0, + "grad_norm": 0.058737109015633, + "language_loss": 0.86788458, + "learning_rate": 0.0007055669211215505, + "loss": 0.87881291, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.35131836, + "step": 1997, + "time_per_iteration": 2.6091408729553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088772, + "balance_loss_mlp": 1.05238962, + "epoch": 0.3843786071565987, + "flos": 572673798144.0, + "grad_norm": 0.06483433205315106, + "language_loss": 0.77687544, + "learning_rate": 0.0007052828870575322, + "loss": 0.78776312, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.36376953, + "step": 1998, + "time_per_iteration": 2.671349048614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109131, + "balance_loss_mlp": 1.0558331, + "epoch": 0.38457098884186225, + "flos": 728361275904.0, + "grad_norm": 0.05010154832824161, + "language_loss": 0.86955881, + "learning_rate": 0.0007049987732956291, + "loss": 0.88047194, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.35498047, + "step": 1999, + "time_per_iteration": 2.9918439388275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108718, + "balance_loss_mlp": 1.05189395, + "epoch": 0.38476337052712584, + "flos": 583123442688.0, + "grad_norm": 0.047224279388360366, + "language_loss": 0.82623643, + "learning_rate": 0.0007047145799461439, + "loss": 0.83710825, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.35327148, + "step": 2000, + "time_per_iteration": 2.84328293800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092374, + "balance_loss_mlp": 1.05656374, + "epoch": 0.38495575221238937, + "flos": 552787075584.0, + "grad_norm": 0.05254385134155795, + "language_loss": 0.82269979, + "learning_rate": 0.00070443030711941, + "loss": 0.83362353, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.3581543, + "step": 2001, + "time_per_iteration": 2.753903865814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092226, + "balance_loss_mlp": 1.0559628, + "epoch": 0.38514813389765296, + "flos": 654173190144.0, + "grad_norm": 0.05896823149323879, + "language_loss": 0.8241961, + "learning_rate": 0.0007041459549257924, + "loss": 0.83511841, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.36254883, + "step": 2002, + "time_per_iteration": 2.85963773727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086715, + "balance_loss_mlp": 1.05030835, + "epoch": 0.3853405155829165, + "flos": 867715158528.0, + "grad_norm": 0.0671523306708724, + "language_loss": 0.78569824, + "learning_rate": 0.0007038615234756859, + "loss": 0.79656541, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.36425781, + "step": 2003, + "time_per_iteration": 3.1452226638793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_mlp": 1.04440188, + "epoch": 0.3855328972681801, + "flos": 546164011008.0, + "grad_norm": 0.05736478292188374, + "language_loss": 0.83675313, + "learning_rate": 0.000703577012879517, + "loss": 0.84755898, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.36230469, + "step": 2004, + "time_per_iteration": 2.6308705806732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075931, + "balance_loss_mlp": 1.04040706, + "epoch": 0.3857252789534436, + "flos": 533819939328.0, + "grad_norm": 0.0602394573591363, + "language_loss": 0.8843599, + "learning_rate": 0.0007032924232477423, + "loss": 0.89511919, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.35595703, + "step": 2005, + "time_per_iteration": 2.6188220977783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079495, + "balance_loss_mlp": 1.04337406, + "epoch": 0.3859176606387072, + "flos": 491514849792.0, + "grad_norm": 0.055511202055775664, + "language_loss": 0.80448711, + "learning_rate": 0.0007030077546908493, + "loss": 0.81528199, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.36132812, + "step": 2006, + "time_per_iteration": 2.6309516429901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088067, + "balance_loss_mlp": 1.07609844, + "epoch": 0.3861100423239708, + "flos": 1486278955008.0, + "grad_norm": 0.032522163132150485, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84152722, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.11962891, + "step": 2007, + "time_per_iteration": 4.736604452133179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083286, + "balance_loss_mlp": 1.04816651, + "epoch": 0.3863024240092343, + "flos": 473493441024.0, + "grad_norm": 0.05514866045126494, + "language_loss": 0.79152983, + "learning_rate": 0.0007024381812438117, + "loss": 0.80236268, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.3515625, + "step": 2008, + "time_per_iteration": 2.5392396450042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108868, + "balance_loss_mlp": 1.05306053, + "epoch": 0.3864948056944979, + "flos": 716258723328.0, + "grad_norm": 0.059806412844581394, + "language_loss": 0.83199877, + "learning_rate": 0.0007021532765747951, + "loss": 0.84288561, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.35668945, + "step": 2009, + "time_per_iteration": 2.9926528930664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087876, + "balance_loss_mlp": 1.0511353, + "epoch": 0.38668718737976143, + "flos": 727302067200.0, + "grad_norm": 0.05631620148302912, + "language_loss": 0.7933259, + "learning_rate": 0.0007018682934229162, + "loss": 0.8042047, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.36743164, + "step": 2010, + "time_per_iteration": 2.924781322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087301, + "balance_loss_mlp": 1.05103779, + "epoch": 0.386879569065025, + "flos": 525218079744.0, + "grad_norm": 0.05794664731816873, + "language_loss": 0.82387936, + "learning_rate": 0.0007015832318988152, + "loss": 0.83475244, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.36328125, + "step": 2011, + "time_per_iteration": 2.668565511703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_mlp": 1.02173615, + "epoch": 0.38707195075028855, + "flos": 1527036005376.0, + "grad_norm": 0.019732384975687786, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74922872, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11083984, + "step": 2012, + "time_per_iteration": 4.948081970214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085105, + "balance_loss_mlp": 1.04886508, + "epoch": 0.38726433243555214, + "flos": 557045821440.0, + "grad_norm": 0.049164425244227684, + "language_loss": 0.84333575, + "learning_rate": 0.0007010128741766604, + "loss": 0.85418677, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.36230469, + "step": 2013, + "time_per_iteration": 2.7263684272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073123, + "balance_loss_mlp": 1.03695476, + "epoch": 0.38745671412081567, + "flos": 553431647232.0, + "grad_norm": 0.06787190277242791, + "language_loss": 0.84107876, + "learning_rate": 0.0007007275782000391, + "loss": 0.85181004, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.36181641, + "step": 2014, + "time_per_iteration": 2.6458756923675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082824, + "balance_loss_mlp": 1.04679942, + "epoch": 0.38764909580607926, + "flos": 458175384576.0, + "grad_norm": 0.05583745089019265, + "language_loss": 0.85148585, + "learning_rate": 0.0007004422042940605, + "loss": 0.86231411, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.3605957, + "step": 2015, + "time_per_iteration": 2.5374679565429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074881, + "balance_loss_mlp": 1.03847444, + "epoch": 0.38784147749134285, + "flos": 521973462528.0, + "grad_norm": 0.056017537147394686, + "language_loss": 0.89528251, + "learning_rate": 0.0007001567525695169, + "loss": 0.90603131, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.36425781, + "step": 2016, + "time_per_iteration": 2.5863571166992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081449, + "balance_loss_mlp": 1.04504275, + "epoch": 0.3880338591766064, + "flos": 665696600064.0, + "grad_norm": 0.05583938490392423, + "language_loss": 0.839926, + "learning_rate": 0.0006998712231372303, + "loss": 0.85074055, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.36425781, + "step": 2017, + "time_per_iteration": 2.998652219772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076964, + "balance_loss_mlp": 1.04103458, + "epoch": 0.38822624086186996, + "flos": 593660427264.0, + "grad_norm": 0.044278068469259586, + "language_loss": 0.86088806, + "learning_rate": 0.0006995856161080532, + "loss": 0.87165773, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.35961914, + "step": 2018, + "time_per_iteration": 2.870619297027588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077822, + "balance_loss_mlp": 1.03972268, + "epoch": 0.3884186225471335, + "flos": 612256596480.0, + "grad_norm": 0.10653426783792587, + "language_loss": 0.8221643, + "learning_rate": 0.0006992999315928679, + "loss": 0.83294249, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.38061523, + "step": 2019, + "time_per_iteration": 2.7867438793182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074258, + "balance_loss_mlp": 1.03782773, + "epoch": 0.3886110042323971, + "flos": 606737820672.0, + "grad_norm": 0.05830260104080337, + "language_loss": 0.85476196, + "learning_rate": 0.0006990141697025871, + "loss": 0.8655045, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.36401367, + "step": 2020, + "time_per_iteration": 2.7722346782684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008633, + "balance_loss_mlp": 0.99642587, + "epoch": 0.3888033859176606, + "flos": 1527289108992.0, + "grad_norm": 0.011259985776032525, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77368271, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.12207031, + "step": 2021, + "time_per_iteration": 4.71975302696228 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069739, + "balance_loss_mlp": 1.03368998, + "epoch": 0.3889957676029242, + "flos": 692145340416.0, + "grad_norm": 0.08577921290538253, + "language_loss": 0.82040119, + "learning_rate": 0.0006984424142405392, + "loss": 0.83109856, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.36035156, + "step": 2022, + "time_per_iteration": 2.8003616333007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070636, + "balance_loss_mlp": 1.03413415, + "epoch": 0.3891881492881878, + "flos": 514937170944.0, + "grad_norm": 0.06357614890279897, + "language_loss": 0.81860286, + "learning_rate": 0.0006981564208907474, + "loss": 0.82930923, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.36499023, + "step": 2023, + "time_per_iteration": 2.581556558609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074791, + "balance_loss_mlp": 1.03933799, + "epoch": 0.3893805309734513, + "flos": 628770663936.0, + "grad_norm": 0.04985256691663517, + "language_loss": 0.90055227, + "learning_rate": 0.0006978703506098102, + "loss": 0.91130018, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.35498047, + "step": 2024, + "time_per_iteration": 2.7220654487609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080786, + "balance_loss_mlp": 1.04497564, + "epoch": 0.3895729126587149, + "flos": 543894234624.0, + "grad_norm": 0.06500254639996711, + "language_loss": 0.88078821, + "learning_rate": 0.00069758420350879, + "loss": 0.89159608, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.35839844, + "step": 2025, + "time_per_iteration": 2.6044023036956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086131, + "balance_loss_mlp": 1.05012965, + "epoch": 0.38976529434397844, + "flos": 617987778048.0, + "grad_norm": 0.06153368317516065, + "language_loss": 0.86008936, + "learning_rate": 0.000697297979698779, + "loss": 0.87095064, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.36010742, + "step": 2026, + "time_per_iteration": 2.7051053047180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091671, + "balance_loss_mlp": 1.05628932, + "epoch": 0.38995767602924203, + "flos": 834518287872.0, + "grad_norm": 0.05732441037152358, + "language_loss": 0.83766049, + "learning_rate": 0.0006970116792908992, + "loss": 0.84857726, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.35400391, + "step": 2027, + "time_per_iteration": 3.086228847503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096114, + "balance_loss_mlp": 1.06032705, + "epoch": 0.39015005771450556, + "flos": 541343651328.0, + "grad_norm": 0.060477391230123065, + "language_loss": 0.8159399, + "learning_rate": 0.000696725302396302, + "loss": 0.82690096, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.3581543, + "step": 2028, + "time_per_iteration": 2.6521902084350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093769, + "balance_loss_mlp": 1.05867422, + "epoch": 0.39034243939976915, + "flos": 1007102536704.0, + "grad_norm": 0.04866281229524116, + "language_loss": 0.85781944, + "learning_rate": 0.0006964388491261692, + "loss": 0.86875713, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.35131836, + "step": 2029, + "time_per_iteration": 3.2338647842407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099777, + "balance_loss_mlp": 1.06401408, + "epoch": 0.3905348210850327, + "flos": 678723121152.0, + "grad_norm": 0.05278281932643199, + "language_loss": 0.87335277, + "learning_rate": 0.0006961523195917114, + "loss": 0.88435054, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.35791016, + "step": 2030, + "time_per_iteration": 2.8414504528045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098277, + "balance_loss_mlp": 1.06256151, + "epoch": 0.39072720277029627, + "flos": 548606905344.0, + "grad_norm": 0.05643291477722073, + "language_loss": 0.77850938, + "learning_rate": 0.0006958657139041696, + "loss": 0.78949213, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.35742188, + "step": 2031, + "time_per_iteration": 2.7278060913085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091715, + "balance_loss_mlp": 1.07807708, + "epoch": 0.39091958445555985, + "flos": 1546912401408.0, + "grad_norm": 0.03426807627657635, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77804685, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.13671875, + "step": 2032, + "time_per_iteration": 4.927444696426392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099209, + "balance_loss_mlp": 1.06399429, + "epoch": 0.3911119661408234, + "flos": 503741058048.0, + "grad_norm": 0.050822615130563034, + "language_loss": 0.78371578, + "learning_rate": 0.0006952922745149434, + "loss": 0.79470789, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.35253906, + "step": 2033, + "time_per_iteration": 2.6730306148529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095244, + "balance_loss_mlp": 1.05933857, + "epoch": 0.391304347826087, + "flos": 556967245824.0, + "grad_norm": 0.05118619150019999, + "language_loss": 0.87770367, + "learning_rate": 0.000695005441035888, + "loss": 0.88865614, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.359375, + "step": 2034, + "time_per_iteration": 2.6585283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_mlp": 1.02461255, + "epoch": 0.3914967295113505, + "flos": 1499309858304.0, + "grad_norm": 0.00946210886057752, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74762058, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.140625, + "step": 2035, + "time_per_iteration": 4.8464648723602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087203, + "balance_loss_mlp": 1.05182171, + "epoch": 0.3916891111966141, + "flos": 706715518464.0, + "grad_norm": 0.07007748344060821, + "language_loss": 0.81416976, + "learning_rate": 0.0006944315470656863, + "loss": 0.82504177, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.35424805, + "step": 2036, + "time_per_iteration": 2.9289584159851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010868, + "balance_loss_mlp": 1.05051255, + "epoch": 0.3918814928818776, + "flos": 556085537280.0, + "grad_norm": 0.05570869743183256, + "language_loss": 0.91007531, + "learning_rate": 0.000694144486797345, + "loss": 0.92094326, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.36303711, + "step": 2037, + "time_per_iteration": 0.0013861656188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043662, + "balance_loss_mlp": 1.02954793, + "epoch": 0.3920738745671412, + "flos": 1537845032448.0, + "grad_norm": 0.018656318140729232, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80564094, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.14160156, + "step": 2038, + "time_per_iteration": 4.6249003410339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091153, + "balance_loss_mlp": 1.05586696, + "epoch": 0.39226625625240474, + "flos": 498594811392.0, + "grad_norm": 0.06764177247916761, + "language_loss": 0.89479941, + "learning_rate": 0.0006935701402514156, + "loss": 0.90571094, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.35327148, + "step": 2039, + "time_per_iteration": 2.535269260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_mlp": 1.01963174, + "epoch": 0.39245863793766833, + "flos": 1346465203200.0, + "grad_norm": 0.017448285120256823, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.7406826, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.13769531, + "step": 2040, + "time_per_iteration": 4.913180589675903 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086444, + "balance_loss_mlp": 1.05072939, + "epoch": 0.3926510196229319, + "flos": 1345614474240.0, + "grad_norm": 0.055350347752650936, + "language_loss": 0.8453002, + "learning_rate": 0.0006929954931031422, + "loss": 0.85616457, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.35742188, + "step": 2041, + "time_per_iteration": 3.6796491146087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081967, + "balance_loss_mlp": 1.04722965, + "epoch": 0.39284340130819545, + "flos": 499333925376.0, + "grad_norm": 0.05434437059814268, + "language_loss": 0.88856936, + "learning_rate": 0.0006927080570819805, + "loss": 0.89938903, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.34790039, + "step": 2042, + "time_per_iteration": 2.634052038192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087578, + "balance_loss_mlp": 1.05217266, + "epoch": 0.39303578299345904, + "flos": 520077625344.0, + "grad_norm": 0.06468620716873735, + "language_loss": 0.80649555, + "learning_rate": 0.0006924205462449161, + "loss": 0.81737131, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.35473633, + "step": 2043, + "time_per_iteration": 2.6021780967712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078315, + "balance_loss_mlp": 1.04288566, + "epoch": 0.39322816467872257, + "flos": 907529301504.0, + "grad_norm": 0.05516365318311268, + "language_loss": 0.82013571, + "learning_rate": 0.0006921329607035702, + "loss": 0.83091891, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.35473633, + "step": 2044, + "time_per_iteration": 3.2195992469787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078665, + "balance_loss_mlp": 1.04473805, + "epoch": 0.39342054636398616, + "flos": 517330603008.0, + "grad_norm": 0.046703626280748714, + "language_loss": 0.88374329, + "learning_rate": 0.0006918453005695938, + "loss": 0.89452994, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.33959961, + "step": 2045, + "time_per_iteration": 2.6319282054901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080714, + "balance_loss_mlp": 1.04435515, + "epoch": 0.3936129280492497, + "flos": 547646621184.0, + "grad_norm": 0.04434497339872072, + "language_loss": 0.8422206, + "learning_rate": 0.0006915575659546662, + "loss": 0.8530277, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.36401367, + "step": 2046, + "time_per_iteration": 2.648895263671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081267, + "balance_loss_mlp": 1.04519427, + "epoch": 0.3938053097345133, + "flos": 525858269184.0, + "grad_norm": 0.0524234289418272, + "language_loss": 0.80648899, + "learning_rate": 0.0006912697569704959, + "loss": 0.81730163, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.36083984, + "step": 2047, + "time_per_iteration": 2.6330111026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085798, + "balance_loss_mlp": 1.04934382, + "epoch": 0.39399769141977686, + "flos": 471390990336.0, + "grad_norm": 0.0542278175669412, + "language_loss": 0.86721706, + "learning_rate": 0.0006909818737288205, + "loss": 0.87807506, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.36450195, + "step": 2048, + "time_per_iteration": 2.537581205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090734, + "balance_loss_mlp": 1.05468488, + "epoch": 0.3941900731050404, + "flos": 501490220544.0, + "grad_norm": 0.056383256559611315, + "language_loss": 0.80660325, + "learning_rate": 0.000690693916341406, + "loss": 0.8175106, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.3605957, + "step": 2049, + "time_per_iteration": 2.622183084487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096224, + "balance_loss_mlp": 1.05898309, + "epoch": 0.394382454790304, + "flos": 580577241600.0, + "grad_norm": 0.11468284139168772, + "language_loss": 0.82465422, + "learning_rate": 0.0006904058849200475, + "loss": 0.83561641, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.37255859, + "step": 2050, + "time_per_iteration": 2.7216436862945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087551, + "balance_loss_mlp": 1.05088186, + "epoch": 0.3945748364755675, + "flos": 513563659776.0, + "grad_norm": 0.05187056217607278, + "language_loss": 0.84988606, + "learning_rate": 0.0006901177795765683, + "loss": 0.86076152, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.36694336, + "step": 2051, + "time_per_iteration": 2.5725293159484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079371, + "balance_loss_mlp": 1.04291642, + "epoch": 0.3947672181608311, + "flos": 593683748352.0, + "grad_norm": 0.0518129521666432, + "language_loss": 0.8131091, + "learning_rate": 0.0006898296004228213, + "loss": 0.82390279, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.36450195, + "step": 2052, + "time_per_iteration": 2.6969447135925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050959, + "balance_loss_mlp": 1.0379895, + "epoch": 0.39495959984609463, + "flos": 1546829443584.0, + "grad_norm": 0.029989620736742544, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79177701, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12988281, + "step": 2053, + "time_per_iteration": 4.8486199378967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078173, + "balance_loss_mlp": 1.04233825, + "epoch": 0.3951519815313582, + "flos": 496271190528.0, + "grad_norm": 0.06693197740080077, + "language_loss": 0.80026031, + "learning_rate": 0.0006892530211320763, + "loss": 0.81104207, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.35864258, + "step": 2054, + "time_per_iteration": 2.6731534004211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082597, + "balance_loss_mlp": 1.04657161, + "epoch": 0.39534436321662175, + "flos": 530934704640.0, + "grad_norm": 0.06400198340094926, + "language_loss": 0.8367995, + "learning_rate": 0.000688964621218926, + "loss": 0.84762549, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.36010742, + "step": 2055, + "time_per_iteration": 2.623870611190796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107818, + "balance_loss_mlp": 1.0422982, + "epoch": 0.39553674490188534, + "flos": 702224017920.0, + "grad_norm": 0.05929287076568038, + "language_loss": 0.80154717, + "learning_rate": 0.0006886761479432037, + "loss": 0.81232893, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.35864258, + "step": 2056, + "time_per_iteration": 2.810593366622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088441, + "balance_loss_mlp": 1.05263042, + "epoch": 0.3957291265871489, + "flos": 409552768512.0, + "grad_norm": 0.05784227470554994, + "language_loss": 0.84645867, + "learning_rate": 0.0006883876014169045, + "loss": 0.85734308, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.3581543, + "step": 2057, + "time_per_iteration": 2.464358329772949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088007, + "balance_loss_mlp": 1.05121863, + "epoch": 0.39592150827241246, + "flos": 618204566016.0, + "grad_norm": 0.05454135250908964, + "language_loss": 0.90161431, + "learning_rate": 0.000688098981752052, + "loss": 0.91249436, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.36791992, + "step": 2058, + "time_per_iteration": 2.742589235305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094817, + "balance_loss_mlp": 1.05819607, + "epoch": 0.39611388995767605, + "flos": 820986969600.0, + "grad_norm": 0.05656267147709111, + "language_loss": 0.80105305, + "learning_rate": 0.0006878102890606982, + "loss": 0.81200117, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.36621094, + "step": 2059, + "time_per_iteration": 3.0725343227386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096787, + "balance_loss_mlp": 1.06018949, + "epoch": 0.3963062716429396, + "flos": 491977539072.0, + "grad_norm": 0.0710153527902746, + "language_loss": 0.81321216, + "learning_rate": 0.0006875215234549239, + "loss": 0.82418001, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.3659668, + "step": 2060, + "time_per_iteration": 2.542090654373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097398, + "balance_loss_mlp": 1.06015694, + "epoch": 0.39649865332820317, + "flos": 584466430464.0, + "grad_norm": 0.08966956269211096, + "language_loss": 0.8554219, + "learning_rate": 0.0006872326850468376, + "loss": 0.86639589, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.37231445, + "step": 2061, + "time_per_iteration": 2.7194440364837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010999, + "balance_loss_mlp": 1.06139588, + "epoch": 0.3966910350134667, + "flos": 458328153600.0, + "grad_norm": 0.05276871533818733, + "language_loss": 0.78609985, + "learning_rate": 0.0006869437739485762, + "loss": 0.79709888, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.38476562, + "step": 2062, + "time_per_iteration": 2.6032114028930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089079, + "balance_loss_mlp": 1.05281568, + "epoch": 0.3968834166987303, + "flos": 508388299776.0, + "grad_norm": 0.05735909750828215, + "language_loss": 0.93035084, + "learning_rate": 0.0006866547902723053, + "loss": 0.94124162, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.36279297, + "step": 2063, + "time_per_iteration": 2.666294813156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097295, + "balance_loss_mlp": 1.06110287, + "epoch": 0.3970757983839938, + "flos": 572349321216.0, + "grad_norm": 0.05819495660266105, + "language_loss": 0.79961425, + "learning_rate": 0.000686365734130218, + "loss": 0.81058717, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.36206055, + "step": 2064, + "time_per_iteration": 2.667443037033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093878, + "balance_loss_mlp": 1.05639839, + "epoch": 0.3972681800692574, + "flos": 481391092224.0, + "grad_norm": 0.051061390118103664, + "language_loss": 0.84029245, + "learning_rate": 0.000686076605634536, + "loss": 0.85123128, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.37475586, + "step": 2065, + "time_per_iteration": 2.605252981185913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091284, + "balance_loss_mlp": 1.05406737, + "epoch": 0.397460561754521, + "flos": 487683887616.0, + "grad_norm": 0.060621892107923396, + "language_loss": 0.84327424, + "learning_rate": 0.0006857874048975088, + "loss": 0.85418713, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.37207031, + "step": 2066, + "time_per_iteration": 2.5779786109924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090658, + "balance_loss_mlp": 1.05329823, + "epoch": 0.3976529434397845, + "flos": 421768802304.0, + "grad_norm": 0.05929679602335689, + "language_loss": 0.86975348, + "learning_rate": 0.0006854981320314142, + "loss": 0.88066006, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.37353516, + "step": 2067, + "time_per_iteration": 2.4723403453826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082754, + "balance_loss_mlp": 1.04591799, + "epoch": 0.3978453251250481, + "flos": 545331764736.0, + "grad_norm": 0.058605050617464606, + "language_loss": 0.86758339, + "learning_rate": 0.0006852087871485579, + "loss": 0.87841094, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.36816406, + "step": 2068, + "time_per_iteration": 2.6007602214813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082239, + "balance_loss_mlp": 1.04602289, + "epoch": 0.39803770681031164, + "flos": 650548841472.0, + "grad_norm": 0.06821675645960798, + "language_loss": 0.81689966, + "learning_rate": 0.0006849193703612735, + "loss": 0.82772201, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.36206055, + "step": 2069, + "time_per_iteration": 2.7661337852478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083569, + "balance_loss_mlp": 1.04661417, + "epoch": 0.39823008849557523, + "flos": 739734888960.0, + "grad_norm": 0.059947122372600754, + "language_loss": 0.77649361, + "learning_rate": 0.0006846298817819225, + "loss": 0.78732932, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.36987305, + "step": 2070, + "time_per_iteration": 2.9364843368530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091151, + "balance_loss_mlp": 1.05436325, + "epoch": 0.39842247018083876, + "flos": 384825337344.0, + "grad_norm": 0.0736862776590319, + "language_loss": 0.80732799, + "learning_rate": 0.0006843403215228945, + "loss": 0.81823957, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.36767578, + "step": 2071, + "time_per_iteration": 2.4597537517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078334, + "balance_loss_mlp": 1.04133189, + "epoch": 0.39861485186610235, + "flos": 533431443456.0, + "grad_norm": 0.052578385162892496, + "language_loss": 0.80366135, + "learning_rate": 0.0006840506896966065, + "loss": 0.81444472, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.36962891, + "step": 2072, + "time_per_iteration": 2.6826841831207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081243, + "balance_loss_mlp": 1.04397774, + "epoch": 0.39880723355136594, + "flos": 642834482688.0, + "grad_norm": 0.055481383737447196, + "language_loss": 0.82090193, + "learning_rate": 0.0006837609864155038, + "loss": 0.83171439, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.37255859, + "step": 2073, + "time_per_iteration": 2.8541154861450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075844, + "balance_loss_mlp": 1.0408442, + "epoch": 0.39899961523662947, + "flos": 515587534848.0, + "grad_norm": 0.07686004257588779, + "language_loss": 0.83464944, + "learning_rate": 0.0006834712117920592, + "loss": 0.84540784, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.3503418, + "step": 2074, + "time_per_iteration": 2.6023800373077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107731, + "balance_loss_mlp": 1.04025948, + "epoch": 0.39919199692189306, + "flos": 464148085248.0, + "grad_norm": 0.0625246810856132, + "language_loss": 0.85794407, + "learning_rate": 0.0006831813659387729, + "loss": 0.86871719, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.37036133, + "step": 2075, + "time_per_iteration": 2.5916242599487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071695, + "balance_loss_mlp": 1.0353837, + "epoch": 0.3993843786071566, + "flos": 531382837248.0, + "grad_norm": 0.05588277312371317, + "language_loss": 0.84130096, + "learning_rate": 0.0006828914489681733, + "loss": 0.852018, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.36303711, + "step": 2076, + "time_per_iteration": 2.7014079093933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078543, + "balance_loss_mlp": 1.04168355, + "epoch": 0.3995767602924202, + "flos": 503701770240.0, + "grad_norm": 0.05616101270921505, + "language_loss": 0.85284638, + "learning_rate": 0.0006826014609928162, + "loss": 0.86363184, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.36816406, + "step": 2077, + "time_per_iteration": 2.6714975833892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089101, + "balance_loss_mlp": 1.07622623, + "epoch": 0.3997691419776837, + "flos": 1453780500480.0, + "grad_norm": 0.03492718818999835, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.8428849, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.12890625, + "step": 2078, + "time_per_iteration": 4.8198041915893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075675, + "balance_loss_mlp": 1.03943551, + "epoch": 0.3999615236629473, + "flos": 530418170880.0, + "grad_norm": 0.06060184252075253, + "language_loss": 0.79984158, + "learning_rate": 0.0006820212724781896, + "loss": 0.81059831, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.36279297, + "step": 2079, + "time_per_iteration": 2.725576400756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077383, + "balance_loss_mlp": 1.04209709, + "epoch": 0.4001539053482108, + "flos": 694823961600.0, + "grad_norm": 0.12722864956638674, + "language_loss": 0.83843565, + "learning_rate": 0.0006817310721641694, + "loss": 0.84920955, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.35302734, + "step": 2080, + "time_per_iteration": 2.808981418609619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083492, + "balance_loss_mlp": 1.04732358, + "epoch": 0.4003462870334744, + "flos": 520102356480.0, + "grad_norm": 0.09864886938158902, + "language_loss": 0.84025592, + "learning_rate": 0.00068144080129589, + "loss": 0.85109079, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.36181641, + "step": 2081, + "time_per_iteration": 2.61795973777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090466, + "balance_loss_mlp": 1.05403543, + "epoch": 0.400538668718738, + "flos": 492272902656.0, + "grad_norm": 0.05814134634807872, + "language_loss": 0.83103502, + "learning_rate": 0.0006811504599860441, + "loss": 0.84193969, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.36450195, + "step": 2082, + "time_per_iteration": 2.5586163997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109661, + "balance_loss_mlp": 1.06161022, + "epoch": 0.40073105040400153, + "flos": 490083111936.0, + "grad_norm": 0.05292967428813452, + "language_loss": 0.85547149, + "learning_rate": 0.0006808600483473526, + "loss": 0.86643761, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.35058594, + "step": 2083, + "time_per_iteration": 2.8549985885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110158, + "balance_loss_mlp": 1.06584144, + "epoch": 0.4009234320892651, + "flos": 562088761344.0, + "grad_norm": 0.051341860757237005, + "language_loss": 0.85926497, + "learning_rate": 0.0006805695664925629, + "loss": 0.87028074, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.35791016, + "step": 2084, + "time_per_iteration": 2.7807514667510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111507, + "balance_loss_mlp": 1.07619727, + "epoch": 0.40111581377452865, + "flos": 425786029056.0, + "grad_norm": 0.07139972521672847, + "language_loss": 0.84098327, + "learning_rate": 0.0006802790145344506, + "loss": 0.85209835, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.35327148, + "step": 2085, + "time_per_iteration": 2.4653491973876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106176, + "balance_loss_mlp": 1.07024658, + "epoch": 0.40130819545979224, + "flos": 612148907520.0, + "grad_norm": 0.09859033966702202, + "language_loss": 0.87080699, + "learning_rate": 0.0006799883925858176, + "loss": 0.88186872, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.35961914, + "step": 2086, + "time_per_iteration": 2.8432652950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101686, + "balance_loss_mlp": 1.06580365, + "epoch": 0.40150057714505577, + "flos": 523179648000.0, + "grad_norm": 0.06735788816740666, + "language_loss": 0.85303611, + "learning_rate": 0.0006796977007594933, + "loss": 0.86405295, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.35913086, + "step": 2087, + "time_per_iteration": 2.597883701324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109903, + "balance_loss_mlp": 1.06240904, + "epoch": 0.40169295883031936, + "flos": 561143033856.0, + "grad_norm": 0.0524220318715257, + "language_loss": 0.86402881, + "learning_rate": 0.0006794069391683345, + "loss": 0.87501919, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.36621094, + "step": 2088, + "time_per_iteration": 2.7313365936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101072, + "balance_loss_mlp": 1.06414104, + "epoch": 0.4018853405155829, + "flos": 518743401984.0, + "grad_norm": 0.056795041649419745, + "language_loss": 0.80919069, + "learning_rate": 0.0006791161079252248, + "loss": 0.8202014, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.36914062, + "step": 2089, + "time_per_iteration": 2.57450532913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109207, + "balance_loss_mlp": 1.05652201, + "epoch": 0.4020777222008465, + "flos": 525957193728.0, + "grad_norm": 0.05166370887572794, + "language_loss": 0.82473212, + "learning_rate": 0.0006788252071430747, + "loss": 0.83565277, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.35546875, + "step": 2090, + "time_per_iteration": 2.6603012084960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097817, + "balance_loss_mlp": 1.06100535, + "epoch": 0.40227010388611006, + "flos": 525494504448.0, + "grad_norm": 0.056931817338158205, + "language_loss": 0.86595076, + "learning_rate": 0.0006785342369348222, + "loss": 0.87692893, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.3684082, + "step": 2091, + "time_per_iteration": 2.807980537414551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091263, + "balance_loss_mlp": 1.05566692, + "epoch": 0.4024624855713736, + "flos": 432074442240.0, + "grad_norm": 0.0736357586886409, + "language_loss": 0.79799104, + "learning_rate": 0.0006782431974134316, + "loss": 0.80890369, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.35668945, + "step": 2092, + "time_per_iteration": 2.5331132411956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097919, + "balance_loss_mlp": 1.06044006, + "epoch": 0.4026548672566372, + "flos": 766304312832.0, + "grad_norm": 0.05288336614740697, + "language_loss": 0.89230573, + "learning_rate": 0.0006779520886918949, + "loss": 0.90328491, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.375, + "step": 2093, + "time_per_iteration": 3.014895439147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093032, + "balance_loss_mlp": 1.0560298, + "epoch": 0.4028472489419007, + "flos": 642636633600.0, + "grad_norm": 0.05102527643704043, + "language_loss": 0.8125242, + "learning_rate": 0.0006776609108832301, + "loss": 0.8234545, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.36987305, + "step": 2094, + "time_per_iteration": 2.7778923511505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089446, + "balance_loss_mlp": 1.05311072, + "epoch": 0.4030396306271643, + "flos": 491593425408.0, + "grad_norm": 0.053262929353227066, + "language_loss": 0.84942901, + "learning_rate": 0.0006773696641004828, + "loss": 0.86032349, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.36352539, + "step": 2095, + "time_per_iteration": 2.580313205718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089479, + "balance_loss_mlp": 1.05238152, + "epoch": 0.40323201231242783, + "flos": 901363133952.0, + "grad_norm": 0.05931554649921985, + "language_loss": 0.77618563, + "learning_rate": 0.0006770783484567247, + "loss": 0.78708041, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.37109375, + "step": 2096, + "time_per_iteration": 3.0955684185028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089916, + "balance_loss_mlp": 1.0536046, + "epoch": 0.4034243939976914, + "flos": 570267219456.0, + "grad_norm": 0.07944545156942663, + "language_loss": 0.8587091, + "learning_rate": 0.000676786964065055, + "loss": 0.86960828, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.36303711, + "step": 2097, + "time_per_iteration": 2.742293119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084372, + "balance_loss_mlp": 1.04829895, + "epoch": 0.403616775682955, + "flos": 507206845440.0, + "grad_norm": 0.04869402927646331, + "language_loss": 0.78305566, + "learning_rate": 0.0006764955110385986, + "loss": 0.79389936, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.3605957, + "step": 2098, + "time_per_iteration": 2.708390235900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086865, + "balance_loss_mlp": 1.05055428, + "epoch": 0.40380915736821854, + "flos": 519127515648.0, + "grad_norm": 0.06727344126892942, + "language_loss": 0.80247992, + "learning_rate": 0.0006762039894905083, + "loss": 0.81334853, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.36328125, + "step": 2099, + "time_per_iteration": 2.6428377628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095654, + "balance_loss_mlp": 1.05812716, + "epoch": 0.40400153905348213, + "flos": 441686048256.0, + "grad_norm": 0.06575852305434472, + "language_loss": 0.80233693, + "learning_rate": 0.000675912399533962, + "loss": 0.81329346, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.375, + "step": 2100, + "time_per_iteration": 2.5560812950134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088947, + "balance_loss_mlp": 1.05249298, + "epoch": 0.40419392073874566, + "flos": 771961300992.0, + "grad_norm": 0.1036114098840327, + "language_loss": 0.85183066, + "learning_rate": 0.0006756207412821656, + "loss": 0.86272013, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.36450195, + "step": 2101, + "time_per_iteration": 2.986583709716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086168, + "balance_loss_mlp": 1.05021429, + "epoch": 0.40438630242400925, + "flos": 766215562752.0, + "grad_norm": 0.06055449439143942, + "language_loss": 0.80025709, + "learning_rate": 0.0006753290148483505, + "loss": 0.81111872, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.36010742, + "step": 2102, + "time_per_iteration": 3.0076749324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080415, + "balance_loss_mlp": 1.04491425, + "epoch": 0.4045786841092728, + "flos": 415013317632.0, + "grad_norm": 0.052033945118291625, + "language_loss": 0.7866869, + "learning_rate": 0.0006750372203457752, + "loss": 0.79749095, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.35546875, + "step": 2103, + "time_per_iteration": 2.490941286087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083553, + "balance_loss_mlp": 1.04767144, + "epoch": 0.40477106579453637, + "flos": 538941454848.0, + "grad_norm": 0.07087529891902919, + "language_loss": 0.86455047, + "learning_rate": 0.0006747453578877242, + "loss": 0.875386, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.35864258, + "step": 2104, + "time_per_iteration": 2.6906399726867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083688, + "balance_loss_mlp": 1.04766345, + "epoch": 0.4049634474797999, + "flos": 826358768640.0, + "grad_norm": 0.07644078595746046, + "language_loss": 0.82677126, + "learning_rate": 0.0006744534275875085, + "loss": 0.83760816, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.3605957, + "step": 2105, + "time_per_iteration": 2.9925642013549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081927, + "balance_loss_mlp": 1.0459255, + "epoch": 0.4051558291650635, + "flos": 572417722368.0, + "grad_norm": 0.07127110995979934, + "language_loss": 0.8562066, + "learning_rate": 0.0006741614295584657, + "loss": 0.86702585, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.36010742, + "step": 2106, + "time_per_iteration": 2.6289658546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078264, + "balance_loss_mlp": 1.04321659, + "epoch": 0.4053482108503271, + "flos": 731541874176.0, + "grad_norm": 0.07814638610947379, + "language_loss": 0.78334522, + "learning_rate": 0.0006738693639139595, + "loss": 0.79412782, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.35083008, + "step": 2107, + "time_per_iteration": 3.0381481647491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078707, + "balance_loss_mlp": 1.04234815, + "epoch": 0.4055405925355906, + "flos": 1212588292608.0, + "grad_norm": 0.05182127384415646, + "language_loss": 0.77652568, + "learning_rate": 0.0006735772307673796, + "loss": 0.78731275, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.36376953, + "step": 2108, + "time_per_iteration": 3.5424931049346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075462, + "balance_loss_mlp": 1.03998494, + "epoch": 0.4057329742208542, + "flos": 715553104896.0, + "grad_norm": 0.0496802449600099, + "language_loss": 0.83129466, + "learning_rate": 0.0006732850302321421, + "loss": 0.84204924, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.35498047, + "step": 2109, + "time_per_iteration": 2.902758836746216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081445, + "balance_loss_mlp": 1.04506207, + "epoch": 0.4059253559061177, + "flos": 564623377920.0, + "grad_norm": 0.054690107844022846, + "language_loss": 0.84019876, + "learning_rate": 0.00067299276242169, + "loss": 0.85101312, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.36376953, + "step": 2110, + "time_per_iteration": 2.6453192234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108684, + "balance_loss_mlp": 1.07272601, + "epoch": 0.4061177375913813, + "flos": 1592886919680.0, + "grad_norm": 0.03852995701507201, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75469011, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.14160156, + "step": 2111, + "time_per_iteration": 4.936276197433472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092328, + "balance_loss_mlp": 1.05587411, + "epoch": 0.40631011927664484, + "flos": 615122892288.0, + "grad_norm": 0.05227822307204106, + "language_loss": 0.77911901, + "learning_rate": 0.0006724080254290395, + "loss": 0.79004228, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.36425781, + "step": 2112, + "time_per_iteration": 2.804931402206421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_mlp": 1.04893136, + "epoch": 0.40650250096190843, + "flos": 557390647296.0, + "grad_norm": 0.056265148252134925, + "language_loss": 0.89716649, + "learning_rate": 0.0006721155564738566, + "loss": 0.90801871, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.36303711, + "step": 2113, + "time_per_iteration": 2.756901502609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050781, + "balance_loss_mlp": 1.03676188, + "epoch": 0.40669488264717196, + "flos": 1579301756928.0, + "grad_norm": 0.015026311101099392, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79673421, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.140625, + "step": 2114, + "time_per_iteration": 4.975963354110718 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109673, + "balance_loss_mlp": 1.0599184, + "epoch": 0.40688726433243555, + "flos": 507398902272.0, + "grad_norm": 0.07464761746525102, + "language_loss": 0.85648221, + "learning_rate": 0.0006715304182135078, + "loss": 0.86744952, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.36816406, + "step": 2115, + "time_per_iteration": 2.5924360752105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104151, + "balance_loss_mlp": 1.06726742, + "epoch": 0.40707964601769914, + "flos": 588757109760.0, + "grad_norm": 0.06427267203463374, + "language_loss": 0.88647795, + "learning_rate": 0.0006712377491355127, + "loss": 0.89751947, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.36889648, + "step": 2116, + "time_per_iteration": 2.887439489364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097518, + "balance_loss_mlp": 1.06135035, + "epoch": 0.40727202770296267, + "flos": 580134901248.0, + "grad_norm": 0.10612280790481599, + "language_loss": 0.81211627, + "learning_rate": 0.0006709450135771274, + "loss": 0.82309151, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.36206055, + "step": 2117, + "time_per_iteration": 2.9730725288391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101548, + "balance_loss_mlp": 1.06523705, + "epoch": 0.40746440938822626, + "flos": 503819633664.0, + "grad_norm": 0.05032701187252936, + "language_loss": 0.86683893, + "learning_rate": 0.0006706522116520023, + "loss": 0.87785447, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.36328125, + "step": 2118, + "time_per_iteration": 2.6400580406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096769, + "balance_loss_mlp": 1.06122053, + "epoch": 0.4076567910734898, + "flos": 605323611648.0, + "grad_norm": 0.05658204986861598, + "language_loss": 0.82839441, + "learning_rate": 0.0006703593434738127, + "loss": 0.83936214, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.35571289, + "step": 2119, + "time_per_iteration": 2.77944016456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091629, + "balance_loss_mlp": 1.05622339, + "epoch": 0.4078491727587534, + "flos": 479313372672.0, + "grad_norm": 0.0532477275953574, + "language_loss": 0.78150344, + "learning_rate": 0.0006700664091562604, + "loss": 0.79241967, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.35449219, + "step": 2120, + "time_per_iteration": 2.580658435821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093922, + "balance_loss_mlp": 1.05780149, + "epoch": 0.4080415544440169, + "flos": 510126985728.0, + "grad_norm": 0.045251762284626275, + "language_loss": 0.85188484, + "learning_rate": 0.0006697734088130725, + "loss": 0.86282408, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.36157227, + "step": 2121, + "time_per_iteration": 2.5990941524505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108927, + "balance_loss_mlp": 1.05329287, + "epoch": 0.4082339361292805, + "flos": 734318009856.0, + "grad_norm": 0.06207508790269206, + "language_loss": 0.85326135, + "learning_rate": 0.0006694803425580018, + "loss": 0.86415404, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.36010742, + "step": 2122, + "time_per_iteration": 2.9514336585998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093256, + "balance_loss_mlp": 1.05687356, + "epoch": 0.4084263178145441, + "flos": 457239831552.0, + "grad_norm": 0.08260422277145335, + "language_loss": 0.84467387, + "learning_rate": 0.0006691872105048268, + "loss": 0.85560644, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.36401367, + "step": 2123, + "time_per_iteration": 2.584765672683716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093552, + "balance_loss_mlp": 1.05762231, + "epoch": 0.4086186994998076, + "flos": 562659139584.0, + "grad_norm": 0.056985949085160005, + "language_loss": 0.84641832, + "learning_rate": 0.0006688940127673513, + "loss": 0.85735387, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.35961914, + "step": 2124, + "time_per_iteration": 2.698777675628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100727, + "balance_loss_mlp": 1.06446397, + "epoch": 0.4088110811850712, + "flos": 573364859904.0, + "grad_norm": 0.04747345440626025, + "language_loss": 0.85754699, + "learning_rate": 0.0006686007494594049, + "loss": 0.86855423, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.36279297, + "step": 2125, + "time_per_iteration": 2.8035151958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101938, + "balance_loss_mlp": 1.06538868, + "epoch": 0.40900346287033473, + "flos": 456702948864.0, + "grad_norm": 0.06322616011827766, + "language_loss": 0.80074888, + "learning_rate": 0.0006683074206948425, + "loss": 0.81176829, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.36547852, + "step": 2126, + "time_per_iteration": 2.4856953620910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103432, + "balance_loss_mlp": 1.06697774, + "epoch": 0.4091958445555983, + "flos": 617097305088.0, + "grad_norm": 0.05684118517242104, + "language_loss": 0.8146261, + "learning_rate": 0.0006680140265875443, + "loss": 0.82566047, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.36474609, + "step": 2127, + "time_per_iteration": 2.772571325302124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111548, + "balance_loss_mlp": 1.07564259, + "epoch": 0.40938822624086185, + "flos": 472159217664.0, + "grad_norm": 0.051537767424008556, + "language_loss": 0.95483583, + "learning_rate": 0.0006677205672514162, + "loss": 0.96595132, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.35888672, + "step": 2128, + "time_per_iteration": 2.6006312370300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114142, + "balance_loss_mlp": 1.07642448, + "epoch": 0.40958060792612544, + "flos": 569734718976.0, + "grad_norm": 0.04853999942998699, + "language_loss": 0.88646978, + "learning_rate": 0.000667427042800389, + "loss": 0.8976112, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.37670898, + "step": 2129, + "time_per_iteration": 2.742804765701294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107096, + "balance_loss_mlp": 1.07030797, + "epoch": 0.40977298961138897, + "flos": 609065823744.0, + "grad_norm": 0.053374560930054, + "language_loss": 0.8288517, + "learning_rate": 0.0006671334533484192, + "loss": 0.83992267, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.36767578, + "step": 2130, + "time_per_iteration": 2.7175474166870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105432, + "balance_loss_mlp": 1.06854916, + "epoch": 0.40996537129665256, + "flos": 581463332352.0, + "grad_norm": 0.10187828374301312, + "language_loss": 0.83427989, + "learning_rate": 0.0006668397990094881, + "loss": 0.84533429, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.36889648, + "step": 2131, + "time_per_iteration": 2.718189239501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102513, + "balance_loss_mlp": 1.06438994, + "epoch": 0.41015775298191615, + "flos": 516296125440.0, + "grad_norm": 0.05088305967580112, + "language_loss": 0.84777439, + "learning_rate": 0.0006665460798976027, + "loss": 0.85879958, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.38134766, + "step": 2132, + "time_per_iteration": 2.754838228225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103514, + "balance_loss_mlp": 1.06448531, + "epoch": 0.4103501346671797, + "flos": 510083315712.0, + "grad_norm": 0.04980971333778078, + "language_loss": 0.81075269, + "learning_rate": 0.0006662522961267947, + "loss": 0.82178783, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.38989258, + "step": 2133, + "time_per_iteration": 2.630645513534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105465, + "balance_loss_mlp": 1.06514883, + "epoch": 0.41054251635244327, + "flos": 549459500544.0, + "grad_norm": 0.047627275091831754, + "language_loss": 0.87016159, + "learning_rate": 0.0006659584478111211, + "loss": 0.88121629, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.40307617, + "step": 2134, + "time_per_iteration": 2.7775702476501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114637, + "balance_loss_mlp": 1.07408166, + "epoch": 0.4107348980377068, + "flos": 839549643264.0, + "grad_norm": 0.06581962625194586, + "language_loss": 0.82464856, + "learning_rate": 0.000665664535064664, + "loss": 0.83579493, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.40551758, + "step": 2135, + "time_per_iteration": 3.0234854221343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011149, + "balance_loss_mlp": 1.07501245, + "epoch": 0.4109272797229704, + "flos": 503445694464.0, + "grad_norm": 0.05498766410062668, + "language_loss": 0.82554698, + "learning_rate": 0.0006653705580015303, + "loss": 0.83669591, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.39892578, + "step": 2136, + "time_per_iteration": 2.740478992462158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110871, + "balance_loss_mlp": 1.06786942, + "epoch": 0.4111196614082339, + "flos": 610533877248.0, + "grad_norm": 0.1069583069182241, + "language_loss": 0.86098707, + "learning_rate": 0.0006650765167358523, + "loss": 0.87207425, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.40844727, + "step": 2137, + "time_per_iteration": 2.7766735553741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112768, + "balance_loss_mlp": 1.07264185, + "epoch": 0.4113120430934975, + "flos": 452931623424.0, + "grad_norm": 0.06240188984530218, + "language_loss": 0.8998509, + "learning_rate": 0.0006647824113817864, + "loss": 0.91097856, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.40112305, + "step": 2138, + "time_per_iteration": 2.558088779449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109501, + "balance_loss_mlp": 1.06992376, + "epoch": 0.41150442477876104, + "flos": 541324712448.0, + "grad_norm": 0.06351755199965968, + "language_loss": 0.81488299, + "learning_rate": 0.000664488242053515, + "loss": 0.82597804, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.39550781, + "step": 2139, + "time_per_iteration": 2.7064287662506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102585, + "balance_loss_mlp": 1.06405628, + "epoch": 0.4116968064640246, + "flos": 576017339904.0, + "grad_norm": 0.052717271070364294, + "language_loss": 0.8372525, + "learning_rate": 0.0006641940088652445, + "loss": 0.8482784, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.38500977, + "step": 2140, + "time_per_iteration": 2.8360941410064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107136, + "balance_loss_mlp": 1.0685842, + "epoch": 0.4118891881492882, + "flos": 495857963520.0, + "grad_norm": 0.05632128251923113, + "language_loss": 0.82241237, + "learning_rate": 0.0006638997119312065, + "loss": 0.83348376, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.38500977, + "step": 2141, + "time_per_iteration": 2.695482015609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432807, + "balance_loss_mlp": 1.41773903, + "epoch": 0.41208156983455174, + "flos": 1537604923392.0, + "grad_norm": 0.12335560313674339, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76496112, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.15039062, + "step": 2142, + "time_per_iteration": 4.938086032867432 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096248, + "balance_loss_mlp": 1.05800605, + "epoch": 0.41227395151981533, + "flos": 584697775104.0, + "grad_norm": 0.06073263389064812, + "language_loss": 0.84852999, + "learning_rate": 0.000663310927282877, + "loss": 0.85949242, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.38208008, + "step": 2143, + "time_per_iteration": 2.776041269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098979, + "balance_loss_mlp": 1.06183362, + "epoch": 0.41246633320507886, + "flos": 442685620224.0, + "grad_norm": 0.05843533128868507, + "language_loss": 0.85999441, + "learning_rate": 0.000663016439797172, + "loss": 0.8709842, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.37109375, + "step": 2144, + "time_per_iteration": 2.6550843715667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099188, + "balance_loss_mlp": 1.06280541, + "epoch": 0.41265871489034245, + "flos": 579680976384.0, + "grad_norm": 0.05476235673703619, + "language_loss": 0.80718118, + "learning_rate": 0.0006627218890228724, + "loss": 0.81817305, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.36376953, + "step": 2145, + "time_per_iteration": 2.748966693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098226, + "balance_loss_mlp": 1.06139088, + "epoch": 0.412851096575606, + "flos": 760906372608.0, + "grad_norm": 0.06511227414480983, + "language_loss": 0.83519912, + "learning_rate": 0.0006624272750743326, + "loss": 0.84618139, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.3684082, + "step": 2146, + "time_per_iteration": 2.987541913986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098071, + "balance_loss_mlp": 1.05994785, + "epoch": 0.41304347826086957, + "flos": 555062644224.0, + "grad_norm": 0.04596756157996359, + "language_loss": 0.82878035, + "learning_rate": 0.0006621325980659322, + "loss": 0.83976108, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.38061523, + "step": 2147, + "time_per_iteration": 2.821556568145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104625, + "balance_loss_mlp": 1.0655247, + "epoch": 0.41323585994613315, + "flos": 665418765312.0, + "grad_norm": 0.06740751064613239, + "language_loss": 0.8204211, + "learning_rate": 0.000661837858112075, + "loss": 0.83146733, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.390625, + "step": 2148, + "time_per_iteration": 2.7922754287719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089584, + "balance_loss_mlp": 1.05136561, + "epoch": 0.4134282416313967, + "flos": 548429405184.0, + "grad_norm": 0.050771109286751076, + "language_loss": 0.88476944, + "learning_rate": 0.0006615430553271888, + "loss": 0.89566529, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.38208008, + "step": 2149, + "time_per_iteration": 2.7367136478424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091169, + "balance_loss_mlp": 1.05326056, + "epoch": 0.4136206233166603, + "flos": 645951062016.0, + "grad_norm": 0.056682848656222896, + "language_loss": 0.85300201, + "learning_rate": 0.0006612481898257264, + "loss": 0.86391366, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.37866211, + "step": 2150, + "time_per_iteration": 2.862969160079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082558, + "balance_loss_mlp": 1.04398179, + "epoch": 0.4138130050019238, + "flos": 517103640576.0, + "grad_norm": 0.07190872816549171, + "language_loss": 0.85216105, + "learning_rate": 0.000660953261722165, + "loss": 0.86298662, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.38549805, + "step": 2151, + "time_per_iteration": 2.608966588973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072414, + "balance_loss_mlp": 1.03379023, + "epoch": 0.4140053866871874, + "flos": 608977073664.0, + "grad_norm": 0.05213877076699988, + "language_loss": 0.82764488, + "learning_rate": 0.0006606582711310055, + "loss": 0.83836901, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.38574219, + "step": 2152, + "time_per_iteration": 2.704941511154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081545, + "balance_loss_mlp": 1.04287302, + "epoch": 0.4141977683724509, + "flos": 579493301760.0, + "grad_norm": 0.0573275470165796, + "language_loss": 0.83345616, + "learning_rate": 0.0006603632181667736, + "loss": 0.8442716, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.38671875, + "step": 2153, + "time_per_iteration": 2.670036792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157558, + "balance_loss_mlp": 1.14086878, + "epoch": 0.4143901500577145, + "flos": 1306598777856.0, + "grad_norm": 0.04466441147089705, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.80100882, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.16699219, + "step": 2154, + "time_per_iteration": 4.936178684234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088281, + "balance_loss_mlp": 1.04989576, + "epoch": 0.41458253174297804, + "flos": 459957740544.0, + "grad_norm": 0.05825483779723247, + "language_loss": 0.81504506, + "learning_rate": 0.0006597729255773153, + "loss": 0.82592785, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.38354492, + "step": 2155, + "time_per_iteration": 2.5436675548553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095127, + "balance_loss_mlp": 1.056885, + "epoch": 0.41477491342824163, + "flos": 553096995840.0, + "grad_norm": 0.14369101348323118, + "language_loss": 0.82126498, + "learning_rate": 0.0006594776861812608, + "loss": 0.83221632, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.38183594, + "step": 2156, + "time_per_iteration": 2.6603870391845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102341, + "balance_loss_mlp": 1.06414664, + "epoch": 0.4149672951135052, + "flos": 697444356096.0, + "grad_norm": 0.09619651786969989, + "language_loss": 0.86957002, + "learning_rate": 0.0006591823848704776, + "loss": 0.88059342, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.38183594, + "step": 2157, + "time_per_iteration": 2.888523578643799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112154, + "balance_loss_mlp": 1.07362556, + "epoch": 0.41515967679876875, + "flos": 565480355328.0, + "grad_norm": 0.06180894820080996, + "language_loss": 0.81514823, + "learning_rate": 0.0006588870217596117, + "loss": 0.82626975, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.38500977, + "step": 2158, + "time_per_iteration": 2.7872376441955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124674, + "balance_loss_mlp": 1.08497691, + "epoch": 0.41535205848403234, + "flos": 500938781184.0, + "grad_norm": 0.08519942481898463, + "language_loss": 0.85712391, + "learning_rate": 0.0006585915969633334, + "loss": 0.86837065, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.39672852, + "step": 2159, + "time_per_iteration": 2.5857338905334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135799, + "balance_loss_mlp": 1.09703159, + "epoch": 0.41554444016929587, + "flos": 607268911104.0, + "grad_norm": 0.06479316283343547, + "language_loss": 0.89294302, + "learning_rate": 0.0006582961105963366, + "loss": 0.90430105, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.38720703, + "step": 2160, + "time_per_iteration": 2.7831602096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153796, + "balance_loss_mlp": 1.11493373, + "epoch": 0.41573682185455946, + "flos": 528856985088.0, + "grad_norm": 0.06215124272048543, + "language_loss": 0.77626073, + "learning_rate": 0.0006580005627733395, + "loss": 0.7877987, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.38818359, + "step": 2161, + "time_per_iteration": 2.6620304584503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152884, + "balance_loss_mlp": 1.11349678, + "epoch": 0.415929203539823, + "flos": 504686785536.0, + "grad_norm": 0.0577168801928891, + "language_loss": 0.81587994, + "learning_rate": 0.0006577049536090838, + "loss": 0.82740879, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.39355469, + "step": 2162, + "time_per_iteration": 2.6678874492645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144655, + "balance_loss_mlp": 1.10693753, + "epoch": 0.4161215852250866, + "flos": 582467286528.0, + "grad_norm": 0.07160302952697103, + "language_loss": 0.85415941, + "learning_rate": 0.000657409283218335, + "loss": 0.86560595, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.37695312, + "step": 2163, + "time_per_iteration": 2.6405746936798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134301, + "balance_loss_mlp": 1.09570062, + "epoch": 0.4163139669103501, + "flos": 490432320000.0, + "grad_norm": 0.051386242205519156, + "language_loss": 0.80774486, + "learning_rate": 0.0006571135517158829, + "loss": 0.81908786, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.38549805, + "step": 2164, + "time_per_iteration": 2.6496996879577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218225, + "balance_loss_mlp": 1.20143986, + "epoch": 0.4165063485956137, + "flos": 1287445377024.0, + "grad_norm": 0.06520745435981959, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77982283, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.16796875, + "step": 2165, + "time_per_iteration": 4.76560640335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127203, + "balance_loss_mlp": 1.09003401, + "epoch": 0.4166987302808773, + "flos": 495015542784.0, + "grad_norm": 0.07154886739030113, + "language_loss": 0.83213758, + "learning_rate": 0.0006565219058351444, + "loss": 0.8434096, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.37133789, + "step": 2166, + "time_per_iteration": 2.539856433868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112149, + "balance_loss_mlp": 1.07397866, + "epoch": 0.4168911119661408, + "flos": 463823608320.0, + "grad_norm": 0.0764039854303378, + "language_loss": 0.83196324, + "learning_rate": 0.0006562259916865553, + "loss": 0.84308469, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.38110352, + "step": 2167, + "time_per_iteration": 2.5938220024108887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106062, + "balance_loss_mlp": 1.06939304, + "epoch": 0.4170834936514044, + "flos": 536499970560.0, + "grad_norm": 0.052882286550722295, + "language_loss": 0.7941224, + "learning_rate": 0.0006559300168856573, + "loss": 0.80518305, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.36694336, + "step": 2168, + "time_per_iteration": 2.7382309436798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100629, + "balance_loss_mlp": 1.0633167, + "epoch": 0.41727587533666793, + "flos": 550418374656.0, + "grad_norm": 0.05257418188896324, + "language_loss": 0.85768378, + "learning_rate": 0.0006556339815473577, + "loss": 0.86869007, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.37280273, + "step": 2169, + "time_per_iteration": 2.6762564182281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110352, + "balance_loss_mlp": 1.06501567, + "epoch": 0.4174682570219315, + "flos": 630795949056.0, + "grad_norm": 0.0440641640787593, + "language_loss": 0.85913342, + "learning_rate": 0.000655337885786588, + "loss": 0.87016863, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.38452148, + "step": 2170, + "time_per_iteration": 2.8669848442077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098716, + "balance_loss_mlp": 1.06068778, + "epoch": 0.41766063870719505, + "flos": 519501454848.0, + "grad_norm": 0.07103396575336611, + "language_loss": 0.84732234, + "learning_rate": 0.0006550417297183025, + "loss": 0.85830951, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.37988281, + "step": 2171, + "time_per_iteration": 2.6471290588378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110284, + "balance_loss_mlp": 1.0640254, + "epoch": 0.41785302039245864, + "flos": 557656897536.0, + "grad_norm": 0.051327988161677204, + "language_loss": 0.8175863, + "learning_rate": 0.0006547455134574793, + "loss": 0.82861477, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.38793945, + "step": 2172, + "time_per_iteration": 2.71508526802063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100888, + "balance_loss_mlp": 1.06338453, + "epoch": 0.41804540207772223, + "flos": 788156683776.0, + "grad_norm": 0.052280747851499734, + "language_loss": 0.84377366, + "learning_rate": 0.0006544492371191198, + "loss": 0.85478258, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.37475586, + "step": 2173, + "time_per_iteration": 3.114607810974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096475, + "balance_loss_mlp": 1.05775642, + "epoch": 0.41823778376298576, + "flos": 903944240640.0, + "grad_norm": 0.04972167781175626, + "language_loss": 0.83103442, + "learning_rate": 0.0006541529008182485, + "loss": 0.84199917, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.38696289, + "step": 2174, + "time_per_iteration": 3.165484666824341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094456, + "balance_loss_mlp": 1.0563333, + "epoch": 0.41843016544824935, + "flos": 511308440064.0, + "grad_norm": 0.05116159603840096, + "language_loss": 0.8702668, + "learning_rate": 0.0006538565046699136, + "loss": 0.88121128, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.38085938, + "step": 2175, + "time_per_iteration": 2.5701253414154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101136, + "balance_loss_mlp": 1.06389487, + "epoch": 0.4186225471335129, + "flos": 652774947840.0, + "grad_norm": 0.05537675869017034, + "language_loss": 0.81610411, + "learning_rate": 0.0006535600487891862, + "loss": 0.82711548, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.37231445, + "step": 2176, + "time_per_iteration": 2.7980031967163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096315, + "balance_loss_mlp": 1.05900216, + "epoch": 0.41881492881877647, + "flos": 568892298240.0, + "grad_norm": 0.05573219506936483, + "language_loss": 0.89184308, + "learning_rate": 0.0006532635332911603, + "loss": 0.90280616, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.37304688, + "step": 2177, + "time_per_iteration": 2.64104962348938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092004, + "balance_loss_mlp": 1.05495393, + "epoch": 0.41900731050404, + "flos": 911478127104.0, + "grad_norm": 0.05325324025552218, + "language_loss": 0.80538237, + "learning_rate": 0.0006529669582909541, + "loss": 0.81630242, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.37011719, + "step": 2178, + "time_per_iteration": 3.21323299407959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108647, + "balance_loss_mlp": 1.04896641, + "epoch": 0.4191996921893036, + "flos": 535498988544.0, + "grad_norm": 0.06510625194491998, + "language_loss": 0.85975909, + "learning_rate": 0.0006526703239037077, + "loss": 0.87062377, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.375, + "step": 2179, + "time_per_iteration": 2.630338430404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086501, + "balance_loss_mlp": 1.0496887, + "epoch": 0.4193920738745671, + "flos": 582363979776.0, + "grad_norm": 0.04783092813648227, + "language_loss": 0.86411011, + "learning_rate": 0.0006523736302445851, + "loss": 0.8749752, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.36816406, + "step": 2180, + "time_per_iteration": 2.7710120677948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083583, + "balance_loss_mlp": 1.04681921, + "epoch": 0.4195844555598307, + "flos": 1335279720960.0, + "grad_norm": 0.05415818779113344, + "language_loss": 0.77215266, + "learning_rate": 0.0006520768774287728, + "loss": 0.78298849, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.36743164, + "step": 2181, + "time_per_iteration": 3.738273859024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082946, + "balance_loss_mlp": 1.04642057, + "epoch": 0.4197768372450943, + "flos": 598480786944.0, + "grad_norm": 0.04672312513315136, + "language_loss": 0.85467362, + "learning_rate": 0.0006517800655714806, + "loss": 0.86550307, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.36547852, + "step": 2182, + "time_per_iteration": 2.796132802963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076263, + "balance_loss_mlp": 1.04016638, + "epoch": 0.4199692189303578, + "flos": 734929085952.0, + "grad_norm": 0.05966366646918548, + "language_loss": 0.84806752, + "learning_rate": 0.0006514831947879407, + "loss": 0.85883021, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.36132812, + "step": 2183, + "time_per_iteration": 2.9417624473571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077956, + "balance_loss_mlp": 1.04243183, + "epoch": 0.4201616006156214, + "flos": 749854264320.0, + "grad_norm": 0.05811307518141115, + "language_loss": 0.78259802, + "learning_rate": 0.0006511862651934091, + "loss": 0.79337758, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.35522461, + "step": 2184, + "time_per_iteration": 3.0546512603759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082116, + "balance_loss_mlp": 1.04601932, + "epoch": 0.42035398230088494, + "flos": 546764912640.0, + "grad_norm": 0.041926600273946305, + "language_loss": 0.82459891, + "learning_rate": 0.0006508892769031638, + "loss": 0.83542007, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.36083984, + "step": 2185, + "time_per_iteration": 2.7021775245666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085419, + "balance_loss_mlp": 1.04972804, + "epoch": 0.42054636398614853, + "flos": 616628823552.0, + "grad_norm": 0.31605549573939495, + "language_loss": 0.86902821, + "learning_rate": 0.000650592230032506, + "loss": 0.87988245, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.35742188, + "step": 2186, + "time_per_iteration": 2.725625514984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090024, + "balance_loss_mlp": 1.05175829, + "epoch": 0.42073874567141206, + "flos": 640077285888.0, + "grad_norm": 0.04878826269588872, + "language_loss": 0.84995645, + "learning_rate": 0.0006502951246967595, + "loss": 0.86085677, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.38256836, + "step": 2187, + "time_per_iteration": 2.8762335777282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092344, + "balance_loss_mlp": 1.05517459, + "epoch": 0.42093112735667565, + "flos": 493524168192.0, + "grad_norm": 0.05435264660880543, + "language_loss": 0.86905056, + "learning_rate": 0.0006499979610112706, + "loss": 0.87997395, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.37158203, + "step": 2188, + "time_per_iteration": 2.7210283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105519, + "balance_loss_mlp": 1.06615603, + "epoch": 0.4211235090419392, + "flos": 542097321984.0, + "grad_norm": 0.05832158753777823, + "language_loss": 0.84076196, + "learning_rate": 0.000649700739091409, + "loss": 0.85181713, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.39331055, + "step": 2189, + "time_per_iteration": 2.70627498626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109273, + "balance_loss_mlp": 1.09582591, + "epoch": 0.42131589072720277, + "flos": 1531342651392.0, + "grad_norm": 0.0317680876714807, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74945545, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.13476562, + "step": 2190, + "time_per_iteration": 4.8291919231414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103729, + "balance_loss_mlp": 1.0656538, + "epoch": 0.42150827241246636, + "flos": 566583234048.0, + "grad_norm": 0.055290985630161965, + "language_loss": 0.85335857, + "learning_rate": 0.0006491061210101557, + "loss": 0.86439586, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.38037109, + "step": 2191, + "time_per_iteration": 2.669895887374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096378, + "balance_loss_mlp": 1.05770612, + "epoch": 0.4217006540977299, + "flos": 707242226688.0, + "grad_norm": 0.050091435221191714, + "language_loss": 0.83998156, + "learning_rate": 0.0006488087250796157, + "loss": 0.85094529, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.38623047, + "step": 2192, + "time_per_iteration": 2.951594352722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098297, + "balance_loss_mlp": 1.05864835, + "epoch": 0.4218930357829935, + "flos": 626975161344.0, + "grad_norm": 0.047618767001194696, + "language_loss": 0.81377089, + "learning_rate": 0.0006485112713764049, + "loss": 0.82475388, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.39624023, + "step": 2193, + "time_per_iteration": 2.943021535873413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095527, + "balance_loss_mlp": 1.05592585, + "epoch": 0.422085417468257, + "flos": 460110509568.0, + "grad_norm": 0.051159508672241207, + "language_loss": 0.83686495, + "learning_rate": 0.0006482137600160051, + "loss": 0.84782028, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.39575195, + "step": 2194, + "time_per_iteration": 2.5134236812591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095144, + "balance_loss_mlp": 1.05590069, + "epoch": 0.4222777991535206, + "flos": 473788804608.0, + "grad_norm": 0.10490890222415104, + "language_loss": 0.84473735, + "learning_rate": 0.0006479161911139206, + "loss": 0.85568881, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.39208984, + "step": 2195, + "time_per_iteration": 2.577578544616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096098, + "balance_loss_mlp": 1.05754566, + "epoch": 0.4224701808387841, + "flos": 470647494144.0, + "grad_norm": 0.0782943385788455, + "language_loss": 0.85684174, + "learning_rate": 0.0006476185647856778, + "loss": 0.86780274, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.38500977, + "step": 2196, + "time_per_iteration": 2.578495740890503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102208, + "balance_loss_mlp": 1.06286871, + "epoch": 0.4226625625240477, + "flos": 677202633216.0, + "grad_norm": 0.22187176821456261, + "language_loss": 0.81400013, + "learning_rate": 0.0006473208811468255, + "loss": 0.82502222, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.39306641, + "step": 2197, + "time_per_iteration": 2.870922088623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099017, + "balance_loss_mlp": 1.05984497, + "epoch": 0.4228549442093113, + "flos": 503268194304.0, + "grad_norm": 0.05214229642018916, + "language_loss": 0.8430717, + "learning_rate": 0.0006470231403129347, + "loss": 0.85406196, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.39135742, + "step": 2198, + "time_per_iteration": 2.5834295749664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098125, + "balance_loss_mlp": 1.05959654, + "epoch": 0.42304732589457483, + "flos": 611543623680.0, + "grad_norm": 0.055955286861533095, + "language_loss": 0.81645906, + "learning_rate": 0.0006467253423995988, + "loss": 0.82744032, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.38500977, + "step": 2199, + "time_per_iteration": 2.8634603023529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097002, + "balance_loss_mlp": 1.05854511, + "epoch": 0.4232397075798384, + "flos": 515302345728.0, + "grad_norm": 0.05326479811347408, + "language_loss": 0.79026473, + "learning_rate": 0.000646427487522433, + "loss": 0.80123472, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.38452148, + "step": 2200, + "time_per_iteration": 2.649003744125366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102129, + "balance_loss_mlp": 1.063815, + "epoch": 0.42343208926510195, + "flos": 589513752576.0, + "grad_norm": 0.053706873495154336, + "language_loss": 0.83035368, + "learning_rate": 0.0006461295757970749, + "loss": 0.84137499, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.3828125, + "step": 2201, + "time_per_iteration": 2.8269903659820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103583, + "balance_loss_mlp": 1.06379044, + "epoch": 0.42362447095036554, + "flos": 640342126080.0, + "grad_norm": 0.05615670023579285, + "language_loss": 0.8144629, + "learning_rate": 0.0006458316073391839, + "loss": 0.8254987, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.39770508, + "step": 2202, + "time_per_iteration": 2.9145257472991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094508, + "balance_loss_mlp": 1.05595589, + "epoch": 0.42381685263562907, + "flos": 512421493248.0, + "grad_norm": 0.05176927409450969, + "language_loss": 0.87622833, + "learning_rate": 0.0006455335822644422, + "loss": 0.88717341, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.38525391, + "step": 2203, + "time_per_iteration": 2.596822500228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099235, + "balance_loss_mlp": 1.06032515, + "epoch": 0.42400923432089266, + "flos": 546523393536.0, + "grad_norm": 0.08269999762480702, + "language_loss": 0.77441901, + "learning_rate": 0.0006452355006885527, + "loss": 0.78541136, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.38867188, + "step": 2204, + "time_per_iteration": 2.6238672733306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105054, + "balance_loss_mlp": 1.06533396, + "epoch": 0.4242016160061562, + "flos": 621872584704.0, + "grad_norm": 0.06279334467905663, + "language_loss": 0.86963212, + "learning_rate": 0.0006449373627272412, + "loss": 0.88068271, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.39697266, + "step": 2205, + "time_per_iteration": 2.715792417526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094037, + "balance_loss_mlp": 1.05515122, + "epoch": 0.4243939976914198, + "flos": 571649495040.0, + "grad_norm": 0.055815664393925046, + "language_loss": 0.82368463, + "learning_rate": 0.0006446391684962553, + "loss": 0.83462495, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.38867188, + "step": 2206, + "time_per_iteration": 2.642230987548828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096131, + "balance_loss_mlp": 1.05822253, + "epoch": 0.42458637937668336, + "flos": 448509934080.0, + "grad_norm": 0.05868479731789126, + "language_loss": 0.83175069, + "learning_rate": 0.000644340918111364, + "loss": 0.84271193, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.37841797, + "step": 2207, + "time_per_iteration": 2.5489144325256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096536, + "balance_loss_mlp": 1.0566721, + "epoch": 0.4247787610619469, + "flos": 435176464896.0, + "grad_norm": 0.05469710752121124, + "language_loss": 0.84862429, + "learning_rate": 0.0006440426116883585, + "loss": 0.8595897, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.3984375, + "step": 2208, + "time_per_iteration": 2.5027823448181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106001, + "balance_loss_mlp": 1.06563711, + "epoch": 0.4249711427472105, + "flos": 495818675712.0, + "grad_norm": 0.04694631121992161, + "language_loss": 0.86197406, + "learning_rate": 0.0006437442493430519, + "loss": 0.87303412, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.40356445, + "step": 2209, + "time_per_iteration": 2.624462127685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111613, + "balance_loss_mlp": 1.0711534, + "epoch": 0.425163524432474, + "flos": 655498649088.0, + "grad_norm": 0.06243114219893557, + "language_loss": 0.86437929, + "learning_rate": 0.000643445831191278, + "loss": 0.87549543, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.40454102, + "step": 2210, + "time_per_iteration": 2.883671760559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110528, + "balance_loss_mlp": 1.06544065, + "epoch": 0.4253559061177376, + "flos": 650317496832.0, + "grad_norm": 0.059150918853506505, + "language_loss": 0.81800103, + "learning_rate": 0.0006431473573488937, + "loss": 0.82905388, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.39819336, + "step": 2211, + "time_per_iteration": 2.723308563232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098753, + "balance_loss_mlp": 1.05807877, + "epoch": 0.42554828780300114, + "flos": 553894336512.0, + "grad_norm": 0.05841858860857517, + "language_loss": 0.84883767, + "learning_rate": 0.0006428488279317765, + "loss": 0.85982525, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.40673828, + "step": 2212, + "time_per_iteration": 2.628831148147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098786, + "balance_loss_mlp": 1.05904126, + "epoch": 0.4257406694882647, + "flos": 514154386944.0, + "grad_norm": 0.056764121975701104, + "language_loss": 0.87647104, + "learning_rate": 0.0006425502430558259, + "loss": 0.88745892, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.39746094, + "step": 2213, + "time_per_iteration": 2.604146718978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094618, + "balance_loss_mlp": 1.0550406, + "epoch": 0.42593305117352825, + "flos": 515380921344.0, + "grad_norm": 0.05046529876809897, + "language_loss": 0.84638417, + "learning_rate": 0.0006422516028369628, + "loss": 0.85733032, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.39550781, + "step": 2214, + "time_per_iteration": 2.6178741455078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088889, + "balance_loss_mlp": 1.04864407, + "epoch": 0.42612543285879184, + "flos": 587766302208.0, + "grad_norm": 0.04660283784017015, + "language_loss": 0.83496028, + "learning_rate": 0.0006419529073911296, + "loss": 0.84584916, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.40234375, + "step": 2215, + "time_per_iteration": 2.8105666637420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085879, + "balance_loss_mlp": 1.04515672, + "epoch": 0.42631781454405543, + "flos": 635153619456.0, + "grad_norm": 0.05277435964401644, + "language_loss": 0.85660267, + "learning_rate": 0.0006416541568342901, + "loss": 0.86746144, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.40722656, + "step": 2216, + "time_per_iteration": 2.880662441253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080832, + "balance_loss_mlp": 1.040277, + "epoch": 0.42651019622931896, + "flos": 540891136512.0, + "grad_norm": 0.04969535335028593, + "language_loss": 0.84409285, + "learning_rate": 0.0006413553512824297, + "loss": 0.85490113, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.40551758, + "step": 2217, + "time_per_iteration": 2.7169618606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108871, + "balance_loss_mlp": 1.0485599, + "epoch": 0.42670257791458255, + "flos": 557892624384.0, + "grad_norm": 0.052410461022671016, + "language_loss": 0.84532559, + "learning_rate": 0.0006410564908515549, + "loss": 0.85621268, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.40136719, + "step": 2218, + "time_per_iteration": 2.657231092453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077064, + "balance_loss_mlp": 1.03710461, + "epoch": 0.4268949595998461, + "flos": 621025781760.0, + "grad_norm": 0.054635208049088675, + "language_loss": 0.8539567, + "learning_rate": 0.0006407575756576935, + "loss": 0.86472738, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.39941406, + "step": 2219, + "time_per_iteration": 2.7336490154266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089202, + "balance_loss_mlp": 1.04921913, + "epoch": 0.42708734128510967, + "flos": 537646519296.0, + "grad_norm": 0.04674173481591379, + "language_loss": 0.8770538, + "learning_rate": 0.0006404586058168951, + "loss": 0.88794577, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.3996582, + "step": 2220, + "time_per_iteration": 2.757380723953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080306, + "balance_loss_mlp": 1.0401566, + "epoch": 0.4272797229703732, + "flos": 502617830400.0, + "grad_norm": 0.05080694298179496, + "language_loss": 0.86598134, + "learning_rate": 0.0006401595814452296, + "loss": 0.87678444, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.40136719, + "step": 2221, + "time_per_iteration": 2.583448886871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082627, + "balance_loss_mlp": 1.04252505, + "epoch": 0.4274721046556368, + "flos": 492208883712.0, + "grad_norm": 0.05244104927134987, + "language_loss": 0.80640519, + "learning_rate": 0.000639860502658789, + "loss": 0.81723142, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.40087891, + "step": 2222, + "time_per_iteration": 2.6454262733459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080551, + "balance_loss_mlp": 1.04149842, + "epoch": 0.4276644863409004, + "flos": 568094957568.0, + "grad_norm": 0.049852493850949496, + "language_loss": 0.84906983, + "learning_rate": 0.0006395613695736853, + "loss": 0.85987538, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.39038086, + "step": 2223, + "time_per_iteration": 2.6607768535614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108883, + "balance_loss_mlp": 1.04841852, + "epoch": 0.4278568680261639, + "flos": 607155429888.0, + "grad_norm": 0.052366739862963044, + "language_loss": 0.8181783, + "learning_rate": 0.0006392621823060529, + "loss": 0.82906657, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.40405273, + "step": 2224, + "time_per_iteration": 2.7084245681762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085727, + "balance_loss_mlp": 1.045434, + "epoch": 0.4280492497114275, + "flos": 560265707520.0, + "grad_norm": 0.062247479017330604, + "language_loss": 0.85044312, + "learning_rate": 0.0006389629409720465, + "loss": 0.86130041, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.40307617, + "step": 2225, + "time_per_iteration": 2.6494481563568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083023, + "balance_loss_mlp": 1.04451835, + "epoch": 0.428241631396691, + "flos": 720334176768.0, + "grad_norm": 0.05784613309553924, + "language_loss": 0.88236213, + "learning_rate": 0.0006386636456878417, + "loss": 0.89319241, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.38452148, + "step": 2226, + "time_per_iteration": 2.8575398921966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086633, + "balance_loss_mlp": 1.04643595, + "epoch": 0.4284340130819546, + "flos": 429243052032.0, + "grad_norm": 0.05660062263134159, + "language_loss": 0.9185167, + "learning_rate": 0.0006383642965696353, + "loss": 0.92938304, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.40185547, + "step": 2227, + "time_per_iteration": 2.436495065689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093242, + "balance_loss_mlp": 1.05240059, + "epoch": 0.42862639476721814, + "flos": 524732069376.0, + "grad_norm": 0.06503204597883332, + "language_loss": 0.82736492, + "learning_rate": 0.000638064893733645, + "loss": 0.83829737, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.40844727, + "step": 2228, + "time_per_iteration": 2.737835645675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097834, + "balance_loss_mlp": 1.05937719, + "epoch": 0.42881877645248173, + "flos": 465089430528.0, + "grad_norm": 0.05835798065495767, + "language_loss": 0.90023828, + "learning_rate": 0.000637765437296109, + "loss": 0.91121662, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.38427734, + "step": 2229, + "time_per_iteration": 2.6694185733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102575, + "balance_loss_mlp": 1.06383204, + "epoch": 0.42901115813774526, + "flos": 560034362880.0, + "grad_norm": 0.048777417646368525, + "language_loss": 0.85443366, + "learning_rate": 0.000637465927373287, + "loss": 0.86545944, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.38720703, + "step": 2230, + "time_per_iteration": 2.608868360519409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097095, + "balance_loss_mlp": 1.05942452, + "epoch": 0.42920353982300885, + "flos": 561186703872.0, + "grad_norm": 0.058529600310023314, + "language_loss": 0.78994036, + "learning_rate": 0.000637166364081459, + "loss": 0.80091131, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.37670898, + "step": 2231, + "time_per_iteration": 2.6343741416931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109752, + "balance_loss_mlp": 1.06089842, + "epoch": 0.42939592150827244, + "flos": 555982230528.0, + "grad_norm": 0.06635954042372831, + "language_loss": 0.84122705, + "learning_rate": 0.0006368667475369256, + "loss": 0.8522023, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.36621094, + "step": 2232, + "time_per_iteration": 2.719153881072998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01385097, + "balance_loss_mlp": 1.36373484, + "epoch": 0.42958830319353597, + "flos": 1520796902400.0, + "grad_norm": 0.10507214536659652, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79912877, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.21386719, + "step": 2233, + "time_per_iteration": 4.869459390640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222541, + "balance_loss_mlp": 1.20547056, + "epoch": 0.42978068487879956, + "flos": 1495052522496.0, + "grad_norm": 0.06278147410173565, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.80117965, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.17089844, + "step": 2234, + "time_per_iteration": 4.809493780136108 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101654, + "balance_loss_mlp": 1.06386471, + "epoch": 0.4299730665640631, + "flos": 546725624832.0, + "grad_norm": 0.047028007384334866, + "language_loss": 0.86220634, + "learning_rate": 0.0006359675795504112, + "loss": 0.87322283, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.37744141, + "step": 2235, + "time_per_iteration": 2.644548177719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104888, + "balance_loss_mlp": 1.06671751, + "epoch": 0.4301654482493267, + "flos": 1128839473152.0, + "grad_norm": 0.053864842268977364, + "language_loss": 0.7475214, + "learning_rate": 0.0006356677511584775, + "loss": 0.75857025, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.38134766, + "step": 2236, + "time_per_iteration": 3.473637580871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104941, + "balance_loss_mlp": 1.06784356, + "epoch": 0.4303578299345902, + "flos": 495502963200.0, + "grad_norm": 0.07035023985335077, + "language_loss": 0.8582648, + "learning_rate": 0.0006353678700956511, + "loss": 0.86931419, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.37084961, + "step": 2237, + "time_per_iteration": 2.5412683486938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110161, + "balance_loss_mlp": 1.0728724, + "epoch": 0.4305502116198538, + "flos": 615472100352.0, + "grad_norm": 0.048926528615743585, + "language_loss": 0.83597398, + "learning_rate": 0.0006350679364783569, + "loss": 0.84707558, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.37255859, + "step": 2238, + "time_per_iteration": 2.7351441383361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108038, + "balance_loss_mlp": 1.0704397, + "epoch": 0.4307425933051173, + "flos": 558995503104.0, + "grad_norm": 0.05635941331688695, + "language_loss": 0.85586011, + "learning_rate": 0.0006347679504230393, + "loss": 0.8669405, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.37573242, + "step": 2239, + "time_per_iteration": 2.628014326095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108584, + "balance_loss_mlp": 1.06981754, + "epoch": 0.4309349749903809, + "flos": 971755163136.0, + "grad_norm": 0.06390031403556296, + "language_loss": 0.75844669, + "learning_rate": 0.0006344679120461632, + "loss": 0.76953256, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.38745117, + "step": 2240, + "time_per_iteration": 3.325970411300659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099219, + "balance_loss_mlp": 1.06123924, + "epoch": 0.4311273566756445, + "flos": 541663746048.0, + "grad_norm": 0.07957466882071795, + "language_loss": 0.79994094, + "learning_rate": 0.0006341678214642134, + "loss": 0.81093317, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.37963867, + "step": 2241, + "time_per_iteration": 2.598954916000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098329, + "balance_loss_mlp": 1.06118321, + "epoch": 0.43131973836090803, + "flos": 761316627456.0, + "grad_norm": 0.06316124390987561, + "language_loss": 0.82909411, + "learning_rate": 0.0006338676787936963, + "loss": 0.8400774, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.37133789, + "step": 2242, + "time_per_iteration": 3.057990074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092454, + "balance_loss_mlp": 1.0547359, + "epoch": 0.4315121200461716, + "flos": 554263893504.0, + "grad_norm": 0.058630582948494374, + "language_loss": 0.83799654, + "learning_rate": 0.0006335674841511367, + "loss": 0.84892106, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.37670898, + "step": 2243, + "time_per_iteration": 2.667917490005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152602, + "balance_loss_mlp": 1.1380111, + "epoch": 0.43170450173143515, + "flos": 1484499419136.0, + "grad_norm": 0.03105866471095203, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80333769, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.14550781, + "step": 2244, + "time_per_iteration": 4.996346473693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147416, + "balance_loss_mlp": 1.13225269, + "epoch": 0.43189688341669874, + "flos": 1472897433600.0, + "grad_norm": 0.02634625536346193, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78512967, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.15136719, + "step": 2245, + "time_per_iteration": 4.925641775131226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090243, + "balance_loss_mlp": 1.05293071, + "epoch": 0.43208926510196227, + "flos": 492677365248.0, + "grad_norm": 0.04832922480589342, + "language_loss": 0.82476389, + "learning_rate": 0.0006326665895567652, + "loss": 0.83566636, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.37304688, + "step": 2246, + "time_per_iteration": 2.6338651180267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108663, + "balance_loss_mlp": 1.04876888, + "epoch": 0.43228164678722586, + "flos": 519969936384.0, + "grad_norm": 0.06353903654252775, + "language_loss": 0.86891162, + "learning_rate": 0.0006323661881916976, + "loss": 0.87977791, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.37841797, + "step": 2247, + "time_per_iteration": 2.7270143032073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088946, + "balance_loss_mlp": 1.05082273, + "epoch": 0.4324740284724894, + "flos": 795722655744.0, + "grad_norm": 0.06655581665723238, + "language_loss": 0.81039822, + "learning_rate": 0.0006320657354375179, + "loss": 0.82128775, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.38134766, + "step": 2248, + "time_per_iteration": 2.9334113597869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090387, + "balance_loss_mlp": 1.05183434, + "epoch": 0.432666410157753, + "flos": 481917800448.0, + "grad_norm": 0.05858711608638651, + "language_loss": 0.87308645, + "learning_rate": 0.0006317652314108726, + "loss": 0.88399029, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.38500977, + "step": 2249, + "time_per_iteration": 2.5155436992645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083427, + "balance_loss_mlp": 1.04508948, + "epoch": 0.43285879184301657, + "flos": 499963940352.0, + "grad_norm": 0.06176153995331203, + "language_loss": 0.91197717, + "learning_rate": 0.0006314646762284277, + "loss": 0.92281145, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.38305664, + "step": 2250, + "time_per_iteration": 2.5938589572906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151081, + "balance_loss_mlp": 1.13324702, + "epoch": 0.4330511735282801, + "flos": 1509615346176.0, + "grad_norm": 0.03602865793169688, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76576912, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.17871094, + "step": 2251, + "time_per_iteration": 4.858763217926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082209, + "balance_loss_mlp": 1.04322791, + "epoch": 0.4332435552135437, + "flos": 699270382080.0, + "grad_norm": 0.07106828010915285, + "language_loss": 0.77364099, + "learning_rate": 0.0006308634128629022, + "loss": 0.78446311, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.3894043, + "step": 2252, + "time_per_iteration": 2.857311487197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081116, + "balance_loss_mlp": 1.04163396, + "epoch": 0.4334359368988072, + "flos": 591995934720.0, + "grad_norm": 0.05494240381392999, + "language_loss": 0.87411273, + "learning_rate": 0.0006305627049132531, + "loss": 0.88492393, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.39453125, + "step": 2253, + "time_per_iteration": 2.7931392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074672, + "balance_loss_mlp": 1.03628647, + "epoch": 0.4336283185840708, + "flos": 842440670208.0, + "grad_norm": 0.045544810523015906, + "language_loss": 0.85602796, + "learning_rate": 0.0006302619462746662, + "loss": 0.86677468, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.38330078, + "step": 2254, + "time_per_iteration": 3.137031078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072053, + "balance_loss_mlp": 1.03521752, + "epoch": 0.43382070026933434, + "flos": 625974179328.0, + "grad_norm": 0.05597321467051534, + "language_loss": 0.90273923, + "learning_rate": 0.0006299611370639069, + "loss": 0.91345972, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.36816406, + "step": 2255, + "time_per_iteration": 2.7370500564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078678, + "balance_loss_mlp": 1.04029226, + "epoch": 0.4340130819545979, + "flos": 590837801472.0, + "grad_norm": 0.05249156720482198, + "language_loss": 0.7960273, + "learning_rate": 0.0006296602773977593, + "loss": 0.80681407, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.38354492, + "step": 2256, + "time_per_iteration": 2.671543836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082288, + "balance_loss_mlp": 1.04387856, + "epoch": 0.4342054636398615, + "flos": 490624376832.0, + "grad_norm": 0.047941706130753194, + "language_loss": 0.87283635, + "learning_rate": 0.0006293593673930277, + "loss": 0.88365924, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.3840332, + "step": 2257, + "time_per_iteration": 2.622807741165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084566, + "balance_loss_mlp": 1.04694366, + "epoch": 0.43439784532512504, + "flos": 698679654912.0, + "grad_norm": 0.05256563639723818, + "language_loss": 0.78625226, + "learning_rate": 0.0006290584071665358, + "loss": 0.79709792, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.3762207, + "step": 2258, + "time_per_iteration": 2.8814268112182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084583, + "balance_loss_mlp": 1.0463171, + "epoch": 0.43459022701038863, + "flos": 485581436928.0, + "grad_norm": 0.05582719483060078, + "language_loss": 0.82315511, + "learning_rate": 0.0006287573968351266, + "loss": 0.83400095, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.38256836, + "step": 2259, + "time_per_iteration": 2.530107259750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093075, + "balance_loss_mlp": 1.05585814, + "epoch": 0.43478260869565216, + "flos": 642818515968.0, + "grad_norm": 0.06362082652150813, + "language_loss": 0.82416236, + "learning_rate": 0.0006284563365156626, + "loss": 0.83509314, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.37182617, + "step": 2260, + "time_per_iteration": 2.798595905303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088803, + "balance_loss_mlp": 1.05103791, + "epoch": 0.43497499038091575, + "flos": 425870396928.0, + "grad_norm": 0.05655312611086985, + "language_loss": 0.87709838, + "learning_rate": 0.0006281552263250261, + "loss": 0.88798642, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.37719727, + "step": 2261, + "time_per_iteration": 2.452665090560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160927, + "balance_loss_mlp": 1.14223516, + "epoch": 0.4351673720661793, + "flos": 1537594748928.0, + "grad_norm": 0.04176446008295971, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.8185246, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.18652344, + "step": 2262, + "time_per_iteration": 4.821255207061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101716, + "balance_loss_mlp": 1.0650475, + "epoch": 0.43535975375144287, + "flos": 748828551168.0, + "grad_norm": 0.06957692587484587, + "language_loss": 0.81302369, + "learning_rate": 0.0006275528567978593, + "loss": 0.82404089, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.36669922, + "step": 2263, + "time_per_iteration": 2.9021594524383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105224, + "balance_loss_mlp": 1.06710052, + "epoch": 0.4355521354367064, + "flos": 860914593792.0, + "grad_norm": 0.05359116837259303, + "language_loss": 0.8251968, + "learning_rate": 0.0006272515976951898, + "loss": 0.83624899, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.38134766, + "step": 2264, + "time_per_iteration": 3.051140546798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100567, + "balance_loss_mlp": 1.06160915, + "epoch": 0.43574451712197, + "flos": 734200146432.0, + "grad_norm": 0.04085362180640218, + "language_loss": 0.79003727, + "learning_rate": 0.0006269502891890687, + "loss": 0.80104291, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.38916016, + "step": 2265, + "time_per_iteration": 2.987435817718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109612, + "balance_loss_mlp": 1.05899858, + "epoch": 0.4359368988072336, + "flos": 570296332800.0, + "grad_norm": 0.04646658934269887, + "language_loss": 0.88059056, + "learning_rate": 0.0006266489313964743, + "loss": 0.89155173, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.37109375, + "step": 2266, + "time_per_iteration": 2.718259572982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098621, + "balance_loss_mlp": 1.06040287, + "epoch": 0.4361292804924971, + "flos": 555244526592.0, + "grad_norm": 0.06168340797293566, + "language_loss": 0.85241735, + "learning_rate": 0.0006263475244344041, + "loss": 0.86340356, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.38183594, + "step": 2267, + "time_per_iteration": 2.822174072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099998, + "balance_loss_mlp": 1.06232774, + "epoch": 0.4363216621777607, + "flos": 557021090304.0, + "grad_norm": 0.06545155195827496, + "language_loss": 0.84663981, + "learning_rate": 0.0006260460684198746, + "loss": 0.85763973, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.37646484, + "step": 2268, + "time_per_iteration": 2.652629852294922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092951, + "balance_loss_mlp": 1.05556679, + "epoch": 0.4365140438630242, + "flos": 477979149312.0, + "grad_norm": 0.06144025960698331, + "language_loss": 0.84485406, + "learning_rate": 0.0006257445634699213, + "loss": 0.85578358, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.3737793, + "step": 2269, + "time_per_iteration": 2.526547431945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091306, + "balance_loss_mlp": 1.05506659, + "epoch": 0.4367064255482878, + "flos": 578646498816.0, + "grad_norm": 0.047950904811088546, + "language_loss": 0.82840669, + "learning_rate": 0.0006254430097015993, + "loss": 0.83931977, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.36279297, + "step": 2270, + "time_per_iteration": 2.6397740840911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121077, + "balance_loss_mlp": 1.1094898, + "epoch": 0.43689880723355135, + "flos": 1458117669888.0, + "grad_norm": 0.029995875979849037, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77600169, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.11572266, + "step": 2271, + "time_per_iteration": 4.781012535095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093451, + "balance_loss_mlp": 1.0559721, + "epoch": 0.43709118891881493, + "flos": 667295663616.0, + "grad_norm": 0.05579821190743498, + "language_loss": 0.85169244, + "learning_rate": 0.0006248397561781609, + "loss": 0.86262697, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.37426758, + "step": 2272, + "time_per_iteration": 2.8750343322753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109427, + "balance_loss_mlp": 1.05617118, + "epoch": 0.43728357060407846, + "flos": 544612999680.0, + "grad_norm": 0.06638881020832643, + "language_loss": 0.86299849, + "learning_rate": 0.0006245380566572482, + "loss": 0.87394118, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.38085938, + "step": 2273, + "time_per_iteration": 2.667287826538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095571, + "balance_loss_mlp": 1.05873561, + "epoch": 0.43747595228934205, + "flos": 746504930304.0, + "grad_norm": 0.06509502789500103, + "language_loss": 0.75652242, + "learning_rate": 0.0006242363087863744, + "loss": 0.76747811, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.36816406, + "step": 2274, + "time_per_iteration": 2.948168992996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088988, + "balance_loss_mlp": 1.05060267, + "epoch": 0.43766833397460564, + "flos": 631060789248.0, + "grad_norm": 0.0773983629565932, + "language_loss": 0.85681164, + "learning_rate": 0.0006239345126826878, + "loss": 0.86770147, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.38354492, + "step": 2275, + "time_per_iteration": 2.7522637844085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084857, + "balance_loss_mlp": 1.04682946, + "epoch": 0.43786071565986917, + "flos": 530709152256.0, + "grad_norm": 0.05397848209837344, + "language_loss": 0.84028137, + "learning_rate": 0.0006236326684633561, + "loss": 0.85112989, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.37988281, + "step": 2276, + "time_per_iteration": 2.8013172149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083155, + "balance_loss_mlp": 1.04479384, + "epoch": 0.43805309734513276, + "flos": 538295473152.0, + "grad_norm": 0.057720697432170794, + "language_loss": 0.74613291, + "learning_rate": 0.0006233307762455658, + "loss": 0.75696445, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.38354492, + "step": 2277, + "time_per_iteration": 4.090092658996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088607, + "balance_loss_mlp": 1.05057979, + "epoch": 0.4382454790303963, + "flos": 864188324352.0, + "grad_norm": 0.052083504639934525, + "language_loss": 0.83232701, + "learning_rate": 0.0006230288361465216, + "loss": 0.84321308, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.37988281, + "step": 2278, + "time_per_iteration": 3.0360679626464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092026, + "balance_loss_mlp": 1.05368817, + "epoch": 0.4384378607156599, + "flos": 765175292928.0, + "grad_norm": 0.0765632057362916, + "language_loss": 0.85051048, + "learning_rate": 0.0006227268482834473, + "loss": 0.86143076, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.38305664, + "step": 2279, + "time_per_iteration": 2.875603437423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092125, + "balance_loss_mlp": 1.05369186, + "epoch": 0.4386302424009234, + "flos": 668260329984.0, + "grad_norm": 0.06746087226793605, + "language_loss": 0.87309432, + "learning_rate": 0.000622424812773585, + "loss": 0.88401562, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.3840332, + "step": 2280, + "time_per_iteration": 2.815737724304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091003, + "balance_loss_mlp": 1.05335641, + "epoch": 0.438822624086187, + "flos": 484941247488.0, + "grad_norm": 0.06660247150401381, + "language_loss": 0.7952022, + "learning_rate": 0.000622122729734195, + "loss": 0.80611223, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.3762207, + "step": 2281, + "time_per_iteration": 2.528907060623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010937, + "balance_loss_mlp": 1.05653024, + "epoch": 0.4390150057714506, + "flos": 498959986176.0, + "grad_norm": 0.07198447175498815, + "language_loss": 0.87400854, + "learning_rate": 0.0006218205992825566, + "loss": 0.88494551, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.37158203, + "step": 2282, + "time_per_iteration": 2.6437437534332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086909, + "balance_loss_mlp": 1.04895234, + "epoch": 0.4392073874567141, + "flos": 557937704448.0, + "grad_norm": 0.0537918663445124, + "language_loss": 0.81690598, + "learning_rate": 0.0006215184215359671, + "loss": 0.82777506, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.37939453, + "step": 2283, + "time_per_iteration": 2.7374680042266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082056, + "balance_loss_mlp": 1.04531598, + "epoch": 0.4393997691419777, + "flos": 605028248064.0, + "grad_norm": 0.053438963610997155, + "language_loss": 0.86718416, + "learning_rate": 0.0006212161966117425, + "loss": 0.87800473, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.36743164, + "step": 2284, + "time_per_iteration": 2.7031607627868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082719, + "balance_loss_mlp": 1.04476333, + "epoch": 0.43959215082724123, + "flos": 803812363776.0, + "grad_norm": 0.05414488390239245, + "language_loss": 0.81261152, + "learning_rate": 0.0006209139246272164, + "loss": 0.8234387, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.37915039, + "step": 2285, + "time_per_iteration": 2.942938804626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081504, + "balance_loss_mlp": 1.04354775, + "epoch": 0.4397845325125048, + "flos": 487403080704.0, + "grad_norm": 0.06213580776851028, + "language_loss": 0.8193686, + "learning_rate": 0.0006206116056997421, + "loss": 0.83018363, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.37939453, + "step": 2286, + "time_per_iteration": 2.549246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083431, + "balance_loss_mlp": 1.04671431, + "epoch": 0.43997691419776835, + "flos": 480569020416.0, + "grad_norm": 0.047189645190622125, + "language_loss": 0.82737786, + "learning_rate": 0.0006203092399466892, + "loss": 0.83821213, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.36694336, + "step": 2287, + "time_per_iteration": 2.533667802810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079141, + "balance_loss_mlp": 1.04259157, + "epoch": 0.44016929588303194, + "flos": 482873702400.0, + "grad_norm": 0.04521232958061075, + "language_loss": 0.85280973, + "learning_rate": 0.0006200068274854473, + "loss": 0.86360115, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.36523438, + "step": 2288, + "time_per_iteration": 2.6336212158203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087714, + "balance_loss_mlp": 1.05013943, + "epoch": 0.4403616775682955, + "flos": 571562155008.0, + "grad_norm": 0.04238785738832165, + "language_loss": 0.85822582, + "learning_rate": 0.0006197043684334229, + "loss": 0.86910295, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.37548828, + "step": 2289, + "time_per_iteration": 2.7420616149902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108734, + "balance_loss_mlp": 1.05028939, + "epoch": 0.44055405925355906, + "flos": 630563194368.0, + "grad_norm": 0.0573866619632787, + "language_loss": 0.79627317, + "learning_rate": 0.0006194018629080411, + "loss": 0.80714655, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.37036133, + "step": 2290, + "time_per_iteration": 2.7804791927337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108993, + "balance_loss_mlp": 1.0514729, + "epoch": 0.44074644093882265, + "flos": 536523291648.0, + "grad_norm": 0.052070709818396434, + "language_loss": 0.81445479, + "learning_rate": 0.0006190993110267451, + "loss": 0.82535404, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.38427734, + "step": 2291, + "time_per_iteration": 2.6991255283355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079676, + "balance_loss_mlp": 1.04317451, + "epoch": 0.4409388226240862, + "flos": 462995744256.0, + "grad_norm": 0.05365602748785357, + "language_loss": 0.84155387, + "learning_rate": 0.0006187967129069958, + "loss": 0.85235059, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.36523438, + "step": 2292, + "time_per_iteration": 2.558609962463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082437, + "balance_loss_mlp": 1.04569674, + "epoch": 0.44113120430934977, + "flos": 565717492224.0, + "grad_norm": 0.05065606510830679, + "language_loss": 0.87013716, + "learning_rate": 0.0006184940686662722, + "loss": 0.88096148, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.36743164, + "step": 2293, + "time_per_iteration": 2.753314733505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078141, + "balance_loss_mlp": 1.04125786, + "epoch": 0.4413235859946133, + "flos": 543313681920.0, + "grad_norm": 0.05240936044313176, + "language_loss": 0.89929485, + "learning_rate": 0.0006181913784220714, + "loss": 0.91007626, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.36865234, + "step": 2294, + "time_per_iteration": 2.6420986652374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111961, + "balance_loss_mlp": 1.09889555, + "epoch": 0.4415159676798769, + "flos": 1569016465920.0, + "grad_norm": 0.03544098021349555, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81665742, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.13085938, + "step": 2295, + "time_per_iteration": 4.864506483078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085965, + "balance_loss_mlp": 1.04831886, + "epoch": 0.4417083493651404, + "flos": 658423171584.0, + "grad_norm": 0.06256258413724265, + "language_loss": 0.79847091, + "learning_rate": 0.0006175858603933146, + "loss": 0.80933058, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.3762207, + "step": 2296, + "time_per_iteration": 2.8739333152770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079624, + "balance_loss_mlp": 1.04328871, + "epoch": 0.441900731050404, + "flos": 740119002624.0, + "grad_norm": 0.05454759239937102, + "language_loss": 0.80644178, + "learning_rate": 0.0006172830328438416, + "loss": 0.81723803, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.36352539, + "step": 2297, + "time_per_iteration": 2.9661777019500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082208, + "balance_loss_mlp": 1.0437274, + "epoch": 0.44209311273566754, + "flos": 539153860608.0, + "grad_norm": 0.05386131456834753, + "language_loss": 0.87081188, + "learning_rate": 0.0006169801597610572, + "loss": 0.88163394, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.38452148, + "step": 2298, + "time_per_iteration": 2.732304573059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107238, + "balance_loss_mlp": 1.03604531, + "epoch": 0.4422854944209311, + "flos": 621335702016.0, + "grad_norm": 0.07013675434202182, + "language_loss": 0.89663231, + "learning_rate": 0.0006166772412625469, + "loss": 0.90735614, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.36328125, + "step": 2299, + "time_per_iteration": 2.70890736579895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075195, + "balance_loss_mlp": 1.03793061, + "epoch": 0.4424778761061947, + "flos": 658516303872.0, + "grad_norm": 0.06419018913135732, + "language_loss": 0.81816357, + "learning_rate": 0.0006163742774659141, + "loss": 0.8289156, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.37255859, + "step": 2300, + "time_per_iteration": 2.830306053161621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081661, + "balance_loss_mlp": 1.0454216, + "epoch": 0.44267025779145824, + "flos": 568297188864.0, + "grad_norm": 0.05261241955347018, + "language_loss": 0.85695601, + "learning_rate": 0.0006160712684887801, + "loss": 0.86777264, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.36279297, + "step": 2301, + "time_per_iteration": 2.7931785583496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010826, + "balance_loss_mlp": 1.04600239, + "epoch": 0.44286263947672183, + "flos": 496469039616.0, + "grad_norm": 0.05340137710748247, + "language_loss": 0.81907189, + "learning_rate": 0.0006157682144487832, + "loss": 0.82989788, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.36572266, + "step": 2302, + "time_per_iteration": 2.7355551719665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108501, + "balance_loss_mlp": 1.04793596, + "epoch": 0.44305502116198536, + "flos": 609096347136.0, + "grad_norm": 0.060309070663334345, + "language_loss": 0.82788789, + "learning_rate": 0.0006154651154635793, + "loss": 0.83873796, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.37084961, + "step": 2303, + "time_per_iteration": 2.8048007488250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088624, + "balance_loss_mlp": 1.05150199, + "epoch": 0.44324740284724895, + "flos": 470558744064.0, + "grad_norm": 0.05169590776144269, + "language_loss": 0.84867418, + "learning_rate": 0.0006151619716508421, + "loss": 0.85956049, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.37084961, + "step": 2304, + "time_per_iteration": 2.5419833660125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.05046785, + "epoch": 0.4434397845325125, + "flos": 578454441984.0, + "grad_norm": 0.05720417651641939, + "language_loss": 0.86974978, + "learning_rate": 0.0006148587831282625, + "loss": 0.88062799, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.37353516, + "step": 2305, + "time_per_iteration": 2.689751386642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055473, + "balance_loss_mlp": 1.04326594, + "epoch": 0.44363216621777607, + "flos": 1495765343232.0, + "grad_norm": 0.012762307031937271, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80231541, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.12207031, + "step": 2306, + "time_per_iteration": 4.886535406112671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092659, + "balance_loss_mlp": 1.05699158, + "epoch": 0.44382454790303966, + "flos": 477082884096.0, + "grad_norm": 0.06286570611305137, + "language_loss": 0.86913157, + "learning_rate": 0.0006142522724244255, + "loss": 0.88005817, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.35693359, + "step": 2307, + "time_per_iteration": 2.499870777130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054077, + "balance_loss_mlp": 1.04177487, + "epoch": 0.4440169295883032, + "flos": 1543321548288.0, + "grad_norm": 0.013017387525484581, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.775388, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.12255859, + "step": 2308, + "time_per_iteration": 4.8646886348724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087414, + "balance_loss_mlp": 1.05115092, + "epoch": 0.4442093112735668, + "flos": 590789749248.0, + "grad_norm": 0.050195382328210664, + "language_loss": 0.77274799, + "learning_rate": 0.000613645584293942, + "loss": 0.78362215, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.36279297, + "step": 2309, + "time_per_iteration": 2.877244472503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087256, + "balance_loss_mlp": 1.05056334, + "epoch": 0.4444016929588303, + "flos": 530009326080.0, + "grad_norm": 0.047114011401622066, + "language_loss": 0.83068305, + "learning_rate": 0.0006133421739881185, + "loss": 0.8415556, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.36694336, + "step": 2310, + "time_per_iteration": 2.667240858078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081979, + "balance_loss_mlp": 1.04557252, + "epoch": 0.4445940746440939, + "flos": 619947634176.0, + "grad_norm": 0.055208144480819774, + "language_loss": 0.82587862, + "learning_rate": 0.0006130387196789605, + "loss": 0.83669835, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.36425781, + "step": 2311, + "time_per_iteration": 2.7925667762756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082597, + "balance_loss_mlp": 1.04704881, + "epoch": 0.4447864563293574, + "flos": 628782248448.0, + "grad_norm": 0.049856185775691036, + "language_loss": 0.83914995, + "learning_rate": 0.0006127352214842795, + "loss": 0.84997582, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.35571289, + "step": 2312, + "time_per_iteration": 2.9495813846588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079657, + "balance_loss_mlp": 1.04236865, + "epoch": 0.444978838014621, + "flos": 650548841472.0, + "grad_norm": 0.0527905378587152, + "language_loss": 0.85049295, + "learning_rate": 0.0006124316795219041, + "loss": 0.8612895, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.37255859, + "step": 2313, + "time_per_iteration": 2.760117769241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077597, + "balance_loss_mlp": 1.04119062, + "epoch": 0.44517121969988455, + "flos": 612153289728.0, + "grad_norm": 0.047764928605774304, + "language_loss": 0.82297838, + "learning_rate": 0.0006121280939096794, + "loss": 0.8337543, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.36401367, + "step": 2314, + "time_per_iteration": 2.737471580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075674, + "balance_loss_mlp": 1.0385046, + "epoch": 0.44536360138514813, + "flos": 488491402752.0, + "grad_norm": 0.07620217918322614, + "language_loss": 0.87685931, + "learning_rate": 0.000611824464765468, + "loss": 0.88761604, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.37133789, + "step": 2315, + "time_per_iteration": 2.5991926193237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103119, + "balance_loss_mlp": 1.01922143, + "epoch": 0.4455559830704117, + "flos": 1515425255424.0, + "grad_norm": 0.013293348061684912, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79626131, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.11962891, + "step": 2316, + "time_per_iteration": 4.652711391448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079239, + "balance_loss_mlp": 1.04335713, + "epoch": 0.44574836475567525, + "flos": 615314949120.0, + "grad_norm": 0.04747333782009751, + "language_loss": 0.85680878, + "learning_rate": 0.000611217076352619, + "loss": 0.86760116, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.35913086, + "step": 2317, + "time_per_iteration": 2.7729227542877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077005, + "balance_loss_mlp": 1.04140949, + "epoch": 0.44594074644093884, + "flos": 506070471168.0, + "grad_norm": 0.2761075259266177, + "language_loss": 0.82980591, + "learning_rate": 0.0006109133173197905, + "loss": 0.84057599, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.35620117, + "step": 2318, + "time_per_iteration": 2.6684277057647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087623, + "balance_loss_mlp": 1.05243218, + "epoch": 0.44613312812620237, + "flos": 726647321088.0, + "grad_norm": 0.057083346058123784, + "language_loss": 0.85251284, + "learning_rate": 0.0006106095152265935, + "loss": 0.86338907, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.35229492, + "step": 2319, + "time_per_iteration": 2.9197404384613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092234, + "balance_loss_mlp": 1.05604196, + "epoch": 0.44632550981146596, + "flos": 635419869696.0, + "grad_norm": 0.048967973341694476, + "language_loss": 0.8448627, + "learning_rate": 0.0006103056701909739, + "loss": 0.85578501, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.36230469, + "step": 2320, + "time_per_iteration": 2.885965347290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101974, + "balance_loss_mlp": 1.06604421, + "epoch": 0.4465178914967295, + "flos": 826690447872.0, + "grad_norm": 0.04429440839494469, + "language_loss": 0.82779431, + "learning_rate": 0.0006100017823308956, + "loss": 0.83881408, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.35961914, + "step": 2321, + "time_per_iteration": 3.1523914337158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110877, + "balance_loss_mlp": 1.0737319, + "epoch": 0.4467102731819931, + "flos": 665532246528.0, + "grad_norm": 0.05773147459468349, + "language_loss": 0.79802787, + "learning_rate": 0.0006096978517643377, + "loss": 0.80913663, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.37158203, + "step": 2322, + "time_per_iteration": 2.8030614852905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123327, + "balance_loss_mlp": 1.08668184, + "epoch": 0.4469026548672566, + "flos": 512692125696.0, + "grad_norm": 0.052696901781691036, + "language_loss": 0.83731532, + "learning_rate": 0.0006093938786092968, + "loss": 0.84854853, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.3659668, + "step": 2323, + "time_per_iteration": 2.6108593940734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112741, + "balance_loss_mlp": 1.0761435, + "epoch": 0.4470950365525202, + "flos": 683774825472.0, + "grad_norm": 0.0683875942547517, + "language_loss": 0.89724207, + "learning_rate": 0.0006090898629837857, + "loss": 0.90836942, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.3659668, + "step": 2324, + "time_per_iteration": 2.8141510486602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121866, + "balance_loss_mlp": 1.08515, + "epoch": 0.4472874182377838, + "flos": 627018831360.0, + "grad_norm": 0.05799026068482576, + "language_loss": 0.87375617, + "learning_rate": 0.0006087858050058337, + "loss": 0.88497484, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.3671875, + "step": 2325, + "time_per_iteration": 2.8174242973327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106391, + "balance_loss_mlp": 1.07053268, + "epoch": 0.4474797999230473, + "flos": 546946795008.0, + "grad_norm": 0.06107345330372946, + "language_loss": 0.81985253, + "learning_rate": 0.0006084817047934866, + "loss": 0.8309164, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.35888672, + "step": 2326, + "time_per_iteration": 2.627870798110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111871, + "balance_loss_mlp": 1.08211279, + "epoch": 0.4476721816083109, + "flos": 455585513472.0, + "grad_norm": 0.09021260210248909, + "language_loss": 0.89277744, + "learning_rate": 0.0006081775624648066, + "loss": 0.90396452, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.3659668, + "step": 2327, + "time_per_iteration": 2.517587900161743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107902, + "balance_loss_mlp": 1.07154357, + "epoch": 0.44786456329357444, + "flos": 481273228800.0, + "grad_norm": 0.05788938613905733, + "language_loss": 0.8277235, + "learning_rate": 0.0006078733781378721, + "loss": 0.83880252, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.36401367, + "step": 2328, + "time_per_iteration": 2.5216193199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102085, + "balance_loss_mlp": 1.06579816, + "epoch": 0.448056944978838, + "flos": 551822409216.0, + "grad_norm": 0.05774471450654044, + "language_loss": 0.82095438, + "learning_rate": 0.0006075691519307781, + "loss": 0.83197522, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.36303711, + "step": 2329, + "time_per_iteration": 2.8394477367401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093614, + "balance_loss_mlp": 1.05551517, + "epoch": 0.44824932666410156, + "flos": 550571143680.0, + "grad_norm": 0.05485541452922095, + "language_loss": 0.82042563, + "learning_rate": 0.0006072648839616356, + "loss": 0.83136177, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.38061523, + "step": 2330, + "time_per_iteration": 2.650087594985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089159, + "balance_loss_mlp": 1.05229926, + "epoch": 0.44844170834936514, + "flos": 988161541632.0, + "grad_norm": 0.0454185508799419, + "language_loss": 0.82814097, + "learning_rate": 0.0006069605743485718, + "loss": 0.83903253, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.3684082, + "step": 2331, + "time_per_iteration": 3.345179319381714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085878, + "balance_loss_mlp": 1.0494473, + "epoch": 0.44863409003462873, + "flos": 591040032768.0, + "grad_norm": 0.057018102026312835, + "language_loss": 0.83470714, + "learning_rate": 0.0006066562232097303, + "loss": 0.84556592, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.36425781, + "step": 2332, + "time_per_iteration": 2.7025153636932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089148, + "balance_loss_mlp": 1.0525744, + "epoch": 0.44882647171989226, + "flos": 724313525760.0, + "grad_norm": 0.055435808375502424, + "language_loss": 0.86104345, + "learning_rate": 0.0006063518306632708, + "loss": 0.87193495, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.36572266, + "step": 2333, + "time_per_iteration": 2.934469699859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082178, + "balance_loss_mlp": 1.04465127, + "epoch": 0.44901885340515585, + "flos": 534662360064.0, + "grad_norm": 0.061394686563490536, + "language_loss": 0.82313985, + "learning_rate": 0.0006060473968273688, + "loss": 0.83396161, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.375, + "step": 2334, + "time_per_iteration": 2.6561286449432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139417, + "balance_loss_mlp": 1.12782979, + "epoch": 0.4492112350904194, + "flos": 1554456462336.0, + "grad_norm": 0.048192148717983975, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.79018956, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.11572266, + "step": 2335, + "time_per_iteration": 4.895314693450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092745, + "balance_loss_mlp": 1.08144426, + "epoch": 0.44940361677568297, + "flos": 1522525413888.0, + "grad_norm": 0.0355581806637232, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.8209796, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.11279297, + "step": 2336, + "time_per_iteration": 4.86665940284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088054, + "balance_loss_mlp": 1.05064595, + "epoch": 0.4495959984609465, + "flos": 382289310720.0, + "grad_norm": 0.06064477802371089, + "language_loss": 0.88117951, + "learning_rate": 0.0006051338487650047, + "loss": 0.89206004, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.3737793, + "step": 2337, + "time_per_iteration": 2.4159162044525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086135, + "balance_loss_mlp": 1.04777336, + "epoch": 0.4497883801462101, + "flos": 497630145024.0, + "grad_norm": 0.058257925131248826, + "language_loss": 0.82456082, + "learning_rate": 0.0006048292509534095, + "loss": 0.83542222, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.38354492, + "step": 2338, + "time_per_iteration": 2.5835769176483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081051, + "balance_loss_mlp": 1.04392958, + "epoch": 0.4499807618314736, + "flos": 614166990336.0, + "grad_norm": 0.053787147945734054, + "language_loss": 0.77580249, + "learning_rate": 0.0006045246124434895, + "loss": 0.78661299, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.37109375, + "step": 2339, + "time_per_iteration": 2.7258870601654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080837, + "balance_loss_mlp": 1.04311895, + "epoch": 0.4501731435167372, + "flos": 1005122331648.0, + "grad_norm": 0.06446556175990359, + "language_loss": 0.86143219, + "learning_rate": 0.0006042199333535162, + "loss": 0.87224054, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.37695312, + "step": 2340, + "time_per_iteration": 3.2644054889678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089021, + "balance_loss_mlp": 1.05132723, + "epoch": 0.4503655252020008, + "flos": 820519898112.0, + "grad_norm": 0.05440597484835576, + "language_loss": 0.8378191, + "learning_rate": 0.0006039152138017763, + "loss": 0.84870934, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.37695312, + "step": 2341, + "time_per_iteration": 3.0747756958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082643, + "balance_loss_mlp": 1.04566467, + "epoch": 0.4505579068872643, + "flos": 486113937408.0, + "grad_norm": 0.06051531382505287, + "language_loss": 0.83470345, + "learning_rate": 0.0006036104539065726, + "loss": 0.84552985, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.36962891, + "step": 2342, + "time_per_iteration": 2.6581151485443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076125, + "balance_loss_mlp": 1.03812099, + "epoch": 0.4507502885725279, + "flos": 884421282816.0, + "grad_norm": 0.05288539322407846, + "language_loss": 0.845487, + "learning_rate": 0.000603305653786223, + "loss": 0.85624826, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.37963867, + "step": 2343, + "time_per_iteration": 3.1298844814300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079349, + "balance_loss_mlp": 1.04208446, + "epoch": 0.45094267025779144, + "flos": 578070328320.0, + "grad_norm": 0.04730162576611683, + "language_loss": 0.83859873, + "learning_rate": 0.0006030008135590622, + "loss": 0.84939224, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.37255859, + "step": 2344, + "time_per_iteration": 2.685067892074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107533, + "balance_loss_mlp": 1.03799331, + "epoch": 0.45113505194305503, + "flos": 525124947456.0, + "grad_norm": 0.051192045733620226, + "language_loss": 0.80228901, + "learning_rate": 0.0006026959333434387, + "loss": 0.81304228, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.37353516, + "step": 2345, + "time_per_iteration": 2.783407688140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107739, + "balance_loss_mlp": 1.04014897, + "epoch": 0.45132743362831856, + "flos": 501791376384.0, + "grad_norm": 0.05199160611628431, + "language_loss": 0.77699506, + "learning_rate": 0.0006023910132577181, + "loss": 0.78776896, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.37207031, + "step": 2346, + "time_per_iteration": 2.646801233291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077398, + "balance_loss_mlp": 1.03968024, + "epoch": 0.45151981531358215, + "flos": 431690328576.0, + "grad_norm": 0.04922592508563583, + "language_loss": 0.84707314, + "learning_rate": 0.0006020860534202806, + "loss": 0.85784709, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.37670898, + "step": 2347, + "time_per_iteration": 2.4788920879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078489, + "balance_loss_mlp": 1.04036641, + "epoch": 0.4517121969988457, + "flos": 711826859520.0, + "grad_norm": 0.07725824631471088, + "language_loss": 0.80951411, + "learning_rate": 0.0006017810539495224, + "loss": 0.82029903, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.38110352, + "step": 2348, + "time_per_iteration": 3.013258934020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071848, + "balance_loss_mlp": 1.03587079, + "epoch": 0.45190457868410927, + "flos": 579197938176.0, + "grad_norm": 0.052394100693581906, + "language_loss": 0.82200068, + "learning_rate": 0.0006014760149638547, + "loss": 0.83271921, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.35986328, + "step": 2349, + "time_per_iteration": 2.6988728046417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073469, + "balance_loss_mlp": 1.03823042, + "epoch": 0.45209696036937286, + "flos": 482415395328.0, + "grad_norm": 0.04812495303687425, + "language_loss": 0.88394493, + "learning_rate": 0.000601170936581704, + "loss": 0.89467961, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.35253906, + "step": 2350, + "time_per_iteration": 2.5537099838256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108294, + "balance_loss_mlp": 1.04617548, + "epoch": 0.4522893420546364, + "flos": 539945409024.0, + "grad_norm": 0.059990427154632556, + "language_loss": 0.84346575, + "learning_rate": 0.0006008658189215121, + "loss": 0.85429513, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.36767578, + "step": 2351, + "time_per_iteration": 2.649442434310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084325, + "balance_loss_mlp": 1.04803789, + "epoch": 0.4524817237399, + "flos": 496423959552.0, + "grad_norm": 0.09153462549619036, + "language_loss": 0.7966159, + "learning_rate": 0.0006005606621017366, + "loss": 0.80745912, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.36328125, + "step": 2352, + "time_per_iteration": 2.55026912689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086533, + "balance_loss_mlp": 1.04891062, + "epoch": 0.4526741054251635, + "flos": 652229300736.0, + "grad_norm": 0.05116414037173521, + "language_loss": 0.80266565, + "learning_rate": 0.0006002554662408496, + "loss": 0.81353092, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.3762207, + "step": 2353, + "time_per_iteration": 2.8708717823028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089479, + "balance_loss_mlp": 1.05259538, + "epoch": 0.4528664871104271, + "flos": 570674654208.0, + "grad_norm": 0.05934636879993742, + "language_loss": 0.91137719, + "learning_rate": 0.0005999502314573388, + "loss": 0.92227197, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.36865234, + "step": 2354, + "time_per_iteration": 2.636732339859009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091866, + "balance_loss_mlp": 1.05424321, + "epoch": 0.45305886879569063, + "flos": 458480922624.0, + "grad_norm": 0.06511026561582739, + "language_loss": 0.85993183, + "learning_rate": 0.0005996449578697066, + "loss": 0.87085044, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.3762207, + "step": 2355, + "time_per_iteration": 2.6497340202331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095767, + "balance_loss_mlp": 1.05916929, + "epoch": 0.4532512504809542, + "flos": 504922512384.0, + "grad_norm": 0.05408585590104452, + "language_loss": 0.81462455, + "learning_rate": 0.0005993396455964709, + "loss": 0.82558227, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.36621094, + "step": 2356, + "time_per_iteration": 2.67404842376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090921, + "balance_loss_mlp": 1.05360866, + "epoch": 0.4534436321662178, + "flos": 581940578304.0, + "grad_norm": 0.046652791791384825, + "language_loss": 0.81415474, + "learning_rate": 0.0005990342947561647, + "loss": 0.82506394, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.37304688, + "step": 2357, + "time_per_iteration": 2.694093942642212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092731, + "balance_loss_mlp": 1.05577612, + "epoch": 0.45363601385148133, + "flos": 549458090496.0, + "grad_norm": 0.05811050095266086, + "language_loss": 0.77914369, + "learning_rate": 0.0005987289054673351, + "loss": 0.79007101, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.36987305, + "step": 2358, + "time_per_iteration": 2.6171157360076904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187917, + "balance_loss_mlp": 1.16912949, + "epoch": 0.4538283955367449, + "flos": 1473754411008.0, + "grad_norm": 0.03301673104438644, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77763653, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.1875, + "step": 2359, + "time_per_iteration": 4.821492910385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096443, + "balance_loss_mlp": 1.05986929, + "epoch": 0.45402077722200845, + "flos": 584441699328.0, + "grad_norm": 0.059282629275687046, + "language_loss": 0.91217041, + "learning_rate": 0.0005981180120183722, + "loss": 0.92313486, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.36572266, + "step": 2360, + "time_per_iteration": 2.6678080558776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109692, + "balance_loss_mlp": 1.05901098, + "epoch": 0.45421315890727204, + "flos": 531462822912.0, + "grad_norm": 0.0444268091974553, + "language_loss": 0.85307455, + "learning_rate": 0.0005978125080954089, + "loss": 0.86404377, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.37915039, + "step": 2361, + "time_per_iteration": 2.7723591327667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093514, + "balance_loss_mlp": 1.05651164, + "epoch": 0.4544055405925356, + "flos": 784890307584.0, + "grad_norm": 0.08031817047800895, + "language_loss": 0.7639026, + "learning_rate": 0.000597506966198262, + "loss": 0.77483773, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.36987305, + "step": 2362, + "time_per_iteration": 2.9897196292877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109078, + "balance_loss_mlp": 1.05389667, + "epoch": 0.45459792227779916, + "flos": 517950443520.0, + "grad_norm": 0.07752194494873299, + "language_loss": 0.84128416, + "learning_rate": 0.0005972013864455536, + "loss": 0.85219198, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.36914062, + "step": 2363, + "time_per_iteration": 2.580357074737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091271, + "balance_loss_mlp": 1.05515027, + "epoch": 0.4547903039630627, + "flos": 537306075648.0, + "grad_norm": 0.05808697989569881, + "language_loss": 0.85570788, + "learning_rate": 0.0005968957689559203, + "loss": 0.8666206, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.36132812, + "step": 2364, + "time_per_iteration": 2.64911150932312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095782, + "balance_loss_mlp": 1.05997205, + "epoch": 0.4549826856483263, + "flos": 528423409152.0, + "grad_norm": 0.05494979115149378, + "language_loss": 0.88544732, + "learning_rate": 0.0005965901138480131, + "loss": 0.8964051, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.35839844, + "step": 2365, + "time_per_iteration": 2.61967396736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100343, + "balance_loss_mlp": 1.06379294, + "epoch": 0.45517506733358987, + "flos": 520649413632.0, + "grad_norm": 0.0583285525672419, + "language_loss": 0.87046576, + "learning_rate": 0.0005962844212404982, + "loss": 0.88146913, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.36547852, + "step": 2366, + "time_per_iteration": 2.663799524307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108056, + "balance_loss_mlp": 1.07067156, + "epoch": 0.4553674490188534, + "flos": 450814616064.0, + "grad_norm": 0.06095483853323617, + "language_loss": 0.86969483, + "learning_rate": 0.0005959786912520558, + "loss": 0.88077545, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.37353516, + "step": 2367, + "time_per_iteration": 2.604011058807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104168, + "balance_loss_mlp": 1.06740427, + "epoch": 0.455559830704117, + "flos": 546308015616.0, + "grad_norm": 0.04613637765687707, + "language_loss": 0.83717126, + "learning_rate": 0.0005956729240013806, + "loss": 0.84821296, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.36743164, + "step": 2368, + "time_per_iteration": 2.7852706909179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110558, + "balance_loss_mlp": 1.06917334, + "epoch": 0.4557522123893805, + "flos": 583491589632.0, + "grad_norm": 0.05161395773765414, + "language_loss": 0.91501808, + "learning_rate": 0.0005953671196071824, + "loss": 0.92607391, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.36401367, + "step": 2369, + "time_per_iteration": 2.7515223026275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110279, + "balance_loss_mlp": 1.06681311, + "epoch": 0.4559445940746441, + "flos": 526149250560.0, + "grad_norm": 0.05240938085212211, + "language_loss": 0.80084532, + "learning_rate": 0.0005950612781881846, + "loss": 0.8118732, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.35986328, + "step": 2370, + "time_per_iteration": 2.6867175102233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096163, + "balance_loss_mlp": 1.05873156, + "epoch": 0.45613697575990764, + "flos": 651810281472.0, + "grad_norm": 0.06280114629685846, + "language_loss": 0.7594825, + "learning_rate": 0.0005947553998631259, + "loss": 0.77044415, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.37451172, + "step": 2371, + "time_per_iteration": 2.8399033546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096425, + "balance_loss_mlp": 1.05985141, + "epoch": 0.4563293574451712, + "flos": 866744699904.0, + "grad_norm": 0.04396235367342953, + "language_loss": 0.78598678, + "learning_rate": 0.000594449484750758, + "loss": 0.79695106, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.36572266, + "step": 2372, + "time_per_iteration": 3.140890121459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088429, + "balance_loss_mlp": 1.05140269, + "epoch": 0.45652173913043476, + "flos": 497817819648.0, + "grad_norm": 0.06709411136792778, + "language_loss": 0.82665753, + "learning_rate": 0.0005941435329698484, + "loss": 0.83754182, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.36987305, + "step": 2373, + "time_per_iteration": 2.6316027641296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089126, + "balance_loss_mlp": 1.05238533, + "epoch": 0.45671412081569834, + "flos": 560581420032.0, + "grad_norm": 0.05173954705628188, + "language_loss": 0.82881534, + "learning_rate": 0.0005938375446391778, + "loss": 0.83970654, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.36743164, + "step": 2374, + "time_per_iteration": 2.6999659538269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096506, + "balance_loss_mlp": 1.05823994, + "epoch": 0.45690650250096193, + "flos": 502873906176.0, + "grad_norm": 0.06488189122368912, + "language_loss": 0.88693655, + "learning_rate": 0.0005935315198775415, + "loss": 0.89790159, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.38232422, + "step": 2375, + "time_per_iteration": 2.584855556488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084023, + "balance_loss_mlp": 1.04675794, + "epoch": 0.45709888418622546, + "flos": 430473968640.0, + "grad_norm": 0.054054227258136585, + "language_loss": 0.86900407, + "learning_rate": 0.0005932254588037486, + "loss": 0.87984431, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.37207031, + "step": 2376, + "time_per_iteration": 2.4713377952575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087126, + "balance_loss_mlp": 1.04907441, + "epoch": 0.45729126587148905, + "flos": 525395579904.0, + "grad_norm": 0.22673198102288197, + "language_loss": 0.86219609, + "learning_rate": 0.000592919361536623, + "loss": 0.87306732, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.38037109, + "step": 2377, + "time_per_iteration": 2.6324362754821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074984, + "balance_loss_mlp": 1.03821993, + "epoch": 0.4574836475567526, + "flos": 637717349376.0, + "grad_norm": 0.06562895013351942, + "language_loss": 0.88980031, + "learning_rate": 0.0005926132281950017, + "loss": 0.90055019, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.36767578, + "step": 2378, + "time_per_iteration": 2.7336690425872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079258, + "balance_loss_mlp": 1.04194546, + "epoch": 0.45767602924201617, + "flos": 649288811520.0, + "grad_norm": 0.05221471992659685, + "language_loss": 0.84916019, + "learning_rate": 0.0005923070588977367, + "loss": 0.85995281, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.37280273, + "step": 2379, + "time_per_iteration": 2.796694755554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073672, + "balance_loss_mlp": 1.03745568, + "epoch": 0.4578684109272797, + "flos": 746356543488.0, + "grad_norm": 0.05948192069014845, + "language_loss": 0.86265379, + "learning_rate": 0.0005920008537636931, + "loss": 0.8733905, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.36230469, + "step": 2380, + "time_per_iteration": 2.919175863265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073893, + "balance_loss_mlp": 1.03734303, + "epoch": 0.4580607926125433, + "flos": 641155433472.0, + "grad_norm": 0.07082348059879481, + "language_loss": 0.86767799, + "learning_rate": 0.0005916946129117504, + "loss": 0.8784169, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.36523438, + "step": 2381, + "time_per_iteration": 2.8834073543548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076603, + "balance_loss_mlp": 1.03983903, + "epoch": 0.4582531742978069, + "flos": 801513474048.0, + "grad_norm": 0.06015762492268947, + "language_loss": 0.80385733, + "learning_rate": 0.0005913883364608017, + "loss": 0.81462336, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.36791992, + "step": 2382, + "time_per_iteration": 3.05711030960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077489, + "balance_loss_mlp": 1.03984237, + "epoch": 0.4584455559830704, + "flos": 683991613440.0, + "grad_norm": 0.05122280126715116, + "language_loss": 0.88575673, + "learning_rate": 0.0005910820245297542, + "loss": 0.89653164, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.37646484, + "step": 2383, + "time_per_iteration": 2.8739712238311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107409, + "balance_loss_mlp": 1.03682566, + "epoch": 0.458637937668334, + "flos": 517902391296.0, + "grad_norm": 0.06830932289634356, + "language_loss": 0.80442882, + "learning_rate": 0.000590775677237529, + "loss": 0.81516975, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.37231445, + "step": 2384, + "time_per_iteration": 2.7162787914276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083903, + "balance_loss_mlp": 1.04585159, + "epoch": 0.4588303193535975, + "flos": 505242607104.0, + "grad_norm": 0.06045305543182838, + "language_loss": 0.80110037, + "learning_rate": 0.0005904692947030601, + "loss": 0.81193942, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.38012695, + "step": 2385, + "time_per_iteration": 2.615645408630371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077094, + "balance_loss_mlp": 1.04054475, + "epoch": 0.4590227010388611, + "flos": 495655732224.0, + "grad_norm": 0.07817461665700527, + "language_loss": 0.89474368, + "learning_rate": 0.0005901628770452963, + "loss": 0.90551466, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.36572266, + "step": 2386, + "time_per_iteration": 2.545145273208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_mlp": 1.03952503, + "epoch": 0.45921508272412465, + "flos": 493375781376.0, + "grad_norm": 0.05719900676000999, + "language_loss": 0.87518173, + "learning_rate": 0.000589856424383199, + "loss": 0.88595015, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.37280273, + "step": 2387, + "time_per_iteration": 2.5866873264312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078911, + "balance_loss_mlp": 1.04000092, + "epoch": 0.45940746440938823, + "flos": 691096306176.0, + "grad_norm": 0.05272732350360167, + "language_loss": 0.82854474, + "learning_rate": 0.000589549936835744, + "loss": 0.83933389, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.38867188, + "step": 2388, + "time_per_iteration": 2.886815309524536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082474, + "balance_loss_mlp": 1.04485154, + "epoch": 0.45959984609465176, + "flos": 503489364480.0, + "grad_norm": 0.061476086167368736, + "language_loss": 0.79490817, + "learning_rate": 0.0005892434145219202, + "loss": 0.80573285, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.37597656, + "step": 2389, + "time_per_iteration": 2.669055461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078287, + "balance_loss_mlp": 1.04035497, + "epoch": 0.45979222777991535, + "flos": 676339863552.0, + "grad_norm": 0.13998924312013794, + "language_loss": 0.82966721, + "learning_rate": 0.0005889368575607303, + "loss": 0.84045005, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.37890625, + "step": 2390, + "time_per_iteration": 2.8364429473876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075252, + "balance_loss_mlp": 1.03941786, + "epoch": 0.45998460946517894, + "flos": 777308368896.0, + "grad_norm": 0.05472501976139028, + "language_loss": 0.78496212, + "learning_rate": 0.00058863026607119, + "loss": 0.79571462, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.35864258, + "step": 2391, + "time_per_iteration": 3.104703664779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078215, + "balance_loss_mlp": 1.04059267, + "epoch": 0.46017699115044247, + "flos": 851073053184.0, + "grad_norm": 0.06149888926191146, + "language_loss": 0.79584855, + "learning_rate": 0.0005883236401723287, + "loss": 0.80663073, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.37597656, + "step": 2392, + "time_per_iteration": 3.1967198848724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074328, + "balance_loss_mlp": 1.03603745, + "epoch": 0.46036937283570606, + "flos": 575608495104.0, + "grad_norm": 0.05401888737183198, + "language_loss": 0.84525239, + "learning_rate": 0.0005880169799831893, + "loss": 0.85599566, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.3828125, + "step": 2393, + "time_per_iteration": 2.6700267791748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078621, + "balance_loss_mlp": 1.04049826, + "epoch": 0.4605617545209696, + "flos": 611553798144.0, + "grad_norm": 0.04760801272162673, + "language_loss": 0.81405449, + "learning_rate": 0.0005877102856228278, + "loss": 0.82484066, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.38110352, + "step": 2394, + "time_per_iteration": 2.8472628593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079988, + "balance_loss_mlp": 1.04100633, + "epoch": 0.4607541362062332, + "flos": 532884386304.0, + "grad_norm": 0.0583897063043048, + "language_loss": 0.84685498, + "learning_rate": 0.0005874035572103133, + "loss": 0.85765481, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.38964844, + "step": 2395, + "time_per_iteration": 2.6390676498413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081925, + "balance_loss_mlp": 1.04437459, + "epoch": 0.4609465178914967, + "flos": 647023417344.0, + "grad_norm": 0.07571396195119524, + "language_loss": 0.82582867, + "learning_rate": 0.0005870967948647288, + "loss": 0.83664787, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.37573242, + "step": 2396, + "time_per_iteration": 2.7459003925323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115963, + "balance_loss_mlp": 1.09889209, + "epoch": 0.4611388995767603, + "flos": 1465487202816.0, + "grad_norm": 0.025541481833947964, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75424266, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.17089844, + "step": 2397, + "time_per_iteration": 5.318708896636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083182, + "balance_loss_mlp": 1.04446316, + "epoch": 0.46133128126202383, + "flos": 722772688896.0, + "grad_norm": 0.0770893227760576, + "language_loss": 0.8586902, + "learning_rate": 0.0005864831688507443, + "loss": 0.86952198, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.38696289, + "step": 2398, + "time_per_iteration": 3.0177690982818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092841, + "balance_loss_mlp": 1.05266774, + "epoch": 0.4615236629472874, + "flos": 547735371264.0, + "grad_norm": 0.05577558539065206, + "language_loss": 0.74877977, + "learning_rate": 0.0005861763054205754, + "loss": 0.75970817, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.40161133, + "step": 2399, + "time_per_iteration": 4.235994815826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089549, + "balance_loss_mlp": 1.04885101, + "epoch": 0.461716044632551, + "flos": 601942192128.0, + "grad_norm": 0.04983292023279428, + "language_loss": 0.80479169, + "learning_rate": 0.0005858694085337976, + "loss": 0.81568718, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.40698242, + "step": 2400, + "time_per_iteration": 2.807819366455078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095586, + "balance_loss_mlp": 1.0549593, + "epoch": 0.46190842631781454, + "flos": 474236937216.0, + "grad_norm": 0.0664642499777789, + "language_loss": 0.8348912, + "learning_rate": 0.0005855624783095589, + "loss": 0.84584707, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.40625, + "step": 2401, + "time_per_iteration": 2.572861909866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088871, + "balance_loss_mlp": 1.04848242, + "epoch": 0.4621008080030781, + "flos": 437254184448.0, + "grad_norm": 0.05436683283363487, + "language_loss": 0.85176182, + "learning_rate": 0.00058525551486702, + "loss": 0.86265051, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.40405273, + "step": 2402, + "time_per_iteration": 2.5116658210754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091528, + "balance_loss_mlp": 1.05056739, + "epoch": 0.46229318968834165, + "flos": 525203523072.0, + "grad_norm": 0.06054832474170735, + "language_loss": 0.81057394, + "learning_rate": 0.0005849485183253548, + "loss": 0.82148921, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.40942383, + "step": 2403, + "time_per_iteration": 2.6135447025299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109357, + "balance_loss_mlp": 1.05446947, + "epoch": 0.46248557137360524, + "flos": 439395922944.0, + "grad_norm": 0.05271308957386849, + "language_loss": 0.87085575, + "learning_rate": 0.0005846414888037501, + "loss": 0.88179141, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.39086914, + "step": 2404, + "time_per_iteration": 2.479233503341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094001, + "balance_loss_mlp": 1.05513883, + "epoch": 0.4626779530588688, + "flos": 617318475264.0, + "grad_norm": 0.05681624365321511, + "language_loss": 0.82982111, + "learning_rate": 0.0005843344264214049, + "loss": 0.84076107, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.38818359, + "step": 2405, + "time_per_iteration": 2.8025927543640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094528, + "balance_loss_mlp": 1.05478346, + "epoch": 0.46287033474413236, + "flos": 669796784640.0, + "grad_norm": 0.07573173665893672, + "language_loss": 0.84474289, + "learning_rate": 0.0005840273312975317, + "loss": 0.8556881, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.39746094, + "step": 2406, + "time_per_iteration": 2.880143642425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096681, + "balance_loss_mlp": 1.05705631, + "epoch": 0.46306271642939595, + "flos": 479992849920.0, + "grad_norm": 0.09801123732991168, + "language_loss": 0.90446943, + "learning_rate": 0.0005837202035513555, + "loss": 0.91543621, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.39599609, + "step": 2407, + "time_per_iteration": 2.5880489349365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109455, + "balance_loss_mlp": 1.05583048, + "epoch": 0.4632550981146595, + "flos": 580395359232.0, + "grad_norm": 0.057934056350582984, + "language_loss": 0.81573331, + "learning_rate": 0.0005834130433021136, + "loss": 0.82667881, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.38671875, + "step": 2408, + "time_per_iteration": 2.739018201828003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100791, + "balance_loss_mlp": 1.06121325, + "epoch": 0.46344747979992307, + "flos": 523701974016.0, + "grad_norm": 0.11568384778980019, + "language_loss": 0.73278892, + "learning_rate": 0.0005831058506690563, + "loss": 0.74379677, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.39550781, + "step": 2409, + "time_per_iteration": 2.6164803504943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109281, + "balance_loss_mlp": 1.05513954, + "epoch": 0.4636398614851866, + "flos": 746174661120.0, + "grad_norm": 0.10585491609730635, + "language_loss": 0.85966945, + "learning_rate": 0.0005827986257714464, + "loss": 0.87059754, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.3762207, + "step": 2410, + "time_per_iteration": 2.9002575874328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108768, + "balance_loss_mlp": 1.05008137, + "epoch": 0.4638322431704502, + "flos": 596273619456.0, + "grad_norm": 0.054458395819511424, + "language_loss": 0.88645154, + "learning_rate": 0.0005824913687285591, + "loss": 0.89732838, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.37597656, + "step": 2411, + "time_per_iteration": 2.65468168258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108455, + "balance_loss_mlp": 1.046808, + "epoch": 0.4640246248557137, + "flos": 539172799488.0, + "grad_norm": 0.10537111148670983, + "language_loss": 0.81237781, + "learning_rate": 0.0005821840796596821, + "loss": 0.82322335, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.37744141, + "step": 2412, + "time_per_iteration": 2.64800763130188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086752, + "balance_loss_mlp": 1.04979706, + "epoch": 0.4642170065409773, + "flos": 562330280448.0, + "grad_norm": 0.05022524173963101, + "language_loss": 0.80493259, + "learning_rate": 0.0005818767586841158, + "loss": 0.81580019, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.36962891, + "step": 2413, + "time_per_iteration": 2.755119800567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081928, + "balance_loss_mlp": 1.04657054, + "epoch": 0.46440938822624084, + "flos": 530684421120.0, + "grad_norm": 0.05374997972366647, + "language_loss": 0.86088538, + "learning_rate": 0.0005815694059211726, + "loss": 0.87170464, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.35400391, + "step": 2414, + "time_per_iteration": 2.6568868160247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.09606647, + "epoch": 0.4646017699115044, + "flos": 1525503780864.0, + "grad_norm": 0.029698276976430914, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.81986189, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.16503906, + "step": 2415, + "time_per_iteration": 4.772961378097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103885, + "balance_loss_mlp": 1.08795917, + "epoch": 0.464794151596768, + "flos": 1539999765504.0, + "grad_norm": 0.029205098078145548, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78048944, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.15917969, + "step": 2416, + "time_per_iteration": 4.972976446151733 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085394, + "balance_loss_mlp": 1.04908264, + "epoch": 0.46498653328203154, + "flos": 501200649216.0, + "grad_norm": 0.04510206741076235, + "language_loss": 0.86396641, + "learning_rate": 0.0005806471581013931, + "loss": 0.87482029, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.36328125, + "step": 2417, + "time_per_iteration": 2.6620965003967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084664, + "balance_loss_mlp": 1.04806709, + "epoch": 0.46517891496729513, + "flos": 675856825344.0, + "grad_norm": 0.06302462590955567, + "language_loss": 0.78826416, + "learning_rate": 0.0005803396793823146, + "loss": 0.79911077, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.36572266, + "step": 2418, + "time_per_iteration": 2.7901804447174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108772, + "balance_loss_mlp": 1.05190992, + "epoch": 0.46537129665255866, + "flos": 585062949888.0, + "grad_norm": 0.06339234247272847, + "language_loss": 0.85623956, + "learning_rate": 0.0005800321694726065, + "loss": 0.86711681, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.35839844, + "step": 2419, + "time_per_iteration": 2.728811740875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085796, + "balance_loss_mlp": 1.04836476, + "epoch": 0.46556367833782225, + "flos": 587425858560.0, + "grad_norm": 0.05222204092555794, + "language_loss": 0.8708874, + "learning_rate": 0.0005797246284916545, + "loss": 0.88174534, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.37402344, + "step": 2420, + "time_per_iteration": 2.6684653759002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045596, + "balance_loss_mlp": 1.03043234, + "epoch": 0.4657560600230858, + "flos": 1484674099200.0, + "grad_norm": 0.011675297447767578, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78550786, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.15136719, + "step": 2421, + "time_per_iteration": 4.958959102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109154, + "balance_loss_mlp": 1.05506182, + "epoch": 0.46594844170834937, + "flos": 579961783296.0, + "grad_norm": 0.06275032464162542, + "language_loss": 0.88184166, + "learning_rate": 0.0005791094537936233, + "loss": 0.89275706, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.36499023, + "step": 2422, + "time_per_iteration": 2.682985782623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085598, + "balance_loss_mlp": 1.04761815, + "epoch": 0.4661408233936129, + "flos": 512322568704.0, + "grad_norm": 0.05420418194823272, + "language_loss": 0.8170498, + "learning_rate": 0.0005788018203153762, + "loss": 0.82790577, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.37988281, + "step": 2423, + "time_per_iteration": 2.5706470012664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085407, + "balance_loss_mlp": 1.04883409, + "epoch": 0.4663332050788765, + "flos": 490839754752.0, + "grad_norm": 0.06546291293651209, + "language_loss": 0.85642946, + "learning_rate": 0.000578494156243549, + "loss": 0.86728358, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.36572266, + "step": 2424, + "time_per_iteration": 2.578847646713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085751, + "balance_loss_mlp": 1.04746079, + "epoch": 0.4665255867641401, + "flos": 512353092096.0, + "grad_norm": 0.059152702804089866, + "language_loss": 0.89097798, + "learning_rate": 0.0005781864616975878, + "loss": 0.90183544, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.38256836, + "step": 2425, + "time_per_iteration": 2.6408798694610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085338, + "balance_loss_mlp": 1.04585552, + "epoch": 0.4667179684494036, + "flos": 424590018048.0, + "grad_norm": 0.07480086545967683, + "language_loss": 0.84123272, + "learning_rate": 0.0005778787367969502, + "loss": 0.85208613, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.39477539, + "step": 2426, + "time_per_iteration": 2.5963637828826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077247, + "balance_loss_mlp": 1.03910041, + "epoch": 0.4669103501346672, + "flos": 707640897024.0, + "grad_norm": 0.07167303988395164, + "language_loss": 0.80844486, + "learning_rate": 0.0005775709816611053, + "loss": 0.81921738, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.38134766, + "step": 2427, + "time_per_iteration": 2.971285581588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079771, + "balance_loss_mlp": 1.04138589, + "epoch": 0.4671027318199307, + "flos": 554554874880.0, + "grad_norm": 0.05405801443852106, + "language_loss": 0.83748919, + "learning_rate": 0.0005772631964095346, + "loss": 0.84828693, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.38354492, + "step": 2428, + "time_per_iteration": 2.709364175796509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080886, + "balance_loss_mlp": 1.04271483, + "epoch": 0.4672951135051943, + "flos": 566839309824.0, + "grad_norm": 0.060777782070244445, + "language_loss": 0.8565498, + "learning_rate": 0.000576955381161731, + "loss": 0.86735862, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.38183594, + "step": 2429, + "time_per_iteration": 2.708270311355591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083121, + "balance_loss_mlp": 1.04452121, + "epoch": 0.46748749519045785, + "flos": 424294654464.0, + "grad_norm": 0.05633631430335825, + "language_loss": 0.85906339, + "learning_rate": 0.0005766475360371985, + "loss": 0.86989462, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.38574219, + "step": 2430, + "time_per_iteration": 2.617856740951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089079, + "balance_loss_mlp": 1.05055118, + "epoch": 0.46767987687572143, + "flos": 538088859648.0, + "grad_norm": 0.05568735360450276, + "language_loss": 0.84486759, + "learning_rate": 0.0005763396611554536, + "loss": 0.85575831, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.38476562, + "step": 2431, + "time_per_iteration": 2.6460912227630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093376, + "balance_loss_mlp": 1.0557059, + "epoch": 0.467872258560985, + "flos": 823360052736.0, + "grad_norm": 0.05823580457003032, + "language_loss": 0.80262822, + "learning_rate": 0.0005760317566360237, + "loss": 0.81356204, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.37646484, + "step": 2432, + "time_per_iteration": 3.010744094848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104411, + "balance_loss_mlp": 1.066836, + "epoch": 0.46806464024624855, + "flos": 661366632960.0, + "grad_norm": 0.07453415962543286, + "language_loss": 0.85120392, + "learning_rate": 0.000575723822598448, + "loss": 0.86224806, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.37573242, + "step": 2433, + "time_per_iteration": 2.7999444007873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100188, + "balance_loss_mlp": 1.06232667, + "epoch": 0.46825702193151214, + "flos": 755362865664.0, + "grad_norm": 0.08922556949000433, + "language_loss": 0.81824166, + "learning_rate": 0.0005754158591622773, + "loss": 0.82924354, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.37866211, + "step": 2434, + "time_per_iteration": 3.016101837158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089201, + "balance_loss_mlp": 1.05250812, + "epoch": 0.4684494036167757, + "flos": 439164578304.0, + "grad_norm": 0.06367410837717138, + "language_loss": 0.82359827, + "learning_rate": 0.0005751078664470732, + "loss": 0.8344903, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.36694336, + "step": 2435, + "time_per_iteration": 2.5870590209960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095131, + "balance_loss_mlp": 1.05762815, + "epoch": 0.46864178530203926, + "flos": 532446428160.0, + "grad_norm": 0.059213993455869605, + "language_loss": 0.85874772, + "learning_rate": 0.0005747998445724094, + "loss": 0.86969906, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.375, + "step": 2436, + "time_per_iteration": 2.606999397277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088052, + "balance_loss_mlp": 1.05135953, + "epoch": 0.4688341669873028, + "flos": 576328670208.0, + "grad_norm": 0.05282393784178956, + "language_loss": 0.89627349, + "learning_rate": 0.0005744917936578707, + "loss": 0.90715402, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.3671875, + "step": 2437, + "time_per_iteration": 2.7902729511260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075719, + "balance_loss_mlp": 1.03978968, + "epoch": 0.4690265486725664, + "flos": 539296455168.0, + "grad_norm": 0.04430533887369339, + "language_loss": 0.84245884, + "learning_rate": 0.0005741837138230526, + "loss": 0.85321605, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.35913086, + "step": 2438, + "time_per_iteration": 2.726710319519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082309, + "balance_loss_mlp": 1.04580677, + "epoch": 0.4692189303578299, + "flos": 770168770560.0, + "grad_norm": 0.06182369714878754, + "language_loss": 0.86213875, + "learning_rate": 0.0005738756051875627, + "loss": 0.87296176, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.36547852, + "step": 2439, + "time_per_iteration": 3.07755708694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077719, + "balance_loss_mlp": 1.04178953, + "epoch": 0.4694113120430935, + "flos": 571118404608.0, + "grad_norm": 0.047772699497207846, + "language_loss": 0.82990217, + "learning_rate": 0.0005735674678710192, + "loss": 0.84067929, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.359375, + "step": 2440, + "time_per_iteration": 2.6625607013702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080403, + "balance_loss_mlp": 1.04423499, + "epoch": 0.4696036937283571, + "flos": 748498281984.0, + "grad_norm": 0.07690297936976162, + "language_loss": 0.81414962, + "learning_rate": 0.0005732593019930517, + "loss": 0.82495368, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.36181641, + "step": 2441, + "time_per_iteration": 2.918219566345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082294, + "balance_loss_mlp": 1.04669785, + "epoch": 0.4697960754136206, + "flos": 493208455680.0, + "grad_norm": 0.061105529929901724, + "language_loss": 0.87989414, + "learning_rate": 0.0005729511076733008, + "loss": 0.89071703, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.35620117, + "step": 2442, + "time_per_iteration": 2.6301560401916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085737, + "balance_loss_mlp": 1.04909194, + "epoch": 0.4699884570988842, + "flos": 724809710592.0, + "grad_norm": 0.0773152930313349, + "language_loss": 0.84905529, + "learning_rate": 0.000572642885031418, + "loss": 0.85991269, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.36645508, + "step": 2443, + "time_per_iteration": 2.8638129234313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081528, + "balance_loss_mlp": 1.04619479, + "epoch": 0.47018083878414774, + "flos": 555141219840.0, + "grad_norm": 0.0470926044275737, + "language_loss": 0.80651355, + "learning_rate": 0.0005723346341870662, + "loss": 0.81732887, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.35351562, + "step": 2444, + "time_per_iteration": 2.7571544647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093224, + "balance_loss_mlp": 1.05767596, + "epoch": 0.4703732204694113, + "flos": 423846521856.0, + "grad_norm": 0.060426187781859556, + "language_loss": 0.8612802, + "learning_rate": 0.0005720263552599188, + "loss": 0.87221241, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.35595703, + "step": 2445, + "time_per_iteration": 2.457702398300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087133, + "balance_loss_mlp": 1.05003476, + "epoch": 0.47056560215467486, + "flos": 703179919872.0, + "grad_norm": 0.05103700331104036, + "language_loss": 0.79627156, + "learning_rate": 0.0005717180483696604, + "loss": 0.80714285, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.37084961, + "step": 2446, + "time_per_iteration": 2.851597785949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096579, + "balance_loss_mlp": 1.05981517, + "epoch": 0.47075798383993844, + "flos": 554701851648.0, + "grad_norm": 0.05942499594418206, + "language_loss": 0.82931131, + "learning_rate": 0.0005714097136359862, + "loss": 0.84027708, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.36791992, + "step": 2447, + "time_per_iteration": 2.6262872219085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088323, + "balance_loss_mlp": 1.05203617, + "epoch": 0.470950365525202, + "flos": 564009329664.0, + "grad_norm": 0.04849265524269106, + "language_loss": 0.86289024, + "learning_rate": 0.0005711013511786027, + "loss": 0.87377352, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.36303711, + "step": 2448, + "time_per_iteration": 2.7698192596435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087117, + "balance_loss_mlp": 1.05066276, + "epoch": 0.47114274721046556, + "flos": 534189496320.0, + "grad_norm": 0.0564117191668664, + "language_loss": 0.83740294, + "learning_rate": 0.0005707929611172263, + "loss": 0.84827411, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.36450195, + "step": 2449, + "time_per_iteration": 2.679288864135742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091785, + "balance_loss_mlp": 1.0557121, + "epoch": 0.47133512889572915, + "flos": 472877982720.0, + "grad_norm": 0.05809255973733416, + "language_loss": 0.83857393, + "learning_rate": 0.000570484543571585, + "loss": 0.84949178, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.3605957, + "step": 2450, + "time_per_iteration": 2.53946852684021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085264, + "balance_loss_mlp": 1.04914355, + "epoch": 0.4715275105809927, + "flos": 458776286208.0, + "grad_norm": 0.05957003441240347, + "language_loss": 0.83003706, + "learning_rate": 0.0005701760986614171, + "loss": 0.84088969, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.36132812, + "step": 2451, + "time_per_iteration": 2.578679323196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108432, + "balance_loss_mlp": 1.04784179, + "epoch": 0.47171989226625627, + "flos": 421783358976.0, + "grad_norm": 0.04971859173266034, + "language_loss": 0.86998093, + "learning_rate": 0.0005698676265064714, + "loss": 0.88082415, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.36499023, + "step": 2452, + "time_per_iteration": 2.5178701877593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108693, + "balance_loss_mlp": 1.04887831, + "epoch": 0.4719122739515198, + "flos": 457200543744.0, + "grad_norm": 0.06455625952921856, + "language_loss": 0.89101571, + "learning_rate": 0.0005695591272265074, + "loss": 0.90188503, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.38037109, + "step": 2453, + "time_per_iteration": 2.527940511703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094235, + "balance_loss_mlp": 1.05601645, + "epoch": 0.4721046556367834, + "flos": 514716000768.0, + "grad_norm": 0.05921175255811472, + "language_loss": 0.81955969, + "learning_rate": 0.0005692506009412954, + "loss": 0.83050203, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.3815918, + "step": 2454, + "time_per_iteration": 2.6692135334014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152126, + "balance_loss_mlp": 1.13209891, + "epoch": 0.4722970373220469, + "flos": 1571399723520.0, + "grad_norm": 0.04281653423243919, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78703392, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.20019531, + "step": 2455, + "time_per_iteration": 4.940452337265015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085506, + "balance_loss_mlp": 1.04731131, + "epoch": 0.4724894190073105, + "flos": 585919927296.0, + "grad_norm": 0.06574328103666784, + "language_loss": 0.89537692, + "learning_rate": 0.0005686334678342593, + "loss": 0.906232, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.38183594, + "step": 2456, + "time_per_iteration": 2.8626763820648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085091, + "balance_loss_mlp": 1.04816043, + "epoch": 0.4726818006925741, + "flos": 867290347008.0, + "grad_norm": 0.053689359601525224, + "language_loss": 0.81760311, + "learning_rate": 0.0005683248612520274, + "loss": 0.82845408, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.36914062, + "step": 2457, + "time_per_iteration": 3.062195301055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079889, + "balance_loss_mlp": 1.04300618, + "epoch": 0.4728741823778376, + "flos": 752653721088.0, + "grad_norm": 0.06424431420602757, + "language_loss": 0.83881927, + "learning_rate": 0.0005680162281437321, + "loss": 0.84961808, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.36865234, + "step": 2458, + "time_per_iteration": 4.24756932258606 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081915, + "balance_loss_mlp": 1.04474509, + "epoch": 0.4730665640631012, + "flos": 538301265408.0, + "grad_norm": 0.04398827684533395, + "language_loss": 0.84583557, + "learning_rate": 0.000567707568629195, + "loss": 0.8566547, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.37158203, + "step": 2459, + "time_per_iteration": 2.678410530090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077333, + "balance_loss_mlp": 1.04104519, + "epoch": 0.47325894574836475, + "flos": 491396986368.0, + "grad_norm": 0.04729381274413396, + "language_loss": 0.82117784, + "learning_rate": 0.0005673988828282486, + "loss": 0.83195114, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.36303711, + "step": 2460, + "time_per_iteration": 2.6379287242889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080146, + "balance_loss_mlp": 1.04397774, + "epoch": 0.47345132743362833, + "flos": 764117494272.0, + "grad_norm": 0.048508898725252214, + "language_loss": 0.80703068, + "learning_rate": 0.0005670901708607352, + "loss": 0.81783217, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.36206055, + "step": 2461, + "time_per_iteration": 2.9682881832122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079185, + "balance_loss_mlp": 1.04366088, + "epoch": 0.47364370911889186, + "flos": 539925060096.0, + "grad_norm": 0.06522156043574484, + "language_loss": 0.84211236, + "learning_rate": 0.0005667814328465076, + "loss": 0.8529042, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.35546875, + "step": 2462, + "time_per_iteration": 2.6927719116210938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074953, + "balance_loss_mlp": 1.04031122, + "epoch": 0.47383609080415545, + "flos": 406002613248.0, + "grad_norm": 0.06749328280555515, + "language_loss": 0.81615329, + "learning_rate": 0.0005664726689054285, + "loss": 0.82690287, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.34692383, + "step": 2463, + "time_per_iteration": 2.4384853839874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078599, + "balance_loss_mlp": 1.04345584, + "epoch": 0.474028472489419, + "flos": 453237161472.0, + "grad_norm": 0.0467114590315811, + "language_loss": 0.81182402, + "learning_rate": 0.0005661638791573704, + "loss": 0.82261002, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.35180664, + "step": 2464, + "time_per_iteration": 2.695479154586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108113, + "balance_loss_mlp": 1.04582047, + "epoch": 0.47422085417468257, + "flos": 491923694592.0, + "grad_norm": 0.04732653708909472, + "language_loss": 0.86637986, + "learning_rate": 0.0005658550637222164, + "loss": 0.87719119, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.35327148, + "step": 2465, + "time_per_iteration": 2.6167092323303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079374, + "balance_loss_mlp": 1.04365873, + "epoch": 0.47441323585994616, + "flos": 738537467904.0, + "grad_norm": 0.057300064889236176, + "language_loss": 0.82372761, + "learning_rate": 0.0005655462227198592, + "loss": 0.83452135, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.35742188, + "step": 2466, + "time_per_iteration": 2.9023492336273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080505, + "balance_loss_mlp": 1.04509962, + "epoch": 0.4746056175452097, + "flos": 484439270400.0, + "grad_norm": 0.05227273448390526, + "language_loss": 0.83720088, + "learning_rate": 0.0005652373562702016, + "loss": 0.84800589, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.35449219, + "step": 2467, + "time_per_iteration": 2.5808918476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082419, + "balance_loss_mlp": 1.04715681, + "epoch": 0.4747979992304733, + "flos": 460814717952.0, + "grad_norm": 0.05382206625072039, + "language_loss": 0.88037241, + "learning_rate": 0.000564928464493156, + "loss": 0.89119661, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.35302734, + "step": 2468, + "time_per_iteration": 2.5377156734466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087106, + "balance_loss_mlp": 1.05198669, + "epoch": 0.4749903809157368, + "flos": 864070460928.0, + "grad_norm": 0.0577962749951369, + "language_loss": 0.81768191, + "learning_rate": 0.000564619547508645, + "loss": 0.82855296, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.3515625, + "step": 2469, + "time_per_iteration": 3.043691396713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086191, + "balance_loss_mlp": 1.05042827, + "epoch": 0.4751827626010004, + "flos": 505296451584.0, + "grad_norm": 0.1751373121791138, + "language_loss": 0.83049238, + "learning_rate": 0.0005643106054366008, + "loss": 0.84135431, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.3581543, + "step": 2470, + "time_per_iteration": 2.6487743854522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085754, + "balance_loss_mlp": 1.05118382, + "epoch": 0.47537514428626393, + "flos": 559123540992.0, + "grad_norm": 0.05689297252919276, + "language_loss": 0.79414684, + "learning_rate": 0.000564001638396965, + "loss": 0.80500442, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.34594727, + "step": 2471, + "time_per_iteration": 2.749767780303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087834, + "balance_loss_mlp": 1.05228639, + "epoch": 0.4755675259715275, + "flos": 833907211776.0, + "grad_norm": 0.05462179859190678, + "language_loss": 0.81897652, + "learning_rate": 0.0005636926465096897, + "loss": 0.82985491, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.35546875, + "step": 2472, + "time_per_iteration": 3.043703556060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091887, + "balance_loss_mlp": 1.05569541, + "epoch": 0.47575990765679105, + "flos": 507989629440.0, + "grad_norm": 0.050841736172577985, + "language_loss": 0.87258822, + "learning_rate": 0.0005633836298947363, + "loss": 0.88350713, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.36206055, + "step": 2473, + "time_per_iteration": 2.564831018447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098688, + "balance_loss_mlp": 1.06206715, + "epoch": 0.47595228934205464, + "flos": 591566740992.0, + "grad_norm": 0.05674114123782856, + "language_loss": 0.70767033, + "learning_rate": 0.000563074588672075, + "loss": 0.7186572, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.3659668, + "step": 2474, + "time_per_iteration": 2.6735401153564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095847, + "balance_loss_mlp": 1.05960727, + "epoch": 0.4761446710273182, + "flos": 580340104704.0, + "grad_norm": 0.055780063244739476, + "language_loss": 0.84891874, + "learning_rate": 0.0005627655229616868, + "loss": 0.85987723, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.36230469, + "step": 2475, + "time_per_iteration": 2.672621488571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096569, + "balance_loss_mlp": 1.05899405, + "epoch": 0.47633705271258175, + "flos": 672597651456.0, + "grad_norm": 0.05102987049441457, + "language_loss": 0.90229654, + "learning_rate": 0.0005624564328835616, + "loss": 0.91326219, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.37524414, + "step": 2476, + "time_per_iteration": 2.8432443141937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102069, + "balance_loss_mlp": 1.0635407, + "epoch": 0.47652943439784534, + "flos": 541580788224.0, + "grad_norm": 0.0471064217807047, + "language_loss": 0.84254396, + "learning_rate": 0.0005621473185576986, + "loss": 0.85356462, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.38525391, + "step": 2477, + "time_per_iteration": 2.702977180480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095957, + "balance_loss_mlp": 1.05826259, + "epoch": 0.4767218160831089, + "flos": 524563333632.0, + "grad_norm": 0.057656530584244435, + "language_loss": 0.87137967, + "learning_rate": 0.0005618381801041068, + "loss": 0.88233924, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.37670898, + "step": 2478, + "time_per_iteration": 2.603593111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098137, + "balance_loss_mlp": 1.05953729, + "epoch": 0.47691419776837246, + "flos": 567789419520.0, + "grad_norm": 0.11168904607405869, + "language_loss": 0.82855433, + "learning_rate": 0.0005615290176428044, + "loss": 0.83953571, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.38574219, + "step": 2479, + "time_per_iteration": 2.6339292526245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109593, + "balance_loss_mlp": 1.05959523, + "epoch": 0.477106579453636, + "flos": 530659689984.0, + "grad_norm": 0.06204032147038535, + "language_loss": 0.85517442, + "learning_rate": 0.0005612198312938187, + "loss": 0.86613369, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.36328125, + "step": 2480, + "time_per_iteration": 2.727931261062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096505, + "balance_loss_mlp": 1.05912077, + "epoch": 0.4772989611388996, + "flos": 593980521984.0, + "grad_norm": 0.07113059060466843, + "language_loss": 0.79093325, + "learning_rate": 0.0005609106211771868, + "loss": 0.80189824, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.37402344, + "step": 2481, + "time_per_iteration": 2.8239502906799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091646, + "balance_loss_mlp": 1.05471444, + "epoch": 0.4774913428241631, + "flos": 544352541696.0, + "grad_norm": 0.07337307686737661, + "language_loss": 0.89208174, + "learning_rate": 0.0005606013874129543, + "loss": 0.90299821, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.36914062, + "step": 2482, + "time_per_iteration": 2.7480216026306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088167, + "balance_loss_mlp": 1.05187941, + "epoch": 0.4776837245094267, + "flos": 539817371136.0, + "grad_norm": 0.16520730257770824, + "language_loss": 0.80029452, + "learning_rate": 0.0005602921301211768, + "loss": 0.81117618, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.36303711, + "step": 2483, + "time_per_iteration": 2.6802146434783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096429, + "balance_loss_mlp": 1.06021321, + "epoch": 0.4778761061946903, + "flos": 471543759360.0, + "grad_norm": 0.07816325562851568, + "language_loss": 0.81835008, + "learning_rate": 0.0005599828494219185, + "loss": 0.82931435, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.36206055, + "step": 2484, + "time_per_iteration": 2.546365976333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094667, + "balance_loss_mlp": 1.05923831, + "epoch": 0.4780684878799538, + "flos": 725769994752.0, + "grad_norm": 0.05627448694129284, + "language_loss": 0.88551247, + "learning_rate": 0.0005596735454352527, + "loss": 0.89645922, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.35498047, + "step": 2485, + "time_per_iteration": 2.862647771835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106974, + "balance_loss_mlp": 1.07054353, + "epoch": 0.4782608695652174, + "flos": 548665132032.0, + "grad_norm": 0.07015146645765026, + "language_loss": 0.85657477, + "learning_rate": 0.0005593642182812619, + "loss": 0.86764455, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.36425781, + "step": 2486, + "time_per_iteration": 2.609184741973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101968, + "balance_loss_mlp": 1.06558526, + "epoch": 0.47845325125048094, + "flos": 829555333632.0, + "grad_norm": 0.061922125379274766, + "language_loss": 0.83543551, + "learning_rate": 0.0005590548680800378, + "loss": 0.84645522, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.36401367, + "step": 2487, + "time_per_iteration": 3.089769124984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110265, + "balance_loss_mlp": 1.0746448, + "epoch": 0.4786456329357445, + "flos": 513889546752.0, + "grad_norm": 0.2189409026834594, + "language_loss": 0.76099992, + "learning_rate": 0.0005587454949516804, + "loss": 0.77210259, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.35644531, + "step": 2488, + "time_per_iteration": 2.751055955886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110821, + "balance_loss_mlp": 1.07187533, + "epoch": 0.47883801462100806, + "flos": 564392033280.0, + "grad_norm": 0.10409544878795325, + "language_loss": 0.87659556, + "learning_rate": 0.0005584360990162993, + "loss": 0.88767767, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.36376953, + "step": 2489, + "time_per_iteration": 2.6652133464813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113142, + "balance_loss_mlp": 1.07563877, + "epoch": 0.47903039630627164, + "flos": 579296862720.0, + "grad_norm": 0.09667813376582209, + "language_loss": 0.8484993, + "learning_rate": 0.0005581266803940124, + "loss": 0.8596307, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.375, + "step": 2490, + "time_per_iteration": 2.736374616622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119685, + "balance_loss_mlp": 1.08206201, + "epoch": 0.47922277799153523, + "flos": 618667255296.0, + "grad_norm": 0.050098276566308, + "language_loss": 0.87162292, + "learning_rate": 0.0005578172392049471, + "loss": 0.88281971, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.37573242, + "step": 2491, + "time_per_iteration": 2.7753453254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011097, + "balance_loss_mlp": 1.07307923, + "epoch": 0.47941515967679876, + "flos": 639352728576.0, + "grad_norm": 0.06461059150776577, + "language_loss": 0.83998954, + "learning_rate": 0.0005575077755692386, + "loss": 0.85108656, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.3659668, + "step": 2492, + "time_per_iteration": 2.788609266281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113104, + "balance_loss_mlp": 1.07595801, + "epoch": 0.47960754136206235, + "flos": 519561091584.0, + "grad_norm": 0.0557937811773086, + "language_loss": 0.86232179, + "learning_rate": 0.0005571982896070316, + "loss": 0.87345278, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.37158203, + "step": 2493, + "time_per_iteration": 2.6394574642181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107707, + "balance_loss_mlp": 1.07111025, + "epoch": 0.4797999230473259, + "flos": 474798551040.0, + "grad_norm": 0.0598408121702559, + "language_loss": 0.90174985, + "learning_rate": 0.0005568887814384792, + "loss": 0.9128269, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.3659668, + "step": 2494, + "time_per_iteration": 2.534224033355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111594, + "balance_loss_mlp": 1.0754025, + "epoch": 0.47999230473258947, + "flos": 531766950912.0, + "grad_norm": 0.07246176028888049, + "language_loss": 0.87038457, + "learning_rate": 0.000556579251183743, + "loss": 0.88150048, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.36230469, + "step": 2495, + "time_per_iteration": 2.6398251056671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094859, + "balance_loss_mlp": 1.05802298, + "epoch": 0.480184686417853, + "flos": 601207460352.0, + "grad_norm": 0.06271692106547645, + "language_loss": 0.79938626, + "learning_rate": 0.0005562696989629936, + "loss": 0.8103348, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.3684082, + "step": 2496, + "time_per_iteration": 2.6642816066741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093996, + "balance_loss_mlp": 1.05766106, + "epoch": 0.4803770681031166, + "flos": 527931606528.0, + "grad_norm": 0.05594777531112506, + "language_loss": 0.82110333, + "learning_rate": 0.0005559601248964095, + "loss": 0.83204329, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.36352539, + "step": 2497, + "time_per_iteration": 2.636070966720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093639, + "balance_loss_mlp": 1.05739903, + "epoch": 0.4805694497883801, + "flos": 510934500864.0, + "grad_norm": 0.054324508936697755, + "language_loss": 0.85873795, + "learning_rate": 0.0005556505291041783, + "loss": 0.86967432, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.36254883, + "step": 2498, + "time_per_iteration": 2.7246336936950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094125, + "balance_loss_mlp": 1.05757546, + "epoch": 0.4807618314736437, + "flos": 600027416064.0, + "grad_norm": 0.37566577491106196, + "language_loss": 0.84318507, + "learning_rate": 0.0005553409117064954, + "loss": 0.85412627, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.36547852, + "step": 2499, + "time_per_iteration": 2.8535146713256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.06770992, + "epoch": 0.4809542131589073, + "flos": 568700241408.0, + "grad_norm": 0.05235544022747109, + "language_loss": 0.84675509, + "learning_rate": 0.0005550312728235654, + "loss": 0.85780698, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.37475586, + "step": 2500, + "time_per_iteration": 2.691314697265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118964, + "balance_loss_mlp": 1.08138871, + "epoch": 0.4811465948441708, + "flos": 575703037440.0, + "grad_norm": 0.0667425977867665, + "language_loss": 0.83709896, + "learning_rate": 0.0005547216125756003, + "loss": 0.84828854, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.37573242, + "step": 2501, + "time_per_iteration": 2.7381327152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126097, + "balance_loss_mlp": 1.08754468, + "epoch": 0.4813389765294344, + "flos": 823508439552.0, + "grad_norm": 0.052606522983796165, + "language_loss": 0.82174253, + "learning_rate": 0.0005544119310828211, + "loss": 0.83300352, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.38549805, + "step": 2502, + "time_per_iteration": 3.072216272354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134856, + "balance_loss_mlp": 1.09632754, + "epoch": 0.48153135821469795, + "flos": 635240959488.0, + "grad_norm": 0.048230358167368766, + "language_loss": 0.84706873, + "learning_rate": 0.0005541022284654568, + "loss": 0.85841727, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.38525391, + "step": 2503, + "time_per_iteration": 2.916139602661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128897, + "balance_loss_mlp": 1.09051132, + "epoch": 0.48172373989996153, + "flos": 503450076672.0, + "grad_norm": 0.07897645884633452, + "language_loss": 0.84086657, + "learning_rate": 0.0005537925048437446, + "loss": 0.85215557, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.38354492, + "step": 2504, + "time_per_iteration": 2.5921871662139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110906, + "balance_loss_mlp": 1.09278584, + "epoch": 0.48191612158522507, + "flos": 1531563821568.0, + "grad_norm": 0.0372588251023387, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76862371, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.18164062, + "step": 2505, + "time_per_iteration": 4.9559855461120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141303, + "balance_loss_mlp": 1.10132027, + "epoch": 0.48210850327048865, + "flos": 702078451200.0, + "grad_norm": 0.058816464552035166, + "language_loss": 0.88463128, + "learning_rate": 0.0005531729950682664, + "loss": 0.89604431, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.3996582, + "step": 2506, + "time_per_iteration": 3.0114240646362305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132181, + "balance_loss_mlp": 1.09353316, + "epoch": 0.4823008849557522, + "flos": 439548691968.0, + "grad_norm": 0.06626147096234755, + "language_loss": 0.84781104, + "learning_rate": 0.000552863209155015, + "loss": 0.85913289, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.38598633, + "step": 2507, + "time_per_iteration": 2.5784101486206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113394, + "balance_loss_mlp": 1.09390914, + "epoch": 0.48249326664101577, + "flos": 471622334976.0, + "grad_norm": 0.05712589242287889, + "language_loss": 0.82110274, + "learning_rate": 0.0005525534027184461, + "loss": 0.83244216, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.40014648, + "step": 2508, + "time_per_iteration": 2.552065372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132678, + "balance_loss_mlp": 1.09395885, + "epoch": 0.48268564832627936, + "flos": 562954503168.0, + "grad_norm": 0.04979156125943264, + "language_loss": 0.82958996, + "learning_rate": 0.0005522435758788365, + "loss": 0.84091675, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.38696289, + "step": 2509, + "time_per_iteration": 2.727841854095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121662, + "balance_loss_mlp": 1.08210802, + "epoch": 0.4828780300115429, + "flos": 629298782208.0, + "grad_norm": 0.054057791094232886, + "language_loss": 0.79695261, + "learning_rate": 0.0005519337287564721, + "loss": 0.80816925, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.39526367, + "step": 2510, + "time_per_iteration": 2.841032028198242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111392, + "balance_loss_mlp": 1.07582068, + "epoch": 0.4830704116968065, + "flos": 631562766336.0, + "grad_norm": 0.0770242195625866, + "language_loss": 0.83640802, + "learning_rate": 0.000551623861471646, + "loss": 0.84754717, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.38061523, + "step": 2511, + "time_per_iteration": 2.7330808639526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051582, + "balance_loss_mlp": 1.03489304, + "epoch": 0.48326279338207, + "flos": 1568434503168.0, + "grad_norm": 0.02207943535017646, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79870415, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.16699219, + "step": 2512, + "time_per_iteration": 4.847305536270142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119635, + "balance_loss_mlp": 1.08015239, + "epoch": 0.4834551750673336, + "flos": 508989201408.0, + "grad_norm": 0.07604353740704149, + "language_loss": 0.86230296, + "learning_rate": 0.0005510040668958211, + "loss": 0.87349927, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.39453125, + "step": 2513, + "time_per_iteration": 2.6358695030212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039681, + "balance_loss_mlp": 1.02423155, + "epoch": 0.48364755675259713, + "flos": 1527875453952.0, + "grad_norm": 0.016719139942629795, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78800267, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.15429688, + "step": 2514, + "time_per_iteration": 4.8266448974609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108341, + "balance_loss_mlp": 1.06895423, + "epoch": 0.4838399384378607, + "flos": 564726684672.0, + "grad_norm": 0.05692617769518991, + "language_loss": 0.8306818, + "learning_rate": 0.0005503841931138645, + "loss": 0.84176517, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.39355469, + "step": 2515, + "time_per_iteration": 4.18599271774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108623, + "balance_loss_mlp": 1.07073843, + "epoch": 0.4840323201231243, + "flos": 387479227392.0, + "grad_norm": 0.0681425082817114, + "language_loss": 0.81703341, + "learning_rate": 0.0005500742268214025, + "loss": 0.82811964, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.37841797, + "step": 2516, + "time_per_iteration": 2.4660089015960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109531, + "balance_loss_mlp": 1.07116938, + "epoch": 0.48422470180838784, + "flos": 630701406720.0, + "grad_norm": 0.09015941461472031, + "language_loss": 0.85304928, + "learning_rate": 0.0005497642410884014, + "loss": 0.86414456, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.38305664, + "step": 2517, + "time_per_iteration": 2.8147974014282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108486, + "balance_loss_mlp": 1.06845522, + "epoch": 0.4844170834936514, + "flos": 498955603968.0, + "grad_norm": 0.05998889999991439, + "language_loss": 0.8499558, + "learning_rate": 0.0005494542360352085, + "loss": 0.86104071, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.40014648, + "step": 2518, + "time_per_iteration": 2.639248847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101242, + "balance_loss_mlp": 1.06335747, + "epoch": 0.48460946517891496, + "flos": 550798106112.0, + "grad_norm": 0.04916831458391579, + "language_loss": 0.85637897, + "learning_rate": 0.0005491442117821783, + "loss": 0.86739141, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.37866211, + "step": 2519, + "time_per_iteration": 2.7024173736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101572, + "balance_loss_mlp": 1.06275773, + "epoch": 0.48480184686417854, + "flos": 529123235328.0, + "grad_norm": 0.05557918275255021, + "language_loss": 0.87415975, + "learning_rate": 0.0005488341684496732, + "loss": 0.88517547, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.38793945, + "step": 2520, + "time_per_iteration": 2.6733944416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094952, + "balance_loss_mlp": 1.05732954, + "epoch": 0.4849942285494421, + "flos": 531630148608.0, + "grad_norm": 0.049677430441928086, + "language_loss": 0.91897535, + "learning_rate": 0.0005485241061580624, + "loss": 0.92992491, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.37646484, + "step": 2521, + "time_per_iteration": 2.7186949253082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087464, + "balance_loss_mlp": 1.04802954, + "epoch": 0.48518661023470566, + "flos": 722231424000.0, + "grad_norm": 0.05969395587297076, + "language_loss": 0.84698212, + "learning_rate": 0.0005482140250277228, + "loss": 0.85785675, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.39404297, + "step": 2522, + "time_per_iteration": 3.0005805492401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084269, + "balance_loss_mlp": 1.04664636, + "epoch": 0.4853789919199692, + "flos": 505843508736.0, + "grad_norm": 0.0576168536354582, + "language_loss": 0.87382847, + "learning_rate": 0.0005479039251790387, + "loss": 0.88467115, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.37597656, + "step": 2523, + "time_per_iteration": 2.612565517425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083753, + "balance_loss_mlp": 1.04508114, + "epoch": 0.4855713736052328, + "flos": 660185178624.0, + "grad_norm": 0.05213001441745639, + "language_loss": 0.84754556, + "learning_rate": 0.0005475938067324014, + "loss": 0.85838306, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.38647461, + "step": 2524, + "time_per_iteration": 2.7874755859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084646, + "balance_loss_mlp": 1.04556894, + "epoch": 0.48576375529049637, + "flos": 436727476224.0, + "grad_norm": 0.04741211423020534, + "language_loss": 0.83422267, + "learning_rate": 0.0005472836698082098, + "loss": 0.84506917, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.39086914, + "step": 2525, + "time_per_iteration": 2.50516676902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076506, + "balance_loss_mlp": 1.03764343, + "epoch": 0.4859561369757599, + "flos": 581424044544.0, + "grad_norm": 0.04357292691167825, + "language_loss": 0.84170592, + "learning_rate": 0.0005469735145268694, + "loss": 0.85247099, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.38818359, + "step": 2526, + "time_per_iteration": 2.7474558353424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076384, + "balance_loss_mlp": 1.03723574, + "epoch": 0.4861485186610235, + "flos": 487723175424.0, + "grad_norm": 0.056946126423794464, + "language_loss": 0.80818385, + "learning_rate": 0.0005466633410087933, + "loss": 0.81894767, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.39111328, + "step": 2527, + "time_per_iteration": 2.690655469894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080703, + "balance_loss_mlp": 1.06363261, + "epoch": 0.486340900346287, + "flos": 1556893564416.0, + "grad_norm": 0.03973044492620415, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78341526, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.17089844, + "step": 2528, + "time_per_iteration": 4.852689981460571 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076, + "balance_loss_mlp": 1.03723347, + "epoch": 0.4865332820315506, + "flos": 482760221184.0, + "grad_norm": 0.04657742417719492, + "language_loss": 0.88156307, + "learning_rate": 0.0005460429397441214, + "loss": 0.89232314, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.38720703, + "step": 2529, + "time_per_iteration": 2.55281662940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079921, + "balance_loss_mlp": 1.04053402, + "epoch": 0.48672566371681414, + "flos": 535548450816.0, + "grad_norm": 0.06549810250084472, + "language_loss": 0.86653185, + "learning_rate": 0.0005457327122383866, + "loss": 0.87733108, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.39379883, + "step": 2530, + "time_per_iteration": 2.671656847000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035467, + "balance_loss_mlp": 1.01963639, + "epoch": 0.4869180454020777, + "flos": 1411876901376.0, + "grad_norm": 0.025637836045087663, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75671959, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.15820312, + "step": 2531, + "time_per_iteration": 4.814793348312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081396, + "balance_loss_mlp": 1.04322505, + "epoch": 0.48711042708734126, + "flos": 572836741632.0, + "grad_norm": 0.048652424424379774, + "language_loss": 0.7607469, + "learning_rate": 0.0005451122040823244, + "loss": 0.77156091, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.38134766, + "step": 2532, + "time_per_iteration": 2.7569382190704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081141, + "balance_loss_mlp": 1.04246926, + "epoch": 0.48730280877260485, + "flos": 626231665152.0, + "grad_norm": 0.05261384345268123, + "language_loss": 0.76949328, + "learning_rate": 0.0005448019236728997, + "loss": 0.78030467, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.38647461, + "step": 2533, + "time_per_iteration": 2.8791191577911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082063, + "balance_loss_mlp": 1.04439306, + "epoch": 0.48749519045786843, + "flos": 512233818624.0, + "grad_norm": 0.05361284003065004, + "language_loss": 0.84639871, + "learning_rate": 0.0005444916258698255, + "loss": 0.85721934, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.37670898, + "step": 2534, + "time_per_iteration": 2.584188938140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108354, + "balance_loss_mlp": 1.04548812, + "epoch": 0.48768757214313196, + "flos": 525149678592.0, + "grad_norm": 0.044479444876285516, + "language_loss": 0.85999918, + "learning_rate": 0.0005441813107935704, + "loss": 0.87083459, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.38037109, + "step": 2535, + "time_per_iteration": 2.63484787940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089581, + "balance_loss_mlp": 1.05141044, + "epoch": 0.48787995382839555, + "flos": 504784300032.0, + "grad_norm": 0.05225590764746468, + "language_loss": 0.85801542, + "learning_rate": 0.0005438709785646091, + "loss": 0.86891127, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.38110352, + "step": 2536, + "time_per_iteration": 2.5857274532318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087898, + "balance_loss_mlp": 1.0496794, + "epoch": 0.4880723355136591, + "flos": 574904286720.0, + "grad_norm": 0.05427082704851873, + "language_loss": 0.8654719, + "learning_rate": 0.0005435606293034234, + "loss": 0.87635088, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.3815918, + "step": 2537, + "time_per_iteration": 2.6441421508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082535, + "balance_loss_mlp": 1.04498374, + "epoch": 0.48826471719892267, + "flos": 561172147200.0, + "grad_norm": 0.0666705066547564, + "language_loss": 0.84424317, + "learning_rate": 0.0005432502631305016, + "loss": 0.8550685, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.37548828, + "step": 2538, + "time_per_iteration": 2.657888174057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081573, + "balance_loss_mlp": 1.04383135, + "epoch": 0.4884570988841862, + "flos": 725849980416.0, + "grad_norm": 0.04200092081923836, + "language_loss": 0.83068514, + "learning_rate": 0.0005429398801663386, + "loss": 0.84150088, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.37744141, + "step": 2539, + "time_per_iteration": 2.926213264465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085607, + "balance_loss_mlp": 1.04726946, + "epoch": 0.4886494805694498, + "flos": 430794063360.0, + "grad_norm": 0.05775520457519848, + "language_loss": 0.82975113, + "learning_rate": 0.0005426294805314355, + "loss": 0.84060717, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.38305664, + "step": 2540, + "time_per_iteration": 2.476100444793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087326, + "balance_loss_mlp": 1.0497514, + "epoch": 0.4888418622547134, + "flos": 672673254912.0, + "grad_norm": 0.050739997063638825, + "language_loss": 0.79934752, + "learning_rate": 0.0005423190643463003, + "loss": 0.81022084, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.37573242, + "step": 2541, + "time_per_iteration": 2.983567953109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108794, + "balance_loss_mlp": 1.05005538, + "epoch": 0.4890342439399769, + "flos": 541639014912.0, + "grad_norm": 0.05834464255250002, + "language_loss": 0.82589471, + "learning_rate": 0.0005420086317314473, + "loss": 0.83677411, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.37841797, + "step": 2542, + "time_per_iteration": 2.6762986183166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088795, + "balance_loss_mlp": 1.04957485, + "epoch": 0.4892266256252405, + "flos": 590380904448.0, + "grad_norm": 0.056502349447813176, + "language_loss": 0.8105309, + "learning_rate": 0.0005416981828073971, + "loss": 0.82141888, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.39208984, + "step": 2543, + "time_per_iteration": 2.798063039779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111053, + "balance_loss_mlp": 1.0975107, + "epoch": 0.48941900731050403, + "flos": 1515460008960.0, + "grad_norm": 0.049245887260565786, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78226066, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.13574219, + "step": 2544, + "time_per_iteration": 4.86514949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084158, + "balance_loss_mlp": 1.04632151, + "epoch": 0.4896113889957676, + "flos": 470327399424.0, + "grad_norm": 0.0633775200016376, + "language_loss": 0.84418309, + "learning_rate": 0.000541077236513819, + "loss": 0.85502464, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.37792969, + "step": 2545, + "time_per_iteration": 2.590907335281372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085016, + "balance_loss_mlp": 1.04698849, + "epoch": 0.48980377068103115, + "flos": 496310478336.0, + "grad_norm": 0.05034497234802515, + "language_loss": 0.82352334, + "learning_rate": 0.0005407667393853638, + "loss": 0.83437347, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.37988281, + "step": 2546, + "time_per_iteration": 2.6386098861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079303, + "balance_loss_mlp": 1.04187095, + "epoch": 0.48999615236629473, + "flos": 692539628544.0, + "grad_norm": 0.05625529240804266, + "language_loss": 0.83240199, + "learning_rate": 0.0005404562264298569, + "loss": 0.84319508, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.37426758, + "step": 2547, + "time_per_iteration": 2.8305716514587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083931, + "balance_loss_mlp": 1.04459167, + "epoch": 0.49018853405155827, + "flos": 541432401408.0, + "grad_norm": 0.05508159523705553, + "language_loss": 0.83712828, + "learning_rate": 0.0005401456977678498, + "loss": 0.84796757, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.39306641, + "step": 2548, + "time_per_iteration": 2.647726058959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079917, + "balance_loss_mlp": 1.0415554, + "epoch": 0.49038091573682185, + "flos": 695304027648.0, + "grad_norm": 0.06449580544702971, + "language_loss": 0.77341408, + "learning_rate": 0.0005398351535199008, + "loss": 0.7842133, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.38330078, + "step": 2549, + "time_per_iteration": 3.0876851081848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087981, + "balance_loss_mlp": 1.04976225, + "epoch": 0.49057329742208544, + "flos": 596614063104.0, + "grad_norm": 0.053976289964032184, + "language_loss": 0.83800292, + "learning_rate": 0.0005395245938065735, + "loss": 0.84888279, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.38183594, + "step": 2550, + "time_per_iteration": 2.804429769515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082681, + "balance_loss_mlp": 1.04372382, + "epoch": 0.490765679107349, + "flos": 513154814976.0, + "grad_norm": 0.06066311696873723, + "language_loss": 0.8244735, + "learning_rate": 0.0005392140187484379, + "loss": 0.83530027, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.38916016, + "step": 2551, + "time_per_iteration": 2.597642421722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078468, + "balance_loss_mlp": 1.04001141, + "epoch": 0.49095806079261256, + "flos": 629298782208.0, + "grad_norm": 0.0491826620467597, + "language_loss": 0.89348012, + "learning_rate": 0.0005389034284660701, + "loss": 0.90426481, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.3840332, + "step": 2552, + "time_per_iteration": 2.7942707538604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081847, + "balance_loss_mlp": 1.04231691, + "epoch": 0.4911504424778761, + "flos": 914938122240.0, + "grad_norm": 0.07682264853807555, + "language_loss": 0.82114685, + "learning_rate": 0.000538592823080052, + "loss": 0.83196527, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.39501953, + "step": 2553, + "time_per_iteration": 3.1190438270568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079314, + "balance_loss_mlp": 1.04154849, + "epoch": 0.4913428241631397, + "flos": 438716445696.0, + "grad_norm": 0.05210768805810414, + "language_loss": 0.85049736, + "learning_rate": 0.000538282202710971, + "loss": 0.86129045, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.37768555, + "step": 2554, + "time_per_iteration": 2.5379602909088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073496, + "balance_loss_mlp": 1.03613555, + "epoch": 0.4915352058484032, + "flos": 635806955520.0, + "grad_norm": 0.06005848629390598, + "language_loss": 0.81770831, + "learning_rate": 0.000537971567479421, + "loss": 0.82844329, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.37329102, + "step": 2555, + "time_per_iteration": 2.7403476238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107527, + "balance_loss_mlp": 1.0371232, + "epoch": 0.4917275875336668, + "flos": 504272148480.0, + "grad_norm": 0.05941814666543565, + "language_loss": 0.87821388, + "learning_rate": 0.0005376609175060011, + "loss": 0.88896656, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.38110352, + "step": 2556, + "time_per_iteration": 2.5817511081695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069861, + "balance_loss_mlp": 1.03192806, + "epoch": 0.49191996921893033, + "flos": 654251765760.0, + "grad_norm": 0.06032782721564886, + "language_loss": 0.80381918, + "learning_rate": 0.0005373502529113162, + "loss": 0.81451786, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.37915039, + "step": 2557, + "time_per_iteration": 2.7871665954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077426, + "balance_loss_mlp": 1.03939795, + "epoch": 0.4921123509041939, + "flos": 492101194752.0, + "grad_norm": 0.054204772274654804, + "language_loss": 0.81538296, + "learning_rate": 0.0005370395738159773, + "loss": 0.82615721, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.38012695, + "step": 2558, + "time_per_iteration": 2.667402744293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071328, + "balance_loss_mlp": 1.03368151, + "epoch": 0.4923047325894575, + "flos": 545907935232.0, + "grad_norm": 0.05883600684350466, + "language_loss": 0.82952267, + "learning_rate": 0.0005367288803406003, + "loss": 0.84023595, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.3762207, + "step": 2559, + "time_per_iteration": 2.626527786254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078144, + "balance_loss_mlp": 1.03937757, + "epoch": 0.49249711427472104, + "flos": 596195043840.0, + "grad_norm": 0.05079842806629368, + "language_loss": 0.8133688, + "learning_rate": 0.0005364181726058073, + "loss": 0.82415026, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.38720703, + "step": 2560, + "time_per_iteration": 2.6742072105407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079994, + "balance_loss_mlp": 1.0413698, + "epoch": 0.4926894959599846, + "flos": 497580682752.0, + "grad_norm": 0.07402195837362009, + "language_loss": 0.8230688, + "learning_rate": 0.0005361074507322261, + "loss": 0.83386874, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.38574219, + "step": 2561, + "time_per_iteration": 2.5911788940429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079985, + "balance_loss_mlp": 1.04226756, + "epoch": 0.49288187764524816, + "flos": 535868545536.0, + "grad_norm": 0.051530448614758514, + "language_loss": 0.81235635, + "learning_rate": 0.000535796714840489, + "loss": 0.82315624, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.37695312, + "step": 2562, + "time_per_iteration": 2.607124090194702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108504, + "balance_loss_mlp": 1.04694033, + "epoch": 0.49307425933051174, + "flos": 641267504640.0, + "grad_norm": 0.0614534794373117, + "language_loss": 0.83895457, + "learning_rate": 0.0005354859650512348, + "loss": 0.84980506, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.38037109, + "step": 2563, + "time_per_iteration": 2.757147789001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.04889464, + "epoch": 0.4932666410157753, + "flos": 516000761856.0, + "grad_norm": 0.06049941260890761, + "language_loss": 0.87262708, + "learning_rate": 0.0005351752014851074, + "loss": 0.88350135, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.38500977, + "step": 2564, + "time_per_iteration": 2.546381711959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090812, + "balance_loss_mlp": 1.05190217, + "epoch": 0.49345902270103886, + "flos": 601217634816.0, + "grad_norm": 0.06075916964602771, + "language_loss": 0.83327425, + "learning_rate": 0.0005348644242627553, + "loss": 0.84418237, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.38867188, + "step": 2565, + "time_per_iteration": 2.737234592437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080753, + "balance_loss_mlp": 1.06368184, + "epoch": 0.49365140438630245, + "flos": 1492849585152.0, + "grad_norm": 0.03629255242441858, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76367378, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.17089844, + "step": 2566, + "time_per_iteration": 4.96724271774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093722, + "balance_loss_mlp": 1.05462122, + "epoch": 0.493843786071566, + "flos": 629303164416.0, + "grad_norm": 0.05641611710897844, + "language_loss": 0.81215966, + "learning_rate": 0.0005342428293320013, + "loss": 0.82309687, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.390625, + "step": 2567, + "time_per_iteration": 2.75099778175354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085401, + "balance_loss_mlp": 1.04722989, + "epoch": 0.49403616775682957, + "flos": 617283569664.0, + "grad_norm": 0.05682733114828458, + "language_loss": 0.83676398, + "learning_rate": 0.0005339320118649238, + "loss": 0.84761798, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.3815918, + "step": 2568, + "time_per_iteration": 2.6829991340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087145, + "balance_loss_mlp": 1.04945099, + "epoch": 0.4942285494420931, + "flos": 577357355520.0, + "grad_norm": 0.053270861905881636, + "language_loss": 0.86332101, + "learning_rate": 0.000533621181224271, + "loss": 0.87419248, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.37646484, + "step": 2569, + "time_per_iteration": 2.777698278427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092012, + "balance_loss_mlp": 1.0536983, + "epoch": 0.4944209311273567, + "flos": 629899683840.0, + "grad_norm": 0.059449335887268515, + "language_loss": 0.81470358, + "learning_rate": 0.0005333103375307182, + "loss": 0.82562375, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.3828125, + "step": 2570, + "time_per_iteration": 2.866680145263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087838, + "balance_loss_mlp": 1.0502398, + "epoch": 0.4946133128126202, + "flos": 587337108480.0, + "grad_norm": 0.04632852912872097, + "language_loss": 0.86004198, + "learning_rate": 0.0005329994809049451, + "loss": 0.8709203, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.37548828, + "step": 2571, + "time_per_iteration": 2.719249963760376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089339, + "balance_loss_mlp": 1.05147839, + "epoch": 0.4948056944978838, + "flos": 583437745152.0, + "grad_norm": 0.05131083950778726, + "language_loss": 0.87596244, + "learning_rate": 0.0005326886114676375, + "loss": 0.88685584, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.37866211, + "step": 2572, + "time_per_iteration": 2.7392373085021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083396, + "balance_loss_mlp": 1.04524934, + "epoch": 0.49499807618314734, + "flos": 481583149056.0, + "grad_norm": 0.0472919496744071, + "language_loss": 0.87958217, + "learning_rate": 0.0005323777293394854, + "loss": 0.89041615, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.38110352, + "step": 2573, + "time_per_iteration": 2.531196355819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078942, + "balance_loss_mlp": 1.04072404, + "epoch": 0.4951904578684109, + "flos": 518714288640.0, + "grad_norm": 0.0452048253819277, + "language_loss": 0.82375443, + "learning_rate": 0.000532066834641184, + "loss": 0.83454382, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.38183594, + "step": 2574, + "time_per_iteration": 2.6414644718170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076991, + "balance_loss_mlp": 1.03939271, + "epoch": 0.4953828395536745, + "flos": 535238530560.0, + "grad_norm": 0.0513606490930485, + "language_loss": 0.84946954, + "learning_rate": 0.0005317559274934334, + "loss": 0.86023939, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.37573242, + "step": 2575, + "time_per_iteration": 2.764742374420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075902, + "balance_loss_mlp": 1.03904271, + "epoch": 0.49557522123893805, + "flos": 528305545728.0, + "grad_norm": 0.0624025017343203, + "language_loss": 0.80560994, + "learning_rate": 0.0005314450080169382, + "loss": 0.816369, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.3684082, + "step": 2576, + "time_per_iteration": 2.594782590866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078443, + "balance_loss_mlp": 1.04017663, + "epoch": 0.49576760292420163, + "flos": 427780790784.0, + "grad_norm": 0.059991931078834576, + "language_loss": 0.80652928, + "learning_rate": 0.0005311340763324083, + "loss": 0.81731379, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.38232422, + "step": 2577, + "time_per_iteration": 2.5488879680633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107968, + "balance_loss_mlp": 1.04232025, + "epoch": 0.49595998460946517, + "flos": 564968203776.0, + "grad_norm": 0.04956045110382575, + "language_loss": 0.81899893, + "learning_rate": 0.0005308231325605578, + "loss": 0.82979578, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.37329102, + "step": 2578, + "time_per_iteration": 2.6677722930908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077103, + "balance_loss_mlp": 1.03905153, + "epoch": 0.49615236629472875, + "flos": 702161409024.0, + "grad_norm": 0.04106026216453222, + "language_loss": 0.76928478, + "learning_rate": 0.0005305121768221061, + "loss": 0.78005582, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.38012695, + "step": 2579, + "time_per_iteration": 3.070509672164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024013, + "balance_loss_mlp": 1.00970817, + "epoch": 0.4963447479799923, + "flos": 1440896573952.0, + "grad_norm": 0.02117966265403326, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76062334, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.14257812, + "step": 2580, + "time_per_iteration": 4.802190780639648 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084007, + "balance_loss_mlp": 1.04669428, + "epoch": 0.49653712966525587, + "flos": 537370094592.0, + "grad_norm": 0.04967277918174837, + "language_loss": 0.91594803, + "learning_rate": 0.0005298902299282984, + "loss": 0.92678809, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.37304688, + "step": 2581, + "time_per_iteration": 2.5916941165924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075168, + "balance_loss_mlp": 1.03823721, + "epoch": 0.4967295113505194, + "flos": 607002660864.0, + "grad_norm": 0.058889996692992934, + "language_loss": 0.84090436, + "learning_rate": 0.0005295792390144033, + "loss": 0.85165608, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.36889648, + "step": 2582, + "time_per_iteration": 2.731971502304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077994, + "balance_loss_mlp": 1.04065764, + "epoch": 0.496921893035783, + "flos": 474340243968.0, + "grad_norm": 0.06551304805839393, + "language_loss": 0.83421808, + "learning_rate": 0.0005292682366168294, + "loss": 0.844998, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.37304688, + "step": 2583, + "time_per_iteration": 2.575511932373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071469, + "balance_loss_mlp": 1.03437066, + "epoch": 0.4971142747210466, + "flos": 597180059136.0, + "grad_norm": 0.09149919184070833, + "language_loss": 0.79965729, + "learning_rate": 0.0005289572228563181, + "loss": 0.81037199, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.37084961, + "step": 2584, + "time_per_iteration": 2.7206363677978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107391, + "balance_loss_mlp": 1.03533435, + "epoch": 0.4973066564063101, + "flos": 599321797632.0, + "grad_norm": 0.052533233614156426, + "language_loss": 0.82869196, + "learning_rate": 0.000528646197853616, + "loss": 0.83943105, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.38549805, + "step": 2585, + "time_per_iteration": 2.6923370361328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078928, + "balance_loss_mlp": 1.04097223, + "epoch": 0.4974990380915737, + "flos": 649152009216.0, + "grad_norm": 0.05229001766272028, + "language_loss": 0.85541296, + "learning_rate": 0.0005283351617294735, + "loss": 0.86620224, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.37939453, + "step": 2586, + "time_per_iteration": 2.929431915283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021123, + "balance_loss_mlp": 1.00719905, + "epoch": 0.49769141977683723, + "flos": 1528490912256.0, + "grad_norm": 0.01235864360091676, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77657783, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.13964844, + "step": 2587, + "time_per_iteration": 5.021655082702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077541, + "balance_loss_mlp": 1.03977549, + "epoch": 0.4978838014621008, + "flos": 536114446848.0, + "grad_norm": 0.05582319417935397, + "language_loss": 0.866669, + "learning_rate": 0.0005277130565998916, + "loss": 0.87744439, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.37719727, + "step": 2588, + "time_per_iteration": 2.729919195175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079282, + "balance_loss_mlp": 1.04163599, + "epoch": 0.49807618314736435, + "flos": 539335742976.0, + "grad_norm": 0.05154521563335112, + "language_loss": 0.81850547, + "learning_rate": 0.0005274019878359748, + "loss": 0.82929826, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.3762207, + "step": 2589, + "time_per_iteration": 2.692312240600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080128, + "balance_loss_mlp": 1.04243433, + "epoch": 0.49826856483262794, + "flos": 542215185408.0, + "grad_norm": 0.0590106194524904, + "language_loss": 0.87004912, + "learning_rate": 0.0005270909084336628, + "loss": 0.88085043, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.37695312, + "step": 2590, + "time_per_iteration": 2.684134006500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085832, + "balance_loss_mlp": 1.04637384, + "epoch": 0.4984609465178915, + "flos": 522062212608.0, + "grad_norm": 0.056922673879229405, + "language_loss": 0.89000517, + "learning_rate": 0.0005267798185137276, + "loss": 0.90086353, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.39428711, + "step": 2591, + "time_per_iteration": 2.6129040718078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087547, + "balance_loss_mlp": 1.04942417, + "epoch": 0.49865332820315506, + "flos": 574255332864.0, + "grad_norm": 0.05087809825508884, + "language_loss": 0.89274907, + "learning_rate": 0.0005264687181969444, + "loss": 0.90362453, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.38085938, + "step": 2592, + "time_per_iteration": 2.7253634929656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087493, + "balance_loss_mlp": 1.04891706, + "epoch": 0.49884570988841864, + "flos": 1013207657472.0, + "grad_norm": 0.06815052907107509, + "language_loss": 0.75056839, + "learning_rate": 0.0005261576076040937, + "loss": 0.76144326, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.38525391, + "step": 2593, + "time_per_iteration": 3.2982125282287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086885, + "balance_loss_mlp": 1.04790401, + "epoch": 0.4990380915736822, + "flos": 559315597824.0, + "grad_norm": 0.05997761702509101, + "language_loss": 0.84464318, + "learning_rate": 0.0005258464868559591, + "loss": 0.85551196, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.38964844, + "step": 2594, + "time_per_iteration": 2.650743007659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087007, + "balance_loss_mlp": 1.04819274, + "epoch": 0.49923047325894576, + "flos": 498708292608.0, + "grad_norm": 0.060987476024219604, + "language_loss": 0.88568228, + "learning_rate": 0.0005255353560733284, + "loss": 0.89655238, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.38793945, + "step": 2595, + "time_per_iteration": 2.5599913597106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041145, + "balance_loss_mlp": 1.02760279, + "epoch": 0.4994228549442093, + "flos": 1495851273216.0, + "grad_norm": 0.01946244961408958, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76619792, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.13574219, + "step": 2596, + "time_per_iteration": 4.769503593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108652, + "balance_loss_mlp": 1.0481348, + "epoch": 0.4996152366294729, + "flos": 557090901504.0, + "grad_norm": 0.052826274831603945, + "language_loss": 0.83429873, + "learning_rate": 0.0005249130648877492, + "loss": 0.84516394, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.38354492, + "step": 2597, + "time_per_iteration": 2.724168300628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085761, + "balance_loss_mlp": 1.04785287, + "epoch": 0.4998076183147364, + "flos": 415372700160.0, + "grad_norm": 0.05706521232724688, + "language_loss": 0.84317046, + "learning_rate": 0.0005246019047263953, + "loss": 0.85402811, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.37841797, + "step": 2598, + "time_per_iteration": 2.4463517665863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081855, + "balance_loss_mlp": 1.04475701, + "epoch": 0.5, + "flos": 467107513344.0, + "grad_norm": 0.6792645039501298, + "language_loss": 0.82562613, + "learning_rate": 0.0005242907350137353, + "loss": 0.83644474, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.37060547, + "step": 2599, + "time_per_iteration": 2.560786008834839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099746, + "balance_loss_mlp": 1.06193328, + "epoch": 0.5001923816852636, + "flos": 482460475392.0, + "grad_norm": 0.06436348420044716, + "language_loss": 0.78717571, + "learning_rate": 0.0005239795558705754, + "loss": 0.79817319, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.37817383, + "step": 2600, + "time_per_iteration": 2.691749095916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103949, + "balance_loss_mlp": 1.06613564, + "epoch": 0.5003847633705272, + "flos": 533534750208.0, + "grad_norm": 0.05701005713359991, + "language_loss": 0.89229304, + "learning_rate": 0.0005236683674177264, + "loss": 0.90333253, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.37744141, + "step": 2601, + "time_per_iteration": 2.6216700077056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118846, + "balance_loss_mlp": 1.08053231, + "epoch": 0.5005771450557907, + "flos": 737473876992.0, + "grad_norm": 0.059257141019647214, + "language_loss": 0.82444715, + "learning_rate": 0.0005233571697760021, + "loss": 0.83563566, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.3828125, + "step": 2602, + "time_per_iteration": 2.856107473373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127913, + "balance_loss_mlp": 1.08902669, + "epoch": 0.5007695267410542, + "flos": 778646974464.0, + "grad_norm": 0.08832305121279985, + "language_loss": 0.83020616, + "learning_rate": 0.0005230459630662203, + "loss": 0.84148532, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.38842773, + "step": 2603, + "time_per_iteration": 2.954914093017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133998, + "balance_loss_mlp": 1.09563613, + "epoch": 0.5009619084263178, + "flos": 623192251392.0, + "grad_norm": 0.09845505678723535, + "language_loss": 0.81501806, + "learning_rate": 0.0005227347474092022, + "loss": 0.82635808, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.38354492, + "step": 2604, + "time_per_iteration": 2.7330713272094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132886, + "balance_loss_mlp": 1.09223533, + "epoch": 0.5011542901115814, + "flos": 530812459008.0, + "grad_norm": 0.044602380755084235, + "language_loss": 0.83597159, + "learning_rate": 0.0005224235229257724, + "loss": 0.84730041, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.40649414, + "step": 2605, + "time_per_iteration": 2.682590961456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134983, + "balance_loss_mlp": 1.09485674, + "epoch": 0.5013466717968449, + "flos": 527262303744.0, + "grad_norm": 0.06172408458695075, + "language_loss": 0.86453664, + "learning_rate": 0.0005221122897367589, + "loss": 0.87588644, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.40136719, + "step": 2606, + "time_per_iteration": 2.7657558917999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130017, + "balance_loss_mlp": 1.08970046, + "epoch": 0.5015390534821085, + "flos": 565750987776.0, + "grad_norm": 0.060573415362282904, + "language_loss": 0.80914944, + "learning_rate": 0.0005218010479629932, + "loss": 0.82044959, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.40332031, + "step": 2607, + "time_per_iteration": 2.650521755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137201, + "balance_loss_mlp": 1.09564483, + "epoch": 0.5017314351673721, + "flos": 566430465024.0, + "grad_norm": 0.062462394429491495, + "language_loss": 0.82171839, + "learning_rate": 0.0005214897977253102, + "loss": 0.83309042, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.41552734, + "step": 2608, + "time_per_iteration": 2.679605484008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135192, + "balance_loss_mlp": 1.09222913, + "epoch": 0.5019238168526357, + "flos": 522018542592.0, + "grad_norm": 0.04524020883908707, + "language_loss": 0.84520149, + "learning_rate": 0.0005211785391445473, + "loss": 0.85655344, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.4296875, + "step": 2609, + "time_per_iteration": 2.727029323577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133676, + "balance_loss_mlp": 1.09128523, + "epoch": 0.5021161985378992, + "flos": 641135084544.0, + "grad_norm": 0.0754859849582408, + "language_loss": 0.79190326, + "learning_rate": 0.0005208672723415467, + "loss": 0.80324006, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.42358398, + "step": 2610, + "time_per_iteration": 2.7925145626068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132046, + "balance_loss_mlp": 1.09058475, + "epoch": 0.5023085802231627, + "flos": 591000744960.0, + "grad_norm": 0.05557553185326306, + "language_loss": 0.78870118, + "learning_rate": 0.0005205559974371525, + "loss": 0.80002165, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.41455078, + "step": 2611, + "time_per_iteration": 2.7993710041046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129577, + "balance_loss_mlp": 1.08747184, + "epoch": 0.5025009619084263, + "flos": 472134486528.0, + "grad_norm": 0.05627981978612443, + "language_loss": 0.81993866, + "learning_rate": 0.0005202447145522123, + "loss": 0.83123446, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.42089844, + "step": 2612, + "time_per_iteration": 2.6950342655181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120122, + "balance_loss_mlp": 1.0788281, + "epoch": 0.5026933435936899, + "flos": 454906036224.0, + "grad_norm": 0.05146182880646494, + "language_loss": 0.79119051, + "learning_rate": 0.0005199334238075769, + "loss": 0.80239171, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.4128418, + "step": 2613, + "time_per_iteration": 2.533280372619629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121533, + "balance_loss_mlp": 1.08064461, + "epoch": 0.5028857252789535, + "flos": 491504675328.0, + "grad_norm": 0.049706042989329166, + "language_loss": 0.91481262, + "learning_rate": 0.0005196221253241, + "loss": 0.92602801, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.40869141, + "step": 2614, + "time_per_iteration": 2.562459707260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125484, + "balance_loss_mlp": 1.08271146, + "epoch": 0.503078106964217, + "flos": 625280145408.0, + "grad_norm": 0.05688830610190983, + "language_loss": 0.82597703, + "learning_rate": 0.0005193108192226383, + "loss": 0.83723187, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.42797852, + "step": 2615, + "time_per_iteration": 2.7700836658477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124223, + "balance_loss_mlp": 1.08054483, + "epoch": 0.5032704886494805, + "flos": 578774536704.0, + "grad_norm": 0.07123141067873749, + "language_loss": 0.87046134, + "learning_rate": 0.000518999505624052, + "loss": 0.88170362, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.43701172, + "step": 2616, + "time_per_iteration": 2.6920361518859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110793, + "balance_loss_mlp": 1.06897473, + "epoch": 0.5034628703347441, + "flos": 471481150464.0, + "grad_norm": 0.07512500822512953, + "language_loss": 0.83250809, + "learning_rate": 0.000518688184649203, + "loss": 0.84361595, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.41845703, + "step": 2617, + "time_per_iteration": 2.8107755184173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109828, + "balance_loss_mlp": 1.06786621, + "epoch": 0.5036552520200077, + "flos": 489594281472.0, + "grad_norm": 0.05241889370213675, + "language_loss": 0.83636624, + "learning_rate": 0.0005183768564189577, + "loss": 0.84746444, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.41967773, + "step": 2618, + "time_per_iteration": 2.5401604175567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117501, + "balance_loss_mlp": 1.07649279, + "epoch": 0.5038476337052713, + "flos": 493991239680.0, + "grad_norm": 0.05660213632560354, + "language_loss": 0.8184489, + "learning_rate": 0.0005180655210541838, + "loss": 0.82962382, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.40991211, + "step": 2619, + "time_per_iteration": 2.603214979171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111785, + "balance_loss_mlp": 1.06829762, + "epoch": 0.5040400153905348, + "flos": 600321369600.0, + "grad_norm": 0.06441755274122189, + "language_loss": 0.83548617, + "learning_rate": 0.0005177541786757527, + "loss": 0.84660405, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.43481445, + "step": 2620, + "time_per_iteration": 2.760035276412964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122898, + "balance_loss_mlp": 1.07759881, + "epoch": 0.5042323970757984, + "flos": 811178924544.0, + "grad_norm": 0.05307882661131351, + "language_loss": 0.82779682, + "learning_rate": 0.000517442829404538, + "loss": 0.8390258, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.453125, + "step": 2621, + "time_per_iteration": 2.9839560985565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110947, + "balance_loss_mlp": 1.06581521, + "epoch": 0.504424778761062, + "flos": 626985335808.0, + "grad_norm": 0.08823829105457728, + "language_loss": 0.87315869, + "learning_rate": 0.0005171314733614166, + "loss": 0.88425338, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.43676758, + "step": 2622, + "time_per_iteration": 2.901881456375122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_mlp": 1.05961967, + "epoch": 0.5046171604463255, + "flos": 515651553792.0, + "grad_norm": 0.052612789537889, + "language_loss": 0.78039354, + "learning_rate": 0.0005168201106672671, + "loss": 0.79141223, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.42236328, + "step": 2623, + "time_per_iteration": 2.7674055099487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111898, + "balance_loss_mlp": 1.07046056, + "epoch": 0.504809542131589, + "flos": 527576606208.0, + "grad_norm": 0.08464756430959838, + "language_loss": 0.8495788, + "learning_rate": 0.0005165087414429717, + "loss": 0.86069775, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.41430664, + "step": 2624, + "time_per_iteration": 2.602158546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117075, + "balance_loss_mlp": 1.07261038, + "epoch": 0.5050019238168526, + "flos": 553855048704.0, + "grad_norm": 0.23140620797494316, + "language_loss": 0.83667731, + "learning_rate": 0.0005161973658094144, + "loss": 0.84784812, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.44458008, + "step": 2625, + "time_per_iteration": 2.6992454528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108998, + "balance_loss_mlp": 1.06834817, + "epoch": 0.5051943055021162, + "flos": 574486677504.0, + "grad_norm": 0.05317382862924398, + "language_loss": 0.82239455, + "learning_rate": 0.000515885983887482, + "loss": 0.83348453, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.40649414, + "step": 2626, + "time_per_iteration": 2.7204251289367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110206, + "balance_loss_mlp": 1.06781507, + "epoch": 0.5053866871873798, + "flos": 496438516224.0, + "grad_norm": 0.08071327634258786, + "language_loss": 0.84119672, + "learning_rate": 0.0005155745957980636, + "loss": 0.85229874, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.42382812, + "step": 2627, + "time_per_iteration": 2.5813376903533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118048, + "balance_loss_mlp": 1.0760628, + "epoch": 0.5055790688726434, + "flos": 501963084288.0, + "grad_norm": 0.04526623404133713, + "language_loss": 0.88577604, + "learning_rate": 0.000515263201662051, + "loss": 0.89695656, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.41992188, + "step": 2628, + "time_per_iteration": 2.6876380443573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111719, + "balance_loss_mlp": 1.07625389, + "epoch": 0.5057714505579068, + "flos": 844844276736.0, + "grad_norm": 0.05588400488715087, + "language_loss": 0.82233381, + "learning_rate": 0.0005149518016003378, + "loss": 0.83350569, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.40942383, + "step": 2629, + "time_per_iteration": 3.1858632564544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124651, + "balance_loss_mlp": 1.0810678, + "epoch": 0.5059638322431704, + "flos": 497580682752.0, + "grad_norm": 0.0555737706891176, + "language_loss": 0.82261145, + "learning_rate": 0.0005146403957338206, + "loss": 0.83385789, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.43603516, + "step": 2630, + "time_per_iteration": 2.548497438430786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118356, + "balance_loss_mlp": 1.07703853, + "epoch": 0.506156213928434, + "flos": 617526498816.0, + "grad_norm": 0.05055767229530262, + "language_loss": 0.82073247, + "learning_rate": 0.0005143289841833975, + "loss": 0.83191609, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.41308594, + "step": 2631, + "time_per_iteration": 2.847142457962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116463, + "balance_loss_mlp": 1.07500172, + "epoch": 0.5063485956136976, + "flos": 424624923648.0, + "grad_norm": 0.06986911289391046, + "language_loss": 0.81789684, + "learning_rate": 0.0005140175670699696, + "loss": 0.82906151, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.41455078, + "step": 2632, + "time_per_iteration": 2.6268298625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_mlp": 1.0729686, + "epoch": 0.5065409772989612, + "flos": 569641586688.0, + "grad_norm": 0.04802770333155415, + "language_loss": 0.8255887, + "learning_rate": 0.0005137061445144395, + "loss": 0.8367523, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.43383789, + "step": 2633, + "time_per_iteration": 2.93361759185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106091, + "balance_loss_mlp": 1.06458259, + "epoch": 0.5067333589842247, + "flos": 628510205952.0, + "grad_norm": 0.0826873370301202, + "language_loss": 0.86646289, + "learning_rate": 0.000513394716637712, + "loss": 0.87752378, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.4152832, + "step": 2634, + "time_per_iteration": 2.8372714519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083943, + "balance_loss_mlp": 1.06868434, + "epoch": 0.5069257406694883, + "flos": 1447062741504.0, + "grad_norm": 0.03147096823206272, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80275649, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.15234375, + "step": 2635, + "time_per_iteration": 4.893187046051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109252, + "balance_loss_mlp": 1.06812489, + "epoch": 0.5071181223547518, + "flos": 638530656768.0, + "grad_norm": 0.046825638192595165, + "language_loss": 0.80415404, + "learning_rate": 0.0005127718454042958, + "loss": 0.81524646, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.41113281, + "step": 2636, + "time_per_iteration": 2.8583669662475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104227, + "balance_loss_mlp": 1.06250417, + "epoch": 0.5073105040400154, + "flos": 713239658496.0, + "grad_norm": 0.061804914120772665, + "language_loss": 0.84210312, + "learning_rate": 0.0005124604022894269, + "loss": 0.85314542, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.41723633, + "step": 2637, + "time_per_iteration": 2.924973726272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047259, + "balance_loss_mlp": 1.03228605, + "epoch": 0.5075028857252789, + "flos": 1435658605056.0, + "grad_norm": 0.01918715016894911, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78235483, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.14941406, + "step": 2638, + "time_per_iteration": 4.856257915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103908, + "balance_loss_mlp": 1.06115913, + "epoch": 0.5076952674105425, + "flos": 570857946624.0, + "grad_norm": 0.0603044028086303, + "language_loss": 0.83185166, + "learning_rate": 0.0005118375016679325, + "loss": 0.84289074, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.42749023, + "step": 2639, + "time_per_iteration": 2.788266897201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108523, + "balance_loss_mlp": 1.06651402, + "epoch": 0.5078876490958061, + "flos": 516463451136.0, + "grad_norm": 0.06423032366665075, + "language_loss": 0.8059274, + "learning_rate": 0.0005115260444031382, + "loss": 0.81701261, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.42016602, + "step": 2640, + "time_per_iteration": 2.5973188877105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036794, + "balance_loss_mlp": 1.02191687, + "epoch": 0.5080800307810697, + "flos": 1583378620416.0, + "grad_norm": 0.017407415587129545, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.7976861, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.1484375, + "step": 2641, + "time_per_iteration": 4.9824395179748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107231, + "balance_loss_mlp": 1.06340933, + "epoch": 0.5082724124663333, + "flos": 484965978624.0, + "grad_norm": 0.05963770496992207, + "language_loss": 0.8711704, + "learning_rate": 0.0005109031165700483, + "loss": 0.88224268, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.43823242, + "step": 2642, + "time_per_iteration": 2.5530447959899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103344, + "balance_loss_mlp": 1.05997539, + "epoch": 0.5084647941515967, + "flos": 681928450560.0, + "grad_norm": 0.05207490997788611, + "language_loss": 0.8334229, + "learning_rate": 0.0005105916462435945, + "loss": 0.84445643, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.43359375, + "step": 2643, + "time_per_iteration": 2.8092200756073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101863, + "balance_loss_mlp": 1.05863762, + "epoch": 0.5086571758368603, + "flos": 548468692992.0, + "grad_norm": 0.0494294374601552, + "language_loss": 0.85464209, + "learning_rate": 0.0005102801718050989, + "loss": 0.86566073, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.43261719, + "step": 2644, + "time_per_iteration": 2.6660444736480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111917, + "balance_loss_mlp": 1.06735659, + "epoch": 0.5088495575221239, + "flos": 563751843840.0, + "grad_norm": 0.0695979688507087, + "language_loss": 0.88942361, + "learning_rate": 0.0005099686933754867, + "loss": 0.9005428, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.44580078, + "step": 2645, + "time_per_iteration": 2.673337697982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108704, + "balance_loss_mlp": 1.06283236, + "epoch": 0.5090419392073875, + "flos": 551132757504.0, + "grad_norm": 0.05355859457172443, + "language_loss": 0.84209561, + "learning_rate": 0.0005096572110756845, + "loss": 0.85318267, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.45874023, + "step": 2646, + "time_per_iteration": 2.6638782024383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112209, + "balance_loss_mlp": 1.06686139, + "epoch": 0.509234320892651, + "flos": 567504230400.0, + "grad_norm": 0.04874041351849401, + "language_loss": 0.85460532, + "learning_rate": 0.0005093457250266205, + "loss": 0.86572737, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.45361328, + "step": 2647, + "time_per_iteration": 2.6637892723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107252, + "balance_loss_mlp": 1.0633595, + "epoch": 0.5094267025779146, + "flos": 582339248640.0, + "grad_norm": 0.05998717956466229, + "language_loss": 0.8317883, + "learning_rate": 0.000509034235349224, + "loss": 0.84286082, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.43920898, + "step": 2648, + "time_per_iteration": 2.6878888607025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100829, + "balance_loss_mlp": 1.05846214, + "epoch": 0.5096190842631781, + "flos": 591704953344.0, + "grad_norm": 0.05244355272630434, + "language_loss": 0.812711, + "learning_rate": 0.0005087227421644266, + "loss": 0.82371926, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.42407227, + "step": 2649, + "time_per_iteration": 2.7117576599121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106927, + "balance_loss_mlp": 1.06346333, + "epoch": 0.5098114659484417, + "flos": 513307584000.0, + "grad_norm": 0.052249476616985355, + "language_loss": 0.8603372, + "learning_rate": 0.0005084112455931602, + "loss": 0.87140644, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.43457031, + "step": 2650, + "time_per_iteration": 2.6070332527160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106986, + "balance_loss_mlp": 1.06578696, + "epoch": 0.5100038476337053, + "flos": 484389808128.0, + "grad_norm": 0.053750245063259934, + "language_loss": 0.85138631, + "learning_rate": 0.0005080997457563586, + "loss": 0.8624562, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.41210938, + "step": 2651, + "time_per_iteration": 2.53045654296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105107, + "balance_loss_mlp": 1.06374109, + "epoch": 0.5101962293189688, + "flos": 461366157312.0, + "grad_norm": 0.06332454651149101, + "language_loss": 0.79166603, + "learning_rate": 0.0005077882427749569, + "loss": 0.80271709, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.41381836, + "step": 2652, + "time_per_iteration": 2.4946300983428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113144, + "balance_loss_mlp": 1.07084906, + "epoch": 0.5103886110042324, + "flos": 586760937984.0, + "grad_norm": 0.06191877346451425, + "language_loss": 0.8487432, + "learning_rate": 0.0005074767367698913, + "loss": 0.85987473, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.42285156, + "step": 2653, + "time_per_iteration": 2.6763722896575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105422, + "balance_loss_mlp": 1.06455684, + "epoch": 0.510580992689496, + "flos": 844906885632.0, + "grad_norm": 0.056937070163659766, + "language_loss": 0.83570945, + "learning_rate": 0.0005071652278620988, + "loss": 0.84676373, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.40869141, + "step": 2654, + "time_per_iteration": 3.0378835201263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109149, + "balance_loss_mlp": 1.06706858, + "epoch": 0.5107733743747596, + "flos": 658328629248.0, + "grad_norm": 0.057649397656864075, + "language_loss": 0.83013982, + "learning_rate": 0.0005068537161725186, + "loss": 0.84123135, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.42041016, + "step": 2655, + "time_per_iteration": 2.7623610496520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105478, + "balance_loss_mlp": 1.06385016, + "epoch": 0.510965756060023, + "flos": 701426677248.0, + "grad_norm": 0.05708536741035134, + "language_loss": 0.8435111, + "learning_rate": 0.0005065422018220893, + "loss": 0.85456586, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.41601562, + "step": 2656, + "time_per_iteration": 2.823542833328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102091, + "balance_loss_mlp": 1.06096351, + "epoch": 0.5111581377452866, + "flos": 559430489088.0, + "grad_norm": 0.05217113074905386, + "language_loss": 0.80225503, + "learning_rate": 0.0005062306849317521, + "loss": 0.81327593, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.41113281, + "step": 2657, + "time_per_iteration": 2.8275818824768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084461, + "balance_loss_mlp": 1.04314327, + "epoch": 0.5113505194305502, + "flos": 608745729024.0, + "grad_norm": 0.05701327198704139, + "language_loss": 0.83469534, + "learning_rate": 0.0005059191656224487, + "loss": 0.84553993, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.41308594, + "step": 2658, + "time_per_iteration": 2.7243552207946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094832, + "balance_loss_mlp": 1.05158317, + "epoch": 0.5115429011158138, + "flos": 534214227456.0, + "grad_norm": 0.0458707137929394, + "language_loss": 0.89186656, + "learning_rate": 0.0005056076440151212, + "loss": 0.90281487, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.43237305, + "step": 2659, + "time_per_iteration": 2.663668632507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047323, + "balance_loss_mlp": 1.0349257, + "epoch": 0.5117352828010774, + "flos": 1361451580416.0, + "grad_norm": 0.020991592608455897, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77335441, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.12402344, + "step": 2660, + "time_per_iteration": 4.851064205169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095712, + "balance_loss_mlp": 1.05420339, + "epoch": 0.5119276644863409, + "flos": 633444046848.0, + "grad_norm": 0.05508509945890564, + "language_loss": 0.87153888, + "learning_rate": 0.0005049845943901691, + "loss": 0.882496, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.41479492, + "step": 2661, + "time_per_iteration": 2.827824831008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085148, + "balance_loss_mlp": 1.04459286, + "epoch": 0.5121200461716044, + "flos": 585304468992.0, + "grad_norm": 0.05132624096148621, + "language_loss": 0.86219436, + "learning_rate": 0.0005046730666144338, + "loss": 0.8730458, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.40527344, + "step": 2662, + "time_per_iteration": 2.75281023979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096524, + "balance_loss_mlp": 1.05542088, + "epoch": 0.512312427856868, + "flos": 1032081661440.0, + "grad_norm": 0.048177160037868025, + "language_loss": 0.87700105, + "learning_rate": 0.0005043615370244532, + "loss": 0.88796628, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.41113281, + "step": 2663, + "time_per_iteration": 3.3618671894073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028213, + "balance_loss_mlp": 1.01524341, + "epoch": 0.5125048095421316, + "flos": 1537257277440.0, + "grad_norm": 0.012858425268609664, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79272604, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.12988281, + "step": 2664, + "time_per_iteration": 4.658047914505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093765, + "balance_loss_mlp": 1.05292368, + "epoch": 0.5126971912273951, + "flos": 590814480384.0, + "grad_norm": 0.04944817886166227, + "language_loss": 0.85279715, + "learning_rate": 0.0005037384728855425, + "loss": 0.86373478, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.40820312, + "step": 2665, + "time_per_iteration": 2.8461544513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098547, + "balance_loss_mlp": 1.05620384, + "epoch": 0.5128895729126587, + "flos": 551393215488.0, + "grad_norm": 0.158979293172939, + "language_loss": 0.84343994, + "learning_rate": 0.0005034269385785075, + "loss": 0.85442543, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.42333984, + "step": 2666, + "time_per_iteration": 2.651714563369751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092703, + "balance_loss_mlp": 1.05222011, + "epoch": 0.5130819545979223, + "flos": 481031709696.0, + "grad_norm": 0.06506731950678159, + "language_loss": 0.84809029, + "learning_rate": 0.0005031154029410168, + "loss": 0.85901731, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.40478516, + "step": 2667, + "time_per_iteration": 2.5316364765167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095809, + "balance_loss_mlp": 1.05368042, + "epoch": 0.5132743362831859, + "flos": 475556603904.0, + "grad_norm": 0.06903413954772, + "language_loss": 0.86695576, + "learning_rate": 0.0005028038660940197, + "loss": 0.87791383, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.42138672, + "step": 2668, + "time_per_iteration": 2.521328926086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090159, + "balance_loss_mlp": 1.04962766, + "epoch": 0.5134667179684494, + "flos": 503559175680.0, + "grad_norm": 0.047102953885103854, + "language_loss": 0.84545898, + "learning_rate": 0.0005024923281584648, + "loss": 0.85636055, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.4050293, + "step": 2669, + "time_per_iteration": 2.6462371349334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092047, + "balance_loss_mlp": 1.05330372, + "epoch": 0.5136590996537129, + "flos": 503647925760.0, + "grad_norm": 0.04719667862832961, + "language_loss": 0.82488692, + "learning_rate": 0.0005021807892553026, + "loss": 0.83580744, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.38696289, + "step": 2670, + "time_per_iteration": 2.732416868209839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094857, + "balance_loss_mlp": 1.05370605, + "epoch": 0.5138514813389765, + "flos": 624330035712.0, + "grad_norm": 0.05149766622145395, + "language_loss": 0.84497285, + "learning_rate": 0.0005018692495054828, + "loss": 0.85592139, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.41137695, + "step": 2671, + "time_per_iteration": 2.760014533996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092038, + "balance_loss_mlp": 1.05174494, + "epoch": 0.5140438630242401, + "flos": 583274801664.0, + "grad_norm": 0.05511271146100304, + "language_loss": 0.80692601, + "learning_rate": 0.0005015577090299561, + "loss": 0.81784636, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.40283203, + "step": 2672, + "time_per_iteration": 2.6871819496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102168, + "balance_loss_mlp": 1.06046844, + "epoch": 0.5142362447095037, + "flos": 487683887616.0, + "grad_norm": 0.05906966789334332, + "language_loss": 0.86718851, + "learning_rate": 0.0005012461679496729, + "loss": 0.87821019, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.41674805, + "step": 2673, + "time_per_iteration": 2.573075771331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111889, + "balance_loss_mlp": 1.06968939, + "epoch": 0.5144286263947672, + "flos": 526601765376.0, + "grad_norm": 0.050226260736663565, + "language_loss": 0.87357539, + "learning_rate": 0.0005009346263855848, + "loss": 0.88469428, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.42211914, + "step": 2674, + "time_per_iteration": 2.6014504432678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100642, + "balance_loss_mlp": 1.06106424, + "epoch": 0.5146210080800308, + "flos": 486252149760.0, + "grad_norm": 0.047502810841318265, + "language_loss": 0.8393209, + "learning_rate": 0.0005006230844586422, + "loss": 0.85032737, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.39599609, + "step": 2675, + "time_per_iteration": 2.7817234992980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102843, + "balance_loss_mlp": 1.06152487, + "epoch": 0.5148133897652943, + "flos": 515622440448.0, + "grad_norm": 0.04472754928085029, + "language_loss": 0.79101396, + "learning_rate": 0.0005003115422897968, + "loss": 0.80204242, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.4128418, + "step": 2676, + "time_per_iteration": 2.72664213180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102609, + "balance_loss_mlp": 1.06243563, + "epoch": 0.5150057714505579, + "flos": 510963614208.0, + "grad_norm": 0.061230997357755966, + "language_loss": 0.86760038, + "learning_rate": 0.0005, + "loss": 0.87862647, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.40161133, + "step": 2677, + "time_per_iteration": 2.6518850326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095319, + "balance_loss_mlp": 1.05648041, + "epoch": 0.5151981531358215, + "flos": 910541164032.0, + "grad_norm": 0.056847893934042666, + "language_loss": 0.79409456, + "learning_rate": 0.0004996884577102033, + "loss": 0.80504775, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.38818359, + "step": 2678, + "time_per_iteration": 3.0679850578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096526, + "balance_loss_mlp": 1.05623293, + "epoch": 0.515390534821085, + "flos": 471599013888.0, + "grad_norm": 0.047432465044858714, + "language_loss": 0.8447082, + "learning_rate": 0.000499376915541358, + "loss": 0.85567349, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.40283203, + "step": 2679, + "time_per_iteration": 2.6785037517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099943, + "balance_loss_mlp": 1.06086659, + "epoch": 0.5155829165063486, + "flos": 649811137536.0, + "grad_norm": 0.04795230358992159, + "language_loss": 0.81296241, + "learning_rate": 0.0004990653736144155, + "loss": 0.82396191, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.390625, + "step": 2680, + "time_per_iteration": 2.840188980102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100494, + "balance_loss_mlp": 1.06072533, + "epoch": 0.5157752981916122, + "flos": 414038476800.0, + "grad_norm": 0.062126395708719404, + "language_loss": 0.86077356, + "learning_rate": 0.0004987538320503271, + "loss": 0.87177849, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.3972168, + "step": 2681, + "time_per_iteration": 2.4594664573669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100598, + "balance_loss_mlp": 1.06054354, + "epoch": 0.5159676798768758, + "flos": 553569859584.0, + "grad_norm": 0.05537703124714055, + "language_loss": 0.82735646, + "learning_rate": 0.0004984422909700442, + "loss": 0.83836246, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.39990234, + "step": 2682, + "time_per_iteration": 2.66052508354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091816, + "balance_loss_mlp": 1.05292952, + "epoch": 0.5161600615621393, + "flos": 586234229760.0, + "grad_norm": 0.051780542585777085, + "language_loss": 0.83951235, + "learning_rate": 0.0004981307504945173, + "loss": 0.85043043, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.38867188, + "step": 2683, + "time_per_iteration": 2.6698381900787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109715, + "balance_loss_mlp": 1.05766809, + "epoch": 0.5163524432474028, + "flos": 588568025088.0, + "grad_norm": 0.05164690349476628, + "language_loss": 0.8939817, + "learning_rate": 0.0004978192107446976, + "loss": 0.90495312, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.39428711, + "step": 2684, + "time_per_iteration": 2.7249348163604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095053, + "balance_loss_mlp": 1.05325842, + "epoch": 0.5165448249326664, + "flos": 503642133504.0, + "grad_norm": 0.05677264338484585, + "language_loss": 0.87172639, + "learning_rate": 0.0004975076718415353, + "loss": 0.8826769, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.41796875, + "step": 2685, + "time_per_iteration": 2.599235773086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086892, + "balance_loss_mlp": 1.04676652, + "epoch": 0.51673720661793, + "flos": 416539597824.0, + "grad_norm": 0.05087662124677675, + "language_loss": 0.90954995, + "learning_rate": 0.0004971961339059806, + "loss": 0.92041892, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.40112305, + "step": 2686, + "time_per_iteration": 2.4647631645202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091735, + "balance_loss_mlp": 1.04986906, + "epoch": 0.5169295883031936, + "flos": 598696164864.0, + "grad_norm": 0.1190187036629449, + "language_loss": 0.83923638, + "learning_rate": 0.0004968845970589832, + "loss": 0.85015374, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.41870117, + "step": 2687, + "time_per_iteration": 2.6631908416748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087993, + "balance_loss_mlp": 1.04793859, + "epoch": 0.517121969988457, + "flos": 556543844352.0, + "grad_norm": 0.06869038553700607, + "language_loss": 0.8455354, + "learning_rate": 0.0004965730614214926, + "loss": 0.85641533, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.40039062, + "step": 2688, + "time_per_iteration": 2.628286361694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098374, + "balance_loss_mlp": 1.05576849, + "epoch": 0.5173143516737206, + "flos": 469214346240.0, + "grad_norm": 0.05001993876024353, + "language_loss": 0.85256827, + "learning_rate": 0.0004962615271144576, + "loss": 0.86355197, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.42602539, + "step": 2689, + "time_per_iteration": 2.5224428176879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091114, + "balance_loss_mlp": 1.05017805, + "epoch": 0.5175067333589842, + "flos": 719739067392.0, + "grad_norm": 0.0600896413832987, + "language_loss": 0.82435369, + "learning_rate": 0.0004959499942588264, + "loss": 0.8352648, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.40917969, + "step": 2690, + "time_per_iteration": 2.923792600631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043722, + "balance_loss_mlp": 1.02932107, + "epoch": 0.5176991150442478, + "flos": 1465402834944.0, + "grad_norm": 0.02659438930583784, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79243743, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.14355469, + "step": 2691, + "time_per_iteration": 4.779648542404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089552, + "balance_loss_mlp": 1.04863954, + "epoch": 0.5178914967295114, + "flos": 612345346560.0, + "grad_norm": 0.05374555179207371, + "language_loss": 0.85215712, + "learning_rate": 0.0004953269333855661, + "loss": 0.86305267, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.40917969, + "step": 2692, + "time_per_iteration": 2.7646090984344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086121, + "balance_loss_mlp": 1.04604328, + "epoch": 0.5180838784147749, + "flos": 500663766528.0, + "grad_norm": 0.05670677168127033, + "language_loss": 0.84148359, + "learning_rate": 0.0004950154056098309, + "loss": 0.85234475, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.40039062, + "step": 2693, + "time_per_iteration": 2.7038145065307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088689, + "balance_loss_mlp": 1.0469892, + "epoch": 0.5182762601000385, + "flos": 688531166208.0, + "grad_norm": 0.05599909013755839, + "language_loss": 0.84343493, + "learning_rate": 0.0004947038797692867, + "loss": 0.85432184, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.41699219, + "step": 2694, + "time_per_iteration": 2.8155903816223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092198, + "balance_loss_mlp": 1.05147612, + "epoch": 0.518468641785302, + "flos": 665315458560.0, + "grad_norm": 0.046372715162849826, + "language_loss": 0.77593923, + "learning_rate": 0.0004943923559848789, + "loss": 0.7868613, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.40698242, + "step": 2695, + "time_per_iteration": 2.787229061126709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086627, + "balance_loss_mlp": 1.04714453, + "epoch": 0.5186610234705656, + "flos": 566440639488.0, + "grad_norm": 0.05332286724917534, + "language_loss": 0.89972508, + "learning_rate": 0.0004940808343775515, + "loss": 0.9105913, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.39453125, + "step": 2696, + "time_per_iteration": 2.6648201942443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083742, + "balance_loss_mlp": 1.04292464, + "epoch": 0.5188534051558291, + "flos": 428652324864.0, + "grad_norm": 0.055572994373314345, + "language_loss": 0.82251114, + "learning_rate": 0.0004937693150682479, + "loss": 0.83334857, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.40820312, + "step": 2697, + "time_per_iteration": 2.5013554096221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089603, + "balance_loss_mlp": 1.04804635, + "epoch": 0.5190457868410927, + "flos": 546085435392.0, + "grad_norm": 0.05634548635888483, + "language_loss": 0.7652837, + "learning_rate": 0.0004934577981779107, + "loss": 0.77617967, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.41552734, + "step": 2698, + "time_per_iteration": 2.7512943744659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092838, + "balance_loss_mlp": 1.04958856, + "epoch": 0.5192381685263563, + "flos": 548321716224.0, + "grad_norm": 0.04670174030259061, + "language_loss": 0.81419832, + "learning_rate": 0.0004931462838274817, + "loss": 0.82512677, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.43237305, + "step": 2699, + "time_per_iteration": 2.8294084072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082299, + "balance_loss_mlp": 1.04296041, + "epoch": 0.5194305502116199, + "flos": 574993036800.0, + "grad_norm": 0.05440575131052059, + "language_loss": 0.83835357, + "learning_rate": 0.0004928347721379011, + "loss": 0.84917653, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.39331055, + "step": 2700, + "time_per_iteration": 2.643941879272461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084407, + "balance_loss_mlp": 1.04485357, + "epoch": 0.5196229318968835, + "flos": 434019741696.0, + "grad_norm": 0.054958496552239416, + "language_loss": 0.81611145, + "learning_rate": 0.0004925232632301089, + "loss": 0.8269555, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.39526367, + "step": 2701, + "time_per_iteration": 2.5408122539520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084434, + "balance_loss_mlp": 1.04638255, + "epoch": 0.5198153135821469, + "flos": 558607007232.0, + "grad_norm": 0.05193596738822722, + "language_loss": 0.79534626, + "learning_rate": 0.0004922117572250431, + "loss": 0.80619061, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.38037109, + "step": 2702, + "time_per_iteration": 2.6687467098236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078961, + "balance_loss_mlp": 1.04152906, + "epoch": 0.5200076952674105, + "flos": 565397397504.0, + "grad_norm": 0.04814908286006495, + "language_loss": 0.80652344, + "learning_rate": 0.0004919002542436414, + "loss": 0.81731308, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.37451172, + "step": 2703, + "time_per_iteration": 2.811460256576538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085168, + "balance_loss_mlp": 1.04644859, + "epoch": 0.5202000769526741, + "flos": 570916173312.0, + "grad_norm": 0.05555982935463854, + "language_loss": 0.81149572, + "learning_rate": 0.0004915887544068399, + "loss": 0.8223474, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.38720703, + "step": 2704, + "time_per_iteration": 2.6499714851379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093505, + "balance_loss_mlp": 1.05199671, + "epoch": 0.5203924586379377, + "flos": 693898583040.0, + "grad_norm": 0.050837486186397586, + "language_loss": 0.77994883, + "learning_rate": 0.0004912772578355736, + "loss": 0.7908839, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.41503906, + "step": 2705, + "time_per_iteration": 2.8637514114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094973, + "balance_loss_mlp": 1.0555619, + "epoch": 0.5205848403232012, + "flos": 566215087104.0, + "grad_norm": 0.054100857686445215, + "language_loss": 0.8301729, + "learning_rate": 0.000490965764650776, + "loss": 0.84112263, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.39404297, + "step": 2706, + "time_per_iteration": 2.8644323348999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085669, + "balance_loss_mlp": 1.04661632, + "epoch": 0.5207772220084648, + "flos": 1213775539200.0, + "grad_norm": 0.05228956126941533, + "language_loss": 0.82813179, + "learning_rate": 0.0004906542749733798, + "loss": 0.83898848, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.39013672, + "step": 2707, + "time_per_iteration": 3.6128242015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084528, + "balance_loss_mlp": 1.04635715, + "epoch": 0.5209696036937284, + "flos": 592547374080.0, + "grad_norm": 0.12708447176708407, + "language_loss": 0.84871459, + "learning_rate": 0.0004903427889243156, + "loss": 0.85955989, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.38134766, + "step": 2708, + "time_per_iteration": 2.86226487159729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109093, + "balance_loss_mlp": 1.05211544, + "epoch": 0.5211619853789919, + "flos": 522623826432.0, + "grad_norm": 0.05348625186790992, + "language_loss": 0.85548282, + "learning_rate": 0.0004900313066245134, + "loss": 0.86639208, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.38818359, + "step": 2709, + "time_per_iteration": 2.662485122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081893, + "balance_loss_mlp": 1.0432452, + "epoch": 0.5213543670642555, + "flos": 502534872576.0, + "grad_norm": 0.050688452880556414, + "language_loss": 0.80490649, + "learning_rate": 0.0004897198281949012, + "loss": 0.81572545, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.38647461, + "step": 2710, + "time_per_iteration": 2.6449263095855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085994, + "balance_loss_mlp": 1.04636908, + "epoch": 0.521546748749519, + "flos": 585682790400.0, + "grad_norm": 0.05860885905894002, + "language_loss": 0.77534401, + "learning_rate": 0.0004894083537564057, + "loss": 0.78620392, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.39599609, + "step": 2711, + "time_per_iteration": 2.7473373413085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083493, + "balance_loss_mlp": 1.04458284, + "epoch": 0.5217391304347826, + "flos": 569833643520.0, + "grad_norm": 0.04954385524753536, + "language_loss": 0.80801934, + "learning_rate": 0.0004890968834299519, + "loss": 0.81885427, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.38867188, + "step": 2712, + "time_per_iteration": 2.7709779739379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084865, + "balance_loss_mlp": 1.04621696, + "epoch": 0.5219315121200462, + "flos": 542501784576.0, + "grad_norm": 0.06807472429400872, + "language_loss": 0.78801876, + "learning_rate": 0.0004887854173364633, + "loss": 0.7988674, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.38623047, + "step": 2713, + "time_per_iteration": 2.710489273071289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084971, + "balance_loss_mlp": 1.04713416, + "epoch": 0.5221238938053098, + "flos": 550006557696.0, + "grad_norm": 0.048000843690728094, + "language_loss": 0.81816071, + "learning_rate": 0.0004884739555968617, + "loss": 0.82901043, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.37866211, + "step": 2714, + "time_per_iteration": 2.8097493648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_mlp": 1.01785719, + "epoch": 0.5223162754905732, + "flos": 1354373028864.0, + "grad_norm": 0.016208306264550634, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80007499, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.12597656, + "step": 2715, + "time_per_iteration": 4.9789557456970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083164, + "balance_loss_mlp": 1.04444456, + "epoch": 0.5225086571758368, + "flos": 567441621504.0, + "grad_norm": 0.04806245104826077, + "language_loss": 0.86670554, + "learning_rate": 0.0004878510456629992, + "loss": 0.87753725, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.38696289, + "step": 2716, + "time_per_iteration": 3.015443801879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084237, + "balance_loss_mlp": 1.0459466, + "epoch": 0.5227010388611004, + "flos": 499914478080.0, + "grad_norm": 0.051081355886524536, + "language_loss": 0.85046101, + "learning_rate": 0.00048753959771057314, + "loss": 0.86130333, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.3828125, + "step": 2717, + "time_per_iteration": 2.623352289199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084285, + "balance_loss_mlp": 1.04539871, + "epoch": 0.522893420546364, + "flos": 597372115968.0, + "grad_norm": 0.0531417340924391, + "language_loss": 0.82181746, + "learning_rate": 0.0004872281545957044, + "loss": 0.83266038, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.38842773, + "step": 2718, + "time_per_iteration": 2.7300612926483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080864, + "balance_loss_mlp": 1.04154897, + "epoch": 0.5230858022316276, + "flos": 664278008832.0, + "grad_norm": 0.05093940259468129, + "language_loss": 0.85964847, + "learning_rate": 0.0004869167164393055, + "loss": 0.87045711, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.39306641, + "step": 2719, + "time_per_iteration": 2.9219412803649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108043, + "balance_loss_mlp": 1.04206884, + "epoch": 0.5232781839168911, + "flos": 603547047936.0, + "grad_norm": 0.04294663688852852, + "language_loss": 0.89195794, + "learning_rate": 0.00048660528336228793, + "loss": 0.90276217, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.38330078, + "step": 2720, + "time_per_iteration": 2.7792000770568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076128, + "balance_loss_mlp": 1.03781438, + "epoch": 0.5234705656021547, + "flos": 550438723584.0, + "grad_norm": 0.04780199229625597, + "language_loss": 0.90052795, + "learning_rate": 0.0004862938554855606, + "loss": 0.91128922, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.3828125, + "step": 2721, + "time_per_iteration": 2.781075954437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083526, + "balance_loss_mlp": 1.04509294, + "epoch": 0.5236629472874182, + "flos": 504026247168.0, + "grad_norm": 0.06026541291367098, + "language_loss": 0.85920995, + "learning_rate": 0.0004859824329300304, + "loss": 0.87004519, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.3840332, + "step": 2722, + "time_per_iteration": 2.5523464679718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078682, + "balance_loss_mlp": 1.04043949, + "epoch": 0.5238553289726818, + "flos": 547394927616.0, + "grad_norm": 0.04759572809953804, + "language_loss": 0.83678633, + "learning_rate": 0.00048567101581660244, + "loss": 0.84757316, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.38208008, + "step": 2723, + "time_per_iteration": 2.62168288230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081139, + "balance_loss_mlp": 1.04208636, + "epoch": 0.5240477106579453, + "flos": 531702931968.0, + "grad_norm": 0.060086559712579084, + "language_loss": 0.87061596, + "learning_rate": 0.00048535960426617956, + "loss": 0.88142729, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.39038086, + "step": 2724, + "time_per_iteration": 2.5913078784942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081015, + "balance_loss_mlp": 1.04208124, + "epoch": 0.5242400923432089, + "flos": 617653126656.0, + "grad_norm": 0.05554996608046291, + "language_loss": 0.81582165, + "learning_rate": 0.0004850481983996621, + "loss": 0.82663178, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.3894043, + "step": 2725, + "time_per_iteration": 2.744001865386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083025, + "balance_loss_mlp": 1.04366207, + "epoch": 0.5244324740284725, + "flos": 416461022208.0, + "grad_norm": 0.051041166575027594, + "language_loss": 0.87690443, + "learning_rate": 0.0004847367983379492, + "loss": 0.88773465, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.39331055, + "step": 2726, + "time_per_iteration": 2.452622652053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081299, + "balance_loss_mlp": 1.04327154, + "epoch": 0.5246248557137361, + "flos": 626113801728.0, + "grad_norm": 0.0465947896589182, + "language_loss": 0.7866348, + "learning_rate": 0.00048442540420193643, + "loss": 0.7974478, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.38012695, + "step": 2727, + "time_per_iteration": 2.8958897590637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085515, + "balance_loss_mlp": 1.04524565, + "epoch": 0.5248172373989997, + "flos": 1247980746240.0, + "grad_norm": 0.0639927904505779, + "language_loss": 0.79006433, + "learning_rate": 0.0004841140161125182, + "loss": 0.80091947, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.40234375, + "step": 2728, + "time_per_iteration": 3.5769736766815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109207, + "balance_loss_mlp": 1.05370796, + "epoch": 0.5250096190842631, + "flos": 506616118272.0, + "grad_norm": 0.05909227072060698, + "language_loss": 0.84801137, + "learning_rate": 0.0004838026341905857, + "loss": 0.85893214, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.38354492, + "step": 2729, + "time_per_iteration": 2.6979076862335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082745, + "balance_loss_mlp": 1.04476523, + "epoch": 0.5252020007695267, + "flos": 611021297664.0, + "grad_norm": 0.0531469423300266, + "language_loss": 0.85391581, + "learning_rate": 0.00048349125855702844, + "loss": 0.86474323, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.37915039, + "step": 2730, + "time_per_iteration": 2.7757534980773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084625, + "balance_loss_mlp": 1.04669309, + "epoch": 0.5253943824547903, + "flos": 538970568192.0, + "grad_norm": 0.04649712268604906, + "language_loss": 0.81255782, + "learning_rate": 0.00048317988933273287, + "loss": 0.82340407, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.37939453, + "step": 2731, + "time_per_iteration": 2.7401769161224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094943, + "balance_loss_mlp": 1.05476904, + "epoch": 0.5255867641400539, + "flos": 697714988544.0, + "grad_norm": 0.05136039584795155, + "language_loss": 0.82178587, + "learning_rate": 0.00048286852663858367, + "loss": 0.8327353, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.40161133, + "step": 2732, + "time_per_iteration": 2.9572720527648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088204, + "balance_loss_mlp": 1.05084419, + "epoch": 0.5257791458253175, + "flos": 666975568896.0, + "grad_norm": 0.08443038207797475, + "language_loss": 0.83823925, + "learning_rate": 0.000482557170595462, + "loss": 0.84912133, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.37304688, + "step": 2733, + "time_per_iteration": 2.881659746170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092705, + "balance_loss_mlp": 1.05443931, + "epoch": 0.525971527510581, + "flos": 483375679488.0, + "grad_norm": 0.04826672636793544, + "language_loss": 0.87744856, + "learning_rate": 0.0004822458213242475, + "loss": 0.88837564, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.38232422, + "step": 2734, + "time_per_iteration": 2.5599043369293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090689, + "balance_loss_mlp": 1.05270863, + "epoch": 0.5261639091958445, + "flos": 829559715840.0, + "grad_norm": 0.055467035242162094, + "language_loss": 0.85945731, + "learning_rate": 0.00048193447894581627, + "loss": 0.87036419, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.37988281, + "step": 2735, + "time_per_iteration": 3.1253552436828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102448, + "balance_loss_mlp": 1.06258464, + "epoch": 0.5263562908811081, + "flos": 520461739008.0, + "grad_norm": 0.05936611315903256, + "language_loss": 0.87591684, + "learning_rate": 0.00048162314358104243, + "loss": 0.88694137, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.39868164, + "step": 2736, + "time_per_iteration": 2.5996334552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094957, + "balance_loss_mlp": 1.05704832, + "epoch": 0.5265486725663717, + "flos": 574722404352.0, + "grad_norm": 0.047689297469847035, + "language_loss": 0.82871807, + "learning_rate": 0.0004813118153507969, + "loss": 0.83966762, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.37890625, + "step": 2737, + "time_per_iteration": 2.7455976009368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058466, + "balance_loss_mlp": 1.04540098, + "epoch": 0.5267410542516352, + "flos": 1546439537664.0, + "grad_norm": 0.021507379855054985, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83505595, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.13085938, + "step": 2738, + "time_per_iteration": 4.774937629699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110016, + "balance_loss_mlp": 1.06184578, + "epoch": 0.5269334359368988, + "flos": 929576701440.0, + "grad_norm": 0.045277698895202834, + "language_loss": 0.83199632, + "learning_rate": 0.00048068918077736163, + "loss": 0.84299791, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.38305664, + "step": 2739, + "time_per_iteration": 3.253458261489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102121, + "balance_loss_mlp": 1.06256771, + "epoch": 0.5271258176221624, + "flos": 655079629824.0, + "grad_norm": 0.05720476143842487, + "language_loss": 0.81167477, + "learning_rate": 0.0004803778746759001, + "loss": 0.82269597, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.39526367, + "step": 2740, + "time_per_iteration": 2.890253782272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095422, + "balance_loss_mlp": 1.05777621, + "epoch": 0.527318199307426, + "flos": 542781181440.0, + "grad_norm": 0.064499445698322, + "language_loss": 0.81573081, + "learning_rate": 0.00048006657619242317, + "loss": 0.82668501, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.37646484, + "step": 2741, + "time_per_iteration": 2.696274518966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104347, + "balance_loss_mlp": 1.06419694, + "epoch": 0.5275105809926895, + "flos": 447629635584.0, + "grad_norm": 0.05845576302131632, + "language_loss": 0.78272831, + "learning_rate": 0.00047975528544778775, + "loss": 0.79377174, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.40112305, + "step": 2742, + "time_per_iteration": 2.6140294075012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094189, + "balance_loss_mlp": 1.05508804, + "epoch": 0.527702962677953, + "flos": 578656673280.0, + "grad_norm": 0.058395918180573554, + "language_loss": 0.88265073, + "learning_rate": 0.00047944400256284754, + "loss": 0.89359266, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.39086914, + "step": 2743, + "time_per_iteration": 2.7063393592834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097827, + "balance_loss_mlp": 1.0614922, + "epoch": 0.5278953443632166, + "flos": 652465027584.0, + "grad_norm": 0.07282412653967131, + "language_loss": 0.79796684, + "learning_rate": 0.0004791327276584532, + "loss": 0.80894512, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.36352539, + "step": 2744, + "time_per_iteration": 2.8260412216186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109844, + "balance_loss_mlp": 1.06031692, + "epoch": 0.5280877260484802, + "flos": 513741159936.0, + "grad_norm": 0.04991281876590649, + "language_loss": 0.80703586, + "learning_rate": 0.00047882146085545264, + "loss": 0.81802028, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.38061523, + "step": 2745, + "time_per_iteration": 2.6051464080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018989, + "balance_loss_mlp": 1.00611436, + "epoch": 0.5282801077337438, + "flos": 1444650370560.0, + "grad_norm": 0.010819489631099216, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76421368, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.12890625, + "step": 2746, + "time_per_iteration": 4.9944517612457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084488, + "balance_loss_mlp": 1.0470562, + "epoch": 0.5284724894190073, + "flos": 604580115456.0, + "grad_norm": 0.058273426421755106, + "language_loss": 0.79290295, + "learning_rate": 0.00047819895203700684, + "loss": 0.80374789, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.37451172, + "step": 2747, + "time_per_iteration": 2.728018045425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016603, + "balance_loss_mlp": 1.00410998, + "epoch": 0.5286648711042709, + "flos": 1494172224000.0, + "grad_norm": 0.012264329558562137, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76529038, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.125, + "step": 2748, + "time_per_iteration": 4.659038782119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077352, + "balance_loss_mlp": 1.03860867, + "epoch": 0.5288572527895344, + "flos": 597313889280.0, + "grad_norm": 0.056212558578819974, + "language_loss": 0.88259304, + "learning_rate": 0.0004775764770742277, + "loss": 0.89336658, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.38720703, + "step": 2749, + "time_per_iteration": 2.845102548599243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086383, + "balance_loss_mlp": 1.04699659, + "epoch": 0.529049634474798, + "flos": 557041439232.0, + "grad_norm": 0.05924821658857843, + "language_loss": 0.86565638, + "learning_rate": 0.00047726525259079777, + "loss": 0.87652022, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.39404297, + "step": 2750, + "time_per_iteration": 2.773296356201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085746, + "balance_loss_mlp": 1.04793251, + "epoch": 0.5292420161600616, + "flos": 580986086400.0, + "grad_norm": 0.05670035904014211, + "language_loss": 0.885436, + "learning_rate": 0.0004769540369337798, + "loss": 0.89629346, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.37792969, + "step": 2751, + "time_per_iteration": 2.715921401977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084379, + "balance_loss_mlp": 1.04563594, + "epoch": 0.5294343978453251, + "flos": 607989086208.0, + "grad_norm": 0.05448198338431079, + "language_loss": 0.86051679, + "learning_rate": 0.00047664283022399794, + "loss": 0.87136054, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.38720703, + "step": 2752, + "time_per_iteration": 2.8683502674102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078933, + "balance_loss_mlp": 1.04078627, + "epoch": 0.5296267795305887, + "flos": 646226076672.0, + "grad_norm": 0.05827570747642561, + "language_loss": 0.81129229, + "learning_rate": 0.00047633163258227376, + "loss": 0.82208163, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.38110352, + "step": 2753, + "time_per_iteration": 2.8427987098693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084727, + "balance_loss_mlp": 1.04595971, + "epoch": 0.5298191612158523, + "flos": 559482923520.0, + "grad_norm": 0.14342502720880523, + "language_loss": 0.85232151, + "learning_rate": 0.0004760204441294247, + "loss": 0.86316884, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.38745117, + "step": 2754, + "time_per_iteration": 2.644049882888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088138, + "balance_loss_mlp": 1.05096865, + "epoch": 0.5300115429011159, + "flos": 513776065536.0, + "grad_norm": 0.052931776937271004, + "language_loss": 0.86139393, + "learning_rate": 0.00047570926498626486, + "loss": 0.87227535, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.37133789, + "step": 2755, + "time_per_iteration": 2.6872901916503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092723, + "balance_loss_mlp": 1.05402756, + "epoch": 0.5302039245863793, + "flos": 672475405824.0, + "grad_norm": 0.0470441247054563, + "language_loss": 0.81654894, + "learning_rate": 0.00047539809527360474, + "loss": 0.82747614, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.38696289, + "step": 2756, + "time_per_iteration": 2.865553379058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093267, + "balance_loss_mlp": 1.05488133, + "epoch": 0.5303963062716429, + "flos": 730507396608.0, + "grad_norm": 0.04188022637432273, + "language_loss": 0.82037127, + "learning_rate": 0.0004750869351122511, + "loss": 0.83130395, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.38330078, + "step": 2757, + "time_per_iteration": 2.9792187213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093906, + "balance_loss_mlp": 1.0563792, + "epoch": 0.5305886879569065, + "flos": 573156836352.0, + "grad_norm": 0.0631181134246054, + "language_loss": 0.81604397, + "learning_rate": 0.00047477578462300685, + "loss": 0.82698298, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.37524414, + "step": 2758, + "time_per_iteration": 2.6986684799194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093632, + "balance_loss_mlp": 1.05553293, + "epoch": 0.5307810696421701, + "flos": 694988315136.0, + "grad_norm": 0.050985358767642326, + "language_loss": 0.79166949, + "learning_rate": 0.0004744646439266718, + "loss": 0.80260581, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.38085938, + "step": 2759, + "time_per_iteration": 2.978621006011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091884, + "balance_loss_mlp": 1.05342746, + "epoch": 0.5309734513274337, + "flos": 648629683200.0, + "grad_norm": 0.042424952199748935, + "language_loss": 0.92400765, + "learning_rate": 0.000474153513144041, + "loss": 0.93492657, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.38427734, + "step": 2760, + "time_per_iteration": 2.8996803760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109538, + "balance_loss_mlp": 1.05534935, + "epoch": 0.5311658330126972, + "flos": 604517506560.0, + "grad_norm": 0.048779343359875056, + "language_loss": 0.86932075, + "learning_rate": 0.00047384239239590633, + "loss": 0.88027459, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.39990234, + "step": 2761, + "time_per_iteration": 2.8649730682373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090925, + "balance_loss_mlp": 1.05342138, + "epoch": 0.5313582146979607, + "flos": 557995931136.0, + "grad_norm": 0.062125162710189655, + "language_loss": 0.88300002, + "learning_rate": 0.0004735312818030556, + "loss": 0.89390922, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.37475586, + "step": 2762, + "time_per_iteration": 2.664534091949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108596, + "balance_loss_mlp": 1.04776537, + "epoch": 0.5315505963832243, + "flos": 508152572928.0, + "grad_norm": 0.04725442501000759, + "language_loss": 0.82514352, + "learning_rate": 0.0004732201814862727, + "loss": 0.83600307, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.38183594, + "step": 2763, + "time_per_iteration": 2.7150583267211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100901, + "balance_loss_mlp": 1.06113279, + "epoch": 0.5317429780684879, + "flos": 626132740608.0, + "grad_norm": 0.050347986684343975, + "language_loss": 0.81810606, + "learning_rate": 0.0004729090915663373, + "loss": 0.82911509, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.39746094, + "step": 2764, + "time_per_iteration": 2.837186336517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093533, + "balance_loss_mlp": 1.05509973, + "epoch": 0.5319353597537514, + "flos": 476506713600.0, + "grad_norm": 0.06358705333883939, + "language_loss": 0.85396516, + "learning_rate": 0.00047259801216402534, + "loss": 0.86490047, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.38427734, + "step": 2765, + "time_per_iteration": 2.5005743503570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094119, + "balance_loss_mlp": 1.05592442, + "epoch": 0.532127741439015, + "flos": 501386913792.0, + "grad_norm": 0.06543180937467778, + "language_loss": 0.8612839, + "learning_rate": 0.00047228694340010845, + "loss": 0.87222505, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.38183594, + "step": 2766, + "time_per_iteration": 2.549018144607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096891, + "balance_loss_mlp": 1.0578146, + "epoch": 0.5323201231242786, + "flos": 1164114063360.0, + "grad_norm": 0.04837235133211893, + "language_loss": 0.85614288, + "learning_rate": 0.0004719758853953544, + "loss": 0.8671118, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.390625, + "step": 2767, + "time_per_iteration": 3.568779468536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095476, + "balance_loss_mlp": 1.05709052, + "epoch": 0.5325125048095422, + "flos": 378493254144.0, + "grad_norm": 0.06740098585195309, + "language_loss": 0.84098738, + "learning_rate": 0.00047166483827052645, + "loss": 0.85194218, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.38354492, + "step": 2768, + "time_per_iteration": 2.4389522075653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_mlp": 1.01784337, + "epoch": 0.5327048864948057, + "flos": 1540507534848.0, + "grad_norm": 0.01937833439113787, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78109497, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.13183594, + "step": 2769, + "time_per_iteration": 4.967049837112427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093166, + "balance_loss_mlp": 1.05320704, + "epoch": 0.5328972681800692, + "flos": 910877225472.0, + "grad_norm": 0.052506511923680964, + "language_loss": 0.83564013, + "learning_rate": 0.000471042777143682, + "loss": 0.8465718, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.3996582, + "step": 2770, + "time_per_iteration": 3.2065277099609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083074, + "balance_loss_mlp": 1.04530883, + "epoch": 0.5330896498653328, + "flos": 473660766720.0, + "grad_norm": 0.0519747156636442, + "language_loss": 0.79680347, + "learning_rate": 0.0004707317633831707, + "loss": 0.80763417, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.37744141, + "step": 2771, + "time_per_iteration": 2.5498273372650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091325, + "balance_loss_mlp": 1.05408382, + "epoch": 0.5332820315505964, + "flos": 501386913792.0, + "grad_norm": 0.05598064533442757, + "language_loss": 0.77608013, + "learning_rate": 0.00047042076098559673, + "loss": 0.78699338, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.37231445, + "step": 2772, + "time_per_iteration": 2.5759775638580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091384, + "balance_loss_mlp": 1.05323732, + "epoch": 0.53347441323586, + "flos": 924043368960.0, + "grad_norm": 0.060675625301583505, + "language_loss": 0.73884845, + "learning_rate": 0.00047010977007170174, + "loss": 0.7497623, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.38110352, + "step": 2773, + "time_per_iteration": 3.257833957672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089952, + "balance_loss_mlp": 1.05099463, + "epoch": 0.5336667949211235, + "flos": 574185521664.0, + "grad_norm": 0.06246333407972351, + "language_loss": 0.82451814, + "learning_rate": 0.00046979879076222334, + "loss": 0.83541769, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.38916016, + "step": 2774, + "time_per_iteration": 2.6394476890563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091431, + "balance_loss_mlp": 1.05306923, + "epoch": 0.533859176606387, + "flos": 1064233880064.0, + "grad_norm": 0.044878758318980805, + "language_loss": 0.85063684, + "learning_rate": 0.0004694878231778939, + "loss": 0.86155117, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.38330078, + "step": 2775, + "time_per_iteration": 3.3668456077575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085244, + "balance_loss_mlp": 1.04695392, + "epoch": 0.5340515582916506, + "flos": 746277967872.0, + "grad_norm": 0.04760082973405309, + "language_loss": 0.84270054, + "learning_rate": 0.0004691768674394423, + "loss": 0.85355294, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.38305664, + "step": 2776, + "time_per_iteration": 2.9580860137939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_mlp": 1.02644587, + "epoch": 0.5342439399769142, + "flos": 1444905036288.0, + "grad_norm": 0.01780260433895519, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85522568, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.12109375, + "step": 2777, + "time_per_iteration": 4.798782825469971 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036301, + "balance_loss_mlp": 1.02423704, + "epoch": 0.5344363216621778, + "flos": 1426790495232.0, + "grad_norm": 0.016806659478265918, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77689832, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.12060547, + "step": 2778, + "time_per_iteration": 4.971946477890015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083696, + "balance_loss_mlp": 1.04650259, + "epoch": 0.5346287033474413, + "flos": 527355436032.0, + "grad_norm": 0.27028176168378437, + "language_loss": 0.79060376, + "learning_rate": 0.00046824407250656676, + "loss": 0.80144072, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.37158203, + "step": 2779, + "time_per_iteration": 2.639554738998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083286, + "balance_loss_mlp": 1.04528189, + "epoch": 0.5348210850327049, + "flos": 510515481600.0, + "grad_norm": 0.04912000707376091, + "language_loss": 0.83288354, + "learning_rate": 0.0004679331653588161, + "loss": 0.84371638, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.37988281, + "step": 2780, + "time_per_iteration": 2.590897560119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082747, + "balance_loss_mlp": 1.04388487, + "epoch": 0.5350134667179685, + "flos": 462429748224.0, + "grad_norm": 0.07636572739089499, + "language_loss": 0.8547262, + "learning_rate": 0.0004676222706605147, + "loss": 0.86555368, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.38867188, + "step": 2781, + "time_per_iteration": 2.606795310974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088005, + "balance_loss_mlp": 1.04647303, + "epoch": 0.535205848403232, + "flos": 708566275584.0, + "grad_norm": 0.05667741573580048, + "language_loss": 0.84751678, + "learning_rate": 0.0004673113885323626, + "loss": 0.85839683, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.4152832, + "step": 2782, + "time_per_iteration": 2.813957691192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084509, + "balance_loss_mlp": 1.04507411, + "epoch": 0.5353982300884956, + "flos": 893855388672.0, + "grad_norm": 0.04933634097838137, + "language_loss": 0.78395712, + "learning_rate": 0.00046700051909505494, + "loss": 0.79480219, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.39404297, + "step": 2783, + "time_per_iteration": 3.151244878768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089347, + "balance_loss_mlp": 1.0476948, + "epoch": 0.5355906117737591, + "flos": 535701219840.0, + "grad_norm": 0.06378381527079717, + "language_loss": 0.83984947, + "learning_rate": 0.000466689662469282, + "loss": 0.85074294, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.41650391, + "step": 2784, + "time_per_iteration": 2.6275248527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081581, + "balance_loss_mlp": 1.04159856, + "epoch": 0.5357829934590227, + "flos": 868477593600.0, + "grad_norm": 0.05202541270375375, + "language_loss": 0.83895493, + "learning_rate": 0.00046637881877572917, + "loss": 0.84977078, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.3996582, + "step": 2785, + "time_per_iteration": 3.069645404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085059, + "balance_loss_mlp": 1.04481411, + "epoch": 0.5359753751442863, + "flos": 552999481344.0, + "grad_norm": 0.08844651025983005, + "language_loss": 0.8452431, + "learning_rate": 0.0004660679881350764, + "loss": 0.85609365, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.40234375, + "step": 2786, + "time_per_iteration": 2.7307839393615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059616, + "balance_loss_mlp": 1.04531133, + "epoch": 0.5361677568295499, + "flos": 1479687823872.0, + "grad_norm": 0.02226240505672553, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76667762, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.14257812, + "step": 2787, + "time_per_iteration": 5.010236740112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083366, + "balance_loss_mlp": 1.04352605, + "epoch": 0.5363601385148133, + "flos": 805910432256.0, + "grad_norm": 0.0562451411020875, + "language_loss": 0.78052628, + "learning_rate": 0.0004654463664951667, + "loss": 0.79135996, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.3984375, + "step": 2788, + "time_per_iteration": 2.9822394847869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090776, + "balance_loss_mlp": 1.05076993, + "epoch": 0.5365525202000769, + "flos": 507630246912.0, + "grad_norm": 0.05204597911301594, + "language_loss": 0.82849109, + "learning_rate": 0.0004651355757372447, + "loss": 0.83939886, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.39990234, + "step": 2789, + "time_per_iteration": 2.615691900253296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089332, + "balance_loss_mlp": 1.04937315, + "epoch": 0.5367449018853405, + "flos": 528660546048.0, + "grad_norm": 0.0871364316310779, + "language_loss": 0.854258, + "learning_rate": 0.00046482479851489274, + "loss": 0.86515129, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.39941406, + "step": 2790, + "time_per_iteration": 2.7088706493377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089329, + "balance_loss_mlp": 1.04853582, + "epoch": 0.5369372835706041, + "flos": 649614698496.0, + "grad_norm": 0.059769288934836705, + "language_loss": 0.78002077, + "learning_rate": 0.00046451403494876525, + "loss": 0.79091412, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.40795898, + "step": 2791, + "time_per_iteration": 2.8624680042266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082264, + "balance_loss_mlp": 1.04254341, + "epoch": 0.5371296652558677, + "flos": 584205972480.0, + "grad_norm": 0.05423678017273499, + "language_loss": 0.84187895, + "learning_rate": 0.0004642032851595111, + "loss": 0.8527016, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.3972168, + "step": 2792, + "time_per_iteration": 2.7222046852111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090422, + "balance_loss_mlp": 1.04877055, + "epoch": 0.5373220469411312, + "flos": 595570821120.0, + "grad_norm": 0.05596231110481221, + "language_loss": 0.84764576, + "learning_rate": 0.00046389254926777404, + "loss": 0.85855001, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.41674805, + "step": 2793, + "time_per_iteration": 2.8049495220184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083638, + "balance_loss_mlp": 1.04286838, + "epoch": 0.5375144286263948, + "flos": 1113965167104.0, + "grad_norm": 0.05603938595076487, + "language_loss": 0.78227508, + "learning_rate": 0.0004635818273941926, + "loss": 0.79311144, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.4074707, + "step": 2794, + "time_per_iteration": 3.506617307662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085514, + "balance_loss_mlp": 1.04495919, + "epoch": 0.5377068103116583, + "flos": 595319127552.0, + "grad_norm": 0.07610950885477011, + "language_loss": 0.81443048, + "learning_rate": 0.0004632711196593997, + "loss": 0.82528561, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.40527344, + "step": 2795, + "time_per_iteration": 2.7142324447631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083447, + "balance_loss_mlp": 1.04377437, + "epoch": 0.5378991919969219, + "flos": 883839320064.0, + "grad_norm": 0.061986224183990205, + "language_loss": 0.85229117, + "learning_rate": 0.00046296042618402297, + "loss": 0.86312562, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.39697266, + "step": 2796, + "time_per_iteration": 3.0699656009674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077763, + "balance_loss_mlp": 1.03801823, + "epoch": 0.5380915736821854, + "flos": 710344249344.0, + "grad_norm": 0.04828732184108336, + "language_loss": 0.792054, + "learning_rate": 0.0004626497470886839, + "loss": 0.80283165, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.39746094, + "step": 2797, + "time_per_iteration": 2.9337801933288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085503, + "balance_loss_mlp": 1.04444742, + "epoch": 0.538283955367449, + "flos": 556721344512.0, + "grad_norm": 0.04667541599746409, + "language_loss": 0.8208226, + "learning_rate": 0.00046233908249399897, + "loss": 0.83167768, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.41040039, + "step": 2798, + "time_per_iteration": 2.736253023147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086245, + "balance_loss_mlp": 1.04585731, + "epoch": 0.5384763370527126, + "flos": 513218833920.0, + "grad_norm": 0.05904964511977083, + "language_loss": 0.78162259, + "learning_rate": 0.00046202843252057905, + "loss": 0.79248506, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.40380859, + "step": 2799, + "time_per_iteration": 2.5839316844940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085466, + "balance_loss_mlp": 1.04503012, + "epoch": 0.5386687187379762, + "flos": 489490974720.0, + "grad_norm": 0.06428119470797507, + "language_loss": 0.83220208, + "learning_rate": 0.00046171779728902896, + "loss": 0.8430568, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.40405273, + "step": 2800, + "time_per_iteration": 2.6141908168792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087168, + "balance_loss_mlp": 1.04801977, + "epoch": 0.5388611004232398, + "flos": 482415395328.0, + "grad_norm": 0.12344174959648258, + "language_loss": 0.86207569, + "learning_rate": 0.000461407176919948, + "loss": 0.87294734, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.39111328, + "step": 2801, + "time_per_iteration": 2.503673791885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_mlp": 1.04158366, + "epoch": 0.5390534821085032, + "flos": 560709457920.0, + "grad_norm": 0.05013064620145656, + "language_loss": 0.85174656, + "learning_rate": 0.00046109657153392997, + "loss": 0.86255008, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.38720703, + "step": 2802, + "time_per_iteration": 2.6549510955810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086897, + "balance_loss_mlp": 1.04624677, + "epoch": 0.5392458637937668, + "flos": 488132020224.0, + "grad_norm": 0.05351248634305854, + "language_loss": 0.82771289, + "learning_rate": 0.0004607859812515622, + "loss": 0.8385818, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.40649414, + "step": 2803, + "time_per_iteration": 2.592742681503296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085335, + "balance_loss_mlp": 1.0456624, + "epoch": 0.5394382454790304, + "flos": 511810417152.0, + "grad_norm": 0.06156300752407298, + "language_loss": 0.87926197, + "learning_rate": 0.00046047540619342667, + "loss": 0.89011538, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.39648438, + "step": 2804, + "time_per_iteration": 2.566542863845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108144, + "balance_loss_mlp": 1.04343605, + "epoch": 0.539630627164294, + "flos": 567312173568.0, + "grad_norm": 0.04852529488921132, + "language_loss": 0.7995888, + "learning_rate": 0.00046016484648009933, + "loss": 0.81040317, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.38012695, + "step": 2805, + "time_per_iteration": 2.693988561630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108233, + "balance_loss_mlp": 1.04415882, + "epoch": 0.5398230088495575, + "flos": 526203095040.0, + "grad_norm": 0.058780411040176145, + "language_loss": 0.8077246, + "learning_rate": 0.0004598543022321501, + "loss": 0.81854796, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.38134766, + "step": 2806, + "time_per_iteration": 2.635873317718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079563, + "balance_loss_mlp": 1.0410111, + "epoch": 0.5400153905348211, + "flos": 538493322240.0, + "grad_norm": 0.05389643439716648, + "language_loss": 0.7979452, + "learning_rate": 0.0004595437735701433, + "loss": 0.80874085, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.38500977, + "step": 2807, + "time_per_iteration": 2.671004056930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082107, + "balance_loss_mlp": 1.04252934, + "epoch": 0.5402077722200846, + "flos": 513259531776.0, + "grad_norm": 0.056977099557855106, + "language_loss": 0.83333278, + "learning_rate": 0.00045923326061463623, + "loss": 0.84415388, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.39575195, + "step": 2808, + "time_per_iteration": 2.748844861984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108444, + "balance_loss_mlp": 1.04519629, + "epoch": 0.5404001539053482, + "flos": 675932428800.0, + "grad_norm": 0.053531678156081904, + "language_loss": 0.81448805, + "learning_rate": 0.00045892276348618113, + "loss": 0.82533252, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.39208984, + "step": 2809, + "time_per_iteration": 2.9712717533111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034069, + "balance_loss_mlp": 1.02195704, + "epoch": 0.5405925355906118, + "flos": 1553998155264.0, + "grad_norm": 0.02221665300745606, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79294896, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.12109375, + "step": 2810, + "time_per_iteration": 4.987140893936157 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085482, + "balance_loss_mlp": 1.04697728, + "epoch": 0.5407849172758753, + "flos": 647004478464.0, + "grad_norm": 0.050822756134718025, + "language_loss": 0.80942833, + "learning_rate": 0.000458301817192603, + "loss": 0.82028317, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.38500977, + "step": 2811, + "time_per_iteration": 2.826511859893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028161, + "balance_loss_mlp": 1.01576281, + "epoch": 0.5409772989611389, + "flos": 1406641904640.0, + "grad_norm": 0.017319914930323605, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81869948, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.12353516, + "step": 2812, + "time_per_iteration": 4.797938346862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083094, + "balance_loss_mlp": 1.04525733, + "epoch": 0.5411696806464025, + "flos": 554102360064.0, + "grad_norm": 0.08517188397837483, + "language_loss": 0.87214613, + "learning_rate": 0.00045768093565369983, + "loss": 0.88297707, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.37817383, + "step": 2813, + "time_per_iteration": 2.716890811920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082803, + "balance_loss_mlp": 1.04441762, + "epoch": 0.5413620623316661, + "flos": 527853030912.0, + "grad_norm": 0.05234072905155942, + "language_loss": 0.81825578, + "learning_rate": 0.0004573705194685646, + "loss": 0.8290838, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.38330078, + "step": 2814, + "time_per_iteration": 2.6517584323883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082477, + "balance_loss_mlp": 1.04380536, + "epoch": 0.5415544440169295, + "flos": 598464820224.0, + "grad_norm": 0.054888895455983605, + "language_loss": 0.84797984, + "learning_rate": 0.00045706011983366157, + "loss": 0.85880458, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.38623047, + "step": 2815, + "time_per_iteration": 2.670135974884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088358, + "balance_loss_mlp": 1.050807, + "epoch": 0.5417468257021931, + "flos": 470519456256.0, + "grad_norm": 0.06349065912195655, + "language_loss": 0.82603323, + "learning_rate": 0.00045674973686949847, + "loss": 0.8369168, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.37524414, + "step": 2816, + "time_per_iteration": 2.51487398147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085537, + "balance_loss_mlp": 1.04710388, + "epoch": 0.5419392073874567, + "flos": 680477773824.0, + "grad_norm": 0.04802331030108417, + "language_loss": 0.85519576, + "learning_rate": 0.0004564393706965766, + "loss": 0.86605108, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.3840332, + "step": 2817, + "time_per_iteration": 2.9650819301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088505, + "balance_loss_mlp": 1.05031061, + "epoch": 0.5421315890727203, + "flos": 462134384640.0, + "grad_norm": 0.11431790588446349, + "language_loss": 0.81361973, + "learning_rate": 0.00045612902143539116, + "loss": 0.82450485, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.3815918, + "step": 2818, + "time_per_iteration": 2.5874366760253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083753, + "balance_loss_mlp": 1.04620242, + "epoch": 0.5423239707579839, + "flos": 436727476224.0, + "grad_norm": 0.06287409893753121, + "language_loss": 0.81734043, + "learning_rate": 0.00045581868920642986, + "loss": 0.82817793, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.375, + "step": 2819, + "time_per_iteration": 2.4778597354888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.04818964, + "epoch": 0.5425163524432474, + "flos": 458067695616.0, + "grad_norm": 0.0556653381868651, + "language_loss": 0.79541689, + "learning_rate": 0.00045550837413017457, + "loss": 0.8062731, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.37402344, + "step": 2820, + "time_per_iteration": 2.653878688812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087265, + "balance_loss_mlp": 1.04873669, + "epoch": 0.542708734128511, + "flos": 419267681280.0, + "grad_norm": 0.047652791336190936, + "language_loss": 0.85203838, + "learning_rate": 0.0004551980763271005, + "loss": 0.86291105, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.38500977, + "step": 2821, + "time_per_iteration": 2.6410272121429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088978, + "balance_loss_mlp": 1.04942417, + "epoch": 0.5429011158137745, + "flos": 678142568448.0, + "grad_norm": 0.047512644994480734, + "language_loss": 0.83545935, + "learning_rate": 0.0004548877959176756, + "loss": 0.84634912, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.39550781, + "step": 2822, + "time_per_iteration": 2.8824410438537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083878, + "balance_loss_mlp": 1.04542077, + "epoch": 0.5430934974990381, + "flos": 540664174080.0, + "grad_norm": 0.05440283794038225, + "language_loss": 0.8588357, + "learning_rate": 0.00045457753302236166, + "loss": 0.86967444, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.3840332, + "step": 2823, + "time_per_iteration": 2.665828227996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078196, + "balance_loss_mlp": 1.04069233, + "epoch": 0.5432858791843016, + "flos": 658175860224.0, + "grad_norm": 0.053164692369765, + "language_loss": 0.86939847, + "learning_rate": 0.00045426728776161353, + "loss": 0.88018048, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.37475586, + "step": 2824, + "time_per_iteration": 2.79662823677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082032, + "balance_loss_mlp": 1.04367089, + "epoch": 0.5434782608695652, + "flos": 531678200832.0, + "grad_norm": 0.051257131946256196, + "language_loss": 0.81339788, + "learning_rate": 0.00045395706025587863, + "loss": 0.82421821, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.38330078, + "step": 2825, + "time_per_iteration": 2.612839698791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083107, + "balance_loss_mlp": 1.04298067, + "epoch": 0.5436706425548288, + "flos": 608219020800.0, + "grad_norm": 0.0654215378261843, + "language_loss": 0.8246271, + "learning_rate": 0.00045364685062559843, + "loss": 0.83545816, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.40112305, + "step": 2826, + "time_per_iteration": 2.8304717540740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077545, + "balance_loss_mlp": 1.03863502, + "epoch": 0.5438630242400924, + "flos": 705081549312.0, + "grad_norm": 0.05153461088450525, + "language_loss": 0.91323566, + "learning_rate": 0.0004533366589912067, + "loss": 0.92401117, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.38891602, + "step": 2827, + "time_per_iteration": 2.9909794330596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083973, + "balance_loss_mlp": 1.04399014, + "epoch": 0.544055405925356, + "flos": 856073885184.0, + "grad_norm": 0.06162926864421369, + "language_loss": 0.77631354, + "learning_rate": 0.0004530264854731306, + "loss": 0.78715324, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.3996582, + "step": 2828, + "time_per_iteration": 3.0477852821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079886, + "balance_loss_mlp": 1.0402137, + "epoch": 0.5442477876106194, + "flos": 571483579392.0, + "grad_norm": 0.04880017685382554, + "language_loss": 0.83835936, + "learning_rate": 0.00045271633019179034, + "loss": 0.84915829, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.39648438, + "step": 2829, + "time_per_iteration": 2.8048830032348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108649, + "balance_loss_mlp": 1.04684114, + "epoch": 0.544440169295883, + "flos": 625246649856.0, + "grad_norm": 0.05731672371216008, + "language_loss": 0.87693858, + "learning_rate": 0.0004524061932675986, + "loss": 0.88780355, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.39624023, + "step": 2830, + "time_per_iteration": 2.880328893661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081175, + "balance_loss_mlp": 1.0420748, + "epoch": 0.5446325509811466, + "flos": 835896181248.0, + "grad_norm": 0.061736377466748704, + "language_loss": 0.8659271, + "learning_rate": 0.00045209607482096125, + "loss": 0.87673885, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.390625, + "step": 2831, + "time_per_iteration": 2.9996933937072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080099, + "balance_loss_mlp": 1.04016387, + "epoch": 0.5448249326664102, + "flos": 483129778176.0, + "grad_norm": 0.057163759026562816, + "language_loss": 0.8399148, + "learning_rate": 0.0004517859749722772, + "loss": 0.85071582, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.39892578, + "step": 2832, + "time_per_iteration": 2.6431195735931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085662, + "balance_loss_mlp": 1.04606068, + "epoch": 0.5450173143516738, + "flos": 560799618048.0, + "grad_norm": 0.061436781325619555, + "language_loss": 0.78688192, + "learning_rate": 0.0004514758938419376, + "loss": 0.79773855, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.39575195, + "step": 2833, + "time_per_iteration": 2.811894655227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058831, + "balance_loss_mlp": 1.04280972, + "epoch": 0.5452096960369373, + "flos": 1469632467456.0, + "grad_norm": 0.020133642361800857, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77979416, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.16015625, + "step": 2834, + "time_per_iteration": 4.920469760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077415, + "balance_loss_mlp": 1.03798103, + "epoch": 0.5454020777222008, + "flos": 464827562496.0, + "grad_norm": 0.051503170745990534, + "language_loss": 0.83848447, + "learning_rate": 0.00045085578821782175, + "loss": 0.84925866, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.39404297, + "step": 2835, + "time_per_iteration": 2.523089647293091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048396, + "balance_loss_mlp": 1.03246999, + "epoch": 0.5455944594074644, + "flos": 1468921056768.0, + "grad_norm": 0.01613355837810212, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77183139, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.15917969, + "step": 2836, + "time_per_iteration": 4.865030288696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082059, + "balance_loss_mlp": 1.0422194, + "epoch": 0.545786841092728, + "flos": 532900353024.0, + "grad_norm": 0.04532447535161293, + "language_loss": 0.81224561, + "learning_rate": 0.00045023575891159866, + "loss": 0.82306617, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.3984375, + "step": 2837, + "time_per_iteration": 2.7024872303009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038306, + "balance_loss_mlp": 1.02285683, + "epoch": 0.5459792227779915, + "flos": 1351633360896.0, + "grad_norm": 0.01633471064412587, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75802112, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.15429688, + "step": 2838, + "time_per_iteration": 4.88713812828064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072439, + "balance_loss_mlp": 1.03436387, + "epoch": 0.5461716044632551, + "flos": 637584929280.0, + "grad_norm": 0.044187924464620755, + "language_loss": 0.77777064, + "learning_rate": 0.0004496158068861354, + "loss": 0.788495, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.38037109, + "step": 2839, + "time_per_iteration": 2.7734854221343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083666, + "balance_loss_mlp": 1.04451799, + "epoch": 0.5463639861485187, + "flos": 602458725888.0, + "grad_norm": 0.04916115853202861, + "language_loss": 0.80780178, + "learning_rate": 0.00044930586015455207, + "loss": 0.81863844, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.39111328, + "step": 2840, + "time_per_iteration": 2.776756525039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079059, + "balance_loss_mlp": 1.04105484, + "epoch": 0.5465563678337823, + "flos": 642208849920.0, + "grad_norm": 0.047638532734035705, + "language_loss": 0.89027333, + "learning_rate": 0.000448995933104179, + "loss": 0.90106392, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.37939453, + "step": 2841, + "time_per_iteration": 2.835770606994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084722, + "balance_loss_mlp": 1.04526389, + "epoch": 0.5467487495190458, + "flos": 613852687872.0, + "grad_norm": 0.05241434980763647, + "language_loss": 0.79585081, + "learning_rate": 0.00044868602585534077, + "loss": 0.80669802, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.39428711, + "step": 2842, + "time_per_iteration": 2.8165202140808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081214, + "balance_loss_mlp": 1.04297209, + "epoch": 0.5469411312043093, + "flos": 460957312512.0, + "grad_norm": 0.05377375824052972, + "language_loss": 0.88703167, + "learning_rate": 0.0004483761385283541, + "loss": 0.89784384, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.38183594, + "step": 2843, + "time_per_iteration": 2.5191187858581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085655, + "balance_loss_mlp": 1.04705536, + "epoch": 0.5471335128895729, + "flos": 560930628096.0, + "grad_norm": 0.05339183941738246, + "language_loss": 0.82029176, + "learning_rate": 0.0004480662712435281, + "loss": 0.83114827, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.38549805, + "step": 2844, + "time_per_iteration": 2.7347452640533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084571, + "balance_loss_mlp": 1.046996, + "epoch": 0.5473258945748365, + "flos": 518437863936.0, + "grad_norm": 0.05481278216627967, + "language_loss": 0.88263971, + "learning_rate": 0.0004477564241211635, + "loss": 0.89348543, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.37548828, + "step": 2845, + "time_per_iteration": 2.566675901412964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085991, + "balance_loss_mlp": 1.0476774, + "epoch": 0.5475182762601001, + "flos": 433600722432.0, + "grad_norm": 0.05360762168993706, + "language_loss": 0.87165999, + "learning_rate": 0.0004474465972815541, + "loss": 0.88251984, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.38256836, + "step": 2846, + "time_per_iteration": 2.458261489868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085193, + "balance_loss_mlp": 1.04754686, + "epoch": 0.5477106579453636, + "flos": 511308440064.0, + "grad_norm": 0.04786363547278841, + "language_loss": 0.87439841, + "learning_rate": 0.000447136790844985, + "loss": 0.88525033, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.37646484, + "step": 2847, + "time_per_iteration": 2.667609214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108262, + "balance_loss_mlp": 1.04547465, + "epoch": 0.5479030396306271, + "flos": 675606541824.0, + "grad_norm": 0.050829406458998395, + "language_loss": 0.80589354, + "learning_rate": 0.00044682700493173385, + "loss": 0.81671977, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.37133789, + "step": 2848, + "time_per_iteration": 2.83048677444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088336, + "balance_loss_mlp": 1.04978406, + "epoch": 0.5480954213158907, + "flos": 875720498688.0, + "grad_norm": 0.057674115143319986, + "language_loss": 0.80473161, + "learning_rate": 0.00044651723966207004, + "loss": 0.81561506, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.38500977, + "step": 2849, + "time_per_iteration": 3.1320085525512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084454, + "balance_loss_mlp": 1.04780865, + "epoch": 0.5482878030011543, + "flos": 621715433472.0, + "grad_norm": 0.04900831188074684, + "language_loss": 0.78059959, + "learning_rate": 0.00044620749515625536, + "loss": 0.79144412, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.36669922, + "step": 2850, + "time_per_iteration": 2.784318447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091667, + "balance_loss_mlp": 1.05404472, + "epoch": 0.5484801846864179, + "flos": 496946285568.0, + "grad_norm": 0.05697086220906577, + "language_loss": 0.84891641, + "learning_rate": 0.00044589777153454334, + "loss": 0.85983306, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.37597656, + "step": 2851, + "time_per_iteration": 2.7432825565338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087741, + "balance_loss_mlp": 1.04973722, + "epoch": 0.5486725663716814, + "flos": 442202582016.0, + "grad_norm": 0.05425914558119235, + "language_loss": 0.83565009, + "learning_rate": 0.00044558806891717895, + "loss": 0.84652746, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.37963867, + "step": 2852, + "time_per_iteration": 2.486581563949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093078, + "balance_loss_mlp": 1.05528831, + "epoch": 0.548864948056945, + "flos": 654867224064.0, + "grad_norm": 0.04695408394518552, + "language_loss": 0.79779923, + "learning_rate": 0.0004452783874243998, + "loss": 0.80873001, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.37817383, + "step": 2853, + "time_per_iteration": 2.823004722595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088751, + "balance_loss_mlp": 1.05246305, + "epoch": 0.5490573297422086, + "flos": 545760958464.0, + "grad_norm": 0.06406980317061135, + "language_loss": 0.84579176, + "learning_rate": 0.00044496872717643475, + "loss": 0.85667926, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.36279297, + "step": 2854, + "time_per_iteration": 2.6582207679748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104165, + "balance_loss_mlp": 1.02906144, + "epoch": 0.5492497114274721, + "flos": 1589450245632.0, + "grad_norm": 0.019738925867794382, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78130943, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.12597656, + "step": 2855, + "time_per_iteration": 4.917479991912842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086161, + "balance_loss_mlp": 1.0507319, + "epoch": 0.5494420931127356, + "flos": 750567237120.0, + "grad_norm": 0.05097157568088764, + "language_loss": 0.82032043, + "learning_rate": 0.0004443494708958217, + "loss": 0.83118206, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.35473633, + "step": 2856, + "time_per_iteration": 2.944794178009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087153, + "balance_loss_mlp": 1.04860103, + "epoch": 0.5496344747979992, + "flos": 625704956928.0, + "grad_norm": 0.05077616299787212, + "language_loss": 0.80950212, + "learning_rate": 0.0004440398751035906, + "loss": 0.82037365, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.38549805, + "step": 2857, + "time_per_iteration": 2.8557775020599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083742, + "balance_loss_mlp": 1.04707289, + "epoch": 0.5498268564832628, + "flos": 522859553280.0, + "grad_norm": 0.07234504005195413, + "language_loss": 0.83526963, + "learning_rate": 0.00044373030103700645, + "loss": 0.84610707, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.3671875, + "step": 2858, + "time_per_iteration": 2.5718507766723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079618, + "balance_loss_mlp": 1.04337823, + "epoch": 0.5500192381685264, + "flos": 604290544128.0, + "grad_norm": 0.05047837894946753, + "language_loss": 0.79457223, + "learning_rate": 0.000443420748816257, + "loss": 0.80536836, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.36279297, + "step": 2859, + "time_per_iteration": 2.791083335876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_mlp": 1.0475843, + "epoch": 0.55021161985379, + "flos": 520246361088.0, + "grad_norm": 0.05245161408681963, + "language_loss": 0.78267741, + "learning_rate": 0.0004431112185615208, + "loss": 0.79352212, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.36914062, + "step": 2860, + "time_per_iteration": 2.755300760269165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084785, + "balance_loss_mlp": 1.04873633, + "epoch": 0.5504040015390534, + "flos": 489426955776.0, + "grad_norm": 0.05433061967205067, + "language_loss": 0.79769695, + "learning_rate": 0.00044280171039296845, + "loss": 0.80854475, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.3605957, + "step": 2861, + "time_per_iteration": 2.611142873764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086738, + "balance_loss_mlp": 1.04925907, + "epoch": 0.550596383224317, + "flos": 575519745024.0, + "grad_norm": 0.06168485457456991, + "language_loss": 0.88482428, + "learning_rate": 0.0004424922244307616, + "loss": 0.89569169, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.375, + "step": 2862, + "time_per_iteration": 2.673872470855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085076, + "balance_loss_mlp": 1.04750168, + "epoch": 0.5507887649095806, + "flos": 642149213184.0, + "grad_norm": 0.06448144785997337, + "language_loss": 0.82166171, + "learning_rate": 0.00044218276079505315, + "loss": 0.83251244, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.37524414, + "step": 2863, + "time_per_iteration": 2.8468000888824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088216, + "balance_loss_mlp": 1.05126143, + "epoch": 0.5509811465948442, + "flos": 531589450752.0, + "grad_norm": 0.050966073807123834, + "language_loss": 0.7469635, + "learning_rate": 0.0004418733196059876, + "loss": 0.7578457, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.36938477, + "step": 2864, + "time_per_iteration": 2.662949323654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088219, + "balance_loss_mlp": 1.05174112, + "epoch": 0.5511735282801077, + "flos": 654439440384.0, + "grad_norm": 0.054186590964919915, + "language_loss": 0.79709429, + "learning_rate": 0.0004415639009837008, + "loss": 0.80797648, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.36474609, + "step": 2865, + "time_per_iteration": 2.8164796829223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080612, + "balance_loss_mlp": 1.04503989, + "epoch": 0.5513659099653713, + "flos": 529222159872.0, + "grad_norm": 0.05095499883513892, + "language_loss": 0.81590974, + "learning_rate": 0.00044125450504831955, + "loss": 0.82671583, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.35620117, + "step": 2866, + "time_per_iteration": 2.7417778968811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088604, + "balance_loss_mlp": 1.05162513, + "epoch": 0.5515582916506349, + "flos": 554594162688.0, + "grad_norm": 0.05682958193324047, + "language_loss": 0.82243145, + "learning_rate": 0.0004409451319199622, + "loss": 0.83331752, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.36987305, + "step": 2867, + "time_per_iteration": 2.6530325412750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082608, + "balance_loss_mlp": 1.04608202, + "epoch": 0.5517506733358984, + "flos": 735067298304.0, + "grad_norm": 0.04759427919913488, + "language_loss": 0.84027618, + "learning_rate": 0.0004406357817187381, + "loss": 0.85110223, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.36572266, + "step": 2868, + "time_per_iteration": 2.9475574493408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083501, + "balance_loss_mlp": 1.04590225, + "epoch": 0.551943055021162, + "flos": 1114861432320.0, + "grad_norm": 0.043872910920917114, + "language_loss": 0.80878294, + "learning_rate": 0.0004403264545647474, + "loss": 0.81961799, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.37597656, + "step": 2869, + "time_per_iteration": 3.5124435424804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080703, + "balance_loss_mlp": 1.04422534, + "epoch": 0.5521354367064255, + "flos": 544092083712.0, + "grad_norm": 0.0550168733336382, + "language_loss": 0.84926724, + "learning_rate": 0.00044001715057808154, + "loss": 0.86007428, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.36499023, + "step": 2870, + "time_per_iteration": 2.7501060962677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085896, + "balance_loss_mlp": 1.04855943, + "epoch": 0.5523278183916891, + "flos": 935889845760.0, + "grad_norm": 0.05461062340152541, + "language_loss": 0.81539249, + "learning_rate": 0.0004397078698788232, + "loss": 0.82625151, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.3737793, + "step": 2871, + "time_per_iteration": 3.2084577083587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026675, + "balance_loss_mlp": 1.01427722, + "epoch": 0.5525202000769527, + "flos": 1465117645824.0, + "grad_norm": 0.012296141252344654, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81469035, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.12353516, + "step": 2872, + "time_per_iteration": 4.909080266952515 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087659, + "balance_loss_mlp": 1.05115747, + "epoch": 0.5527125817622163, + "flos": 489554993664.0, + "grad_norm": 0.06201182150044637, + "language_loss": 0.78260124, + "learning_rate": 0.00043908937882281343, + "loss": 0.79347777, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.36523438, + "step": 2873, + "time_per_iteration": 2.5999958515167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078657, + "balance_loss_mlp": 1.0410111, + "epoch": 0.5529049634474797, + "flos": 634606562304.0, + "grad_norm": 0.05626101072807578, + "language_loss": 0.82624078, + "learning_rate": 0.0004387801687061814, + "loss": 0.83702731, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.37573242, + "step": 2874, + "time_per_iteration": 2.816607713699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082085, + "balance_loss_mlp": 1.04310322, + "epoch": 0.5530973451327433, + "flos": 580986086400.0, + "grad_norm": 0.04886656520386433, + "language_loss": 0.80143493, + "learning_rate": 0.0004384709823571958, + "loss": 0.8122558, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.38964844, + "step": 2875, + "time_per_iteration": 2.7270736694335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079953, + "balance_loss_mlp": 1.04113841, + "epoch": 0.5532897268180069, + "flos": 1122030144000.0, + "grad_norm": 0.06103557908182598, + "language_loss": 0.83129716, + "learning_rate": 0.0004381618198958932, + "loss": 0.84209669, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.38793945, + "step": 2876, + "time_per_iteration": 3.4826347827911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085381, + "balance_loss_mlp": 1.04721045, + "epoch": 0.5534821085032705, + "flos": 636965088768.0, + "grad_norm": 0.05070554688334561, + "language_loss": 0.83524168, + "learning_rate": 0.00043785268144230137, + "loss": 0.84609544, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.38183594, + "step": 2877, + "time_per_iteration": 2.8850836753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080685, + "balance_loss_mlp": 1.04332519, + "epoch": 0.5536744901885341, + "flos": 570837597696.0, + "grad_norm": 0.056027333180870484, + "language_loss": 0.82300985, + "learning_rate": 0.00043754356711643837, + "loss": 0.83381677, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.37353516, + "step": 2878, + "time_per_iteration": 2.6629955768585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079329, + "balance_loss_mlp": 1.04180145, + "epoch": 0.5538668718737976, + "flos": 595418052096.0, + "grad_norm": 0.051053801448504514, + "language_loss": 0.84143484, + "learning_rate": 0.0004372344770383132, + "loss": 0.85222816, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.37475586, + "step": 2879, + "time_per_iteration": 2.809924364089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080427, + "balance_loss_mlp": 1.04220867, + "epoch": 0.5540592535590612, + "flos": 532324182528.0, + "grad_norm": 0.054354704442993965, + "language_loss": 0.83048761, + "learning_rate": 0.00043692541132792507, + "loss": 0.8412919, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.38183594, + "step": 2880, + "time_per_iteration": 2.6826112270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076517, + "balance_loss_mlp": 1.03915703, + "epoch": 0.5542516352443247, + "flos": 412398715392.0, + "grad_norm": 0.060842521075957015, + "language_loss": 0.83359361, + "learning_rate": 0.00043661637010526384, + "loss": 0.84435874, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.37329102, + "step": 2881, + "time_per_iteration": 2.5412843227386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077156, + "balance_loss_mlp": 1.03946209, + "epoch": 0.5544440169295883, + "flos": 547341083136.0, + "grad_norm": 0.06506612292228302, + "language_loss": 0.82828653, + "learning_rate": 0.00043630735349031025, + "loss": 0.83905804, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.37646484, + "step": 2882, + "time_per_iteration": 2.6428792476654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079595, + "balance_loss_mlp": 1.04132843, + "epoch": 0.5546363986148518, + "flos": 621518994432.0, + "grad_norm": 0.04746548389090053, + "language_loss": 0.8146224, + "learning_rate": 0.00043599836160303495, + "loss": 0.82541835, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.38232422, + "step": 2883, + "time_per_iteration": 2.836928367614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076033, + "balance_loss_mlp": 1.03833902, + "epoch": 0.5548287803001154, + "flos": 704972450304.0, + "grad_norm": 0.05191443424956408, + "language_loss": 0.77216405, + "learning_rate": 0.0004356893945633995, + "loss": 0.78292441, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.37719727, + "step": 2884, + "time_per_iteration": 2.959998846054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077354, + "balance_loss_mlp": 1.03877735, + "epoch": 0.555021161985379, + "flos": 503952053760.0, + "grad_norm": 0.04795057861891694, + "language_loss": 0.8143183, + "learning_rate": 0.0004353804524913551, + "loss": 0.82509184, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.38549805, + "step": 2885, + "time_per_iteration": 2.587458848953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076676, + "balance_loss_mlp": 1.03960204, + "epoch": 0.5552135436706426, + "flos": 615782020608.0, + "grad_norm": 0.060100634137020215, + "language_loss": 0.81801999, + "learning_rate": 0.0004350715355068441, + "loss": 0.82878673, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.37109375, + "step": 2886, + "time_per_iteration": 2.739311933517456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080638, + "balance_loss_mlp": 1.04227662, + "epoch": 0.5554059253559062, + "flos": 463635933696.0, + "grad_norm": 0.06732751663430354, + "language_loss": 0.79759407, + "learning_rate": 0.00043476264372979847, + "loss": 0.80840045, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.38305664, + "step": 2887, + "time_per_iteration": 2.5322625637054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081996, + "balance_loss_mlp": 1.04425478, + "epoch": 0.5555983070411696, + "flos": 1561923813888.0, + "grad_norm": 0.05205208802168105, + "language_loss": 0.78767329, + "learning_rate": 0.0004344537772801408, + "loss": 0.79849327, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.37744141, + "step": 2888, + "time_per_iteration": 3.8099794387817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022363, + "balance_loss_mlp": 1.00986981, + "epoch": 0.5557906887264332, + "flos": 1467093468672.0, + "grad_norm": 0.012872465654446894, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74444818, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.12451172, + "step": 2889, + "time_per_iteration": 4.8980872631073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080645, + "balance_loss_mlp": 1.04373789, + "epoch": 0.5559830704116968, + "flos": 529575750144.0, + "grad_norm": 0.056518477254008576, + "language_loss": 0.83232135, + "learning_rate": 0.0004338361208426298, + "loss": 0.84312785, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.36889648, + "step": 2890, + "time_per_iteration": 2.596644163131714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108101, + "balance_loss_mlp": 1.04312527, + "epoch": 0.5561754520969604, + "flos": 650895077376.0, + "grad_norm": 0.04719414959796351, + "language_loss": 0.81189138, + "learning_rate": 0.00043352733109457164, + "loss": 0.82270145, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.37841797, + "step": 2891, + "time_per_iteration": 2.8776957988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079084, + "balance_loss_mlp": 1.04158103, + "epoch": 0.556367833782224, + "flos": 733968801792.0, + "grad_norm": 0.04510399892940866, + "language_loss": 0.84577823, + "learning_rate": 0.00043321856715349244, + "loss": 0.85656911, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.37451172, + "step": 2892, + "time_per_iteration": 2.9247210025787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107932, + "balance_loss_mlp": 1.04243708, + "epoch": 0.5565602154674875, + "flos": 672120405504.0, + "grad_norm": 0.04457708587394983, + "language_loss": 0.80344868, + "learning_rate": 0.00043290982913926466, + "loss": 0.81424183, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.36889648, + "step": 2893, + "time_per_iteration": 2.791151285171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087078, + "balance_loss_mlp": 1.04919362, + "epoch": 0.556752597152751, + "flos": 585911162880.0, + "grad_norm": 0.05091942660655845, + "language_loss": 0.84425044, + "learning_rate": 0.0004326011171717514, + "loss": 0.8551212, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.37866211, + "step": 2894, + "time_per_iteration": 2.8832085132598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085909, + "balance_loss_mlp": 1.04788101, + "epoch": 0.5569449788380146, + "flos": 437549548032.0, + "grad_norm": 0.04808991967010034, + "language_loss": 0.81074953, + "learning_rate": 0.0004322924313708051, + "loss": 0.82160866, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.38012695, + "step": 2895, + "time_per_iteration": 2.5033986568450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079423, + "balance_loss_mlp": 1.04315972, + "epoch": 0.5571373605232782, + "flos": 502002372096.0, + "grad_norm": 0.057289668121921454, + "language_loss": 0.84257507, + "learning_rate": 0.0004319837718562681, + "loss": 0.85336924, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.36254883, + "step": 2896, + "time_per_iteration": 2.55461049079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086447, + "balance_loss_mlp": 1.04856229, + "epoch": 0.5573297422085417, + "flos": 577126010880.0, + "grad_norm": 0.05427319641394577, + "language_loss": 0.83001935, + "learning_rate": 0.0004316751387479726, + "loss": 0.84088391, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.37841797, + "step": 2897, + "time_per_iteration": 2.726621150970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010828, + "balance_loss_mlp": 1.04622626, + "epoch": 0.5575221238938053, + "flos": 1343536754688.0, + "grad_norm": 0.07147882998338702, + "language_loss": 0.82389295, + "learning_rate": 0.0004313665321657409, + "loss": 0.83472097, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.36572266, + "step": 2898, + "time_per_iteration": 3.705557107925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084994, + "balance_loss_mlp": 1.04756212, + "epoch": 0.5577145055790689, + "flos": 601680324096.0, + "grad_norm": 0.06263472170874507, + "language_loss": 0.80018216, + "learning_rate": 0.00043105795222938436, + "loss": 0.81103212, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.37451172, + "step": 2899, + "time_per_iteration": 2.7069091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082839, + "balance_loss_mlp": 1.04500163, + "epoch": 0.5579068872643325, + "flos": 562353601536.0, + "grad_norm": 0.0921941925102754, + "language_loss": 0.78331131, + "learning_rate": 0.00043074939905870467, + "loss": 0.79413968, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.37817383, + "step": 2900, + "time_per_iteration": 2.6597537994384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108264, + "balance_loss_mlp": 1.04468393, + "epoch": 0.558099268949596, + "flos": 544292904960.0, + "grad_norm": 0.05487003421557055, + "language_loss": 0.80032802, + "learning_rate": 0.0004304408727734927, + "loss": 0.81115448, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.37939453, + "step": 2901, + "time_per_iteration": 2.61590838432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077829, + "balance_loss_mlp": 1.04120803, + "epoch": 0.5582916506348595, + "flos": 552520825344.0, + "grad_norm": 0.05406538300276566, + "language_loss": 0.88821226, + "learning_rate": 0.0004301323734935288, + "loss": 0.89899063, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.36645508, + "step": 2902, + "time_per_iteration": 2.6357102394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082783, + "balance_loss_mlp": 1.04573286, + "epoch": 0.5584840323201231, + "flos": 543126007296.0, + "grad_norm": 0.054631389421551546, + "language_loss": 0.87217975, + "learning_rate": 0.000429823901338583, + "loss": 0.88300759, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.37011719, + "step": 2903, + "time_per_iteration": 2.6050922870635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073691, + "balance_loss_mlp": 1.03678417, + "epoch": 0.5586764140053867, + "flos": 815212118016.0, + "grad_norm": 0.05529085617610277, + "language_loss": 0.86446041, + "learning_rate": 0.00042951545642841513, + "loss": 0.87519729, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.36914062, + "step": 2904, + "time_per_iteration": 3.0609569549560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076695, + "balance_loss_mlp": 1.03981209, + "epoch": 0.5588687956906503, + "flos": 486196895232.0, + "grad_norm": 0.04557850009306157, + "language_loss": 0.86361349, + "learning_rate": 0.0004292070388827737, + "loss": 0.87438047, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.3684082, + "step": 2905, + "time_per_iteration": 2.5549428462982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107784, + "balance_loss_mlp": 1.04017019, + "epoch": 0.5590611773759138, + "flos": 451809805824.0, + "grad_norm": 0.04842795237529701, + "language_loss": 0.8078168, + "learning_rate": 0.00042889864882139753, + "loss": 0.81859523, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.37646484, + "step": 2906, + "time_per_iteration": 2.6019363403320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072955, + "balance_loss_mlp": 1.03662026, + "epoch": 0.5592535590611774, + "flos": 520693083648.0, + "grad_norm": 0.04884179046821603, + "language_loss": 0.81762469, + "learning_rate": 0.0004285902863640139, + "loss": 0.8283543, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.36352539, + "step": 2907, + "time_per_iteration": 2.5899524688720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072848, + "balance_loss_mlp": 1.03622651, + "epoch": 0.5594459407464409, + "flos": 552250192896.0, + "grad_norm": 0.048074009249812255, + "language_loss": 0.8615104, + "learning_rate": 0.00042828195163033966, + "loss": 0.87223887, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.36645508, + "step": 2908, + "time_per_iteration": 2.676518440246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072859, + "balance_loss_mlp": 1.03585625, + "epoch": 0.5596383224317045, + "flos": 484596421632.0, + "grad_norm": 0.0512741694464887, + "language_loss": 0.79307508, + "learning_rate": 0.0004279736447400812, + "loss": 0.80380368, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.36987305, + "step": 2909, + "time_per_iteration": 2.590859889984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074509, + "balance_loss_mlp": 1.03676748, + "epoch": 0.5598307041169681, + "flos": 610976217600.0, + "grad_norm": 0.05469922136848912, + "language_loss": 0.78325337, + "learning_rate": 0.00042766536581293385, + "loss": 0.79399848, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.37695312, + "step": 2910, + "time_per_iteration": 2.7034008502960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074194, + "balance_loss_mlp": 1.03654802, + "epoch": 0.5600230858022316, + "flos": 488585945088.0, + "grad_norm": 0.05207227245540468, + "language_loss": 0.79564762, + "learning_rate": 0.0004273571149685819, + "loss": 0.80638957, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.37597656, + "step": 2911, + "time_per_iteration": 2.7075796127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074918, + "balance_loss_mlp": 1.03650868, + "epoch": 0.5602154674874952, + "flos": 598592858112.0, + "grad_norm": 0.04994756976596268, + "language_loss": 0.84006047, + "learning_rate": 0.00042704889232669937, + "loss": 0.85080969, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.38354492, + "step": 2912, + "time_per_iteration": 2.6922175884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071101, + "balance_loss_mlp": 1.03431344, + "epoch": 0.5604078491727588, + "flos": 585697347072.0, + "grad_norm": 0.05437848146357707, + "language_loss": 0.85302234, + "learning_rate": 0.0004267406980069484, + "loss": 0.86373341, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.36791992, + "step": 2913, + "time_per_iteration": 2.70796537399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067512, + "balance_loss_mlp": 1.03077149, + "epoch": 0.5606002308580224, + "flos": 540926042112.0, + "grad_norm": 0.045341959008097614, + "language_loss": 0.79753983, + "learning_rate": 0.0004264325321289808, + "loss": 0.80821496, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.3671875, + "step": 2914, + "time_per_iteration": 2.761362314224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107061, + "balance_loss_mlp": 1.03241491, + "epoch": 0.5607926125432858, + "flos": 583654533120.0, + "grad_norm": 0.0532534560102953, + "language_loss": 0.85864502, + "learning_rate": 0.00042612439481243736, + "loss": 0.86935115, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.38183594, + "step": 2915, + "time_per_iteration": 2.745008945465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073585, + "balance_loss_mlp": 1.03655863, + "epoch": 0.5609849942285494, + "flos": 627205095936.0, + "grad_norm": 0.06454697115510677, + "language_loss": 0.90024638, + "learning_rate": 0.00042581628617694735, + "loss": 0.91098225, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.37036133, + "step": 2916, + "time_per_iteration": 2.7654495239257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072441, + "balance_loss_mlp": 1.0346992, + "epoch": 0.561177375913813, + "flos": 588095161344.0, + "grad_norm": 0.05235254168005436, + "language_loss": 0.81651318, + "learning_rate": 0.0004255082063421296, + "loss": 0.82723755, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.37719727, + "step": 2917, + "time_per_iteration": 2.674204111099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107424, + "balance_loss_mlp": 1.03726149, + "epoch": 0.5613697575990766, + "flos": 526774883328.0, + "grad_norm": 0.05687183599046208, + "language_loss": 0.8481921, + "learning_rate": 0.00042520015542759065, + "loss": 0.85893452, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.36987305, + "step": 2918, + "time_per_iteration": 2.8309459686279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079966, + "balance_loss_mlp": 1.04134226, + "epoch": 0.5615621392843402, + "flos": 642351444480.0, + "grad_norm": 0.05024796403090353, + "language_loss": 0.88020825, + "learning_rate": 0.00042489213355292687, + "loss": 0.89100802, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.38598633, + "step": 2919, + "time_per_iteration": 2.8605942726135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083327, + "balance_loss_mlp": 1.04444087, + "epoch": 0.5617545209696037, + "flos": 427524715008.0, + "grad_norm": 0.05130722807003229, + "language_loss": 0.8097831, + "learning_rate": 0.00042458414083772276, + "loss": 0.82061636, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.38842773, + "step": 2920, + "time_per_iteration": 2.5186893939971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076823, + "balance_loss_mlp": 1.03920078, + "epoch": 0.5619469026548672, + "flos": 568140037632.0, + "grad_norm": 0.04280127072200588, + "language_loss": 0.84787017, + "learning_rate": 0.000424276177401552, + "loss": 0.85863835, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.37597656, + "step": 2921, + "time_per_iteration": 2.773881435394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079464, + "balance_loss_mlp": 1.04203272, + "epoch": 0.5621392843401308, + "flos": 504947243520.0, + "grad_norm": 0.056711430924252765, + "language_loss": 0.85714108, + "learning_rate": 0.0004239682433639763, + "loss": 0.86793578, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.37426758, + "step": 2922, + "time_per_iteration": 2.714646816253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081153, + "balance_loss_mlp": 1.04477036, + "epoch": 0.5623316660253944, + "flos": 516744258048.0, + "grad_norm": 0.060505090734525195, + "language_loss": 0.85348099, + "learning_rate": 0.0004236603388445467, + "loss": 0.8642925, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.36425781, + "step": 2923, + "time_per_iteration": 2.6141107082366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075818, + "balance_loss_mlp": 1.03905368, + "epoch": 0.5625240477106579, + "flos": 605732456448.0, + "grad_norm": 0.05369747698254185, + "language_loss": 0.81871819, + "learning_rate": 0.00042335246396280166, + "loss": 0.82947636, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.3671875, + "step": 2924, + "time_per_iteration": 2.7129671573638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081443, + "balance_loss_mlp": 1.0438447, + "epoch": 0.5627164293959215, + "flos": 450203539968.0, + "grad_norm": 0.06323509209264203, + "language_loss": 0.89955974, + "learning_rate": 0.0004230446188382693, + "loss": 0.9103741, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.3762207, + "step": 2925, + "time_per_iteration": 2.5567660331726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077212, + "balance_loss_mlp": 1.04101968, + "epoch": 0.5629088110811851, + "flos": 741734032896.0, + "grad_norm": 0.055420573846539395, + "language_loss": 0.80082184, + "learning_rate": 0.0004227368035904654, + "loss": 0.81159395, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.36181641, + "step": 2926, + "time_per_iteration": 2.947251319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108299, + "balance_loss_mlp": 1.04610705, + "epoch": 0.5631011927664487, + "flos": 496719323136.0, + "grad_norm": 0.04719463019166682, + "language_loss": 0.82913107, + "learning_rate": 0.00042242901833889474, + "loss": 0.83996093, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.36889648, + "step": 2927, + "time_per_iteration": 2.6429412364959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085596, + "balance_loss_mlp": 1.0498333, + "epoch": 0.5632935744517122, + "flos": 885774445056.0, + "grad_norm": 0.055780235249339845, + "language_loss": 0.85862845, + "learning_rate": 0.0004221212632030501, + "loss": 0.86948442, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.35791016, + "step": 2928, + "time_per_iteration": 3.0935142040252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085595, + "balance_loss_mlp": 1.04897451, + "epoch": 0.5634859561369757, + "flos": 604516096512.0, + "grad_norm": 0.08179321361553939, + "language_loss": 0.80431306, + "learning_rate": 0.0004218135383024124, + "loss": 0.81516898, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.3659668, + "step": 2929, + "time_per_iteration": 2.688404083251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079905, + "balance_loss_mlp": 1.04359436, + "epoch": 0.5636783378222393, + "flos": 453670737408.0, + "grad_norm": 0.05341288147748167, + "language_loss": 0.85107243, + "learning_rate": 0.0004215058437564511, + "loss": 0.86187148, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.36352539, + "step": 2930, + "time_per_iteration": 2.5591979026794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083948, + "balance_loss_mlp": 1.04725528, + "epoch": 0.5638707195075029, + "flos": 518206519296.0, + "grad_norm": 0.06241038231461263, + "language_loss": 0.82415265, + "learning_rate": 0.00042119817968462397, + "loss": 0.83499211, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.36694336, + "step": 2931, + "time_per_iteration": 2.5755324363708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075664, + "balance_loss_mlp": 1.03916192, + "epoch": 0.5640631011927665, + "flos": 564632142336.0, + "grad_norm": 0.06755883510394861, + "language_loss": 0.87004125, + "learning_rate": 0.0004208905462063766, + "loss": 0.88079786, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.36499023, + "step": 2932, + "time_per_iteration": 2.6330130100250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077211, + "balance_loss_mlp": 1.04097116, + "epoch": 0.56425548287803, + "flos": 516783545856.0, + "grad_norm": 0.04875434703648171, + "language_loss": 0.84473455, + "learning_rate": 0.00042058294344114315, + "loss": 0.85550666, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.36254883, + "step": 2933, + "time_per_iteration": 2.60188627243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081111, + "balance_loss_mlp": 1.04477572, + "epoch": 0.5644478645632935, + "flos": 853907415552.0, + "grad_norm": 0.05278955631679875, + "language_loss": 0.77495515, + "learning_rate": 0.0004202753715083456, + "loss": 0.78576624, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.36352539, + "step": 2934, + "time_per_iteration": 3.0625100135803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084889, + "balance_loss_mlp": 1.04860175, + "epoch": 0.5646402462485571, + "flos": 553175571456.0, + "grad_norm": 0.05717629686508025, + "language_loss": 0.81433523, + "learning_rate": 0.0004199678305273936, + "loss": 0.82518411, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.36279297, + "step": 2935, + "time_per_iteration": 2.6390254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082496, + "balance_loss_mlp": 1.04587531, + "epoch": 0.5648326279338207, + "flos": 685661898240.0, + "grad_norm": 0.05411523361189988, + "language_loss": 0.81180829, + "learning_rate": 0.0004196603206176854, + "loss": 0.82263327, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.36669922, + "step": 2936, + "time_per_iteration": 2.9184954166412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079161, + "balance_loss_mlp": 1.04354107, + "epoch": 0.5650250096190843, + "flos": 802990291968.0, + "grad_norm": 0.04902014595353554, + "language_loss": 0.83833814, + "learning_rate": 0.000419352841898607, + "loss": 0.84912974, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.35644531, + "step": 2937, + "time_per_iteration": 2.963693618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078443, + "balance_loss_mlp": 1.04248953, + "epoch": 0.5652173913043478, + "flos": 581787809280.0, + "grad_norm": 0.05926519799053672, + "language_loss": 0.77107543, + "learning_rate": 0.000419045394489532, + "loss": 0.78185987, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.359375, + "step": 2938, + "time_per_iteration": 2.727398633956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076353, + "balance_loss_mlp": 1.03975606, + "epoch": 0.5654097729896114, + "flos": 820269614592.0, + "grad_norm": 0.053889258634032246, + "language_loss": 0.76768535, + "learning_rate": 0.0004187379785098224, + "loss": 0.77844894, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.3659668, + "step": 2939, + "time_per_iteration": 3.1188313961029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079563, + "balance_loss_mlp": 1.04339492, + "epoch": 0.565602154674875, + "flos": 783826716672.0, + "grad_norm": 0.05512056097545077, + "language_loss": 0.83633238, + "learning_rate": 0.00041843059407882744, + "loss": 0.84712803, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.36206055, + "step": 2940, + "time_per_iteration": 2.983302116394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076269, + "balance_loss_mlp": 1.04072082, + "epoch": 0.5657945363601385, + "flos": 549418802688.0, + "grad_norm": 0.05159052201649483, + "language_loss": 0.82491434, + "learning_rate": 0.0004181232413158842, + "loss": 0.83567703, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.35571289, + "step": 2941, + "time_per_iteration": 2.6737120151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_mlp": 1.04028893, + "epoch": 0.5659869180454021, + "flos": 667826754048.0, + "grad_norm": 0.06466569325042074, + "language_loss": 0.82093412, + "learning_rate": 0.0004178159203403179, + "loss": 0.83170253, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.36547852, + "step": 2942, + "time_per_iteration": 2.8263752460479736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077242, + "balance_loss_mlp": 1.0423857, + "epoch": 0.5661792997306656, + "flos": 499707864576.0, + "grad_norm": 0.05486974364690197, + "language_loss": 0.81532693, + "learning_rate": 0.0004175086312714409, + "loss": 0.82609934, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.34912109, + "step": 2943, + "time_per_iteration": 2.5581164360046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084461, + "balance_loss_mlp": 1.04848337, + "epoch": 0.5663716814159292, + "flos": 600922271232.0, + "grad_norm": 0.04881995286740945, + "language_loss": 0.83686805, + "learning_rate": 0.00041720137422855366, + "loss": 0.84771264, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.35961914, + "step": 2944, + "time_per_iteration": 2.7574734687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080125, + "balance_loss_mlp": 1.04390931, + "epoch": 0.5665640631011928, + "flos": 540728193024.0, + "grad_norm": 0.05214507443979086, + "language_loss": 0.79004753, + "learning_rate": 0.00041689414933094383, + "loss": 0.80084872, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.36230469, + "step": 2945, + "time_per_iteration": 2.6470541954040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080942, + "balance_loss_mlp": 1.0463953, + "epoch": 0.5667564447864564, + "flos": 601655592960.0, + "grad_norm": 0.06146311821637782, + "language_loss": 0.80673099, + "learning_rate": 0.00041658695669788653, + "loss": 0.81754035, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.34594727, + "step": 2946, + "time_per_iteration": 2.721078872680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083586, + "balance_loss_mlp": 1.04791868, + "epoch": 0.5669488264717198, + "flos": 659224894464.0, + "grad_norm": 0.05891401598443517, + "language_loss": 0.80939281, + "learning_rate": 0.00041627979644864453, + "loss": 0.82022864, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.35717773, + "step": 2947, + "time_per_iteration": 2.877037286758423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085181, + "balance_loss_mlp": 1.04941845, + "epoch": 0.5671412081569834, + "flos": 485158035456.0, + "grad_norm": 0.042998309327625356, + "language_loss": 0.809735, + "learning_rate": 0.0004159726687024683, + "loss": 0.8205868, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.35791016, + "step": 2948, + "time_per_iteration": 2.617147207260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_mlp": 1.04832673, + "epoch": 0.567333589842247, + "flos": 729487475712.0, + "grad_norm": 0.049875608566737006, + "language_loss": 0.79203111, + "learning_rate": 0.00041566557357859506, + "loss": 0.80287302, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.35888672, + "step": 2949, + "time_per_iteration": 2.859217882156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080493, + "balance_loss_mlp": 1.04494464, + "epoch": 0.5675259715275106, + "flos": 968471258112.0, + "grad_norm": 0.06410563873068757, + "language_loss": 0.79063594, + "learning_rate": 0.0004153585111962502, + "loss": 0.80144083, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.35571289, + "step": 2950, + "time_per_iteration": 3.3080387115478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084603, + "balance_loss_mlp": 1.04767203, + "epoch": 0.5677183532127742, + "flos": 564879453696.0, + "grad_norm": 0.058242755990822084, + "language_loss": 0.84030402, + "learning_rate": 0.0004150514816746453, + "loss": 0.85115004, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.36938477, + "step": 2951, + "time_per_iteration": 2.66630220413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080246, + "balance_loss_mlp": 1.04517412, + "epoch": 0.5679107348980377, + "flos": 551432503296.0, + "grad_norm": 0.05117838990465897, + "language_loss": 0.85669959, + "learning_rate": 0.0004147444851329802, + "loss": 0.86750209, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.35107422, + "step": 2952, + "time_per_iteration": 2.645735502243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108119, + "balance_loss_mlp": 1.04585648, + "epoch": 0.5681031165833013, + "flos": 819115863552.0, + "grad_norm": 0.04931619960622222, + "language_loss": 0.85395974, + "learning_rate": 0.00041443752169044126, + "loss": 0.8647716, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.35351562, + "step": 2953, + "time_per_iteration": 3.025468349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087019, + "balance_loss_mlp": 1.05116129, + "epoch": 0.5682954982685648, + "flos": 617731702272.0, + "grad_norm": 0.05138113495872943, + "language_loss": 0.84811544, + "learning_rate": 0.0004141305914662025, + "loss": 0.85898566, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.35888672, + "step": 2954, + "time_per_iteration": 2.7767860889434814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108515, + "balance_loss_mlp": 1.04848099, + "epoch": 0.5684878799538284, + "flos": 647625729024.0, + "grad_norm": 0.04880277930525614, + "language_loss": 0.80257368, + "learning_rate": 0.0004138236945794246, + "loss": 0.81342518, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.36645508, + "step": 2955, + "time_per_iteration": 2.9492557048797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079722, + "balance_loss_mlp": 1.04434061, + "epoch": 0.5686802616390919, + "flos": 805615068672.0, + "grad_norm": 0.060523381383535066, + "language_loss": 0.83239132, + "learning_rate": 0.00041351683114925576, + "loss": 0.84318852, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.35424805, + "step": 2956, + "time_per_iteration": 3.0558693408966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080356, + "balance_loss_mlp": 1.0441637, + "epoch": 0.5688726433243555, + "flos": 546882776064.0, + "grad_norm": 0.06102379875806974, + "language_loss": 0.86688364, + "learning_rate": 0.0004132100012948308, + "loss": 0.87768722, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.36230469, + "step": 2957, + "time_per_iteration": 2.6131510734558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084018, + "balance_loss_mlp": 1.04689598, + "epoch": 0.5690650250096191, + "flos": 486324933120.0, + "grad_norm": 0.05856765821562534, + "language_loss": 0.84111595, + "learning_rate": 0.00041290320513527145, + "loss": 0.85195613, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.37133789, + "step": 2958, + "time_per_iteration": 2.584434986114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077095, + "balance_loss_mlp": 1.04154706, + "epoch": 0.5692574066948827, + "flos": 577184237568.0, + "grad_norm": 0.04674501738886335, + "language_loss": 0.85154927, + "learning_rate": 0.0004125964427896867, + "loss": 0.86232018, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.35571289, + "step": 2959, + "time_per_iteration": 2.6582295894622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071399, + "balance_loss_mlp": 1.03551733, + "epoch": 0.5694497883801463, + "flos": 454005388800.0, + "grad_norm": 0.055082869163009494, + "language_loss": 0.79042369, + "learning_rate": 0.0004122897143771723, + "loss": 0.80113769, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.35888672, + "step": 2960, + "time_per_iteration": 2.555941104888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075778, + "balance_loss_mlp": 1.0394429, + "epoch": 0.5696421700654097, + "flos": 559251578880.0, + "grad_norm": 0.0498118595632428, + "language_loss": 0.81253064, + "learning_rate": 0.0004119830200168109, + "loss": 0.82328844, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.36376953, + "step": 2961, + "time_per_iteration": 2.6521012783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073119, + "balance_loss_mlp": 1.03780937, + "epoch": 0.5698345517506733, + "flos": 465314982912.0, + "grad_norm": 0.05616905034177488, + "language_loss": 0.8830415, + "learning_rate": 0.0004116763598276714, + "loss": 0.89377272, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.35327148, + "step": 2962, + "time_per_iteration": 2.5006790161132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073408, + "balance_loss_mlp": 1.03702545, + "epoch": 0.5700269334359369, + "flos": 605645116416.0, + "grad_norm": 0.05368070912324084, + "language_loss": 0.8055867, + "learning_rate": 0.00041136973392881017, + "loss": 0.81632078, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.36376953, + "step": 2963, + "time_per_iteration": 2.8011715412139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073783, + "balance_loss_mlp": 1.03852105, + "epoch": 0.5702193151212005, + "flos": 562423412736.0, + "grad_norm": 0.05977105557008513, + "language_loss": 0.81818962, + "learning_rate": 0.00041106314243926983, + "loss": 0.82892752, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.35302734, + "step": 2964, + "time_per_iteration": 2.7296242713928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070445, + "balance_loss_mlp": 1.03558779, + "epoch": 0.570411696806464, + "flos": 522983208960.0, + "grad_norm": 0.05693204807949615, + "language_loss": 0.87045705, + "learning_rate": 0.0004107565854780798, + "loss": 0.88116145, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.34887695, + "step": 2965, + "time_per_iteration": 2.5964605808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075214, + "balance_loss_mlp": 1.04002357, + "epoch": 0.5706040784917276, + "flos": 717911631360.0, + "grad_norm": 0.05031367362382368, + "language_loss": 0.80980343, + "learning_rate": 0.000410450063164256, + "loss": 0.82055557, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.35229492, + "step": 2966, + "time_per_iteration": 2.8248300552368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076636, + "balance_loss_mlp": 1.04127812, + "epoch": 0.5707964601769911, + "flos": 476467425792.0, + "grad_norm": 0.059966750204006415, + "language_loss": 0.8167066, + "learning_rate": 0.00041014357561680115, + "loss": 0.82747293, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.35351562, + "step": 2967, + "time_per_iteration": 2.4996910095214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077241, + "balance_loss_mlp": 1.04278946, + "epoch": 0.5709888418622547, + "flos": 579823570944.0, + "grad_norm": 0.05891056148222195, + "language_loss": 0.85875672, + "learning_rate": 0.0004098371229547039, + "loss": 0.86952913, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.3449707, + "step": 2968, + "time_per_iteration": 2.6908459663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131838, + "balance_loss_mlp": 1.11677039, + "epoch": 0.5711812235475183, + "flos": 1579108290048.0, + "grad_norm": 0.050443633584492734, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81142646, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.15039062, + "step": 2969, + "time_per_iteration": 4.709675550460815 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107846, + "balance_loss_mlp": 1.04233932, + "epoch": 0.5713736052327818, + "flos": 468259854336.0, + "grad_norm": 0.04864564090032181, + "language_loss": 0.80513656, + "learning_rate": 0.00040922432276247107, + "loss": 0.81592119, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.36132812, + "step": 2970, + "time_per_iteration": 2.554276466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078647, + "balance_loss_mlp": 1.04412448, + "epoch": 0.5715659869180454, + "flos": 537390443520.0, + "grad_norm": 0.06858717783230618, + "language_loss": 0.84265316, + "learning_rate": 0.0004089179754702457, + "loss": 0.85343957, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.34570312, + "step": 2971, + "time_per_iteration": 2.7972512245178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072054, + "balance_loss_mlp": 1.0365299, + "epoch": 0.571758368603309, + "flos": 655778045952.0, + "grad_norm": 0.0710461233457747, + "language_loss": 0.79649973, + "learning_rate": 0.00040861166353919843, + "loss": 0.80722028, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.35546875, + "step": 2972, + "time_per_iteration": 2.7805516719818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076554, + "balance_loss_mlp": 1.04076695, + "epoch": 0.5719507502885726, + "flos": 667609966080.0, + "grad_norm": 0.05192257726698222, + "language_loss": 0.81693333, + "learning_rate": 0.00040830538708824983, + "loss": 0.82769883, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.35839844, + "step": 2973, + "time_per_iteration": 2.8635294437408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070547, + "balance_loss_mlp": 1.03507066, + "epoch": 0.572143131973836, + "flos": 476083312128.0, + "grad_norm": 0.060626408017241236, + "language_loss": 0.81790257, + "learning_rate": 0.000407999146236307, + "loss": 0.82860804, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.35498047, + "step": 2974, + "time_per_iteration": 2.5645899772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074889, + "balance_loss_mlp": 1.03943634, + "epoch": 0.5723355136590996, + "flos": 539255757312.0, + "grad_norm": 0.06009071322865027, + "language_loss": 0.83246768, + "learning_rate": 0.0004076929411022634, + "loss": 0.84321654, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.35449219, + "step": 2975, + "time_per_iteration": 2.655545234680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075121, + "balance_loss_mlp": 1.0383811, + "epoch": 0.5725278953443632, + "flos": 823784864256.0, + "grad_norm": 0.053970809123607175, + "language_loss": 0.79314309, + "learning_rate": 0.0004073867718049982, + "loss": 0.80389434, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.36743164, + "step": 2976, + "time_per_iteration": 3.0664896965026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078369, + "balance_loss_mlp": 1.0429157, + "epoch": 0.5727202770296268, + "flos": 587155226112.0, + "grad_norm": 0.05912475797179562, + "language_loss": 0.82244706, + "learning_rate": 0.00040708063846337704, + "loss": 0.83323073, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.35522461, + "step": 2977, + "time_per_iteration": 2.7131478786468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083864, + "balance_loss_mlp": 1.04800642, + "epoch": 0.5729126587148904, + "flos": 446723195904.0, + "grad_norm": 0.048537452765021645, + "language_loss": 0.80846637, + "learning_rate": 0.00040677454119625143, + "loss": 0.81930506, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.35864258, + "step": 2978, + "time_per_iteration": 2.6209888458251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078154, + "balance_loss_mlp": 1.0418427, + "epoch": 0.5731050404001539, + "flos": 519206091264.0, + "grad_norm": 0.05702144714813726, + "language_loss": 0.82471335, + "learning_rate": 0.0004064684801224587, + "loss": 0.83549494, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.36328125, + "step": 2979, + "time_per_iteration": 2.5915722846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077909, + "balance_loss_mlp": 1.04197955, + "epoch": 0.5732974220854175, + "flos": 504528224256.0, + "grad_norm": 0.05171310351774622, + "language_loss": 0.80115962, + "learning_rate": 0.00040616245536082224, + "loss": 0.8119387, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.35961914, + "step": 2980, + "time_per_iteration": 2.6032769680023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076097, + "balance_loss_mlp": 1.04057276, + "epoch": 0.573489803770681, + "flos": 592187991552.0, + "grad_norm": 0.049753074122949235, + "language_loss": 0.80894011, + "learning_rate": 0.00040585646703015165, + "loss": 0.81970108, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.35522461, + "step": 2981, + "time_per_iteration": 2.79546856880188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074296, + "balance_loss_mlp": 1.03891444, + "epoch": 0.5736821854559446, + "flos": 489672857088.0, + "grad_norm": 0.06088968225358262, + "language_loss": 0.78612393, + "learning_rate": 0.0004055505152492419, + "loss": 0.79686689, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.35449219, + "step": 2982, + "time_per_iteration": 2.6494040489196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078742, + "balance_loss_mlp": 1.04283655, + "epoch": 0.5738745671412081, + "flos": 457895987712.0, + "grad_norm": 0.05054468303814383, + "language_loss": 0.74372864, + "learning_rate": 0.00040524460013687425, + "loss": 0.75451601, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.359375, + "step": 2983, + "time_per_iteration": 2.7171366214752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078136, + "balance_loss_mlp": 1.04294515, + "epoch": 0.5740669488264717, + "flos": 580012655616.0, + "grad_norm": 0.044553783792680594, + "language_loss": 0.80828458, + "learning_rate": 0.0004049387218118155, + "loss": 0.81906593, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.35229492, + "step": 2984, + "time_per_iteration": 2.995347738265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073612, + "balance_loss_mlp": 1.03725314, + "epoch": 0.5742593305117353, + "flos": 524155898880.0, + "grad_norm": 0.05730874981758524, + "language_loss": 0.8475495, + "learning_rate": 0.00040463288039281777, + "loss": 0.85828567, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.36328125, + "step": 2985, + "time_per_iteration": 2.715092182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102117, + "balance_loss_mlp": 1.0106324, + "epoch": 0.5744517121969989, + "flos": 1553033488896.0, + "grad_norm": 0.021440825644231668, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78897589, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.10546875, + "step": 2986, + "time_per_iteration": 4.936111211776733 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071312, + "balance_loss_mlp": 1.03588247, + "epoch": 0.5746440938822625, + "flos": 751600304640.0, + "grad_norm": 0.05668637583843988, + "language_loss": 0.81840217, + "learning_rate": 0.0004040213087479444, + "loss": 0.82911527, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.35449219, + "step": 2987, + "time_per_iteration": 2.949164628982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074412, + "balance_loss_mlp": 1.03955531, + "epoch": 0.5748364755675259, + "flos": 501618258432.0, + "grad_norm": 0.05762088821448085, + "language_loss": 0.84999508, + "learning_rate": 0.0004037155787595018, + "loss": 0.86073923, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.34887695, + "step": 2988, + "time_per_iteration": 2.6570816040039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010738, + "balance_loss_mlp": 1.03863311, + "epoch": 0.5750288572527895, + "flos": 503757024768.0, + "grad_norm": 0.17757642281187902, + "language_loss": 0.80609345, + "learning_rate": 0.000403409886151987, + "loss": 0.81683147, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.35205078, + "step": 2989, + "time_per_iteration": 2.913994073867798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014651, + "balance_loss_mlp": 1.00430369, + "epoch": 0.5752212389380531, + "flos": 1540541030400.0, + "grad_norm": 0.007550989320398048, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83013755, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.10351562, + "step": 2990, + "time_per_iteration": 4.7991979122161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020765, + "balance_loss_mlp": 1.01027453, + "epoch": 0.5754136206233167, + "flos": 1566499378176.0, + "grad_norm": 0.009415259483784648, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79219365, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.10498047, + "step": 2991, + "time_per_iteration": 4.760354280471802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076847, + "balance_loss_mlp": 1.04282451, + "epoch": 0.5756060023085803, + "flos": 797806167552.0, + "grad_norm": 0.05030181344669937, + "language_loss": 0.76800382, + "learning_rate": 0.00040249303380173807, + "loss": 0.77877235, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.34057617, + "step": 2992, + "time_per_iteration": 3.083129644393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080903, + "balance_loss_mlp": 1.04573631, + "epoch": 0.5757983839938438, + "flos": 587588802048.0, + "grad_norm": 0.05896593059815975, + "language_loss": 0.78794599, + "learning_rate": 0.00040218749190459126, + "loss": 0.79875505, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.35229492, + "step": 2993, + "time_per_iteration": 2.763256788253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083869, + "balance_loss_mlp": 1.04884517, + "epoch": 0.5759907656791073, + "flos": 516576932352.0, + "grad_norm": 0.05409710441005256, + "language_loss": 0.82655573, + "learning_rate": 0.00040188198798162775, + "loss": 0.83739436, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.35058594, + "step": 2994, + "time_per_iteration": 2.6000871658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078452, + "balance_loss_mlp": 1.04333293, + "epoch": 0.5761831473643709, + "flos": 586845305856.0, + "grad_norm": 0.05831918093224265, + "language_loss": 0.85334295, + "learning_rate": 0.000401576522151455, + "loss": 0.8641274, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.3515625, + "step": 2995, + "time_per_iteration": 2.808647871017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081176, + "balance_loss_mlp": 1.04672456, + "epoch": 0.5763755290496345, + "flos": 543619219968.0, + "grad_norm": 0.04257335582462403, + "language_loss": 0.82291412, + "learning_rate": 0.0004012710945326651, + "loss": 0.83372593, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.34472656, + "step": 2996, + "time_per_iteration": 2.7611968517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082283, + "balance_loss_mlp": 1.04749799, + "epoch": 0.576567910734898, + "flos": 625930509312.0, + "grad_norm": 0.050767561493079726, + "language_loss": 0.80952752, + "learning_rate": 0.0004009657052438355, + "loss": 0.82035035, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.34814453, + "step": 2997, + "time_per_iteration": 2.788496971130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107931, + "balance_loss_mlp": 1.04392815, + "epoch": 0.5767602924201616, + "flos": 537985552896.0, + "grad_norm": 0.053276481047857226, + "language_loss": 0.85359365, + "learning_rate": 0.00040066035440352904, + "loss": 0.86438668, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.35400391, + "step": 2998, + "time_per_iteration": 2.6187028884887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010656, + "balance_loss_mlp": 1.05358338, + "epoch": 0.5769526741054252, + "flos": 1558969873920.0, + "grad_norm": 0.027624435835290975, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80358732, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.12011719, + "step": 2999, + "time_per_iteration": 4.880754470825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085333, + "balance_loss_mlp": 1.05071473, + "epoch": 0.5771450557906888, + "flos": 467939759616.0, + "grad_norm": 0.056203987299685475, + "language_loss": 0.7605744, + "learning_rate": 0.00040004976854266145, + "loss": 0.77142775, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.34667969, + "step": 3000, + "time_per_iteration": 2.537555694580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079139, + "balance_loss_mlp": 1.043329, + "epoch": 0.5773374374759523, + "flos": 574288828416.0, + "grad_norm": 0.059637526980377456, + "language_loss": 0.81006908, + "learning_rate": 0.0003997445337591505, + "loss": 0.82086051, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.35839844, + "step": 3001, + "time_per_iteration": 2.637199878692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072855, + "balance_loss_mlp": 1.03756905, + "epoch": 0.5775298191612158, + "flos": 528216795648.0, + "grad_norm": 0.054057225734739034, + "language_loss": 0.73747128, + "learning_rate": 0.0003994393378982635, + "loss": 0.74819982, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.35327148, + "step": 3002, + "time_per_iteration": 2.605628490447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042054, + "balance_loss_mlp": 1.03013277, + "epoch": 0.5777222008464794, + "flos": 1303178070528.0, + "grad_norm": 0.01828159888171313, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80580056, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.11914062, + "step": 3003, + "time_per_iteration": 4.791952848434448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072518, + "balance_loss_mlp": 1.03708899, + "epoch": 0.577914582531743, + "flos": 603344816640.0, + "grad_norm": 0.05129820562397971, + "language_loss": 0.88025165, + "learning_rate": 0.0003988290634182961, + "loss": 0.89097679, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.35449219, + "step": 3004, + "time_per_iteration": 2.7482082843780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107598, + "balance_loss_mlp": 1.04162431, + "epoch": 0.5781069642170066, + "flos": 486537338880.0, + "grad_norm": 0.060845290060135546, + "language_loss": 0.80967325, + "learning_rate": 0.0003985239850361453, + "loss": 0.82043308, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.34399414, + "step": 3005, + "time_per_iteration": 2.577929735183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074673, + "balance_loss_mlp": 1.03933978, + "epoch": 0.5782993459022701, + "flos": 506016626688.0, + "grad_norm": 0.06787324566679709, + "language_loss": 0.84799004, + "learning_rate": 0.0003982189460504777, + "loss": 0.85873681, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.35375977, + "step": 3006, + "time_per_iteration": 2.6993815898895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077227, + "balance_loss_mlp": 1.04179859, + "epoch": 0.5784917275875336, + "flos": 601872380928.0, + "grad_norm": 0.06968716045875477, + "language_loss": 0.79860866, + "learning_rate": 0.00039791394657971935, + "loss": 0.80938095, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.35449219, + "step": 3007, + "time_per_iteration": 2.6929664611816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070454, + "balance_loss_mlp": 1.03616893, + "epoch": 0.5786841092727972, + "flos": 521279428608.0, + "grad_norm": 0.07090711844515878, + "language_loss": 0.84396511, + "learning_rate": 0.00039760898674228205, + "loss": 0.85466969, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.34301758, + "step": 3008, + "time_per_iteration": 2.674983501434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074007, + "balance_loss_mlp": 1.03941262, + "epoch": 0.5788764909580608, + "flos": 767047809024.0, + "grad_norm": 0.04405411396785794, + "language_loss": 0.80589879, + "learning_rate": 0.0003973040666565613, + "loss": 0.81663889, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.34619141, + "step": 3009, + "time_per_iteration": 3.0445330142974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068256, + "balance_loss_mlp": 1.03347063, + "epoch": 0.5790688726433244, + "flos": 598786324992.0, + "grad_norm": 0.0464228238066257, + "language_loss": 0.81778955, + "learning_rate": 0.000396999186440938, + "loss": 0.82847214, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.34814453, + "step": 3010, + "time_per_iteration": 2.837510585784912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_mlp": 1.03594089, + "epoch": 0.5792612543285879, + "flos": 522805708800.0, + "grad_norm": 0.06076952990047212, + "language_loss": 0.8482464, + "learning_rate": 0.000396694346213777, + "loss": 0.85896629, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.36083984, + "step": 3011, + "time_per_iteration": 2.630096197128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071847, + "balance_loss_mlp": 1.03498721, + "epoch": 0.5794536360138515, + "flos": 876178805760.0, + "grad_norm": 0.045866643068031475, + "language_loss": 0.83350897, + "learning_rate": 0.0003963895460934276, + "loss": 0.84422737, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.3684082, + "step": 3012, + "time_per_iteration": 3.144862174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107149, + "balance_loss_mlp": 1.03555989, + "epoch": 0.5796460176991151, + "flos": 401221541376.0, + "grad_norm": 0.0681769397078292, + "language_loss": 0.84421676, + "learning_rate": 0.00039608478619822376, + "loss": 0.85493165, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.35961914, + "step": 3013, + "time_per_iteration": 2.459653854370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071668, + "balance_loss_mlp": 1.03545213, + "epoch": 0.5798383993843786, + "flos": 618229297152.0, + "grad_norm": 0.04312849012034037, + "language_loss": 0.82395273, + "learning_rate": 0.00039578006664648394, + "loss": 0.83466941, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.36206055, + "step": 3014, + "time_per_iteration": 2.759540557861328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068998, + "balance_loss_mlp": 1.0336163, + "epoch": 0.5800307810696421, + "flos": 843966950400.0, + "grad_norm": 0.05059644865737796, + "language_loss": 0.80954117, + "learning_rate": 0.0003954753875565105, + "loss": 0.82023108, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.35424805, + "step": 3015, + "time_per_iteration": 3.102818727493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065155, + "balance_loss_mlp": 1.02970195, + "epoch": 0.5802231627549057, + "flos": 569005779456.0, + "grad_norm": 0.049284538826036076, + "language_loss": 0.82072717, + "learning_rate": 0.00039517074904659057, + "loss": 0.83137876, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.35498047, + "step": 3016, + "time_per_iteration": 2.6733109951019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074461, + "balance_loss_mlp": 1.03884125, + "epoch": 0.5804155444401693, + "flos": 660160447488.0, + "grad_norm": 0.0506827974734746, + "language_loss": 0.84573597, + "learning_rate": 0.00039486615123499535, + "loss": 0.8564806, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.35668945, + "step": 3017, + "time_per_iteration": 2.8088088035583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071818, + "balance_loss_mlp": 1.0354352, + "epoch": 0.5806079261254329, + "flos": 513726603264.0, + "grad_norm": 0.053399367847764105, + "language_loss": 0.84808505, + "learning_rate": 0.00039456159423997996, + "loss": 0.85880327, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.36401367, + "step": 3018, + "time_per_iteration": 2.6254379749298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074265, + "balance_loss_mlp": 1.03747678, + "epoch": 0.5808003078106965, + "flos": 528379739136.0, + "grad_norm": 0.059071353461068586, + "language_loss": 0.89337808, + "learning_rate": 0.00039425707817978406, + "loss": 0.90412068, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.36767578, + "step": 3019, + "time_per_iteration": 2.65867280960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071634, + "balance_loss_mlp": 1.0357995, + "epoch": 0.58099268949596, + "flos": 476787520512.0, + "grad_norm": 0.06353889490099716, + "language_loss": 0.83356857, + "learning_rate": 0.00039395260317263124, + "loss": 0.84428501, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.35839844, + "step": 3020, + "time_per_iteration": 2.554124116897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074167, + "balance_loss_mlp": 1.03666329, + "epoch": 0.5811850711812235, + "flos": 517340777472.0, + "grad_norm": 0.05166922362438639, + "language_loss": 0.84975517, + "learning_rate": 0.0003936481693367291, + "loss": 0.86049688, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.37475586, + "step": 3021, + "time_per_iteration": 2.6460227966308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075928, + "balance_loss_mlp": 1.03976023, + "epoch": 0.5813774528664871, + "flos": 616122464256.0, + "grad_norm": 0.06649500378390247, + "language_loss": 0.876212, + "learning_rate": 0.0003933437767902697, + "loss": 0.88697129, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.36206055, + "step": 3022, + "time_per_iteration": 2.8114941120147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071818, + "balance_loss_mlp": 1.03588879, + "epoch": 0.5815698345517507, + "flos": 567194310144.0, + "grad_norm": 0.06503214921944889, + "language_loss": 0.78287327, + "learning_rate": 0.00039303942565142825, + "loss": 0.7935915, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.35961914, + "step": 3023, + "time_per_iteration": 2.7259762287139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071639, + "balance_loss_mlp": 1.03563786, + "epoch": 0.5817622162370142, + "flos": 562886102016.0, + "grad_norm": 0.05350887168996553, + "language_loss": 0.76429439, + "learning_rate": 0.0003927351160383644, + "loss": 0.77501082, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.36035156, + "step": 3024, + "time_per_iteration": 2.8155934810638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071996, + "balance_loss_mlp": 1.03730595, + "epoch": 0.5819545979222778, + "flos": 458982899712.0, + "grad_norm": 0.05396860990467202, + "language_loss": 0.77624023, + "learning_rate": 0.000392430848069222, + "loss": 0.78696012, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.34741211, + "step": 3025, + "time_per_iteration": 2.5123956203460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069758, + "balance_loss_mlp": 1.03387606, + "epoch": 0.5821469796075414, + "flos": 541215613440.0, + "grad_norm": 0.05894861582094883, + "language_loss": 0.82395303, + "learning_rate": 0.00039212662186212795, + "loss": 0.83465064, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.35913086, + "step": 3026, + "time_per_iteration": 2.6423861980438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075924, + "balance_loss_mlp": 1.03930306, + "epoch": 0.582339361292805, + "flos": 551994117120.0, + "grad_norm": 0.060293393109458415, + "language_loss": 0.77264106, + "learning_rate": 0.0003918224375351934, + "loss": 0.7834003, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.36621094, + "step": 3027, + "time_per_iteration": 2.691378593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075199, + "balance_loss_mlp": 1.04029393, + "epoch": 0.5825317429780685, + "flos": 496138770432.0, + "grad_norm": 0.05191318265313257, + "language_loss": 0.78248543, + "learning_rate": 0.0003915182952065135, + "loss": 0.79323745, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.34936523, + "step": 3028, + "time_per_iteration": 2.718275308609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073019, + "balance_loss_mlp": 1.03732777, + "epoch": 0.582724124663332, + "flos": 563890056192.0, + "grad_norm": 0.0482119369127772, + "language_loss": 0.87499475, + "learning_rate": 0.0003912141949941664, + "loss": 0.8857249, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.35766602, + "step": 3029, + "time_per_iteration": 2.6762070655822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075023, + "balance_loss_mlp": 1.03852117, + "epoch": 0.5829165063485956, + "flos": 491888788992.0, + "grad_norm": 0.06336756881053687, + "language_loss": 0.82355005, + "learning_rate": 0.0003909101370162143, + "loss": 0.83430028, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.36499023, + "step": 3030, + "time_per_iteration": 2.6055908203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035193, + "balance_loss_mlp": 1.02432156, + "epoch": 0.5831088880338592, + "flos": 1528134501888.0, + "grad_norm": 0.025423566517204055, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.7346909, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.10888672, + "step": 3031, + "time_per_iteration": 4.88014817237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071232, + "balance_loss_mlp": 1.03558815, + "epoch": 0.5833012697191228, + "flos": 617712763392.0, + "grad_norm": 0.04799878735573131, + "language_loss": 0.82774729, + "learning_rate": 0.0003903021482356622, + "loss": 0.83845961, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.35693359, + "step": 3032, + "time_per_iteration": 2.7778074741363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071187, + "balance_loss_mlp": 1.03542447, + "epoch": 0.5834936514043862, + "flos": 767578899456.0, + "grad_norm": 0.04830091888101656, + "language_loss": 0.82788891, + "learning_rate": 0.00038999821766910465, + "loss": 0.83860075, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.35791016, + "step": 3033, + "time_per_iteration": 2.9640953540802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070566, + "balance_loss_mlp": 1.03496981, + "epoch": 0.5836860330896498, + "flos": 458136096768.0, + "grad_norm": 0.045708981442043065, + "language_loss": 0.85570675, + "learning_rate": 0.00038969432980902606, + "loss": 0.8664124, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.35620117, + "step": 3034, + "time_per_iteration": 2.520869255065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029447, + "balance_loss_mlp": 1.01819336, + "epoch": 0.5838784147749134, + "flos": 1360485504000.0, + "grad_norm": 0.023110513117977256, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80813944, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.11230469, + "step": 3035, + "time_per_iteration": 4.791047811508179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076539, + "balance_loss_mlp": 1.04125297, + "epoch": 0.584070796460177, + "flos": 566942616576.0, + "grad_norm": 0.048603623386797364, + "language_loss": 0.82340151, + "learning_rate": 0.00038908668268020953, + "loss": 0.83416688, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.35302734, + "step": 3036, + "time_per_iteration": 2.6480767726898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073606, + "balance_loss_mlp": 1.03781927, + "epoch": 0.5842631781454406, + "flos": 611188623360.0, + "grad_norm": 0.04937423588772942, + "language_loss": 0.84850454, + "learning_rate": 0.00038878292364738097, + "loss": 0.85924065, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.3581543, + "step": 3037, + "time_per_iteration": 2.7739527225494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070708, + "balance_loss_mlp": 1.03418183, + "epoch": 0.5844555598307041, + "flos": 463148513280.0, + "grad_norm": 0.05602443207387838, + "language_loss": 0.86980963, + "learning_rate": 0.0003884792077928508, + "loss": 0.88051671, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.36523438, + "step": 3038, + "time_per_iteration": 2.488044500350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076481, + "balance_loss_mlp": 1.04083705, + "epoch": 0.5846479415159677, + "flos": 410005283328.0, + "grad_norm": 0.06107663121836191, + "language_loss": 0.76691568, + "learning_rate": 0.0003881755352345322, + "loss": 0.77768052, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.35644531, + "step": 3039, + "time_per_iteration": 2.4996848106384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076016, + "balance_loss_mlp": 1.03944278, + "epoch": 0.5848403232012312, + "flos": 491056542720.0, + "grad_norm": 0.04475599589029588, + "language_loss": 0.86940634, + "learning_rate": 0.0003878719060903207, + "loss": 0.88016653, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.36572266, + "step": 3040, + "time_per_iteration": 2.5631661415100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107371, + "balance_loss_mlp": 1.03823376, + "epoch": 0.5850327048864948, + "flos": 584146335744.0, + "grad_norm": 0.06623374989281658, + "language_loss": 0.82883763, + "learning_rate": 0.0003875683204780961, + "loss": 0.83957475, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.35522461, + "step": 3041, + "time_per_iteration": 2.7194101810455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074225, + "balance_loss_mlp": 1.03765166, + "epoch": 0.5852250865717584, + "flos": 651253049856.0, + "grad_norm": 0.05546398592496706, + "language_loss": 0.84983653, + "learning_rate": 0.00038726477851572043, + "loss": 0.86057878, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.36572266, + "step": 3042, + "time_per_iteration": 2.809687376022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072999, + "balance_loss_mlp": 1.03659296, + "epoch": 0.5854174682570219, + "flos": 534332090880.0, + "grad_norm": 0.07237686853447298, + "language_loss": 0.80418718, + "learning_rate": 0.0003869612803210395, + "loss": 0.81491715, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.36401367, + "step": 3043, + "time_per_iteration": 2.6141133308410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074965, + "balance_loss_mlp": 1.03872585, + "epoch": 0.5856098499422855, + "flos": 509501352960.0, + "grad_norm": 0.08321780378599658, + "language_loss": 0.83029413, + "learning_rate": 0.0003866578260118817, + "loss": 0.84104383, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.36254883, + "step": 3044, + "time_per_iteration": 2.5739400386810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070627, + "balance_loss_mlp": 1.03438699, + "epoch": 0.5858022316275491, + "flos": 593619729408.0, + "grad_norm": 0.061750802810204855, + "language_loss": 0.83199847, + "learning_rate": 0.0003863544157060581, + "loss": 0.84270471, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.36254883, + "step": 3045, + "time_per_iteration": 2.662442207336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077309, + "balance_loss_mlp": 1.04083109, + "epoch": 0.5859946133128127, + "flos": 558829587456.0, + "grad_norm": 0.0566139046566934, + "language_loss": 0.82210046, + "learning_rate": 0.0003860510495213634, + "loss": 0.83287358, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.36499023, + "step": 3046, + "time_per_iteration": 2.817676305770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086311, + "balance_loss_mlp": 1.04885542, + "epoch": 0.5861869949980761, + "flos": 553431647232.0, + "grad_norm": 0.06969052760403557, + "language_loss": 0.77781415, + "learning_rate": 0.0003857477275755746, + "loss": 0.78867728, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.37451172, + "step": 3047, + "time_per_iteration": 2.645547389984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076852, + "balance_loss_mlp": 1.03994477, + "epoch": 0.5863793766833397, + "flos": 718321886208.0, + "grad_norm": 0.060152245737565335, + "language_loss": 0.83672923, + "learning_rate": 0.00038544444998645167, + "loss": 0.84749776, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.36914062, + "step": 3048, + "time_per_iteration": 2.995572090148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080654, + "balance_loss_mlp": 1.04410434, + "epoch": 0.5865717583686033, + "flos": 472041354240.0, + "grad_norm": 0.05877541838315078, + "language_loss": 0.81869525, + "learning_rate": 0.00038514121687173767, + "loss": 0.82950181, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.36572266, + "step": 3049, + "time_per_iteration": 2.5653092861175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085484, + "balance_loss_mlp": 1.04819572, + "epoch": 0.5867641400538669, + "flos": 813143162880.0, + "grad_norm": 0.060327128014073625, + "language_loss": 0.82117838, + "learning_rate": 0.00038483802834915807, + "loss": 0.83203322, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.37280273, + "step": 3050, + "time_per_iteration": 2.9661922454833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074132, + "balance_loss_mlp": 1.03755879, + "epoch": 0.5869565217391305, + "flos": 486285645312.0, + "grad_norm": 0.05442603126978945, + "language_loss": 0.78767669, + "learning_rate": 0.00038453488453642074, + "loss": 0.79841799, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.36547852, + "step": 3051, + "time_per_iteration": 2.6421701908111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076937, + "balance_loss_mlp": 1.0401963, + "epoch": 0.587148903424394, + "flos": 569104704000.0, + "grad_norm": 0.050403805084847125, + "language_loss": 0.86714828, + "learning_rate": 0.00038423178555121697, + "loss": 0.87791765, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.36743164, + "step": 3052, + "time_per_iteration": 2.689039945602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079498, + "balance_loss_mlp": 1.04239988, + "epoch": 0.5873412851096576, + "flos": 746948680704.0, + "grad_norm": 0.04537735372020953, + "language_loss": 0.85335124, + "learning_rate": 0.00038392873151121994, + "loss": 0.86414617, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.37084961, + "step": 3053, + "time_per_iteration": 3.0252749919891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071586, + "balance_loss_mlp": 1.03510821, + "epoch": 0.5875336667949211, + "flos": 527882144256.0, + "grad_norm": 0.0531573443466337, + "language_loss": 0.82837141, + "learning_rate": 0.0003836257225340859, + "loss": 0.83908725, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.36474609, + "step": 3054, + "time_per_iteration": 2.6028475761413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074191, + "balance_loss_mlp": 1.03728426, + "epoch": 0.5877260484801847, + "flos": 823799420928.0, + "grad_norm": 0.057535155706969474, + "language_loss": 0.81870168, + "learning_rate": 0.00038332275873745336, + "loss": 0.82944363, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.36889648, + "step": 3055, + "time_per_iteration": 3.1007511615753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074496, + "balance_loss_mlp": 1.03682637, + "epoch": 0.5879184301654482, + "flos": 591325221888.0, + "grad_norm": 0.0460079349498171, + "language_loss": 0.82943761, + "learning_rate": 0.0003830198402389431, + "loss": 0.84018254, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.37646484, + "step": 3056, + "time_per_iteration": 2.6919126510620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010412, + "balance_loss_mlp": 1.02975643, + "epoch": 0.5881108118507118, + "flos": 1544953955328.0, + "grad_norm": 0.021887470100806234, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78390133, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.11425781, + "step": 3057, + "time_per_iteration": 4.971444368362427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072291, + "balance_loss_mlp": 1.03576517, + "epoch": 0.5883031935359754, + "flos": 489348380160.0, + "grad_norm": 0.055950804718103285, + "language_loss": 0.82692897, + "learning_rate": 0.0003824141396066855, + "loss": 0.83765185, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.36572266, + "step": 3058, + "time_per_iteration": 2.5848963260650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074876, + "balance_loss_mlp": 1.03842139, + "epoch": 0.588495575221239, + "flos": 582551654400.0, + "grad_norm": 0.05305150563857962, + "language_loss": 0.82647693, + "learning_rate": 0.000382111357708092, + "loss": 0.83722568, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.36499023, + "step": 3059, + "time_per_iteration": 2.750030279159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071916, + "balance_loss_mlp": 1.03558111, + "epoch": 0.5886879569065026, + "flos": 660751174656.0, + "grad_norm": 0.05165433097502605, + "language_loss": 0.83451211, + "learning_rate": 0.00038180862157792864, + "loss": 0.84523129, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.36303711, + "step": 3060, + "time_per_iteration": 2.7654812335968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070431, + "balance_loss_mlp": 1.03414369, + "epoch": 0.588880338591766, + "flos": 562392889344.0, + "grad_norm": 0.05703427459216956, + "language_loss": 0.82004499, + "learning_rate": 0.0003815059313337279, + "loss": 0.83074933, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.36279297, + "step": 3061, + "time_per_iteration": 2.659722089767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072147, + "balance_loss_mlp": 1.03585935, + "epoch": 0.5890727202770296, + "flos": 554451568128.0, + "grad_norm": 0.04901881896382658, + "language_loss": 0.77886307, + "learning_rate": 0.00038120328709300436, + "loss": 0.78958452, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.36279297, + "step": 3062, + "time_per_iteration": 2.8264663219451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076904, + "balance_loss_mlp": 1.04114151, + "epoch": 0.5892651019622932, + "flos": 655226606592.0, + "grad_norm": 0.057794453116502664, + "language_loss": 0.83449113, + "learning_rate": 0.0003809006889732549, + "loss": 0.84526014, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.35766602, + "step": 3063, + "time_per_iteration": 2.780714511871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073572, + "balance_loss_mlp": 1.03680801, + "epoch": 0.5894574836475568, + "flos": 452970911232.0, + "grad_norm": 0.048397381644471126, + "language_loss": 0.87604314, + "learning_rate": 0.0003805981370919589, + "loss": 0.88677883, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.36743164, + "step": 3064, + "time_per_iteration": 2.497511386871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077136, + "balance_loss_mlp": 1.03965652, + "epoch": 0.5896498653328203, + "flos": 518763750912.0, + "grad_norm": 0.05535483461806511, + "language_loss": 0.83910584, + "learning_rate": 0.0003802956315665771, + "loss": 0.84987724, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.37475586, + "step": 3065, + "time_per_iteration": 2.6540539264678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075706, + "balance_loss_mlp": 1.03965688, + "epoch": 0.5898422470180839, + "flos": 548793169920.0, + "grad_norm": 0.06978967624296899, + "language_loss": 0.81621277, + "learning_rate": 0.0003799931725145529, + "loss": 0.82696986, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.3605957, + "step": 3066, + "time_per_iteration": 2.5999929904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075756, + "balance_loss_mlp": 1.04015982, + "epoch": 0.5900346287033474, + "flos": 524046799872.0, + "grad_norm": 0.06178961053063138, + "language_loss": 0.85556895, + "learning_rate": 0.00037969076005331083, + "loss": 0.86632651, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.35571289, + "step": 3067, + "time_per_iteration": 2.7505955696105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080728, + "balance_loss_mlp": 1.04372525, + "epoch": 0.590227010388611, + "flos": 566893154304.0, + "grad_norm": 0.059517883137225745, + "language_loss": 0.88041914, + "learning_rate": 0.00037938839430025817, + "loss": 0.89122641, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.36962891, + "step": 3068, + "time_per_iteration": 2.6254634857177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072667, + "balance_loss_mlp": 1.03714228, + "epoch": 0.5904193920738746, + "flos": 583053631488.0, + "grad_norm": 0.05094647187222568, + "language_loss": 0.85285151, + "learning_rate": 0.0003790860753727835, + "loss": 0.8635782, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.35546875, + "step": 3069, + "time_per_iteration": 2.790996551513672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076132, + "balance_loss_mlp": 1.04056025, + "epoch": 0.5906117737591381, + "flos": 529428773376.0, + "grad_norm": 0.06487433034023032, + "language_loss": 0.82915914, + "learning_rate": 0.00037878380338825766, + "loss": 0.83992046, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.35644531, + "step": 3070, + "time_per_iteration": 2.6697611808776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078223, + "balance_loss_mlp": 1.04276967, + "epoch": 0.5908041554444017, + "flos": 683908655616.0, + "grad_norm": 0.053205750192721994, + "language_loss": 0.81560326, + "learning_rate": 0.00037848157846403287, + "loss": 0.8263855, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.35473633, + "step": 3071, + "time_per_iteration": 2.92523193359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077534, + "balance_loss_mlp": 1.04246306, + "epoch": 0.5909965371296653, + "flos": 549719958528.0, + "grad_norm": 0.04683417834560967, + "language_loss": 0.83405554, + "learning_rate": 0.0003781794007174435, + "loss": 0.84483093, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.35107422, + "step": 3072, + "time_per_iteration": 2.7881455421447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022638, + "balance_loss_mlp": 1.01200461, + "epoch": 0.5911889188149289, + "flos": 1491544475136.0, + "grad_norm": 0.008695883247199268, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75097167, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.10644531, + "step": 3073, + "time_per_iteration": 4.864701509475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078671, + "balance_loss_mlp": 1.04293227, + "epoch": 0.5913813005001923, + "flos": 487630043136.0, + "grad_norm": 0.053099165858615995, + "language_loss": 0.80592149, + "learning_rate": 0.0003775751872264152, + "loss": 0.81670815, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.35766602, + "step": 3074, + "time_per_iteration": 2.7932956218719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079498, + "balance_loss_mlp": 1.04409289, + "epoch": 0.5915736821854559, + "flos": 573034590720.0, + "grad_norm": 0.04575078918426429, + "language_loss": 0.86981148, + "learning_rate": 0.0003772731517165527, + "loss": 0.88060653, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.35449219, + "step": 3075, + "time_per_iteration": 2.7613656520843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075792, + "balance_loss_mlp": 1.04060149, + "epoch": 0.5917660638707195, + "flos": 789183959040.0, + "grad_norm": 0.06797753963070947, + "language_loss": 0.84194851, + "learning_rate": 0.0003769711638534784, + "loss": 0.85270643, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.35205078, + "step": 3076, + "time_per_iteration": 2.991854190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076527, + "balance_loss_mlp": 1.04181361, + "epoch": 0.5919584455559831, + "flos": 528487428096.0, + "grad_norm": 0.06227325112589354, + "language_loss": 0.78677326, + "learning_rate": 0.00037666922375443446, + "loss": 0.79753852, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.34765625, + "step": 3077, + "time_per_iteration": 2.591597557067871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072268, + "balance_loss_mlp": 1.03757811, + "epoch": 0.5921508272412467, + "flos": 560320962048.0, + "grad_norm": 0.056716138151229355, + "language_loss": 0.81505013, + "learning_rate": 0.00037636733153664396, + "loss": 0.82577276, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.34716797, + "step": 3078, + "time_per_iteration": 2.854278802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075513, + "balance_loss_mlp": 1.04144311, + "epoch": 0.5923432089265102, + "flos": 563008347648.0, + "grad_norm": 0.061835614307010005, + "language_loss": 0.79824865, + "learning_rate": 0.0003760654873173124, + "loss": 0.80900383, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.34082031, + "step": 3079, + "time_per_iteration": 2.66091251373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_mlp": 1.04387426, + "epoch": 0.5925355906117737, + "flos": 495488406528.0, + "grad_norm": 0.052514491856325576, + "language_loss": 0.81763887, + "learning_rate": 0.00037576369121362566, + "loss": 0.8284322, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.35498047, + "step": 3080, + "time_per_iteration": 2.5847787857055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079309, + "balance_loss_mlp": 1.04473865, + "epoch": 0.5927279722970373, + "flos": 565940072448.0, + "grad_norm": 0.05276703199883553, + "language_loss": 0.81885982, + "learning_rate": 0.0003754619433427516, + "loss": 0.82965291, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.34570312, + "step": 3081, + "time_per_iteration": 2.898594856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108187, + "balance_loss_mlp": 1.04682267, + "epoch": 0.5929203539823009, + "flos": 666674413056.0, + "grad_norm": 0.06717854488830324, + "language_loss": 0.77682364, + "learning_rate": 0.0003751602438218392, + "loss": 0.78764236, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.35083008, + "step": 3082, + "time_per_iteration": 2.7553367614746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083555, + "balance_loss_mlp": 1.0486505, + "epoch": 0.5931127356675644, + "flos": 555484635648.0, + "grad_norm": 0.05625551140275949, + "language_loss": 0.83254004, + "learning_rate": 0.0003748585927680186, + "loss": 0.84337556, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.34912109, + "step": 3083, + "time_per_iteration": 2.6493966579437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089532, + "balance_loss_mlp": 1.0530777, + "epoch": 0.593305117352828, + "flos": 534932992512.0, + "grad_norm": 0.07512877248395429, + "language_loss": 0.82828176, + "learning_rate": 0.00037455699029840086, + "loss": 0.83917707, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.36450195, + "step": 3084, + "time_per_iteration": 2.674532890319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079674, + "balance_loss_mlp": 1.04488921, + "epoch": 0.5934974990380916, + "flos": 593683748352.0, + "grad_norm": 0.05984158390569505, + "language_loss": 0.84177965, + "learning_rate": 0.0003742554365300787, + "loss": 0.85257638, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.34838867, + "step": 3085, + "time_per_iteration": 2.712371587753296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085917, + "balance_loss_mlp": 1.05044067, + "epoch": 0.5936898807233552, + "flos": 712339011072.0, + "grad_norm": 0.05068184961629974, + "language_loss": 0.78978491, + "learning_rate": 0.0003739539315801255, + "loss": 0.80064404, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.35473633, + "step": 3086, + "time_per_iteration": 2.916006565093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088952, + "balance_loss_mlp": 1.05345142, + "epoch": 0.5938822624086187, + "flos": 391684128768.0, + "grad_norm": 0.05263578767135529, + "language_loss": 0.9165324, + "learning_rate": 0.000373652475565596, + "loss": 0.92742193, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.35522461, + "step": 3087, + "time_per_iteration": 2.470960855484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094024, + "balance_loss_mlp": 1.05900025, + "epoch": 0.5940746440938822, + "flos": 480023373312.0, + "grad_norm": 0.060850763929597464, + "language_loss": 0.81550741, + "learning_rate": 0.00037335106860352587, + "loss": 0.82644761, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.35083008, + "step": 3088, + "time_per_iteration": 2.666472911834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097243, + "balance_loss_mlp": 1.06100357, + "epoch": 0.5942670257791458, + "flos": 483094872576.0, + "grad_norm": 0.049324641114684424, + "language_loss": 0.83196813, + "learning_rate": 0.00037304971081093146, + "loss": 0.84294057, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.36230469, + "step": 3089, + "time_per_iteration": 2.521000862121582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093812, + "balance_loss_mlp": 1.05967069, + "epoch": 0.5944594074644094, + "flos": 547656795648.0, + "grad_norm": 0.0533670066305608, + "language_loss": 0.81061506, + "learning_rate": 0.00037274840230481024, + "loss": 0.82155317, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.34179688, + "step": 3090, + "time_per_iteration": 2.7134556770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092625, + "balance_loss_mlp": 1.05700517, + "epoch": 0.594651789149673, + "flos": 448943510016.0, + "grad_norm": 0.055393993008082114, + "language_loss": 0.78753984, + "learning_rate": 0.00037244714320214077, + "loss": 0.79846609, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.35620117, + "step": 3091, + "time_per_iteration": 2.5576789379119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092048, + "balance_loss_mlp": 1.05640459, + "epoch": 0.5948441708349365, + "flos": 595969491456.0, + "grad_norm": 0.050698130573270175, + "language_loss": 0.83444929, + "learning_rate": 0.000372145933619882, + "loss": 0.84536982, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.35668945, + "step": 3092, + "time_per_iteration": 2.8742141723632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091606, + "balance_loss_mlp": 1.05636811, + "epoch": 0.5950365525202, + "flos": 548251905024.0, + "grad_norm": 0.05419961551348069, + "language_loss": 0.82168603, + "learning_rate": 0.000371844773674974, + "loss": 0.83260214, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.3527832, + "step": 3093, + "time_per_iteration": 2.6228530406951904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094358, + "balance_loss_mlp": 1.05890489, + "epoch": 0.5952289342054636, + "flos": 654385595904.0, + "grad_norm": 0.05844341434318606, + "language_loss": 0.81673229, + "learning_rate": 0.0003715436634843375, + "loss": 0.82767594, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.35498047, + "step": 3094, + "time_per_iteration": 2.8496577739715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084873, + "balance_loss_mlp": 1.04951525, + "epoch": 0.5954213158907272, + "flos": 603055245312.0, + "grad_norm": 0.0455107572696148, + "language_loss": 0.80728281, + "learning_rate": 0.00037124260316487355, + "loss": 0.81813157, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.35375977, + "step": 3095, + "time_per_iteration": 2.83181095123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084995, + "balance_loss_mlp": 1.05044806, + "epoch": 0.5956136975759908, + "flos": 486097970688.0, + "grad_norm": 0.0493360128544523, + "language_loss": 0.89028478, + "learning_rate": 0.0003709415928334643, + "loss": 0.90113473, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.34570312, + "step": 3096, + "time_per_iteration": 2.5334527492523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081969, + "balance_loss_mlp": 1.0465641, + "epoch": 0.5958060792612543, + "flos": 658462459392.0, + "grad_norm": 0.05334894182240255, + "language_loss": 0.80644953, + "learning_rate": 0.00037064063260697233, + "loss": 0.81726921, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.35424805, + "step": 3097, + "time_per_iteration": 2.868948221206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085643, + "balance_loss_mlp": 1.05004668, + "epoch": 0.5959984609465179, + "flos": 723201882624.0, + "grad_norm": 0.05441892470065276, + "language_loss": 0.78413296, + "learning_rate": 0.0003703397226022407, + "loss": 0.79498935, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.35595703, + "step": 3098, + "time_per_iteration": 3.0486435890197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054277, + "balance_loss_mlp": 1.04254675, + "epoch": 0.5961908426317815, + "flos": 1519010164224.0, + "grad_norm": 0.031936086773479797, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76554149, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.1171875, + "step": 3099, + "time_per_iteration": 4.9141762256622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082532, + "balance_loss_mlp": 1.04822397, + "epoch": 0.596383224317045, + "flos": 532357678080.0, + "grad_norm": 0.04537931846822051, + "language_loss": 0.83096731, + "learning_rate": 0.0003697380537253339, + "loss": 0.84179258, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.34350586, + "step": 3100, + "time_per_iteration": 2.6156232357025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082884, + "balance_loss_mlp": 1.04766929, + "epoch": 0.5965756060023086, + "flos": 590922169344.0, + "grad_norm": 0.060003355935897486, + "language_loss": 0.81679451, + "learning_rate": 0.0003694372950867471, + "loss": 0.82762337, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.3527832, + "step": 3101, + "time_per_iteration": 2.746100902557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084672, + "balance_loss_mlp": 1.04967189, + "epoch": 0.5967679876875721, + "flos": 861701760000.0, + "grad_norm": 0.05796500812003716, + "language_loss": 0.77373374, + "learning_rate": 0.0003691365871370976, + "loss": 0.78458047, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.3503418, + "step": 3102, + "time_per_iteration": 3.0448250770568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082291, + "balance_loss_mlp": 1.04710054, + "epoch": 0.5969603693728357, + "flos": 553574241792.0, + "grad_norm": 0.05791620467430745, + "language_loss": 0.854276, + "learning_rate": 0.00036883592999313093, + "loss": 0.86509889, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.35229492, + "step": 3103, + "time_per_iteration": 2.650810718536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082187, + "balance_loss_mlp": 1.04666269, + "epoch": 0.5971527510580993, + "flos": 718345207296.0, + "grad_norm": 0.05277795957282848, + "language_loss": 0.79037023, + "learning_rate": 0.0003685353237715722, + "loss": 0.80119205, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.35546875, + "step": 3104, + "time_per_iteration": 2.87162184715271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082645, + "balance_loss_mlp": 1.04812241, + "epoch": 0.5973451327433629, + "flos": 647324573184.0, + "grad_norm": 0.05039525348103138, + "language_loss": 0.81437027, + "learning_rate": 0.0003682347685891274, + "loss": 0.82519674, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.34570312, + "step": 3105, + "time_per_iteration": 2.844632863998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078322, + "balance_loss_mlp": 1.04284513, + "epoch": 0.5975375144286263, + "flos": 721374446592.0, + "grad_norm": 0.053848168408106474, + "language_loss": 0.80436707, + "learning_rate": 0.0003679342645624822, + "loss": 0.81515038, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.35498047, + "step": 3106, + "time_per_iteration": 2.961121082305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079962, + "balance_loss_mlp": 1.04374671, + "epoch": 0.5977298961138899, + "flos": 750616699392.0, + "grad_norm": 0.04889819009677852, + "language_loss": 0.8164891, + "learning_rate": 0.0003676338118083025, + "loss": 0.82728875, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.36230469, + "step": 3107, + "time_per_iteration": 2.997671127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076541, + "balance_loss_mlp": 1.04161251, + "epoch": 0.5979222777991535, + "flos": 530703360000.0, + "grad_norm": 0.05034919609110883, + "language_loss": 0.79592144, + "learning_rate": 0.0003673334104432347, + "loss": 0.80668688, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.34960938, + "step": 3108, + "time_per_iteration": 2.5946898460388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079709, + "balance_loss_mlp": 1.04461432, + "epoch": 0.5981146594844171, + "flos": 621459357696.0, + "grad_norm": 0.04952863942356172, + "language_loss": 0.83337331, + "learning_rate": 0.0003670330605839048, + "loss": 0.84417045, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.35131836, + "step": 3109, + "time_per_iteration": 2.7955031394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080275, + "balance_loss_mlp": 1.04470301, + "epoch": 0.5983070411696807, + "flos": 603309911040.0, + "grad_norm": 0.05233505638894281, + "language_loss": 0.76384044, + "learning_rate": 0.0003667327623469191, + "loss": 0.77464318, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.35571289, + "step": 3110, + "time_per_iteration": 2.7939095497131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080046, + "balance_loss_mlp": 1.04516506, + "epoch": 0.5984994228549442, + "flos": 633187971072.0, + "grad_norm": 0.05191698416970628, + "language_loss": 0.7765972, + "learning_rate": 0.00036643251584886333, + "loss": 0.78739762, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.34912109, + "step": 3111, + "time_per_iteration": 2.821956157684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076682, + "balance_loss_mlp": 1.0426122, + "epoch": 0.5986918045402078, + "flos": 525026022912.0, + "grad_norm": 0.05255438672232182, + "language_loss": 0.81679058, + "learning_rate": 0.00036613232120630393, + "loss": 0.82755744, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.34106445, + "step": 3112, + "time_per_iteration": 2.61639142036438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072562, + "balance_loss_mlp": 1.03751469, + "epoch": 0.5988841862254713, + "flos": 482942103552.0, + "grad_norm": 0.06309856820969045, + "language_loss": 0.8010537, + "learning_rate": 0.00036583217853578643, + "loss": 0.81177926, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.35083008, + "step": 3113, + "time_per_iteration": 2.544152021408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076935, + "balance_loss_mlp": 1.04241252, + "epoch": 0.5990765679107349, + "flos": 1139658674688.0, + "grad_norm": 0.05746596179478014, + "language_loss": 0.7739538, + "learning_rate": 0.000365532087953837, + "loss": 0.78472316, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.34545898, + "step": 3114, + "time_per_iteration": 3.6210074424743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074738, + "balance_loss_mlp": 1.04104948, + "epoch": 0.5992689495959984, + "flos": 516729701376.0, + "grad_norm": 0.0590793434639382, + "language_loss": 0.89283043, + "learning_rate": 0.00036523204957696065, + "loss": 0.9035778, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.3371582, + "step": 3115, + "time_per_iteration": 2.5835559368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079472, + "balance_loss_mlp": 1.0447346, + "epoch": 0.599461331281262, + "flos": 744288998400.0, + "grad_norm": 0.05148674480480004, + "language_loss": 0.80590332, + "learning_rate": 0.00036493206352164324, + "loss": 0.81669807, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.34790039, + "step": 3116, + "time_per_iteration": 2.9135849475860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073646, + "balance_loss_mlp": 1.03960013, + "epoch": 0.5996537129665256, + "flos": 592078892544.0, + "grad_norm": 0.05828379622393402, + "language_loss": 0.85252976, + "learning_rate": 0.000364632129904349, + "loss": 0.86326623, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.34082031, + "step": 3117, + "time_per_iteration": 2.7019104957580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107531, + "balance_loss_mlp": 1.03997648, + "epoch": 0.5998460946517892, + "flos": 558735045120.0, + "grad_norm": 0.05080253376139345, + "language_loss": 0.77507442, + "learning_rate": 0.00036433224884152283, + "loss": 0.78582752, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.35375977, + "step": 3118, + "time_per_iteration": 2.698032855987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082073, + "balance_loss_mlp": 1.04814649, + "epoch": 0.6000384763370528, + "flos": 484325789184.0, + "grad_norm": 0.058104830427354655, + "language_loss": 0.77595496, + "learning_rate": 0.00036403242044958875, + "loss": 0.78677565, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.33959961, + "step": 3119, + "time_per_iteration": 2.5694661140441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082708, + "balance_loss_mlp": 1.04763699, + "epoch": 0.6002308580223162, + "flos": 596490407424.0, + "grad_norm": 0.05350136271967441, + "language_loss": 0.91317761, + "learning_rate": 0.0003637326448449507, + "loss": 0.92400473, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.35083008, + "step": 3120, + "time_per_iteration": 2.7095799446105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083063, + "balance_loss_mlp": 1.04808724, + "epoch": 0.6004232397075798, + "flos": 544879249920.0, + "grad_norm": 0.044412764387293725, + "language_loss": 0.86037177, + "learning_rate": 0.00036343292214399177, + "loss": 0.87120235, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.34985352, + "step": 3121, + "time_per_iteration": 2.760568618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074765, + "balance_loss_mlp": 1.04112399, + "epoch": 0.6006156213928434, + "flos": 629647990272.0, + "grad_norm": 0.05788035172914192, + "language_loss": 0.770136, + "learning_rate": 0.00036313325246307456, + "loss": 0.78088361, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.33666992, + "step": 3122, + "time_per_iteration": 2.7645843029022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081915, + "balance_loss_mlp": 1.0479641, + "epoch": 0.600808003078107, + "flos": 582043885056.0, + "grad_norm": 0.05339440368403648, + "language_loss": 0.8713336, + "learning_rate": 0.0003628336359185411, + "loss": 0.8821528, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.33984375, + "step": 3123, + "time_per_iteration": 2.704559803009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084761, + "balance_loss_mlp": 1.04961848, + "epoch": 0.6010003847633705, + "flos": 634984883712.0, + "grad_norm": 0.051464767664237604, + "language_loss": 0.7543686, + "learning_rate": 0.000362534072626713, + "loss": 0.76521623, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.35180664, + "step": 3124, + "time_per_iteration": 2.767263174057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082659, + "balance_loss_mlp": 1.04837453, + "epoch": 0.6011927664486341, + "flos": 718448514048.0, + "grad_norm": 0.05118450522862765, + "language_loss": 0.80810112, + "learning_rate": 0.00036223456270389093, + "loss": 0.81892776, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.34326172, + "step": 3125, + "time_per_iteration": 2.972226858139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077124, + "balance_loss_mlp": 1.04272032, + "epoch": 0.6013851481338977, + "flos": 498782486016.0, + "grad_norm": 0.0486392008074567, + "language_loss": 0.81048089, + "learning_rate": 0.00036193510626635517, + "loss": 0.82125211, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.34423828, + "step": 3126, + "time_per_iteration": 2.6381988525390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080205, + "balance_loss_mlp": 1.04620612, + "epoch": 0.6015775298191612, + "flos": 749266509312.0, + "grad_norm": 0.057928922724073975, + "language_loss": 0.81419915, + "learning_rate": 0.0003616357034303649, + "loss": 0.82500118, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.34033203, + "step": 3127, + "time_per_iteration": 2.910590887069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077974, + "balance_loss_mlp": 1.04380846, + "epoch": 0.6017699115044248, + "flos": 592764162048.0, + "grad_norm": 0.06444067726606947, + "language_loss": 0.7886622, + "learning_rate": 0.0003613363543121584, + "loss": 0.79944193, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.34204102, + "step": 3128, + "time_per_iteration": 2.8243367671966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081251, + "balance_loss_mlp": 1.04627466, + "epoch": 0.6019622931896883, + "flos": 514839656448.0, + "grad_norm": 0.05655060163799935, + "language_loss": 0.8488009, + "learning_rate": 0.00036103705902795357, + "loss": 0.85961336, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.35009766, + "step": 3129, + "time_per_iteration": 2.691652297973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078555, + "balance_loss_mlp": 1.0440799, + "epoch": 0.6021546748749519, + "flos": 490219914240.0, + "grad_norm": 0.11187816626328603, + "language_loss": 0.79397345, + "learning_rate": 0.0003607378176939471, + "loss": 0.80475903, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.3449707, + "step": 3130, + "time_per_iteration": 2.59126353263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080272, + "balance_loss_mlp": 1.0459156, + "epoch": 0.6023470565602155, + "flos": 540763098624.0, + "grad_norm": 0.584663234761047, + "language_loss": 0.81865788, + "learning_rate": 0.00036043863042631465, + "loss": 0.82946062, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.34399414, + "step": 3131, + "time_per_iteration": 2.631758689880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082979, + "balance_loss_mlp": 1.04716837, + "epoch": 0.6025394382454791, + "flos": 844660984320.0, + "grad_norm": 0.054894708667503185, + "language_loss": 0.76558393, + "learning_rate": 0.00036013949734121133, + "loss": 0.77641368, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.3581543, + "step": 3132, + "time_per_iteration": 3.091432809829712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077376, + "balance_loss_mlp": 1.04249549, + "epoch": 0.6027318199307425, + "flos": 576903430656.0, + "grad_norm": 0.05648602970445555, + "language_loss": 0.82430494, + "learning_rate": 0.00035984041855477043, + "loss": 0.83507866, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.34912109, + "step": 3133, + "time_per_iteration": 2.707841396331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045948, + "balance_loss_mlp": 1.03345478, + "epoch": 0.6029242016160061, + "flos": 1470160585728.0, + "grad_norm": 0.017118275971869903, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79755843, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.125, + "step": 3134, + "time_per_iteration": 4.929067373275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077811, + "balance_loss_mlp": 1.0416429, + "epoch": 0.6031165833012697, + "flos": 480486062592.0, + "grad_norm": 0.057341971523643794, + "language_loss": 0.79656577, + "learning_rate": 0.00035924242434230637, + "loss": 0.80734384, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.36181641, + "step": 3135, + "time_per_iteration": 2.6362884044647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078775, + "balance_loss_mlp": 1.04294014, + "epoch": 0.6033089649865333, + "flos": 499220444160.0, + "grad_norm": 0.48805573037273664, + "language_loss": 0.78477532, + "learning_rate": 0.00035894350914844516, + "loss": 0.79556304, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.35864258, + "step": 3136, + "time_per_iteration": 2.5889768600463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095737, + "balance_loss_mlp": 1.05961668, + "epoch": 0.6035013466717969, + "flos": 556337230848.0, + "grad_norm": 0.06198645185938339, + "language_loss": 0.828888, + "learning_rate": 0.0003586446487175703, + "loss": 0.83984536, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.36132812, + "step": 3137, + "time_per_iteration": 2.6805853843688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105357, + "balance_loss_mlp": 1.06690025, + "epoch": 0.6036937283570604, + "flos": 594536343552.0, + "grad_norm": 0.04857529981882101, + "language_loss": 0.85242814, + "learning_rate": 0.0003583458431657099, + "loss": 0.86348164, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.3840332, + "step": 3138, + "time_per_iteration": 2.8694372177124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107329, + "balance_loss_mlp": 1.0691824, + "epoch": 0.603886110042324, + "flos": 540684523008.0, + "grad_norm": 0.0686265379907432, + "language_loss": 0.82493383, + "learning_rate": 0.00035804709260887056, + "loss": 0.83600712, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.38110352, + "step": 3139, + "time_per_iteration": 2.6613197326660156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111658, + "balance_loss_mlp": 1.07664514, + "epoch": 0.6040784917275875, + "flos": 518315618304.0, + "grad_norm": 0.04727969625034485, + "language_loss": 0.89413351, + "learning_rate": 0.0003577483971630373, + "loss": 0.90529931, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.39916992, + "step": 3140, + "time_per_iteration": 2.6468544006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112858, + "balance_loss_mlp": 1.08752418, + "epoch": 0.6042708734128511, + "flos": 660436872192.0, + "grad_norm": 0.0491739702694389, + "language_loss": 0.84699506, + "learning_rate": 0.00035744975694417414, + "loss": 0.8582809, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.41064453, + "step": 3141, + "time_per_iteration": 2.8567256927490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128353, + "balance_loss_mlp": 1.0867728, + "epoch": 0.6044632550981146, + "flos": 572035018752.0, + "grad_norm": 0.05704066286420323, + "language_loss": 0.82333231, + "learning_rate": 0.00035715117206822344, + "loss": 0.83461583, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.41577148, + "step": 3142, + "time_per_iteration": 2.7504515647888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141414, + "balance_loss_mlp": 1.09892821, + "epoch": 0.6046556367833782, + "flos": 546420086784.0, + "grad_norm": 0.06612582666460322, + "language_loss": 0.80943495, + "learning_rate": 0.0003568526426511065, + "loss": 0.82084912, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.42456055, + "step": 3143, + "time_per_iteration": 2.6085774898529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140491, + "balance_loss_mlp": 1.09817219, + "epoch": 0.6048480184686418, + "flos": 776505235968.0, + "grad_norm": 0.064383973380027, + "language_loss": 0.82750165, + "learning_rate": 0.000356554168808722, + "loss": 0.83890665, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.42358398, + "step": 3144, + "time_per_iteration": 2.9655168056488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140449, + "balance_loss_mlp": 1.09834385, + "epoch": 0.6050404001539054, + "flos": 656837254656.0, + "grad_norm": 0.05900200764303625, + "language_loss": 0.85025299, + "learning_rate": 0.00035625575065694837, + "loss": 0.8616575, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.42114258, + "step": 3145, + "time_per_iteration": 2.826193332672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134203, + "balance_loss_mlp": 1.09159803, + "epoch": 0.605232781839169, + "flos": 548710212096.0, + "grad_norm": 0.05530707742448767, + "language_loss": 0.77449524, + "learning_rate": 0.0003559573883116415, + "loss": 0.78583729, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.42626953, + "step": 3146, + "time_per_iteration": 2.6936702728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114118, + "balance_loss_mlp": 1.0976212, + "epoch": 0.6054251635244324, + "flos": 605093677056.0, + "grad_norm": 0.08058095897808437, + "language_loss": 0.85587645, + "learning_rate": 0.00035565908188863604, + "loss": 0.86728823, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.43579102, + "step": 3147, + "time_per_iteration": 2.8229072093963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113968, + "balance_loss_mlp": 1.09488153, + "epoch": 0.605617545209696, + "flos": 613398763008.0, + "grad_norm": 0.05127524075744011, + "language_loss": 0.79730809, + "learning_rate": 0.00035536083150374464, + "loss": 0.80870491, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.44799805, + "step": 3148, + "time_per_iteration": 2.782287836074829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139029, + "balance_loss_mlp": 1.12310266, + "epoch": 0.6058099268949596, + "flos": 1497477888000.0, + "grad_norm": 0.03498965475006418, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75886977, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.15917969, + "step": 3149, + "time_per_iteration": 4.813022613525391 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128616, + "balance_loss_mlp": 1.08696485, + "epoch": 0.6060023085802232, + "flos": 670170723840.0, + "grad_norm": 0.053702261720826414, + "language_loss": 0.85731369, + "learning_rate": 0.0003547644993114475, + "loss": 0.86859989, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.41650391, + "step": 3150, + "time_per_iteration": 2.7940874099731445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118224, + "balance_loss_mlp": 1.07688236, + "epoch": 0.6061946902654868, + "flos": 605885225472.0, + "grad_norm": 0.05286284770127293, + "language_loss": 0.79495448, + "learning_rate": 0.00035446641773555806, + "loss": 0.80613673, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.41357422, + "step": 3151, + "time_per_iteration": 2.7147328853607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116917, + "balance_loss_mlp": 1.07567072, + "epoch": 0.6063870719507503, + "flos": 557568147456.0, + "grad_norm": 0.052762165498596546, + "language_loss": 0.86798322, + "learning_rate": 0.000354168392660816, + "loss": 0.87915242, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.41235352, + "step": 3152, + "time_per_iteration": 2.7346954345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115838, + "balance_loss_mlp": 1.07583165, + "epoch": 0.6065794536360138, + "flos": 556874113536.0, + "grad_norm": 0.05405599690586098, + "language_loss": 0.82799989, + "learning_rate": 0.0003538704242029252, + "loss": 0.8391583, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.39990234, + "step": 3153, + "time_per_iteration": 2.705004930496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112343, + "balance_loss_mlp": 1.07169282, + "epoch": 0.6067718353212774, + "flos": 689836276224.0, + "grad_norm": 0.05919499383434511, + "language_loss": 0.77963281, + "learning_rate": 0.0003535725124775672, + "loss": 0.79075623, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.40649414, + "step": 3154, + "time_per_iteration": 2.8201727867126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110719, + "balance_loss_mlp": 1.07147574, + "epoch": 0.606964217006541, + "flos": 521531122176.0, + "grad_norm": 0.06643297661580516, + "language_loss": 0.86598241, + "learning_rate": 0.00035327465760040126, + "loss": 0.87708956, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.39233398, + "step": 3155, + "time_per_iteration": 2.6584889888763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100039, + "balance_loss_mlp": 1.06201148, + "epoch": 0.6071565986918045, + "flos": 641267504640.0, + "grad_norm": 0.0597836437175205, + "language_loss": 0.84776556, + "learning_rate": 0.00035297685968706526, + "loss": 0.85876596, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.37988281, + "step": 3156, + "time_per_iteration": 2.752196788787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109676, + "balance_loss_mlp": 1.05708754, + "epoch": 0.6073489803770681, + "flos": 560315169792.0, + "grad_norm": 0.05609890059594196, + "language_loss": 0.8300876, + "learning_rate": 0.00035267911885317454, + "loss": 0.84105527, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.39672852, + "step": 3157, + "time_per_iteration": 2.629136562347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109981, + "balance_loss_mlp": 1.06121039, + "epoch": 0.6075413620623317, + "flos": 585810828288.0, + "grad_norm": 0.05476186910904592, + "language_loss": 0.81797791, + "learning_rate": 0.0003523814352143222, + "loss": 0.82897604, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.38598633, + "step": 3158, + "time_per_iteration": 2.8239855766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087443, + "balance_loss_mlp": 1.04953456, + "epoch": 0.6077337437475953, + "flos": 630523906560.0, + "grad_norm": 0.060962114442721135, + "language_loss": 0.90981984, + "learning_rate": 0.00035208380888607937, + "loss": 0.92069423, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.37866211, + "step": 3159, + "time_per_iteration": 2.754648208618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068868, + "balance_loss_mlp": 1.05542111, + "epoch": 0.6079261254328588, + "flos": 1467726455808.0, + "grad_norm": 0.024644792756990472, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80530852, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.13476562, + "step": 3160, + "time_per_iteration": 4.849771022796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066394, + "balance_loss_mlp": 1.05323327, + "epoch": 0.6081185071181223, + "flos": 1522233022464.0, + "grad_norm": 0.022600356712689354, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76758623, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.13183594, + "step": 3161, + "time_per_iteration": 5.017123699188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081969, + "balance_loss_mlp": 1.04530025, + "epoch": 0.6083108888033859, + "flos": 556041867264.0, + "grad_norm": 0.07058889288065262, + "language_loss": 0.81635529, + "learning_rate": 0.00035119127492038446, + "loss": 0.82717502, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.3671875, + "step": 3162, + "time_per_iteration": 2.7839951515197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083576, + "balance_loss_mlp": 1.0463115, + "epoch": 0.6085032704886495, + "flos": 840819847680.0, + "grad_norm": 0.052086088834636966, + "language_loss": 0.82480276, + "learning_rate": 0.00035089387898984436, + "loss": 0.83563852, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.37207031, + "step": 3163, + "time_per_iteration": 3.0475828647613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079298, + "balance_loss_mlp": 1.04267716, + "epoch": 0.6086956521739131, + "flos": 684493590528.0, + "grad_norm": 0.05636679470966986, + "language_loss": 0.81840444, + "learning_rate": 0.0003505965409474343, + "loss": 0.82919747, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.36621094, + "step": 3164, + "time_per_iteration": 2.8719167709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081512, + "balance_loss_mlp": 1.04453373, + "epoch": 0.6088880338591766, + "flos": 535533894144.0, + "grad_norm": 0.05767475367988954, + "language_loss": 0.86591709, + "learning_rate": 0.0003502992609085913, + "loss": 0.87673223, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.36962891, + "step": 3165, + "time_per_iteration": 2.6596477031707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076933, + "balance_loss_mlp": 1.04007339, + "epoch": 0.6090804155444401, + "flos": 731197048320.0, + "grad_norm": 0.05479022562545965, + "language_loss": 0.82799208, + "learning_rate": 0.00035000203898872954, + "loss": 0.83876145, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.3684082, + "step": 3166, + "time_per_iteration": 2.985320568084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076201, + "balance_loss_mlp": 1.03845954, + "epoch": 0.6092727972297037, + "flos": 698708768256.0, + "grad_norm": 0.05187712745412687, + "language_loss": 0.84401566, + "learning_rate": 0.0003497048753032406, + "loss": 0.85477769, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.37695312, + "step": 3167, + "time_per_iteration": 2.876997470855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079403, + "balance_loss_mlp": 1.04213786, + "epoch": 0.6094651789149673, + "flos": 1051515869184.0, + "grad_norm": 0.16368682108793797, + "language_loss": 0.81000876, + "learning_rate": 0.000349407769967494, + "loss": 0.82080269, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.37255859, + "step": 3168, + "time_per_iteration": 3.376215696334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074675, + "balance_loss_mlp": 1.03721976, + "epoch": 0.6096575606002309, + "flos": 502834618368.0, + "grad_norm": 0.047663268241493265, + "language_loss": 0.84680313, + "learning_rate": 0.0003491107230968361, + "loss": 0.85754991, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.37475586, + "step": 3169, + "time_per_iteration": 2.660513401031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076232, + "balance_loss_mlp": 1.03872895, + "epoch": 0.6098499422854944, + "flos": 585339374592.0, + "grad_norm": 0.13699074886281146, + "language_loss": 0.81564283, + "learning_rate": 0.00034881373480659085, + "loss": 0.82640517, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.37475586, + "step": 3170, + "time_per_iteration": 2.831681728363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081961, + "balance_loss_mlp": 1.04364741, + "epoch": 0.610042323970758, + "flos": 468968444928.0, + "grad_norm": 0.06190459758057804, + "language_loss": 0.77871358, + "learning_rate": 0.0003485168052120594, + "loss": 0.78953326, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.3828125, + "step": 3171, + "time_per_iteration": 2.5600767135620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081834, + "balance_loss_mlp": 1.04387796, + "epoch": 0.6102347056560216, + "flos": 513923042304.0, + "grad_norm": 0.0838552496522472, + "language_loss": 0.80047345, + "learning_rate": 0.00034821993442851973, + "loss": 0.81129181, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.37890625, + "step": 3172, + "time_per_iteration": 2.564009666442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082985, + "balance_loss_mlp": 1.0452435, + "epoch": 0.6104270873412851, + "flos": 468776388096.0, + "grad_norm": 0.05938555160639068, + "language_loss": 0.82216555, + "learning_rate": 0.00034792312257122735, + "loss": 0.83299541, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.37719727, + "step": 3173, + "time_per_iteration": 2.6151862144470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078889, + "balance_loss_mlp": 1.04012203, + "epoch": 0.6106194690265486, + "flos": 549610859520.0, + "grad_norm": 0.05423157525738513, + "language_loss": 0.80451965, + "learning_rate": 0.00034762636975541506, + "loss": 0.81530857, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.38720703, + "step": 3174, + "time_per_iteration": 2.627699375152588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107833, + "balance_loss_mlp": 1.03965902, + "epoch": 0.6108118507118122, + "flos": 472602968064.0, + "grad_norm": 0.06986619017952604, + "language_loss": 0.80950004, + "learning_rate": 0.0003473296760962923, + "loss": 0.82028335, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.38647461, + "step": 3175, + "time_per_iteration": 2.6790359020233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073231, + "balance_loss_mlp": 1.06111896, + "epoch": 0.6110042323970758, + "flos": 1444416205824.0, + "grad_norm": 0.03162499472670903, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79606968, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.12109375, + "step": 3176, + "time_per_iteration": 4.660337924957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078339, + "balance_loss_mlp": 1.03966713, + "epoch": 0.6111966140823394, + "flos": 793807879680.0, + "grad_norm": 0.05300706067189078, + "language_loss": 0.8120122, + "learning_rate": 0.00034673646670883976, + "loss": 0.82279563, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.38623047, + "step": 3177, + "time_per_iteration": 2.9990971088409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046334, + "balance_loss_mlp": 1.03431749, + "epoch": 0.611388995767603, + "flos": 1556800432128.0, + "grad_norm": 0.020411675518342276, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76761359, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.12011719, + "step": 3178, + "time_per_iteration": 5.060986280441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078249, + "balance_loss_mlp": 1.03948236, + "epoch": 0.6115813774528664, + "flos": 711841416192.0, + "grad_norm": 0.052313854365800355, + "language_loss": 0.81582487, + "learning_rate": 0.0003461434953300865, + "loss": 0.82660735, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.38745117, + "step": 3179, + "time_per_iteration": 2.8902480602264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073999, + "balance_loss_mlp": 1.03535175, + "epoch": 0.61177375913813, + "flos": 683963910144.0, + "grad_norm": 0.0432149263415984, + "language_loss": 0.81232655, + "learning_rate": 0.0003458470991817515, + "loss": 0.82306653, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.38598633, + "step": 3180, + "time_per_iteration": 2.9921305179595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078708, + "balance_loss_mlp": 1.04068065, + "epoch": 0.6119661408233936, + "flos": 511411746816.0, + "grad_norm": 0.056171077714967085, + "language_loss": 0.84767073, + "learning_rate": 0.0003455507628808802, + "loss": 0.8584578, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.38012695, + "step": 3181, + "time_per_iteration": 2.5818896293640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073399, + "balance_loss_mlp": 1.03527629, + "epoch": 0.6121585225086572, + "flos": 556548226560.0, + "grad_norm": 0.057403680596608046, + "language_loss": 0.8451159, + "learning_rate": 0.00034525448654252076, + "loss": 0.85584986, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.38085938, + "step": 3182, + "time_per_iteration": 2.6865382194519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072137, + "balance_loss_mlp": 1.03384721, + "epoch": 0.6123509041939207, + "flos": 561585374208.0, + "grad_norm": 0.07466059986871497, + "language_loss": 0.82914555, + "learning_rate": 0.0003449582702816976, + "loss": 0.83986694, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.3828125, + "step": 3183, + "time_per_iteration": 2.6590259075164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079784, + "balance_loss_mlp": 1.0416131, + "epoch": 0.6125432858791843, + "flos": 557789317632.0, + "grad_norm": 0.05504997733679025, + "language_loss": 0.82930607, + "learning_rate": 0.0003446621142134122, + "loss": 0.84010386, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.3815918, + "step": 3184, + "time_per_iteration": 2.7104709148406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075053, + "balance_loss_mlp": 1.03776431, + "epoch": 0.6127356675644479, + "flos": 414796529664.0, + "grad_norm": 0.05785245107541848, + "language_loss": 0.84189403, + "learning_rate": 0.0003443660184526424, + "loss": 0.85264462, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.37255859, + "step": 3185, + "time_per_iteration": 2.441305160522461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078818, + "balance_loss_mlp": 1.04048026, + "epoch": 0.6129280492497114, + "flos": 603547047936.0, + "grad_norm": 0.04628969176382701, + "language_loss": 0.86441582, + "learning_rate": 0.0003440699831143429, + "loss": 0.87520397, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.38305664, + "step": 3186, + "time_per_iteration": 2.81016206741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081465, + "balance_loss_mlp": 1.04474831, + "epoch": 0.613120430934975, + "flos": 519492690432.0, + "grad_norm": 0.05115957600907009, + "language_loss": 0.82288289, + "learning_rate": 0.0003437740083134449, + "loss": 0.83369744, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.3671875, + "step": 3187, + "time_per_iteration": 2.695181369781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075008, + "balance_loss_mlp": 1.03798163, + "epoch": 0.6133128126202385, + "flos": 510835576320.0, + "grad_norm": 0.06733229983475184, + "language_loss": 0.83452654, + "learning_rate": 0.00034347809416485574, + "loss": 0.84527659, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.37011719, + "step": 3188, + "time_per_iteration": 2.5900075435638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081504, + "balance_loss_mlp": 1.04402518, + "epoch": 0.6135051943055021, + "flos": 607264528896.0, + "grad_norm": 0.053668382142468496, + "language_loss": 0.81688702, + "learning_rate": 0.0003431822407834597, + "loss": 0.82770205, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.37475586, + "step": 3189, + "time_per_iteration": 2.8129723072052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107806, + "balance_loss_mlp": 1.04062855, + "epoch": 0.6136975759907657, + "flos": 1159750600704.0, + "grad_norm": 0.0555928311696248, + "language_loss": 0.84534049, + "learning_rate": 0.00034288644828411706, + "loss": 0.85612106, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.37426758, + "step": 3190, + "time_per_iteration": 3.4628307819366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076796, + "balance_loss_mlp": 1.03931642, + "epoch": 0.6138899576760293, + "flos": 706631150592.0, + "grad_norm": 0.05334960591036923, + "language_loss": 0.75148171, + "learning_rate": 0.0003425907167816649, + "loss": 0.76224971, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.37475586, + "step": 3191, + "time_per_iteration": 2.867506265640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072461, + "balance_loss_mlp": 1.03510118, + "epoch": 0.6140823393612928, + "flos": 586151271936.0, + "grad_norm": 0.05066562210294406, + "language_loss": 0.84692401, + "learning_rate": 0.00034229504639091623, + "loss": 0.85764861, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.37329102, + "step": 3192, + "time_per_iteration": 2.757969617843628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075038, + "balance_loss_mlp": 1.03722489, + "epoch": 0.6142747210465563, + "flos": 803759929344.0, + "grad_norm": 0.052233657686543596, + "language_loss": 0.79899156, + "learning_rate": 0.0003419994372266606, + "loss": 0.80974191, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.37792969, + "step": 3193, + "time_per_iteration": 3.113477945327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074279, + "balance_loss_mlp": 1.03651392, + "epoch": 0.6144671027318199, + "flos": 529158140928.0, + "grad_norm": 0.04106506245407052, + "language_loss": 0.81734288, + "learning_rate": 0.00034170388940366335, + "loss": 0.82808566, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.37744141, + "step": 3194, + "time_per_iteration": 2.6896331310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078888, + "balance_loss_mlp": 1.04190898, + "epoch": 0.6146594844170835, + "flos": 805054864896.0, + "grad_norm": 0.05108636633203802, + "language_loss": 0.80077958, + "learning_rate": 0.0003414084030366667, + "loss": 0.8115685, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.36987305, + "step": 3195, + "time_per_iteration": 3.083922863006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078724, + "balance_loss_mlp": 1.04134059, + "epoch": 0.6148518661023471, + "flos": 501431993856.0, + "grad_norm": 0.05057450968707768, + "language_loss": 0.82827139, + "learning_rate": 0.0003411129782403883, + "loss": 0.83905864, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.3737793, + "step": 3196, + "time_per_iteration": 2.641129970550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107445, + "balance_loss_mlp": 1.03720951, + "epoch": 0.6150442477876106, + "flos": 510436905984.0, + "grad_norm": 0.062166834979967195, + "language_loss": 0.84822834, + "learning_rate": 0.0003408176151295225, + "loss": 0.85897291, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.37207031, + "step": 3197, + "time_per_iteration": 2.5532026290893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071235, + "balance_loss_mlp": 1.03425658, + "epoch": 0.6152366294728742, + "flos": 526758916608.0, + "grad_norm": 0.06002763695561428, + "language_loss": 0.770096, + "learning_rate": 0.00034052231381873944, + "loss": 0.78080833, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.36962891, + "step": 3198, + "time_per_iteration": 2.6175601482391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107449, + "balance_loss_mlp": 1.03746367, + "epoch": 0.6154290111581378, + "flos": 473055482880.0, + "grad_norm": 0.053906213257321506, + "language_loss": 0.85027397, + "learning_rate": 0.00034022707442268494, + "loss": 0.86101884, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.37060547, + "step": 3199, + "time_per_iteration": 2.5418269634246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075667, + "balance_loss_mlp": 1.03985643, + "epoch": 0.6156213928434013, + "flos": 550542030336.0, + "grad_norm": 0.04138117039405258, + "language_loss": 0.81766355, + "learning_rate": 0.0003399318970559813, + "loss": 0.82842016, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.35864258, + "step": 3200, + "time_per_iteration": 2.8180348873138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074649, + "balance_loss_mlp": 1.03795648, + "epoch": 0.6158137745286649, + "flos": 750587586048.0, + "grad_norm": 0.04925803113162635, + "language_loss": 0.84793299, + "learning_rate": 0.00033963678183322656, + "loss": 0.85867941, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.36694336, + "step": 3201, + "time_per_iteration": 3.032935857772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107717, + "balance_loss_mlp": 1.04035842, + "epoch": 0.6160061562139284, + "flos": 555544272384.0, + "grad_norm": 0.0447157472200271, + "language_loss": 0.82589877, + "learning_rate": 0.0003393417288689945, + "loss": 0.8366704, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.36816406, + "step": 3202, + "time_per_iteration": 2.675895929336548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076788, + "balance_loss_mlp": 1.03976154, + "epoch": 0.616198537899192, + "flos": 741856278528.0, + "grad_norm": 0.0597641092397592, + "language_loss": 0.75911278, + "learning_rate": 0.00033904673827783504, + "loss": 0.76988065, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.37060547, + "step": 3203, + "time_per_iteration": 2.930006265640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078121, + "balance_loss_mlp": 1.04111826, + "epoch": 0.6163909195844556, + "flos": 478569876480.0, + "grad_norm": 0.09425885378712065, + "language_loss": 0.8152014, + "learning_rate": 0.00033875181017427357, + "loss": 0.82598263, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.36962891, + "step": 3204, + "time_per_iteration": 2.624331474304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071675, + "balance_loss_mlp": 1.03524435, + "epoch": 0.6165833012697192, + "flos": 531231478272.0, + "grad_norm": 0.05217722063945812, + "language_loss": 0.80865437, + "learning_rate": 0.00033845694467281133, + "loss": 0.8193711, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.36450195, + "step": 3205, + "time_per_iteration": 2.8210368156433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107918, + "balance_loss_mlp": 1.0422256, + "epoch": 0.6167756829549826, + "flos": 807384278016.0, + "grad_norm": 0.04964273497495854, + "language_loss": 0.83231258, + "learning_rate": 0.00033816214188792516, + "loss": 0.84310448, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.36938477, + "step": 3206, + "time_per_iteration": 3.148005485534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074637, + "balance_loss_mlp": 1.03782535, + "epoch": 0.6169680646402462, + "flos": 488683459584.0, + "grad_norm": 0.053298610353503126, + "language_loss": 0.85231054, + "learning_rate": 0.00033786740193406784, + "loss": 0.8630569, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.36791992, + "step": 3207, + "time_per_iteration": 2.576956272125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083512, + "balance_loss_mlp": 1.04693818, + "epoch": 0.6171604463255098, + "flos": 618643934208.0, + "grad_norm": 0.05970709396928862, + "language_loss": 0.81620336, + "learning_rate": 0.00033757272492566736, + "loss": 0.82703847, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.3659668, + "step": 3208, + "time_per_iteration": 2.8902554512023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077083, + "balance_loss_mlp": 1.04070079, + "epoch": 0.6173528280107734, + "flos": 528600909312.0, + "grad_norm": 0.043205070358092235, + "language_loss": 0.87206829, + "learning_rate": 0.0003372781109771278, + "loss": 0.88283914, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.36401367, + "step": 3209, + "time_per_iteration": 2.688534736633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077515, + "balance_loss_mlp": 1.04036927, + "epoch": 0.617545209696037, + "flos": 596293968384.0, + "grad_norm": 0.05036658648833462, + "language_loss": 0.76489538, + "learning_rate": 0.0003369835602028281, + "loss": 0.77567053, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.37158203, + "step": 3210, + "time_per_iteration": 2.7890372276306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073752, + "balance_loss_mlp": 1.03763127, + "epoch": 0.6177375913813005, + "flos": 474848013312.0, + "grad_norm": 0.06457582449248328, + "language_loss": 0.7954967, + "learning_rate": 0.0003366890727171232, + "loss": 0.80623418, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.36132812, + "step": 3211, + "time_per_iteration": 2.6358649730682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076437, + "balance_loss_mlp": 1.03983986, + "epoch": 0.617929973066564, + "flos": 529546636800.0, + "grad_norm": 0.051638543668130914, + "language_loss": 0.78236932, + "learning_rate": 0.00033639464863434313, + "loss": 0.79313374, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.36621094, + "step": 3212, + "time_per_iteration": 2.6086573600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104581, + "balance_loss_mlp": 1.03403246, + "epoch": 0.6181223547518276, + "flos": 1419361477632.0, + "grad_norm": 0.031029800070293646, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79488277, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.11767578, + "step": 3213, + "time_per_iteration": 4.67001748085022 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082687, + "balance_loss_mlp": 1.04608989, + "epoch": 0.6183147364370912, + "flos": 739976408064.0, + "grad_norm": 0.057199257803381136, + "language_loss": 0.79338527, + "learning_rate": 0.00033580599113475543, + "loss": 0.80421209, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.36572266, + "step": 3214, + "time_per_iteration": 2.9583098888397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084038, + "balance_loss_mlp": 1.04791784, + "epoch": 0.6185071181223547, + "flos": 381442507776.0, + "grad_norm": 0.04917291397631135, + "language_loss": 0.85787857, + "learning_rate": 0.00033551175794648507, + "loss": 0.86871898, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.36108398, + "step": 3215, + "time_per_iteration": 2.450173854827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079107, + "balance_loss_mlp": 1.04191399, + "epoch": 0.6186994998076183, + "flos": 463109225472.0, + "grad_norm": 0.05232146419695497, + "language_loss": 0.8178426, + "learning_rate": 0.00033521758861821365, + "loss": 0.82863367, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.37158203, + "step": 3216, + "time_per_iteration": 2.566434144973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107802, + "balance_loss_mlp": 1.04132736, + "epoch": 0.6188918814928819, + "flos": 485029997568.0, + "grad_norm": 0.044556879100730015, + "language_loss": 0.88947988, + "learning_rate": 0.0003349234832641479, + "loss": 0.90026009, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.36669922, + "step": 3217, + "time_per_iteration": 2.5626957416534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087401, + "balance_loss_mlp": 1.05027926, + "epoch": 0.6190842631781455, + "flos": 656985641472.0, + "grad_norm": 0.056610001609600974, + "language_loss": 0.81178546, + "learning_rate": 0.00033462944199846975, + "loss": 0.82265949, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.37109375, + "step": 3218, + "time_per_iteration": 3.038856267929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091964, + "balance_loss_mlp": 1.054842, + "epoch": 0.619276644863409, + "flos": 403388011008.0, + "grad_norm": 0.051099399179052, + "language_loss": 0.86047733, + "learning_rate": 0.00033433546493533606, + "loss": 0.87139696, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.37109375, + "step": 3219, + "time_per_iteration": 2.4660589694976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086562, + "balance_loss_mlp": 1.04913092, + "epoch": 0.6194690265486725, + "flos": 582807730176.0, + "grad_norm": 0.07929462737326079, + "language_loss": 0.84635407, + "learning_rate": 0.00033404155218887897, + "loss": 0.8572197, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.37402344, + "step": 3220, + "time_per_iteration": 2.7270491123199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087776, + "balance_loss_mlp": 1.05127466, + "epoch": 0.6196614082339361, + "flos": 503963638272.0, + "grad_norm": 0.04746710197063832, + "language_loss": 0.87405616, + "learning_rate": 0.00033374770387320534, + "loss": 0.88493389, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.36499023, + "step": 3221, + "time_per_iteration": 2.7464041709899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086025, + "balance_loss_mlp": 1.04957032, + "epoch": 0.6198537899191997, + "flos": 575131249152.0, + "grad_norm": 0.04828799044899351, + "language_loss": 0.84905434, + "learning_rate": 0.00033345392010239737, + "loss": 0.85991454, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.36425781, + "step": 3222, + "time_per_iteration": 2.7124643325805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090712, + "balance_loss_mlp": 1.05432916, + "epoch": 0.6200461716044633, + "flos": 592871851008.0, + "grad_norm": 0.05455186914626242, + "language_loss": 0.8191222, + "learning_rate": 0.0003331602009905118, + "loss": 0.83002931, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.36376953, + "step": 3223, + "time_per_iteration": 2.7330005168914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084321, + "balance_loss_mlp": 1.04696107, + "epoch": 0.6202385532897268, + "flos": 665765001216.0, + "grad_norm": 0.046947333266423794, + "language_loss": 0.83694303, + "learning_rate": 0.00033286654665158085, + "loss": 0.84778625, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.37329102, + "step": 3224, + "time_per_iteration": 2.937727689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087568, + "balance_loss_mlp": 1.0515908, + "epoch": 0.6204309349749904, + "flos": 484709902848.0, + "grad_norm": 0.0575064293586871, + "language_loss": 0.87672997, + "learning_rate": 0.0003325729571996109, + "loss": 0.88760567, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.36010742, + "step": 3225, + "time_per_iteration": 2.6319355964660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085695, + "balance_loss_mlp": 1.04919314, + "epoch": 0.6206233166602539, + "flos": 583768014336.0, + "grad_norm": 0.048737024704114895, + "language_loss": 0.83402115, + "learning_rate": 0.000332279432748584, + "loss": 0.84487808, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.36523438, + "step": 3226, + "time_per_iteration": 2.733870029449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010819, + "balance_loss_mlp": 1.04656696, + "epoch": 0.6208156983455175, + "flos": 476669657088.0, + "grad_norm": 0.0460557240454385, + "language_loss": 0.87514353, + "learning_rate": 0.00033198597341245576, + "loss": 0.88596255, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.35375977, + "step": 3227, + "time_per_iteration": 2.567084789276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081012, + "balance_loss_mlp": 1.04420066, + "epoch": 0.6210080800307811, + "flos": 788716887552.0, + "grad_norm": 0.07539791999679457, + "language_loss": 0.81657004, + "learning_rate": 0.00033169257930515763, + "loss": 0.82738018, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.36816406, + "step": 3228, + "time_per_iteration": 3.074739694595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084245, + "balance_loss_mlp": 1.04655147, + "epoch": 0.6212004617160446, + "flos": 607514812416.0, + "grad_norm": 0.05269169473042375, + "language_loss": 0.82430172, + "learning_rate": 0.0003313992505405951, + "loss": 0.83514416, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.37695312, + "step": 3229, + "time_per_iteration": 2.711282730102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079533, + "balance_loss_mlp": 1.04305458, + "epoch": 0.6213928434013082, + "flos": 586248786432.0, + "grad_norm": 0.05753494770574613, + "language_loss": 0.8075214, + "learning_rate": 0.0003311059872326487, + "loss": 0.81831676, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.36474609, + "step": 3230, + "time_per_iteration": 2.6755940914154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082159, + "balance_loss_mlp": 1.04467952, + "epoch": 0.6215852250865718, + "flos": 535819083264.0, + "grad_norm": 0.04907016045640681, + "language_loss": 0.79111725, + "learning_rate": 0.0003308127894951734, + "loss": 0.80193883, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.37426758, + "step": 3231, + "time_per_iteration": 2.612122058868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086128, + "balance_loss_mlp": 1.04893494, + "epoch": 0.6217776067718354, + "flos": 617884471296.0, + "grad_norm": 0.0640423801123885, + "language_loss": 0.86435384, + "learning_rate": 0.00033051965744199834, + "loss": 0.87521511, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.37133789, + "step": 3232, + "time_per_iteration": 2.734384059906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107468, + "balance_loss_mlp": 1.03913224, + "epoch": 0.6219699884570988, + "flos": 545570311680.0, + "grad_norm": 0.045255868700115984, + "language_loss": 0.90312266, + "learning_rate": 0.0003302265911869276, + "loss": 0.91386944, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.35571289, + "step": 3233, + "time_per_iteration": 2.9088501930236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079236, + "balance_loss_mlp": 1.04216146, + "epoch": 0.6221623701423624, + "flos": 480899289600.0, + "grad_norm": 0.054924545254622516, + "language_loss": 0.83717418, + "learning_rate": 0.0003299335908437397, + "loss": 0.84796649, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.37060547, + "step": 3234, + "time_per_iteration": 2.5804450511932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077942, + "balance_loss_mlp": 1.04062915, + "epoch": 0.622354751827626, + "flos": 379812920832.0, + "grad_norm": 0.0810547632839198, + "language_loss": 0.80174738, + "learning_rate": 0.0003296406565261873, + "loss": 0.81252682, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.37304688, + "step": 3235, + "time_per_iteration": 2.480074405670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072254, + "balance_loss_mlp": 1.03610981, + "epoch": 0.6225471335128896, + "flos": 667570678272.0, + "grad_norm": 0.04590561718028109, + "language_loss": 0.84757555, + "learning_rate": 0.0003293477883479978, + "loss": 0.85829806, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.36181641, + "step": 3236, + "time_per_iteration": 2.8077552318573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107691, + "balance_loss_mlp": 1.03909636, + "epoch": 0.6227395151981532, + "flos": 770995224576.0, + "grad_norm": 0.06134325459280444, + "language_loss": 0.79419619, + "learning_rate": 0.0003290549864228727, + "loss": 0.80496532, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.37768555, + "step": 3237, + "time_per_iteration": 2.9485511779785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078391, + "balance_loss_mlp": 1.04084027, + "epoch": 0.6229318968834167, + "flos": 484104619008.0, + "grad_norm": 0.04787340801425507, + "language_loss": 0.86647016, + "learning_rate": 0.0003287622508644875, + "loss": 0.87725413, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.37548828, + "step": 3238, + "time_per_iteration": 2.723003387451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072928, + "balance_loss_mlp": 1.0360688, + "epoch": 0.6231242785686802, + "flos": 462700380672.0, + "grad_norm": 0.08533340323107003, + "language_loss": 0.86471462, + "learning_rate": 0.0003284695817864923, + "loss": 0.87544394, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.36865234, + "step": 3239, + "time_per_iteration": 2.4788854122161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079076, + "balance_loss_mlp": 1.04231155, + "epoch": 0.6233166602539438, + "flos": 608809747968.0, + "grad_norm": 0.06356990340371446, + "language_loss": 0.83732104, + "learning_rate": 0.0003281769793025116, + "loss": 0.84811181, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.36791992, + "step": 3240, + "time_per_iteration": 2.68833065032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071809, + "balance_loss_mlp": 1.03542674, + "epoch": 0.6235090419392074, + "flos": 438972521472.0, + "grad_norm": 0.05773237210342904, + "language_loss": 0.89384484, + "learning_rate": 0.00032788444352614346, + "loss": 0.90456295, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.36425781, + "step": 3241, + "time_per_iteration": 2.485630512237549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073948, + "balance_loss_mlp": 1.03706515, + "epoch": 0.6237014236244709, + "flos": 504656262144.0, + "grad_norm": 0.05916154923857777, + "language_loss": 0.80431205, + "learning_rate": 0.0003275919745709606, + "loss": 0.81505156, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.36889648, + "step": 3242, + "time_per_iteration": 2.557732582092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073489, + "balance_loss_mlp": 1.03710628, + "epoch": 0.6238938053097345, + "flos": 512648455680.0, + "grad_norm": 0.047494752086082274, + "language_loss": 0.82139623, + "learning_rate": 0.00032729957255050936, + "loss": 0.83213103, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.36376953, + "step": 3243, + "time_per_iteration": 2.653381586074829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075548, + "balance_loss_mlp": 1.03799748, + "epoch": 0.6240861869949981, + "flos": 736435017216.0, + "grad_norm": 0.07878714918390893, + "language_loss": 0.81488502, + "learning_rate": 0.0003270072375783102, + "loss": 0.8256405, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.37524414, + "step": 3244, + "time_per_iteration": 2.893857717514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068875, + "balance_loss_mlp": 1.03244424, + "epoch": 0.6242785686802617, + "flos": 494464103424.0, + "grad_norm": 0.05659954005953207, + "language_loss": 0.79646188, + "learning_rate": 0.00032671496976785774, + "loss": 0.8071506, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.36425781, + "step": 3245, + "time_per_iteration": 2.5836338996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072633, + "balance_loss_mlp": 1.03536868, + "epoch": 0.6244709503655252, + "flos": 745500976128.0, + "grad_norm": 0.04683044918703509, + "language_loss": 0.75894988, + "learning_rate": 0.0003264227692326205, + "loss": 0.76967621, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.37231445, + "step": 3246, + "time_per_iteration": 3.0129404067993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071862, + "balance_loss_mlp": 1.03524101, + "epoch": 0.6246633320507887, + "flos": 492366034944.0, + "grad_norm": 0.053825278075075034, + "language_loss": 0.85644072, + "learning_rate": 0.00032613063608604055, + "loss": 0.86715937, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.36645508, + "step": 3247, + "time_per_iteration": 2.5503756999969482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078462, + "balance_loss_mlp": 1.0416261, + "epoch": 0.6248557137360523, + "flos": 517142928384.0, + "grad_norm": 0.04781520773103446, + "language_loss": 0.8331461, + "learning_rate": 0.0003258385704415343, + "loss": 0.84393072, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.36816406, + "step": 3248, + "time_per_iteration": 2.560483455657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079309, + "balance_loss_mlp": 1.04161501, + "epoch": 0.6250480954213159, + "flos": 519098402304.0, + "grad_norm": 0.04627181605828338, + "language_loss": 0.83052945, + "learning_rate": 0.0003255465724124915, + "loss": 0.84132254, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.37670898, + "step": 3249, + "time_per_iteration": 2.7024102210998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075461, + "balance_loss_mlp": 1.03776741, + "epoch": 0.6252404771065795, + "flos": 515808705024.0, + "grad_norm": 0.04699281003283387, + "language_loss": 0.82845968, + "learning_rate": 0.00032525464211227587, + "loss": 0.83921427, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.37646484, + "step": 3250, + "time_per_iteration": 2.5934925079345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073962, + "balance_loss_mlp": 1.03712666, + "epoch": 0.6254328587918431, + "flos": 576647354880.0, + "grad_norm": 0.05335085924079445, + "language_loss": 0.85498369, + "learning_rate": 0.0003249627796542249, + "loss": 0.86572331, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.36816406, + "step": 3251, + "time_per_iteration": 2.6473746299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107209, + "balance_loss_mlp": 1.03472972, + "epoch": 0.6256252404771065, + "flos": 597638366208.0, + "grad_norm": 0.05949551618026705, + "language_loss": 0.83974731, + "learning_rate": 0.00032467098515164943, + "loss": 0.85046822, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.37353516, + "step": 3252, + "time_per_iteration": 2.8618545532226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074504, + "balance_loss_mlp": 1.03776419, + "epoch": 0.6258176221623701, + "flos": 508034709504.0, + "grad_norm": 0.05339688957223288, + "language_loss": 0.83978283, + "learning_rate": 0.00032437925871783456, + "loss": 0.85052788, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.36767578, + "step": 3253, + "time_per_iteration": 2.6301941871643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074357, + "balance_loss_mlp": 1.03680658, + "epoch": 0.6260100038476337, + "flos": 639357110784.0, + "grad_norm": 0.06013661875979651, + "language_loss": 0.84100354, + "learning_rate": 0.00032408760046603803, + "loss": 0.85174716, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.37548828, + "step": 3254, + "time_per_iteration": 2.798520565032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076909, + "balance_loss_mlp": 1.03923869, + "epoch": 0.6262023855328973, + "flos": 840648139776.0, + "grad_norm": 0.05406777705406554, + "language_loss": 0.77436024, + "learning_rate": 0.00032379601050949193, + "loss": 0.78512931, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.3762207, + "step": 3255, + "time_per_iteration": 3.0876083374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075377, + "balance_loss_mlp": 1.03746879, + "epoch": 0.6263947672181608, + "flos": 521884712448.0, + "grad_norm": 0.05001529336146337, + "language_loss": 0.8825866, + "learning_rate": 0.0003235044889614013, + "loss": 0.89334035, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.37866211, + "step": 3256, + "time_per_iteration": 2.616588592529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079768, + "balance_loss_mlp": 1.04221702, + "epoch": 0.6265871489034244, + "flos": 606747995136.0, + "grad_norm": 0.049239400336598835, + "language_loss": 0.83356363, + "learning_rate": 0.0003232130359349451, + "loss": 0.84436131, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.37524414, + "step": 3257, + "time_per_iteration": 2.8224074840545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083632, + "balance_loss_mlp": 1.04474616, + "epoch": 0.626779530588688, + "flos": 588208642560.0, + "grad_norm": 0.04846319258982293, + "language_loss": 0.81674659, + "learning_rate": 0.0003229216515432751, + "loss": 0.8275829, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.38842773, + "step": 3258, + "time_per_iteration": 2.78884220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081861, + "balance_loss_mlp": 1.0438329, + "epoch": 0.6269719122739515, + "flos": 438381794304.0, + "grad_norm": 0.061777321686694836, + "language_loss": 0.79815853, + "learning_rate": 0.0003226303358995174, + "loss": 0.80897713, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.37988281, + "step": 3259, + "time_per_iteration": 2.625014305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108494, + "balance_loss_mlp": 1.0462687, + "epoch": 0.6271642939592151, + "flos": 562590738432.0, + "grad_norm": 0.04793696937542698, + "language_loss": 0.8911407, + "learning_rate": 0.00032233908911677, + "loss": 0.90199006, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.38623047, + "step": 3260, + "time_per_iteration": 2.8619987964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081144, + "balance_loss_mlp": 1.04194832, + "epoch": 0.6273566756444786, + "flos": 514288217088.0, + "grad_norm": 0.06578723917558563, + "language_loss": 0.80680311, + "learning_rate": 0.0003220479113081053, + "loss": 0.81761456, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.3918457, + "step": 3261, + "time_per_iteration": 2.7102510929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080352, + "balance_loss_mlp": 1.04270554, + "epoch": 0.6275490573297422, + "flos": 585195369984.0, + "grad_norm": 0.0548628003226281, + "language_loss": 0.78727174, + "learning_rate": 0.00032175680258656836, + "loss": 0.79807532, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.3762207, + "step": 3262, + "time_per_iteration": 2.696701765060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083558, + "balance_loss_mlp": 1.04600739, + "epoch": 0.6277414390150058, + "flos": 559143889920.0, + "grad_norm": 0.044574681461427054, + "language_loss": 0.80117631, + "learning_rate": 0.00032146576306517794, + "loss": 0.81201196, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.37524414, + "step": 3263, + "time_per_iteration": 2.764273166656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077314, + "balance_loss_mlp": 1.03873789, + "epoch": 0.6279338207002694, + "flos": 612423922176.0, + "grad_norm": 0.04659103791946159, + "language_loss": 0.80601645, + "learning_rate": 0.0003211747928569255, + "loss": 0.81678957, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.38525391, + "step": 3264, + "time_per_iteration": 2.741144895553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077333, + "balance_loss_mlp": 1.03906703, + "epoch": 0.6281262023855329, + "flos": 625374687744.0, + "grad_norm": 0.044995138284684974, + "language_loss": 0.81407869, + "learning_rate": 0.0003208838920747754, + "loss": 0.82485199, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.38208008, + "step": 3265, + "time_per_iteration": 2.8458306789398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075806, + "balance_loss_mlp": 1.03753948, + "epoch": 0.6283185840707964, + "flos": 1123147579392.0, + "grad_norm": 0.051347706918532285, + "language_loss": 0.76555598, + "learning_rate": 0.0003205930608316656, + "loss": 0.77631402, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.38232422, + "step": 3266, + "time_per_iteration": 3.5019400119781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074561, + "balance_loss_mlp": 1.03631854, + "epoch": 0.62851096575606, + "flos": 514967694336.0, + "grad_norm": 0.055036634994397565, + "language_loss": 0.84812629, + "learning_rate": 0.00032030229924050673, + "loss": 0.85887194, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.38183594, + "step": 3267, + "time_per_iteration": 2.6514573097229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107346, + "balance_loss_mlp": 1.03495502, + "epoch": 0.6287033474413236, + "flos": 403949624832.0, + "grad_norm": 0.06092252438961513, + "language_loss": 0.79938138, + "learning_rate": 0.00032001160741418247, + "loss": 0.81011593, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.38452148, + "step": 3268, + "time_per_iteration": 2.6364564895629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076302, + "balance_loss_mlp": 1.03765488, + "epoch": 0.6288957291265872, + "flos": 525459598848.0, + "grad_norm": 0.06432688235517753, + "language_loss": 0.81921297, + "learning_rate": 0.0003197209854655494, + "loss": 0.82997596, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.38623047, + "step": 3269, + "time_per_iteration": 2.6190736293792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072941, + "balance_loss_mlp": 1.03531849, + "epoch": 0.6290881108118507, + "flos": 603414627840.0, + "grad_norm": 0.059396512475175293, + "language_loss": 0.74762654, + "learning_rate": 0.0003194304335074371, + "loss": 0.75835598, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.3762207, + "step": 3270, + "time_per_iteration": 2.829658031463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069412, + "balance_loss_mlp": 1.03190899, + "epoch": 0.6292804924971143, + "flos": 437446241280.0, + "grad_norm": 0.057734053913976915, + "language_loss": 0.8848114, + "learning_rate": 0.0003191399516526475, + "loss": 0.89550555, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.37451172, + "step": 3271, + "time_per_iteration": 2.520371675491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107074, + "balance_loss_mlp": 1.03369021, + "epoch": 0.6294728741823779, + "flos": 606368263680.0, + "grad_norm": 0.05065852355738081, + "language_loss": 0.79438859, + "learning_rate": 0.0003188495400139559, + "loss": 0.80509603, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.37060547, + "step": 3272, + "time_per_iteration": 2.771045207977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070525, + "balance_loss_mlp": 1.03354681, + "epoch": 0.6296652558676414, + "flos": 701220063744.0, + "grad_norm": 0.05978567707870047, + "language_loss": 0.84609801, + "learning_rate": 0.00031855919870411013, + "loss": 0.8568033, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.36987305, + "step": 3273, + "time_per_iteration": 2.8209264278411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072859, + "balance_loss_mlp": 1.03516483, + "epoch": 0.6298576375529049, + "flos": 523652511744.0, + "grad_norm": 0.05543489609660157, + "language_loss": 0.85005689, + "learning_rate": 0.0003182689278358305, + "loss": 0.86078548, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.37646484, + "step": 3274, + "time_per_iteration": 2.6735117435455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069939, + "balance_loss_mlp": 1.03360391, + "epoch": 0.6300500192381685, + "flos": 475723929600.0, + "grad_norm": 0.06241690898076668, + "language_loss": 0.79779917, + "learning_rate": 0.0003179787275218105, + "loss": 0.80849856, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.36352539, + "step": 3275, + "time_per_iteration": 2.5281076431274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071447, + "balance_loss_mlp": 1.03394365, + "epoch": 0.6302424009234321, + "flos": 520629064704.0, + "grad_norm": 0.04860664523501564, + "language_loss": 0.83985364, + "learning_rate": 0.0003176885978747155, + "loss": 0.85056806, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.375, + "step": 3276, + "time_per_iteration": 2.590137243270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073206, + "balance_loss_mlp": 1.03594065, + "epoch": 0.6304347826086957, + "flos": 694282696704.0, + "grad_norm": 0.06745994429641342, + "language_loss": 0.82557893, + "learning_rate": 0.0003173985390071839, + "loss": 0.83631098, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.37207031, + "step": 3277, + "time_per_iteration": 2.835454225540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026014, + "balance_loss_mlp": 1.01476038, + "epoch": 0.6306271642939593, + "flos": 1466067755520.0, + "grad_norm": 0.018393176098041853, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78926468, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.11230469, + "step": 3278, + "time_per_iteration": 4.83237099647522 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071213, + "balance_loss_mlp": 1.03440166, + "epoch": 0.6308195459792227, + "flos": 601444597248.0, + "grad_norm": 0.05391474190589518, + "language_loss": 0.8122592, + "learning_rate": 0.00031681863406122704, + "loss": 0.82297128, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.36816406, + "step": 3279, + "time_per_iteration": 2.7689826488494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071032, + "balance_loss_mlp": 1.03381503, + "epoch": 0.6310119276644863, + "flos": 726514900992.0, + "grad_norm": 0.04523972239140451, + "language_loss": 0.85147464, + "learning_rate": 0.00031652878820794087, + "loss": 0.86218488, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.37207031, + "step": 3280, + "time_per_iteration": 2.973525047302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074703, + "balance_loss_mlp": 1.03762913, + "epoch": 0.6312043093497499, + "flos": 519482515968.0, + "grad_norm": 0.0661931076661352, + "language_loss": 0.85199058, + "learning_rate": 0.00031623901358449627, + "loss": 0.86273754, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.37060547, + "step": 3281, + "time_per_iteration": 2.6226651668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074897, + "balance_loss_mlp": 1.03860974, + "epoch": 0.6313966910350135, + "flos": 530934704640.0, + "grad_norm": 0.050825479700673346, + "language_loss": 0.88810539, + "learning_rate": 0.0003159493103033936, + "loss": 0.89885437, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.36303711, + "step": 3282, + "time_per_iteration": 2.601001262664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022599, + "balance_loss_mlp": 1.01163197, + "epoch": 0.631589072720277, + "flos": 1379113606656.0, + "grad_norm": 0.015722809882928884, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80941653, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.10986328, + "step": 3283, + "time_per_iteration": 4.848982334136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075752, + "balance_loss_mlp": 1.03774858, + "epoch": 0.6317814544055406, + "flos": 624379497984.0, + "grad_norm": 0.05495466978446473, + "language_loss": 0.82262814, + "learning_rate": 0.0003153701182180776, + "loss": 0.83338571, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.37939453, + "step": 3284, + "time_per_iteration": 2.767197608947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074338, + "balance_loss_mlp": 1.03759754, + "epoch": 0.6319738360908042, + "flos": 497876046336.0, + "grad_norm": 0.052075562898617506, + "language_loss": 0.81654066, + "learning_rate": 0.00031508062963872655, + "loss": 0.82728398, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.36743164, + "step": 3285, + "time_per_iteration": 2.6035704612731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076725, + "balance_loss_mlp": 1.03836393, + "epoch": 0.6321662177760677, + "flos": 579474362880.0, + "grad_norm": 0.07288308638623867, + "language_loss": 0.79200375, + "learning_rate": 0.0003147912128514423, + "loss": 0.80277097, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.38330078, + "step": 3286, + "time_per_iteration": 2.716641426086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076397, + "balance_loss_mlp": 1.04046774, + "epoch": 0.6323585994613313, + "flos": 601207460352.0, + "grad_norm": 0.06971940923573844, + "language_loss": 0.8695125, + "learning_rate": 0.0003145018679685859, + "loss": 0.88027644, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.35913086, + "step": 3287, + "time_per_iteration": 2.7455978393554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107146, + "balance_loss_mlp": 1.03579235, + "epoch": 0.6325509811465948, + "flos": 528261875712.0, + "grad_norm": 0.04384193895060619, + "language_loss": 0.8763777, + "learning_rate": 0.00031421259510249134, + "loss": 0.88709229, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.35717773, + "step": 3288, + "time_per_iteration": 2.760524034500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070765, + "balance_loss_mlp": 1.03397667, + "epoch": 0.6327433628318584, + "flos": 573993464832.0, + "grad_norm": 0.05235334627417233, + "language_loss": 0.81302404, + "learning_rate": 0.00031392339436546414, + "loss": 0.82373166, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.36791992, + "step": 3289, + "time_per_iteration": 2.8397610187530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075193, + "balance_loss_mlp": 1.03876281, + "epoch": 0.632935744517122, + "flos": 516833008128.0, + "grad_norm": 0.06389388194591325, + "language_loss": 0.83106172, + "learning_rate": 0.00031363426586978205, + "loss": 0.84181368, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.36450195, + "step": 3290, + "time_per_iteration": 2.7519772052764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071007, + "balance_loss_mlp": 1.03438592, + "epoch": 0.6331281262023856, + "flos": 617180262912.0, + "grad_norm": 0.051172787966305235, + "language_loss": 0.84358442, + "learning_rate": 0.0003133452097276947, + "loss": 0.85429454, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.3659668, + "step": 3291, + "time_per_iteration": 2.7666964530944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066726, + "balance_loss_mlp": 1.03060579, + "epoch": 0.633320507887649, + "flos": 592665237504.0, + "grad_norm": 0.04649406007551123, + "language_loss": 0.84316128, + "learning_rate": 0.0003130562260514238, + "loss": 0.85382849, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.36132812, + "step": 3292, + "time_per_iteration": 2.7349252700805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107055, + "balance_loss_mlp": 1.03373802, + "epoch": 0.6335128895729126, + "flos": 582064233984.0, + "grad_norm": 0.04554083300278307, + "language_loss": 0.81461787, + "learning_rate": 0.0003127673149531626, + "loss": 0.82532346, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.36791992, + "step": 3293, + "time_per_iteration": 2.777203321456909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068218, + "balance_loss_mlp": 1.03150177, + "epoch": 0.6337052712581762, + "flos": 452803585536.0, + "grad_norm": 0.06587876286418191, + "language_loss": 0.83099329, + "learning_rate": 0.0003124784765450762, + "loss": 0.84167558, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.3671875, + "step": 3294, + "time_per_iteration": 2.5272936820983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076823, + "balance_loss_mlp": 1.0392009, + "epoch": 0.6338976529434398, + "flos": 573132105216.0, + "grad_norm": 0.07565645338931325, + "language_loss": 0.80265319, + "learning_rate": 0.0003121897109393017, + "loss": 0.81342143, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.37597656, + "step": 3295, + "time_per_iteration": 2.7182729244232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069809, + "balance_loss_mlp": 1.03318739, + "epoch": 0.6340900346287034, + "flos": 508497398784.0, + "grad_norm": 0.45372890194936744, + "language_loss": 0.89147079, + "learning_rate": 0.0003119010182479481, + "loss": 0.90216893, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.36621094, + "step": 3296, + "time_per_iteration": 2.613863706588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076319, + "balance_loss_mlp": 1.0396266, + "epoch": 0.6342824163139669, + "flos": 479505429504.0, + "grad_norm": 0.05534198375005729, + "language_loss": 0.82468164, + "learning_rate": 0.00031161239858309563, + "loss": 0.83544481, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.36669922, + "step": 3297, + "time_per_iteration": 2.581540822982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107642, + "balance_loss_mlp": 1.03917897, + "epoch": 0.6344747979992305, + "flos": 571762976256.0, + "grad_norm": 0.05796983524113203, + "language_loss": 0.8309406, + "learning_rate": 0.0003113238520567964, + "loss": 0.84170485, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.37182617, + "step": 3298, + "time_per_iteration": 2.666191816329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077442, + "balance_loss_mlp": 1.04082084, + "epoch": 0.634667179684494, + "flos": 605629149696.0, + "grad_norm": 0.056114886928888365, + "language_loss": 0.81702375, + "learning_rate": 0.00031103537878107403, + "loss": 0.82779819, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.36621094, + "step": 3299, + "time_per_iteration": 2.7362561225891113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080646, + "balance_loss_mlp": 1.04311848, + "epoch": 0.6348595613697576, + "flos": 646649478144.0, + "grad_norm": 0.06007496440704036, + "language_loss": 0.80261421, + "learning_rate": 0.0003107469788679238, + "loss": 0.81342065, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.37475586, + "step": 3300, + "time_per_iteration": 2.756533622741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073194, + "balance_loss_mlp": 1.03597736, + "epoch": 0.6350519430550212, + "flos": 638776558080.0, + "grad_norm": 0.05358633946635087, + "language_loss": 0.86808562, + "learning_rate": 0.00031045865242931267, + "loss": 0.87881756, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.37207031, + "step": 3301, + "time_per_iteration": 2.829094171524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080134, + "balance_loss_mlp": 1.043203, + "epoch": 0.6352443247402847, + "flos": 686091091968.0, + "grad_norm": 0.0476034793432377, + "language_loss": 0.83036846, + "learning_rate": 0.00031017039957717877, + "loss": 0.84116983, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.36938477, + "step": 3302, + "time_per_iteration": 2.9974441528320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073106, + "balance_loss_mlp": 1.03588891, + "epoch": 0.6354367064255483, + "flos": 559173003264.0, + "grad_norm": 0.056110934582374906, + "language_loss": 0.88712031, + "learning_rate": 0.0003098822204234318, + "loss": 0.89785135, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.37207031, + "step": 3303, + "time_per_iteration": 2.6585702896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076324, + "balance_loss_mlp": 1.03984571, + "epoch": 0.6356290881108119, + "flos": 979095582720.0, + "grad_norm": 0.062320507603927815, + "language_loss": 0.8736068, + "learning_rate": 0.00030959411507995273, + "loss": 0.88437009, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.36499023, + "step": 3304, + "time_per_iteration": 3.2019383907318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079953, + "balance_loss_mlp": 1.04299855, + "epoch": 0.6358214697960755, + "flos": 528005799936.0, + "grad_norm": 0.05770730921560322, + "language_loss": 0.80951726, + "learning_rate": 0.00030930608365859407, + "loss": 0.82031679, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.36938477, + "step": 3305, + "time_per_iteration": 2.6649279594421387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073678, + "balance_loss_mlp": 1.03793883, + "epoch": 0.6360138514813389, + "flos": 516547819008.0, + "grad_norm": 0.050398763649548706, + "language_loss": 0.87612951, + "learning_rate": 0.00030901812627117943, + "loss": 0.88686621, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.35791016, + "step": 3306, + "time_per_iteration": 2.6524715423583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072331, + "balance_loss_mlp": 1.0352571, + "epoch": 0.6362062331666025, + "flos": 466289823744.0, + "grad_norm": 0.06392175432949986, + "language_loss": 0.84607399, + "learning_rate": 0.000308730243029504, + "loss": 0.85679734, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.37084961, + "step": 3307, + "time_per_iteration": 2.619936943054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080745, + "balance_loss_mlp": 1.04407644, + "epoch": 0.6363986148518661, + "flos": 549458090496.0, + "grad_norm": 0.0791847929194259, + "language_loss": 0.79674953, + "learning_rate": 0.0003084424340453339, + "loss": 0.80755699, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.36669922, + "step": 3308, + "time_per_iteration": 2.847384214401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074674, + "balance_loss_mlp": 1.03688467, + "epoch": 0.6365909965371297, + "flos": 582772824576.0, + "grad_norm": 0.10517797210671455, + "language_loss": 0.82179588, + "learning_rate": 0.0003081546994304064, + "loss": 0.8325426, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.37744141, + "step": 3309, + "time_per_iteration": 2.745880365371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073786, + "balance_loss_mlp": 1.03644967, + "epoch": 0.6367833782223933, + "flos": 530998723584.0, + "grad_norm": 0.05446787183102227, + "language_loss": 0.8192482, + "learning_rate": 0.0003078670392964298, + "loss": 0.8299861, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.37304688, + "step": 3310, + "time_per_iteration": 2.6298861503601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075946, + "balance_loss_mlp": 1.03896689, + "epoch": 0.6369757599076568, + "flos": 569237124096.0, + "grad_norm": 0.05047878610686386, + "language_loss": 0.82755494, + "learning_rate": 0.00030757945375508406, + "loss": 0.83831441, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.36938477, + "step": 3311, + "time_per_iteration": 2.6519951820373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074481, + "balance_loss_mlp": 1.03652477, + "epoch": 0.6371681415929203, + "flos": 539684951040.0, + "grad_norm": 0.05551115328113397, + "language_loss": 0.81331229, + "learning_rate": 0.00030729194291801944, + "loss": 0.8240571, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.37915039, + "step": 3312, + "time_per_iteration": 2.6647114753723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078542, + "balance_loss_mlp": 1.04089594, + "epoch": 0.6373605232781839, + "flos": 483326217216.0, + "grad_norm": 0.05317823086949404, + "language_loss": 0.76999873, + "learning_rate": 0.00030700450689685787, + "loss": 0.78078413, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.37646484, + "step": 3313, + "time_per_iteration": 2.517679452896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_mlp": 1.03700447, + "epoch": 0.6375529049634475, + "flos": 578273969664.0, + "grad_norm": 0.05477509929208262, + "language_loss": 0.85436654, + "learning_rate": 0.00030671714580319186, + "loss": 0.86509454, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.35839844, + "step": 3314, + "time_per_iteration": 2.83425235748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078643, + "balance_loss_mlp": 1.04118717, + "epoch": 0.637745286648711, + "flos": 681953181696.0, + "grad_norm": 0.05493703572973109, + "language_loss": 0.83096623, + "learning_rate": 0.0003064298597485846, + "loss": 0.84175265, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.37426758, + "step": 3315, + "time_per_iteration": 2.8374462127685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107089, + "balance_loss_mlp": 1.03472173, + "epoch": 0.6379376683339746, + "flos": 504385629696.0, + "grad_norm": 0.05328451247600945, + "language_loss": 0.83983094, + "learning_rate": 0.00030614264884457054, + "loss": 0.8505398, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.36181641, + "step": 3316, + "time_per_iteration": 2.6181318759918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076564, + "balance_loss_mlp": 1.03896546, + "epoch": 0.6381300500192382, + "flos": 501771027456.0, + "grad_norm": 0.05692902887495298, + "language_loss": 0.77128184, + "learning_rate": 0.000305855513202655, + "loss": 0.78204751, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.37573242, + "step": 3317, + "time_per_iteration": 2.570751190185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072223, + "balance_loss_mlp": 1.03574491, + "epoch": 0.6383224317045018, + "flos": 400271431680.0, + "grad_norm": 0.0603897585499684, + "language_loss": 0.77435303, + "learning_rate": 0.0003055684529343138, + "loss": 0.78507531, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.36474609, + "step": 3318, + "time_per_iteration": 2.4171056747436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068524, + "balance_loss_mlp": 1.03249943, + "epoch": 0.6385148133897653, + "flos": 499131694080.0, + "grad_norm": 0.06663651312006989, + "language_loss": 0.78354919, + "learning_rate": 0.00030528146815099374, + "loss": 0.79423445, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.36010742, + "step": 3319, + "time_per_iteration": 2.5991523265838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072343, + "balance_loss_mlp": 1.03603208, + "epoch": 0.6387071950750288, + "flos": 527409280512.0, + "grad_norm": 0.04641062645518834, + "language_loss": 0.71934807, + "learning_rate": 0.00030499455896411203, + "loss": 0.73007143, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.36376953, + "step": 3320, + "time_per_iteration": 2.601541519165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047736, + "balance_loss_mlp": 1.03734136, + "epoch": 0.6388995767602924, + "flos": 1455200501760.0, + "grad_norm": 0.026504664974818824, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77348548, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.10400391, + "step": 3321, + "time_per_iteration": 4.919625997543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070963, + "balance_loss_mlp": 1.03417492, + "epoch": 0.639091958445556, + "flos": 603577571328.0, + "grad_norm": 0.051172481389266875, + "language_loss": 0.76476693, + "learning_rate": 0.0003044209678251865, + "loss": 0.77547657, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.36791992, + "step": 3322, + "time_per_iteration": 2.9173965454101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070731, + "balance_loss_mlp": 1.03406262, + "epoch": 0.6392843401308196, + "flos": 584230703616.0, + "grad_norm": 0.062017563043543965, + "language_loss": 0.84732592, + "learning_rate": 0.0003041342860958306, + "loss": 0.85803324, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.36694336, + "step": 3323, + "time_per_iteration": 2.751882791519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069123, + "balance_loss_mlp": 1.03269315, + "epoch": 0.6394767218160831, + "flos": 514420637184.0, + "grad_norm": 0.054747759386293726, + "language_loss": 0.91800594, + "learning_rate": 0.00030384768040828857, + "loss": 0.92869711, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.36425781, + "step": 3324, + "time_per_iteration": 2.6570470333099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070533, + "balance_loss_mlp": 1.03314865, + "epoch": 0.6396691035013466, + "flos": 541471689216.0, + "grad_norm": 0.049915114464213116, + "language_loss": 0.85503262, + "learning_rate": 0.00030356115087383094, + "loss": 0.86573792, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.3737793, + "step": 3325, + "time_per_iteration": 2.620593309402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068087, + "balance_loss_mlp": 1.03115582, + "epoch": 0.6398614851866102, + "flos": 525282098688.0, + "grad_norm": 0.05721597206599544, + "language_loss": 0.84746885, + "learning_rate": 0.00030327469760369803, + "loss": 0.85814971, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.36938477, + "step": 3326, + "time_per_iteration": 2.600210428237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070931, + "balance_loss_mlp": 1.03342783, + "epoch": 0.6400538668718738, + "flos": 622704830976.0, + "grad_norm": 0.3477735947082266, + "language_loss": 0.85250199, + "learning_rate": 0.0003029883207091009, + "loss": 0.86321133, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.375, + "step": 3327, + "time_per_iteration": 2.7323827743530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065435, + "balance_loss_mlp": 1.02910042, + "epoch": 0.6402462485571374, + "flos": 503096486400.0, + "grad_norm": 0.053886182744941745, + "language_loss": 0.78170431, + "learning_rate": 0.00030270202030122095, + "loss": 0.79235864, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.36328125, + "step": 3328, + "time_per_iteration": 2.6563096046447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107159, + "balance_loss_mlp": 1.03310895, + "epoch": 0.6404386302424009, + "flos": 818894693376.0, + "grad_norm": 0.06347117361698136, + "language_loss": 0.85806334, + "learning_rate": 0.00030241579649121, + "loss": 0.86877924, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.38476562, + "step": 3329, + "time_per_iteration": 2.9936435222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066015, + "balance_loss_mlp": 1.02901256, + "epoch": 0.6406310119276645, + "flos": 471568490496.0, + "grad_norm": 0.05226197441387588, + "language_loss": 0.79091239, + "learning_rate": 0.00030212964939018994, + "loss": 0.8015725, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.37011719, + "step": 3330, + "time_per_iteration": 2.5639078617095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107031, + "balance_loss_mlp": 1.0323776, + "epoch": 0.6408233936129281, + "flos": 425358245376.0, + "grad_norm": 0.06341229452326952, + "language_loss": 0.85196972, + "learning_rate": 0.0003018435791092527, + "loss": 0.86267287, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.37890625, + "step": 3331, + "time_per_iteration": 2.4909286499023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067837, + "balance_loss_mlp": 1.0288794, + "epoch": 0.6410157752981916, + "flos": 549522109440.0, + "grad_norm": 0.052178008313766185, + "language_loss": 0.81084096, + "learning_rate": 0.00030155758575946083, + "loss": 0.82151937, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.3894043, + "step": 3332, + "time_per_iteration": 2.64400315284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069681, + "balance_loss_mlp": 1.03246343, + "epoch": 0.6412081569834551, + "flos": 475659910656.0, + "grad_norm": 0.056966090936169146, + "language_loss": 0.83717507, + "learning_rate": 0.0003012716694518467, + "loss": 0.84787184, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.37231445, + "step": 3333, + "time_per_iteration": 2.5760622024536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068182, + "balance_loss_mlp": 1.02998757, + "epoch": 0.6414005386687187, + "flos": 540645235200.0, + "grad_norm": 0.0733128954911655, + "language_loss": 0.85120058, + "learning_rate": 0.000300985830297413, + "loss": 0.86188245, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.3815918, + "step": 3334, + "time_per_iteration": 2.6769511699676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068689, + "balance_loss_mlp": 1.03187692, + "epoch": 0.6415929203539823, + "flos": 1040909073408.0, + "grad_norm": 0.0544756341035146, + "language_loss": 0.87377876, + "learning_rate": 0.00030070006840713205, + "loss": 0.88446569, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.36865234, + "step": 3335, + "time_per_iteration": 3.3541831970214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070768, + "balance_loss_mlp": 1.03398037, + "epoch": 0.6417853020392459, + "flos": 648028781568.0, + "grad_norm": 0.051565037343947635, + "language_loss": 0.73971063, + "learning_rate": 0.000300414383891947, + "loss": 0.75041831, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.36791992, + "step": 3336, + "time_per_iteration": 2.802199602127075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072657, + "balance_loss_mlp": 1.03536844, + "epoch": 0.6419776837245095, + "flos": 500639035392.0, + "grad_norm": 0.04995187191956455, + "language_loss": 0.88918942, + "learning_rate": 0.00030012877686276973, + "loss": 0.89991605, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.37280273, + "step": 3337, + "time_per_iteration": 2.69291090965271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107373, + "balance_loss_mlp": 1.03677511, + "epoch": 0.642170065409773, + "flos": 620331747840.0, + "grad_norm": 0.054761035667788324, + "language_loss": 0.86300218, + "learning_rate": 0.0002998432474304832, + "loss": 0.87373948, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.36914062, + "step": 3338, + "time_per_iteration": 2.773374319076538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015993, + "balance_loss_mlp": 1.00283277, + "epoch": 0.6423624470950365, + "flos": 1422767476224.0, + "grad_norm": 0.016749722719595034, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80253339, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.13183594, + "step": 3339, + "time_per_iteration": 4.874187231063843 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073859, + "balance_loss_mlp": 1.03788161, + "epoch": 0.6425548287803001, + "flos": 562082969088.0, + "grad_norm": 0.04482420298263986, + "language_loss": 0.88213849, + "learning_rate": 0.00029927242179996107, + "loss": 0.8928771, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.36010742, + "step": 3340, + "time_per_iteration": 2.665893077850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068858, + "balance_loss_mlp": 1.03240371, + "epoch": 0.6427472104655637, + "flos": 585151699968.0, + "grad_norm": 0.04629279799595454, + "language_loss": 0.83241612, + "learning_rate": 0.0002989871258233398, + "loss": 0.84310472, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.36474609, + "step": 3341, + "time_per_iteration": 2.7554104328155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076157, + "balance_loss_mlp": 1.03927386, + "epoch": 0.6429395921508272, + "flos": 404067488256.0, + "grad_norm": 0.0587599408215441, + "language_loss": 0.82722974, + "learning_rate": 0.0002987019078868373, + "loss": 0.8379913, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.36865234, + "step": 3342, + "time_per_iteration": 2.4214284420013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074694, + "balance_loss_mlp": 1.03742945, + "epoch": 0.6431319738360908, + "flos": 548522537472.0, + "grad_norm": 0.05743775119998274, + "language_loss": 0.8159622, + "learning_rate": 0.00029841676810118484, + "loss": 0.82670915, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.37231445, + "step": 3343, + "time_per_iteration": 2.6899335384368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070744, + "balance_loss_mlp": 1.03390789, + "epoch": 0.6433243555213544, + "flos": 793044034560.0, + "grad_norm": 0.05135608784833761, + "language_loss": 0.87229836, + "learning_rate": 0.0002981317065770839, + "loss": 0.8830058, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.36816406, + "step": 3344, + "time_per_iteration": 3.038647413253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075006, + "balance_loss_mlp": 1.03771782, + "epoch": 0.643516737206618, + "flos": 582762650112.0, + "grad_norm": 0.05966061417455641, + "language_loss": 0.80907631, + "learning_rate": 0.00029784672342520493, + "loss": 0.81982636, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.37231445, + "step": 3345, + "time_per_iteration": 2.6487960815429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106957, + "balance_loss_mlp": 1.03244793, + "epoch": 0.6437091188918815, + "flos": 518501882880.0, + "grad_norm": 0.05291983306106443, + "language_loss": 0.83733785, + "learning_rate": 0.00029756181875618834, + "loss": 0.84803355, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.37133789, + "step": 3346, + "time_per_iteration": 2.5655863285064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073171, + "balance_loss_mlp": 1.03671718, + "epoch": 0.643901500577145, + "flos": 384736587264.0, + "grad_norm": 0.05666313029634666, + "language_loss": 0.83381206, + "learning_rate": 0.0002972769926806439, + "loss": 0.84454376, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.36474609, + "step": 3347, + "time_per_iteration": 2.456300735473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078252, + "balance_loss_mlp": 1.04122531, + "epoch": 0.6440938822624086, + "flos": 483478986240.0, + "grad_norm": 0.05181671155605703, + "language_loss": 0.88556045, + "learning_rate": 0.0002969922453091508, + "loss": 0.89634299, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.37036133, + "step": 3348, + "time_per_iteration": 2.5434532165527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078332, + "balance_loss_mlp": 1.04104328, + "epoch": 0.6442862639476722, + "flos": 540178163712.0, + "grad_norm": 0.04671333484936929, + "language_loss": 0.85028982, + "learning_rate": 0.00029670757675225777, + "loss": 0.86107314, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.37255859, + "step": 3349, + "time_per_iteration": 2.7254116535186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073632, + "balance_loss_mlp": 1.03715396, + "epoch": 0.6444786456329358, + "flos": 526651227648.0, + "grad_norm": 0.06388805390045102, + "language_loss": 0.7939328, + "learning_rate": 0.0002964229871204831, + "loss": 0.80466914, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.36474609, + "step": 3350, + "time_per_iteration": 2.623533248901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107868, + "balance_loss_mlp": 1.04274988, + "epoch": 0.6446710273181993, + "flos": 697576776192.0, + "grad_norm": 0.05363118847235426, + "language_loss": 0.83167213, + "learning_rate": 0.00029613847652431403, + "loss": 0.84245896, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.35961914, + "step": 3351, + "time_per_iteration": 2.835373640060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081252, + "balance_loss_mlp": 1.04536986, + "epoch": 0.6448634090034628, + "flos": 624705384960.0, + "grad_norm": 0.04827389624860956, + "language_loss": 0.79376614, + "learning_rate": 0.0002958540450742078, + "loss": 0.8045786, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.35864258, + "step": 3352, + "time_per_iteration": 2.905045986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078994, + "balance_loss_mlp": 1.04175305, + "epoch": 0.6450557906887264, + "flos": 600647256576.0, + "grad_norm": 0.04612026708575604, + "language_loss": 0.77379197, + "learning_rate": 0.0002955696928805901, + "loss": 0.7845819, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.37231445, + "step": 3353, + "time_per_iteration": 2.899186372756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081177, + "balance_loss_mlp": 1.04536617, + "epoch": 0.64524817237399, + "flos": 645905981952.0, + "grad_norm": 0.050963182313219675, + "language_loss": 0.86320436, + "learning_rate": 0.0002952854200538563, + "loss": 0.87401617, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.35839844, + "step": 3354, + "time_per_iteration": 2.7646782398223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107986, + "balance_loss_mlp": 1.04366803, + "epoch": 0.6454405540592536, + "flos": 473173346304.0, + "grad_norm": 0.05160537421710046, + "language_loss": 0.82000065, + "learning_rate": 0.000295001226704371, + "loss": 0.83079934, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.36206055, + "step": 3355, + "time_per_iteration": 2.5571465492248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080502, + "balance_loss_mlp": 1.04357088, + "epoch": 0.6456329357445171, + "flos": 611548005888.0, + "grad_norm": 0.052373080936441004, + "language_loss": 0.8272965, + "learning_rate": 0.00029471711294246783, + "loss": 0.83810151, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.36914062, + "step": 3356, + "time_per_iteration": 2.829554796218872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075402, + "balance_loss_mlp": 1.03890061, + "epoch": 0.6458253174297807, + "flos": 731373138432.0, + "grad_norm": 0.05569683801855411, + "language_loss": 0.82248133, + "learning_rate": 0.0002944330788784494, + "loss": 0.83323538, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.36499023, + "step": 3357, + "time_per_iteration": 2.93203067779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079005, + "balance_loss_mlp": 1.04276562, + "epoch": 0.6460176991150443, + "flos": 570129007104.0, + "grad_norm": 0.050517424210504216, + "language_loss": 0.84506869, + "learning_rate": 0.00029414912462258786, + "loss": 0.8558588, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.36254883, + "step": 3358, + "time_per_iteration": 2.819854259490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077222, + "balance_loss_mlp": 1.0391469, + "epoch": 0.6462100808003078, + "flos": 582890688000.0, + "grad_norm": 0.05841825537720819, + "language_loss": 0.81327105, + "learning_rate": 0.00029386525028512366, + "loss": 0.82404327, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.38037109, + "step": 3359, + "time_per_iteration": 2.698640823364258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081177, + "balance_loss_mlp": 1.04388809, + "epoch": 0.6464024624855714, + "flos": 483647721984.0, + "grad_norm": 0.05190666328104424, + "language_loss": 0.87126404, + "learning_rate": 0.0002935814559762666, + "loss": 0.88207585, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.37329102, + "step": 3360, + "time_per_iteration": 2.768366575241089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081829, + "balance_loss_mlp": 1.0439682, + "epoch": 0.6465948441708349, + "flos": 527508205056.0, + "grad_norm": 0.050745239197886684, + "language_loss": 0.79334629, + "learning_rate": 0.0002932977418061957, + "loss": 0.80416453, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.37841797, + "step": 3361, + "time_per_iteration": 2.632948637008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082687, + "balance_loss_mlp": 1.04582703, + "epoch": 0.6467872258560985, + "flos": 669121689600.0, + "grad_norm": 0.06228301103005666, + "language_loss": 0.80853021, + "learning_rate": 0.00029301410788505833, + "loss": 0.81935704, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.3684082, + "step": 3362, + "time_per_iteration": 2.7769224643707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084618, + "balance_loss_mlp": 1.04833102, + "epoch": 0.6469796075413621, + "flos": 431867828736.0, + "grad_norm": 0.06087250960931665, + "language_loss": 0.8065362, + "learning_rate": 0.00029273055432297126, + "loss": 0.81738234, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.36328125, + "step": 3363, + "time_per_iteration": 2.484450101852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084285, + "balance_loss_mlp": 1.04611397, + "epoch": 0.6471719892266257, + "flos": 803413693440.0, + "grad_norm": 0.05541447784561029, + "language_loss": 0.80514741, + "learning_rate": 0.00029244708123001917, + "loss": 0.81599021, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.3815918, + "step": 3364, + "time_per_iteration": 2.9762370586395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.04387355, + "epoch": 0.6473643709118891, + "flos": 576923779584.0, + "grad_norm": 0.051117290397423236, + "language_loss": 0.84345543, + "learning_rate": 0.0002921636887162565, + "loss": 0.85426897, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.37451172, + "step": 3365, + "time_per_iteration": 2.72733736038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085471, + "balance_loss_mlp": 1.04930282, + "epoch": 0.6475567525971527, + "flos": 761079490560.0, + "grad_norm": 0.06137767127044858, + "language_loss": 0.83554536, + "learning_rate": 0.00029188037689170595, + "loss": 0.84640002, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.36181641, + "step": 3366, + "time_per_iteration": 2.962611675262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081175, + "balance_loss_mlp": 1.04474497, + "epoch": 0.6477491342824163, + "flos": 842754972672.0, + "grad_norm": 0.05371519731752011, + "language_loss": 0.83465898, + "learning_rate": 0.0002915971458663586, + "loss": 0.84547073, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.36450195, + "step": 3367, + "time_per_iteration": 3.043851137161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082146, + "balance_loss_mlp": 1.04545331, + "epoch": 0.6479415159676799, + "flos": 884431457280.0, + "grad_norm": 0.05567471086198027, + "language_loss": 0.81976676, + "learning_rate": 0.00029131399575017494, + "loss": 0.83058822, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.36669922, + "step": 3368, + "time_per_iteration": 3.16506290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072939, + "balance_loss_mlp": 1.0362463, + "epoch": 0.6481338976529435, + "flos": 615211642368.0, + "grad_norm": 0.04146272732833695, + "language_loss": 0.85776877, + "learning_rate": 0.0002910309266530836, + "loss": 0.86849815, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.36694336, + "step": 3369, + "time_per_iteration": 2.810415267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082428, + "balance_loss_mlp": 1.04485345, + "epoch": 0.648326279338207, + "flos": 509757428736.0, + "grad_norm": 0.047563398394336556, + "language_loss": 0.85364866, + "learning_rate": 0.0002907479386849814, + "loss": 0.86447287, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.37573242, + "step": 3370, + "time_per_iteration": 2.6234049797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079087, + "balance_loss_mlp": 1.04258549, + "epoch": 0.6485186610234706, + "flos": 702157026816.0, + "grad_norm": 0.05547979254265798, + "language_loss": 0.79903388, + "learning_rate": 0.0002904650319557339, + "loss": 0.80982471, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.36523438, + "step": 3371, + "time_per_iteration": 3.052445411682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077959, + "balance_loss_mlp": 1.04148114, + "epoch": 0.6487110427087341, + "flos": 560418476544.0, + "grad_norm": 0.10081589784895977, + "language_loss": 0.80853498, + "learning_rate": 0.0002901822065751758, + "loss": 0.81931454, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.36499023, + "step": 3372, + "time_per_iteration": 2.679738759994507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072935, + "balance_loss_mlp": 1.03614688, + "epoch": 0.6489034243939977, + "flos": 679801268736.0, + "grad_norm": 0.07558571237012199, + "language_loss": 0.85327506, + "learning_rate": 0.0002898994626531093, + "loss": 0.86400437, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.36767578, + "step": 3373, + "time_per_iteration": 2.8318021297454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078194, + "balance_loss_mlp": 1.04131091, + "epoch": 0.6490958060792612, + "flos": 474172918272.0, + "grad_norm": 0.04995126846613369, + "language_loss": 0.87709844, + "learning_rate": 0.00028961680029930526, + "loss": 0.88788044, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.36865234, + "step": 3374, + "time_per_iteration": 2.550858736038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107378, + "balance_loss_mlp": 1.03751612, + "epoch": 0.6492881877645248, + "flos": 588563642880.0, + "grad_norm": 0.053073331698041674, + "language_loss": 0.76720631, + "learning_rate": 0.00028933421962350317, + "loss": 0.77794409, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.36279297, + "step": 3375, + "time_per_iteration": 2.7313249111175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073863, + "balance_loss_mlp": 1.0367415, + "epoch": 0.6494805694497884, + "flos": 642139038720.0, + "grad_norm": 0.0646432947435949, + "language_loss": 0.84017503, + "learning_rate": 0.0002890517207354104, + "loss": 0.8509137, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.37109375, + "step": 3376, + "time_per_iteration": 2.8168907165527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071643, + "balance_loss_mlp": 1.0345453, + "epoch": 0.649672951135052, + "flos": 531550162944.0, + "grad_norm": 0.054117289013755926, + "language_loss": 0.81647491, + "learning_rate": 0.0002887693037447029, + "loss": 0.82719135, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.37084961, + "step": 3377, + "time_per_iteration": 2.59980845451355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068068, + "balance_loss_mlp": 1.03170967, + "epoch": 0.6498653328203156, + "flos": 547124295168.0, + "grad_norm": 0.05861346811628937, + "language_loss": 0.82201707, + "learning_rate": 0.00028848696876102443, + "loss": 0.83269775, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.36352539, + "step": 3378, + "time_per_iteration": 2.6153130531311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071004, + "balance_loss_mlp": 1.03333366, + "epoch": 0.650057714505579, + "flos": 461996172288.0, + "grad_norm": 0.0689678336471058, + "language_loss": 0.83211708, + "learning_rate": 0.00028820471589398723, + "loss": 0.84282708, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.37646484, + "step": 3379, + "time_per_iteration": 2.553159236907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068457, + "balance_loss_mlp": 1.03100109, + "epoch": 0.6502500961908426, + "flos": 509905815552.0, + "grad_norm": 0.06047763604232794, + "language_loss": 0.77722514, + "learning_rate": 0.00028792254525317196, + "loss": 0.78790975, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.37451172, + "step": 3380, + "time_per_iteration": 2.680063009262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072989, + "balance_loss_mlp": 1.03519976, + "epoch": 0.6504424778761062, + "flos": 579557320704.0, + "grad_norm": 0.05541331386031739, + "language_loss": 0.81432557, + "learning_rate": 0.00028764045694812645, + "loss": 0.82505548, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.37768555, + "step": 3381, + "time_per_iteration": 2.7398667335510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069777, + "balance_loss_mlp": 1.03186822, + "epoch": 0.6506348595613698, + "flos": 519206091264.0, + "grad_norm": 0.0812129580253802, + "language_loss": 0.76837122, + "learning_rate": 0.0002873584510883671, + "loss": 0.77906895, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.37915039, + "step": 3382, + "time_per_iteration": 2.565248727798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070048, + "balance_loss_mlp": 1.03302145, + "epoch": 0.6508272412466333, + "flos": 510048410112.0, + "grad_norm": 0.048965932841550305, + "language_loss": 0.85894716, + "learning_rate": 0.0002870765277833788, + "loss": 0.86964768, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.37011719, + "step": 3383, + "time_per_iteration": 2.7287330627441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070639, + "balance_loss_mlp": 1.03366053, + "epoch": 0.6510196229318969, + "flos": 625329607680.0, + "grad_norm": 0.07719936634316926, + "language_loss": 0.80431008, + "learning_rate": 0.00028679468714261347, + "loss": 0.81501651, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.36938477, + "step": 3384, + "time_per_iteration": 2.73777437210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068083, + "balance_loss_mlp": 1.03141391, + "epoch": 0.6512120046171604, + "flos": 474453725184.0, + "grad_norm": 0.05390133741953619, + "language_loss": 0.77104408, + "learning_rate": 0.0002865129292754918, + "loss": 0.78172493, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.36645508, + "step": 3385, + "time_per_iteration": 2.570709228515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107061, + "balance_loss_mlp": 1.03396475, + "epoch": 0.651404386302424, + "flos": 551561951232.0, + "grad_norm": 0.04665998226112413, + "language_loss": 0.81778049, + "learning_rate": 0.00028623125429140105, + "loss": 0.82848656, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.36621094, + "step": 3386, + "time_per_iteration": 2.8083431720733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067177, + "balance_loss_mlp": 1.02964997, + "epoch": 0.6515967679876876, + "flos": 523047227904.0, + "grad_norm": 0.06778513311562764, + "language_loss": 0.86781728, + "learning_rate": 0.00028594966229969785, + "loss": 0.87848902, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.37524414, + "step": 3387, + "time_per_iteration": 2.652562379837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068807, + "balance_loss_mlp": 1.03237641, + "epoch": 0.6517891496729511, + "flos": 573590412288.0, + "grad_norm": 0.04915205130547935, + "language_loss": 0.81361043, + "learning_rate": 0.00028566815340970577, + "loss": 0.82429844, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.36450195, + "step": 3388, + "time_per_iteration": 2.7212326526641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069055, + "balance_loss_mlp": 1.0323149, + "epoch": 0.6519815313582147, + "flos": 555662135808.0, + "grad_norm": 0.05372700409854334, + "language_loss": 0.80874032, + "learning_rate": 0.0002853867277307162, + "loss": 0.81943083, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.36743164, + "step": 3389, + "time_per_iteration": 2.645580291748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072292, + "balance_loss_mlp": 1.03564715, + "epoch": 0.6521739130434783, + "flos": 480229986816.0, + "grad_norm": 0.04994212123605962, + "language_loss": 0.82347226, + "learning_rate": 0.00028510538537198824, + "loss": 0.8341952, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.36669922, + "step": 3390, + "time_per_iteration": 2.6053972244262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071186, + "balance_loss_mlp": 1.03456497, + "epoch": 0.6523662947287419, + "flos": 665380887552.0, + "grad_norm": 0.052060213121620263, + "language_loss": 0.86389101, + "learning_rate": 0.00028482412644274867, + "loss": 0.87460279, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.36621094, + "step": 3391, + "time_per_iteration": 2.9146382808685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071108, + "balance_loss_mlp": 1.03408146, + "epoch": 0.6525586764140053, + "flos": 548394499584.0, + "grad_norm": 0.05233101091155523, + "language_loss": 0.74427474, + "learning_rate": 0.00028454295105219207, + "loss": 0.75498581, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.37011719, + "step": 3392, + "time_per_iteration": 2.653144598007202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072074, + "balance_loss_mlp": 1.03457081, + "epoch": 0.6527510580992689, + "flos": 802529012736.0, + "grad_norm": 0.044337250552145664, + "language_loss": 0.7951991, + "learning_rate": 0.0002842618593094802, + "loss": 0.80591983, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.37475586, + "step": 3393, + "time_per_iteration": 3.1016182899475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075529, + "balance_loss_mlp": 1.0390985, + "epoch": 0.6529434397845325, + "flos": 670864757760.0, + "grad_norm": 0.06313497545988733, + "language_loss": 0.80366606, + "learning_rate": 0.00028398085132374243, + "loss": 0.81442136, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.36425781, + "step": 3394, + "time_per_iteration": 2.81162691116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070268, + "balance_loss_mlp": 1.03338432, + "epoch": 0.6531358214697961, + "flos": 828043610112.0, + "grad_norm": 0.05205360505405607, + "language_loss": 0.84108675, + "learning_rate": 0.0002836999272040761, + "loss": 0.85178936, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.36865234, + "step": 3395, + "time_per_iteration": 3.086585283279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073433, + "balance_loss_mlp": 1.03607285, + "epoch": 0.6533282031550597, + "flos": 487157179392.0, + "grad_norm": 0.06347573427267852, + "language_loss": 0.8364076, + "learning_rate": 0.00028341908705954575, + "loss": 0.84714192, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.37353516, + "step": 3396, + "time_per_iteration": 2.63339900970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101777, + "balance_loss_mlp": 1.00317848, + "epoch": 0.6535205848403232, + "flos": 1556908121088.0, + "grad_norm": 0.01725431962534194, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82779574, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.14550781, + "step": 3397, + "time_per_iteration": 4.886535167694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107024, + "balance_loss_mlp": 1.03342795, + "epoch": 0.6537129665255867, + "flos": 493464531456.0, + "grad_norm": 0.0583640657945681, + "language_loss": 0.78047717, + "learning_rate": 0.00028285765913198604, + "loss": 0.79117954, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.36816406, + "step": 3398, + "time_per_iteration": 2.5336763858795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075265, + "balance_loss_mlp": 1.03771448, + "epoch": 0.6539053482108503, + "flos": 604718327808.0, + "grad_norm": 0.10018787672366053, + "language_loss": 0.81953001, + "learning_rate": 0.0002825770715669227, + "loss": 0.83028269, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.37548828, + "step": 3399, + "time_per_iteration": 2.7225871086120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073476, + "balance_loss_mlp": 1.03656852, + "epoch": 0.6540977298961139, + "flos": 577504332288.0, + "grad_norm": 0.054796705255158284, + "language_loss": 0.81529284, + "learning_rate": 0.00028229656841292634, + "loss": 0.82602763, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.36938477, + "step": 3400, + "time_per_iteration": 2.7136409282684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.03675604, + "epoch": 0.6542901115813774, + "flos": 511500496896.0, + "grad_norm": 0.09810959054820141, + "language_loss": 0.76415372, + "learning_rate": 0.0002820161497788979, + "loss": 0.77489489, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.37304688, + "step": 3401, + "time_per_iteration": 2.561142921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107247, + "balance_loss_mlp": 1.03656387, + "epoch": 0.654482493266641, + "flos": 625201569792.0, + "grad_norm": 0.05065630966567836, + "language_loss": 0.86865586, + "learning_rate": 0.00028173581577370545, + "loss": 0.87938058, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.35913086, + "step": 3402, + "time_per_iteration": 2.771660327911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074844, + "balance_loss_mlp": 1.0377934, + "epoch": 0.6546748749519046, + "flos": 523712148480.0, + "grad_norm": 0.04769798618105731, + "language_loss": 0.78826487, + "learning_rate": 0.0002814555665061844, + "loss": 0.79901326, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.37011719, + "step": 3403, + "time_per_iteration": 2.6541905403137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070804, + "balance_loss_mlp": 1.03351498, + "epoch": 0.6548672566371682, + "flos": 478945225728.0, + "grad_norm": 0.05625408135925951, + "language_loss": 0.77440852, + "learning_rate": 0.00028117540208513715, + "loss": 0.78511655, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.37280273, + "step": 3404, + "time_per_iteration": 2.7175214290618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070835, + "balance_loss_mlp": 1.03428507, + "epoch": 0.6550596383224317, + "flos": 615732558336.0, + "grad_norm": 0.05404961750978507, + "language_loss": 0.84969914, + "learning_rate": 0.00028089532261933313, + "loss": 0.86040747, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.36523438, + "step": 3405, + "time_per_iteration": 2.6872446537017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079163, + "balance_loss_mlp": 1.04197001, + "epoch": 0.6552520200076952, + "flos": 488594709504.0, + "grad_norm": 0.0680253030817501, + "language_loss": 0.85329425, + "learning_rate": 0.0002806153282175087, + "loss": 0.86408579, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.37182617, + "step": 3406, + "time_per_iteration": 2.573843479156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069625, + "balance_loss_mlp": 1.0329802, + "epoch": 0.6554444016929588, + "flos": 687310424064.0, + "grad_norm": 0.0894093410202252, + "language_loss": 0.82802272, + "learning_rate": 0.0002803354189883679, + "loss": 0.83871901, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.36669922, + "step": 3407, + "time_per_iteration": 2.824995279312134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076204, + "balance_loss_mlp": 1.04017901, + "epoch": 0.6556367833782224, + "flos": 542772417024.0, + "grad_norm": 0.05173629873734528, + "language_loss": 0.85629022, + "learning_rate": 0.00028005559504058053, + "loss": 0.86705232, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.3605957, + "step": 3408, + "time_per_iteration": 2.709195852279663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074603, + "balance_loss_mlp": 1.03860188, + "epoch": 0.655829165063486, + "flos": 673237840896.0, + "grad_norm": 0.05391320536337509, + "language_loss": 0.76764786, + "learning_rate": 0.0002797758564827838, + "loss": 0.77839386, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.36010742, + "step": 3409, + "time_per_iteration": 2.7769269943237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073177, + "balance_loss_mlp": 1.03624606, + "epoch": 0.6560215467487496, + "flos": 531550162944.0, + "grad_norm": 0.059937965776424594, + "language_loss": 0.8368215, + "learning_rate": 0.0002794962034235824, + "loss": 0.84755325, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.36889648, + "step": 3410, + "time_per_iteration": 2.599886417388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072982, + "balance_loss_mlp": 1.03588414, + "epoch": 0.656213928434013, + "flos": 591025476096.0, + "grad_norm": 0.13531884717327836, + "language_loss": 0.74423587, + "learning_rate": 0.00027921663597154695, + "loss": 0.75496566, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.37084961, + "step": 3411, + "time_per_iteration": 2.7206108570098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107222, + "balance_loss_mlp": 1.03686285, + "epoch": 0.6564063101192766, + "flos": 415564756992.0, + "grad_norm": 0.08609193384147822, + "language_loss": 0.80696797, + "learning_rate": 0.00027893715423521525, + "loss": 0.81769013, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.35375977, + "step": 3412, + "time_per_iteration": 2.4493868350982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067552, + "balance_loss_mlp": 1.03183699, + "epoch": 0.6565986918045402, + "flos": 453084392448.0, + "grad_norm": 0.05044036578156056, + "language_loss": 0.8354848, + "learning_rate": 0.00027865775832309163, + "loss": 0.84616029, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.35742188, + "step": 3413, + "time_per_iteration": 2.665999174118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074779, + "balance_loss_mlp": 1.03899264, + "epoch": 0.6567910734898038, + "flos": 547483677696.0, + "grad_norm": 0.060493690389786, + "language_loss": 0.85984117, + "learning_rate": 0.00027837844834364733, + "loss": 0.87058896, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.35839844, + "step": 3414, + "time_per_iteration": 2.6195499897003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072987, + "balance_loss_mlp": 1.03677094, + "epoch": 0.6569834551750673, + "flos": 655207667712.0, + "grad_norm": 0.11318049634335087, + "language_loss": 0.86511016, + "learning_rate": 0.00027809922440532, + "loss": 0.87583995, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.36254883, + "step": 3415, + "time_per_iteration": 2.823486566543579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072977, + "balance_loss_mlp": 1.03664172, + "epoch": 0.6571758368603309, + "flos": 539399761920.0, + "grad_norm": 0.08390902906870049, + "language_loss": 0.80793774, + "learning_rate": 0.00027782008661651406, + "loss": 0.81866741, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.36352539, + "step": 3416, + "time_per_iteration": 2.762639045715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071461, + "balance_loss_mlp": 1.03588891, + "epoch": 0.6573682185455945, + "flos": 497088880128.0, + "grad_norm": 0.049698407396127284, + "language_loss": 0.87283665, + "learning_rate": 0.00027754103508560013, + "loss": 0.8835513, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.35620117, + "step": 3417, + "time_per_iteration": 2.5768332481384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070469, + "balance_loss_mlp": 1.03389549, + "epoch": 0.657560600230858, + "flos": 447244111872.0, + "grad_norm": 0.06621650904732551, + "language_loss": 0.8256399, + "learning_rate": 0.0002772620699209163, + "loss": 0.83634454, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.36572266, + "step": 3418, + "time_per_iteration": 2.5885636806488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072181, + "balance_loss_mlp": 1.03606033, + "epoch": 0.6577529819161216, + "flos": 481696630272.0, + "grad_norm": 0.053979947748841836, + "language_loss": 0.80128914, + "learning_rate": 0.0002769831912307658, + "loss": 0.81201094, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.36157227, + "step": 3419, + "time_per_iteration": 2.51863169670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010709, + "balance_loss_mlp": 1.0346607, + "epoch": 0.6579453636013851, + "flos": 530589878784.0, + "grad_norm": 0.061422994023147534, + "language_loss": 0.80013275, + "learning_rate": 0.00027670439912341917, + "loss": 0.81084168, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.36254883, + "step": 3420, + "time_per_iteration": 2.595789670944214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067388, + "balance_loss_mlp": 1.03117275, + "epoch": 0.6581377452866487, + "flos": 627737596416.0, + "grad_norm": 0.0471415503067176, + "language_loss": 0.8344667, + "learning_rate": 0.0002764256937071129, + "loss": 0.84514058, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.36230469, + "step": 3421, + "time_per_iteration": 2.7812321186065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075792, + "balance_loss_mlp": 1.03886116, + "epoch": 0.6583301269719123, + "flos": 548355211776.0, + "grad_norm": 0.05116368726028845, + "language_loss": 0.86894339, + "learning_rate": 0.00027614707509005036, + "loss": 0.87970132, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.36889648, + "step": 3422, + "time_per_iteration": 2.6573753356933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069799, + "balance_loss_mlp": 1.03401232, + "epoch": 0.6585225086571759, + "flos": 427268639232.0, + "grad_norm": 0.053946906539649876, + "language_loss": 0.7900126, + "learning_rate": 0.0002758685433804008, + "loss": 0.80071056, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.35839844, + "step": 3423, + "time_per_iteration": 2.4556972980499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075371, + "balance_loss_mlp": 1.03758192, + "epoch": 0.6587148903424394, + "flos": 859264657920.0, + "grad_norm": 0.05746906751203771, + "language_loss": 0.79022425, + "learning_rate": 0.00027559009868630005, + "loss": 0.80097795, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.37768555, + "step": 3424, + "time_per_iteration": 3.0918102264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068275, + "balance_loss_mlp": 1.03067625, + "epoch": 0.6589072720277029, + "flos": 805280417280.0, + "grad_norm": 0.05909134726698472, + "language_loss": 0.7990104, + "learning_rate": 0.0002753117411158491, + "loss": 0.8096931, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.37573242, + "step": 3425, + "time_per_iteration": 3.0557546615600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074885, + "balance_loss_mlp": 1.03769183, + "epoch": 0.6590996537129665, + "flos": 548355211776.0, + "grad_norm": 0.0487398796366246, + "language_loss": 0.89624393, + "learning_rate": 0.0002750334707771168, + "loss": 0.90699285, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.37158203, + "step": 3426, + "time_per_iteration": 2.6186933517456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107782, + "balance_loss_mlp": 1.03991175, + "epoch": 0.6592920353982301, + "flos": 453931195392.0, + "grad_norm": 0.09520851451243123, + "language_loss": 0.81130987, + "learning_rate": 0.0002747552877781369, + "loss": 0.82208812, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.37866211, + "step": 3427, + "time_per_iteration": 2.4979238510131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068038, + "balance_loss_mlp": 1.03086865, + "epoch": 0.6594844170834937, + "flos": 566903328768.0, + "grad_norm": 0.04689884727267459, + "language_loss": 0.81804323, + "learning_rate": 0.0002744771922269097, + "loss": 0.82872361, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.37158203, + "step": 3428, + "time_per_iteration": 2.740729808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075113, + "balance_loss_mlp": 1.03768158, + "epoch": 0.6596767987687572, + "flos": 1187452016640.0, + "grad_norm": 0.05881296297664234, + "language_loss": 0.81886125, + "learning_rate": 0.0002741991842314015, + "loss": 0.82961237, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.37426758, + "step": 3429, + "time_per_iteration": 3.4745006561279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071953, + "balance_loss_mlp": 1.03506947, + "epoch": 0.6598691804540208, + "flos": 503247845376.0, + "grad_norm": 0.05507751278667406, + "language_loss": 0.85868287, + "learning_rate": 0.0002739212638995445, + "loss": 0.86940235, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.3684082, + "step": 3430, + "time_per_iteration": 2.532402515411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070704, + "balance_loss_mlp": 1.033463, + "epoch": 0.6600615621392844, + "flos": 531072916992.0, + "grad_norm": 0.05565442756862113, + "language_loss": 0.83027416, + "learning_rate": 0.00027364343133923696, + "loss": 0.84098119, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.37231445, + "step": 3431, + "time_per_iteration": 2.630985736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077517, + "balance_loss_mlp": 1.0396086, + "epoch": 0.6602539438245479, + "flos": 565170435072.0, + "grad_norm": 0.06720345334853779, + "language_loss": 0.82615936, + "learning_rate": 0.0002733656866583431, + "loss": 0.83693457, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.37890625, + "step": 3432, + "time_per_iteration": 2.6693778038024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072775, + "balance_loss_mlp": 1.0354147, + "epoch": 0.6604463255098114, + "flos": 856802824704.0, + "grad_norm": 0.05437523875977016, + "language_loss": 0.82810867, + "learning_rate": 0.0002730880299646927, + "loss": 0.83883643, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.37329102, + "step": 3433, + "time_per_iteration": 3.047272205352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072216, + "balance_loss_mlp": 1.03540444, + "epoch": 0.660638707195075, + "flos": 674158837248.0, + "grad_norm": 0.05169361023924996, + "language_loss": 0.85458863, + "learning_rate": 0.0002728104613660821, + "loss": 0.86531085, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.36791992, + "step": 3434, + "time_per_iteration": 2.8202831745147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010658, + "balance_loss_mlp": 1.02879786, + "epoch": 0.6608310888803386, + "flos": 888572339712.0, + "grad_norm": 0.05115304739976813, + "language_loss": 0.83194226, + "learning_rate": 0.0002725329809702729, + "loss": 0.84260029, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.36962891, + "step": 3435, + "time_per_iteration": 3.228891134262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071505, + "balance_loss_mlp": 1.03376281, + "epoch": 0.6610234705656022, + "flos": 1135909260288.0, + "grad_norm": 0.06628416389045559, + "language_loss": 0.75631964, + "learning_rate": 0.0002722555888849921, + "loss": 0.76703465, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.37695312, + "step": 3436, + "time_per_iteration": 3.422288179397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068961, + "balance_loss_mlp": 1.03212583, + "epoch": 0.6612158522508658, + "flos": 467776816128.0, + "grad_norm": 0.05048111401896507, + "language_loss": 0.80400562, + "learning_rate": 0.00027197828521793334, + "loss": 0.81469518, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.36816406, + "step": 3437, + "time_per_iteration": 2.4787607192993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073991, + "balance_loss_mlp": 1.03686941, + "epoch": 0.6614082339361292, + "flos": 571374480384.0, + "grad_norm": 0.05876416837727376, + "language_loss": 0.84865153, + "learning_rate": 0.0002717010700767552, + "loss": 0.85939145, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.37109375, + "step": 3438, + "time_per_iteration": 2.740835189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071354, + "balance_loss_mlp": 1.03444707, + "epoch": 0.6616006156213928, + "flos": 498220872192.0, + "grad_norm": 0.06865546708014894, + "language_loss": 0.75838953, + "learning_rate": 0.00027142394356908226, + "loss": 0.76910305, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.36889648, + "step": 3439, + "time_per_iteration": 2.5476725101470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067365, + "balance_loss_mlp": 1.03021967, + "epoch": 0.6617929973066564, + "flos": 602124074496.0, + "grad_norm": 0.05819778232686783, + "language_loss": 0.85115051, + "learning_rate": 0.00027114690580250456, + "loss": 0.86182415, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.37133789, + "step": 3440, + "time_per_iteration": 2.746610403060913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072245, + "balance_loss_mlp": 1.03562403, + "epoch": 0.66198537899192, + "flos": 522731515392.0, + "grad_norm": 0.053821887104205664, + "language_loss": 0.86748421, + "learning_rate": 0.0002708699568845776, + "loss": 0.87820661, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.36621094, + "step": 3441, + "time_per_iteration": 2.6001980304718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046259, + "balance_loss_mlp": 1.0328126, + "epoch": 0.6621777606771835, + "flos": 1565421230592.0, + "grad_norm": 0.030021604030083596, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80334044, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.13476562, + "step": 3442, + "time_per_iteration": 4.909358263015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075884, + "balance_loss_mlp": 1.03933442, + "epoch": 0.6623701423624471, + "flos": 526409708544.0, + "grad_norm": 0.050122845180299073, + "language_loss": 0.83157456, + "learning_rate": 0.0002703163260247261, + "loss": 0.84233344, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.36547852, + "step": 3443, + "time_per_iteration": 2.600733757019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074894, + "balance_loss_mlp": 1.03853548, + "epoch": 0.6625625240477107, + "flos": 527921432064.0, + "grad_norm": 0.07644437952185021, + "language_loss": 0.81613672, + "learning_rate": 0.0002700396442977399, + "loss": 0.8268857, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.36376953, + "step": 3444, + "time_per_iteration": 2.598722457885742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080077, + "balance_loss_mlp": 1.04312193, + "epoch": 0.6627549057329742, + "flos": 472854661632.0, + "grad_norm": 0.05132438186678615, + "language_loss": 0.84284377, + "learning_rate": 0.0002697630518492817, + "loss": 0.85364461, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.36938477, + "step": 3445, + "time_per_iteration": 2.6794075965881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078368, + "balance_loss_mlp": 1.04253387, + "epoch": 0.6629472874182378, + "flos": 527743931904.0, + "grad_norm": 0.05491144350541831, + "language_loss": 0.8564226, + "learning_rate": 0.0002694865487867343, + "loss": 0.86720634, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.35888672, + "step": 3446, + "time_per_iteration": 2.643427848815918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081911, + "balance_loss_mlp": 1.04540932, + "epoch": 0.6631396691035013, + "flos": 612906960384.0, + "grad_norm": 0.04980385474467639, + "language_loss": 0.84496373, + "learning_rate": 0.0002692101352174453, + "loss": 0.85578281, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.36499023, + "step": 3447, + "time_per_iteration": 2.750990629196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077753, + "balance_loss_mlp": 1.04106009, + "epoch": 0.6633320507887649, + "flos": 609041092608.0, + "grad_norm": 0.05216047224803115, + "language_loss": 0.8459692, + "learning_rate": 0.00026893381124872787, + "loss": 0.85674667, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.3671875, + "step": 3448, + "time_per_iteration": 2.7701821327209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074873, + "balance_loss_mlp": 1.03839493, + "epoch": 0.6635244324740285, + "flos": 749342112768.0, + "grad_norm": 0.05521376247242365, + "language_loss": 0.80839992, + "learning_rate": 0.00026865757698786097, + "loss": 0.81914866, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.36499023, + "step": 3449, + "time_per_iteration": 3.046751022338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079305, + "balance_loss_mlp": 1.04382825, + "epoch": 0.6637168141592921, + "flos": 664222754304.0, + "grad_norm": 0.05057031991468663, + "language_loss": 0.8206256, + "learning_rate": 0.000268381432542088, + "loss": 0.83141863, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.35546875, + "step": 3450, + "time_per_iteration": 2.7903122901916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078156, + "balance_loss_mlp": 1.04117751, + "epoch": 0.6639091958445555, + "flos": 606500683776.0, + "grad_norm": 0.05221239612866202, + "language_loss": 0.7978282, + "learning_rate": 0.00026810537801861807, + "loss": 0.80860978, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.36938477, + "step": 3451, + "time_per_iteration": 2.7744555473327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078626, + "balance_loss_mlp": 1.04200482, + "epoch": 0.6641015775298191, + "flos": 476452869120.0, + "grad_norm": 0.04982593193554921, + "language_loss": 0.81320304, + "learning_rate": 0.0002678294135246243, + "loss": 0.82398927, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.36621094, + "step": 3452, + "time_per_iteration": 2.748623847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107526, + "balance_loss_mlp": 1.03902042, + "epoch": 0.6642939592150827, + "flos": 903746391552.0, + "grad_norm": 0.05075048748752087, + "language_loss": 0.86122698, + "learning_rate": 0.0002675535391672463, + "loss": 0.87197959, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.36230469, + "step": 3453, + "time_per_iteration": 3.0941269397735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075595, + "balance_loss_mlp": 1.03995168, + "epoch": 0.6644863409003463, + "flos": 581527351296.0, + "grad_norm": 0.04705931875685086, + "language_loss": 0.85942483, + "learning_rate": 0.0002672777550535877, + "loss": 0.87018085, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.35668945, + "step": 3454, + "time_per_iteration": 2.782492160797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077961, + "balance_loss_mlp": 1.04222202, + "epoch": 0.6646787225856099, + "flos": 478761933312.0, + "grad_norm": 0.05883776733050642, + "language_loss": 0.84943002, + "learning_rate": 0.00026700206129071747, + "loss": 0.86020958, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.35791016, + "step": 3455, + "time_per_iteration": 2.524601697921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074329, + "balance_loss_mlp": 1.0389235, + "epoch": 0.6648711042708734, + "flos": 449676831744.0, + "grad_norm": 0.058012568255648024, + "language_loss": 0.88879943, + "learning_rate": 0.00026672645798566925, + "loss": 0.89954275, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.35449219, + "step": 3456, + "time_per_iteration": 2.532412528991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072913, + "balance_loss_mlp": 1.03745985, + "epoch": 0.665063485956137, + "flos": 858553095168.0, + "grad_norm": 0.053261627047558845, + "language_loss": 0.79371452, + "learning_rate": 0.00026645094524544225, + "loss": 0.8044436, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.35473633, + "step": 3457, + "time_per_iteration": 3.2936151027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068323, + "balance_loss_mlp": 1.03229845, + "epoch": 0.6652558676414005, + "flos": 604024293888.0, + "grad_norm": 0.04836928796010222, + "language_loss": 0.75254017, + "learning_rate": 0.00026617552317699945, + "loss": 0.76322341, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.36035156, + "step": 3458, + "time_per_iteration": 2.781972646713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072016, + "balance_loss_mlp": 1.03651559, + "epoch": 0.6654482493266641, + "flos": 510141542400.0, + "grad_norm": 0.05402195072483101, + "language_loss": 0.87006921, + "learning_rate": 0.0002659001918872693, + "loss": 0.88078934, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.35546875, + "step": 3459, + "time_per_iteration": 2.586364507675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073402, + "balance_loss_mlp": 1.03790104, + "epoch": 0.6656406310119277, + "flos": 565342142976.0, + "grad_norm": 0.06009221273725258, + "language_loss": 0.80872095, + "learning_rate": 0.0002656249514831449, + "loss": 0.81945497, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.35522461, + "step": 3460, + "time_per_iteration": 2.6385302543640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072951, + "balance_loss_mlp": 1.03652048, + "epoch": 0.6658330126971912, + "flos": 1023859533312.0, + "grad_norm": 0.05794846268474579, + "language_loss": 0.86832029, + "learning_rate": 0.00026534980207148416, + "loss": 0.87904978, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.36425781, + "step": 3461, + "time_per_iteration": 3.388073205947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074265, + "balance_loss_mlp": 1.03869295, + "epoch": 0.6660253943824548, + "flos": 816472147968.0, + "grad_norm": 0.06339025189442228, + "language_loss": 0.7302506, + "learning_rate": 0.0002650747437591097, + "loss": 0.74099326, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.35595703, + "step": 3462, + "time_per_iteration": 2.980158567428589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021261, + "balance_loss_mlp": 1.00810075, + "epoch": 0.6662177760677184, + "flos": 1495331767296.0, + "grad_norm": 0.02097535909927297, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82900834, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.13183594, + "step": 3463, + "time_per_iteration": 5.0071799755096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070948, + "balance_loss_mlp": 1.0354948, + "epoch": 0.666410157752982, + "flos": 499875190272.0, + "grad_norm": 0.04521050671951116, + "language_loss": 0.86503369, + "learning_rate": 0.00026452490085933155, + "loss": 0.87574315, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.35473633, + "step": 3464, + "time_per_iteration": 2.5450592041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067553, + "balance_loss_mlp": 1.03212357, + "epoch": 0.6666025394382454, + "flos": 480928402944.0, + "grad_norm": 0.05339724932754041, + "language_loss": 0.89435887, + "learning_rate": 0.00026425011648539614, + "loss": 0.90503436, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.35424805, + "step": 3465, + "time_per_iteration": 2.5414719581604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070322, + "balance_loss_mlp": 1.03377271, + "epoch": 0.666794921123509, + "flos": 546395355648.0, + "grad_norm": 0.05247467659401075, + "language_loss": 0.82117605, + "learning_rate": 0.00026397542363768267, + "loss": 0.83187926, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.36547852, + "step": 3466, + "time_per_iteration": 2.659952402114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071457, + "balance_loss_mlp": 1.03533673, + "epoch": 0.6669873028087726, + "flos": 471750372864.0, + "grad_norm": 0.052441453711620734, + "language_loss": 0.81731021, + "learning_rate": 0.0002637008224228362, + "loss": 0.82802474, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.36132812, + "step": 3467, + "time_per_iteration": 2.5569608211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073874, + "balance_loss_mlp": 1.03875458, + "epoch": 0.6671796844940362, + "flos": 547119912960.0, + "grad_norm": 0.04638174393206939, + "language_loss": 0.84333348, + "learning_rate": 0.00026342631294746653, + "loss": 0.85407221, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.3515625, + "step": 3468, + "time_per_iteration": 2.7492995262145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068836, + "balance_loss_mlp": 1.03300142, + "epoch": 0.6673720661792998, + "flos": 1069867547136.0, + "grad_norm": 0.06886465160601114, + "language_loss": 0.80601752, + "learning_rate": 0.0002631518953181476, + "loss": 0.81670582, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.35839844, + "step": 3469, + "time_per_iteration": 3.4849367141723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017385, + "balance_loss_mlp": 1.0047015, + "epoch": 0.6675644478645633, + "flos": 1522963372032.0, + "grad_norm": 0.011284556376000376, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.77342671, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.12695312, + "step": 3470, + "time_per_iteration": 4.8896119594573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073777, + "balance_loss_mlp": 1.03775215, + "epoch": 0.6677568295498268, + "flos": 579410343936.0, + "grad_norm": 0.05100561036949307, + "language_loss": 0.8019954, + "learning_rate": 0.00026260333602377985, + "loss": 0.81273311, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.36035156, + "step": 3471, + "time_per_iteration": 2.7527613639831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069612, + "balance_loss_mlp": 1.03370583, + "epoch": 0.6679492112350904, + "flos": 383722458624.0, + "grad_norm": 0.06457573009444674, + "language_loss": 0.86992371, + "learning_rate": 0.0002623291945717007, + "loss": 0.88061988, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.35913086, + "step": 3472, + "time_per_iteration": 2.4496309757232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067849, + "balance_loss_mlp": 1.03158569, + "epoch": 0.668141592920354, + "flos": 1150297555968.0, + "grad_norm": 0.0483341926082761, + "language_loss": 0.83728033, + "learning_rate": 0.00026205514539161175, + "loss": 0.84795886, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.36254883, + "step": 3473, + "time_per_iteration": 3.518329620361328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072278, + "balance_loss_mlp": 1.03682494, + "epoch": 0.6683339746056175, + "flos": 560804000256.0, + "grad_norm": 0.054398972389199884, + "language_loss": 0.84145987, + "learning_rate": 0.00026178118858990773, + "loss": 0.85218263, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.35449219, + "step": 3474, + "time_per_iteration": 2.848719596862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068236, + "balance_loss_mlp": 1.0318768, + "epoch": 0.6685263562908811, + "flos": 514051080192.0, + "grad_norm": 0.060039795644517814, + "language_loss": 0.84093618, + "learning_rate": 0.0002615073242729483, + "loss": 0.85161853, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.36352539, + "step": 3475, + "time_per_iteration": 2.648353099822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070134, + "balance_loss_mlp": 1.03382277, + "epoch": 0.6687187379761447, + "flos": 629466107904.0, + "grad_norm": 0.05046564119076302, + "language_loss": 0.84281248, + "learning_rate": 0.0002612335525470573, + "loss": 0.85351384, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.36352539, + "step": 3476, + "time_per_iteration": 2.792809247970581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_mlp": 1.03096104, + "epoch": 0.6689111196614083, + "flos": 535312723968.0, + "grad_norm": 0.05473638804270082, + "language_loss": 0.78341687, + "learning_rate": 0.0002609598735185221, + "loss": 0.79407597, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.35009766, + "step": 3477, + "time_per_iteration": 2.64404559135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070177, + "balance_loss_mlp": 1.03489089, + "epoch": 0.6691035013466718, + "flos": 602758471680.0, + "grad_norm": 0.0937067542198485, + "language_loss": 0.82979453, + "learning_rate": 0.00026068628729359445, + "loss": 0.84049624, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.35327148, + "step": 3478, + "time_per_iteration": 2.749631404876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071211, + "balance_loss_mlp": 1.03640211, + "epoch": 0.6692958830319353, + "flos": 632539017216.0, + "grad_norm": 0.04937335272714273, + "language_loss": 0.7616291, + "learning_rate": 0.00026041279397848996, + "loss": 0.77234125, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.34838867, + "step": 3479, + "time_per_iteration": 2.839651584625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072082, + "balance_loss_mlp": 1.03693914, + "epoch": 0.6694882647171989, + "flos": 645153721344.0, + "grad_norm": 0.04802288968176994, + "language_loss": 0.8253727, + "learning_rate": 0.00026013939367938797, + "loss": 0.83609354, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.35180664, + "step": 3480, + "time_per_iteration": 2.8756163120269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072592, + "balance_loss_mlp": 1.03861761, + "epoch": 0.6696806464024625, + "flos": 569292378624.0, + "grad_norm": 0.05111387659739007, + "language_loss": 0.81035048, + "learning_rate": 0.00025986608650243204, + "loss": 0.82107639, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.34008789, + "step": 3481, + "time_per_iteration": 2.780930757522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107265, + "balance_loss_mlp": 1.03762627, + "epoch": 0.6698730280877261, + "flos": 622386146304.0, + "grad_norm": 0.11620710974574953, + "language_loss": 0.79299992, + "learning_rate": 0.0002595928725537293, + "loss": 0.80372643, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.35058594, + "step": 3482, + "time_per_iteration": 2.8551175594329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071879, + "balance_loss_mlp": 1.03642654, + "epoch": 0.6700654097729896, + "flos": 502258447872.0, + "grad_norm": 0.05059450730585095, + "language_loss": 0.88189447, + "learning_rate": 0.0002593197519393509, + "loss": 0.89261329, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.35449219, + "step": 3483, + "time_per_iteration": 2.556617021560669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070709, + "balance_loss_mlp": 1.03637671, + "epoch": 0.6702577914582531, + "flos": 623567600640.0, + "grad_norm": 0.05152577773762556, + "language_loss": 0.79466176, + "learning_rate": 0.00025904672476533165, + "loss": 0.8053689, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.34375, + "step": 3484, + "time_per_iteration": 2.8806934356689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072584, + "balance_loss_mlp": 1.03794122, + "epoch": 0.6704501731435167, + "flos": 456033646080.0, + "grad_norm": 0.06330154522458538, + "language_loss": 0.82820839, + "learning_rate": 0.0002587737911376704, + "loss": 0.83893424, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.34643555, + "step": 3485, + "time_per_iteration": 2.6385717391967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073373, + "balance_loss_mlp": 1.03789639, + "epoch": 0.6706425548287803, + "flos": 542973238272.0, + "grad_norm": 0.04882372942075566, + "language_loss": 0.83671743, + "learning_rate": 0.00025850095116232885, + "loss": 0.84745121, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.35498047, + "step": 3486, + "time_per_iteration": 2.6404170989990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073473, + "balance_loss_mlp": 1.03873491, + "epoch": 0.6708349365140439, + "flos": 633631721472.0, + "grad_norm": 0.0500263981223685, + "language_loss": 0.77869016, + "learning_rate": 0.000258228204945233, + "loss": 0.7894249, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.34765625, + "step": 3487, + "time_per_iteration": 2.934980630874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107374, + "balance_loss_mlp": 1.03964591, + "epoch": 0.6710273181993074, + "flos": 640459989504.0, + "grad_norm": 0.05519065712818486, + "language_loss": 0.84700072, + "learning_rate": 0.00025795555259227254, + "loss": 0.85773814, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.34130859, + "step": 3488, + "time_per_iteration": 2.7644948959350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071185, + "balance_loss_mlp": 1.03720999, + "epoch": 0.671219699884571, + "flos": 553673166336.0, + "grad_norm": 0.13608492094864486, + "language_loss": 0.8373906, + "learning_rate": 0.00025768299420930046, + "loss": 0.84810245, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.33984375, + "step": 3489, + "time_per_iteration": 2.718442916870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072555, + "balance_loss_mlp": 1.03700686, + "epoch": 0.6714120815698346, + "flos": 731191256064.0, + "grad_norm": 0.05259417787616518, + "language_loss": 0.83743513, + "learning_rate": 0.0002574105299021332, + "loss": 0.84816062, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.35571289, + "step": 3490, + "time_per_iteration": 2.8551361560821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069241, + "balance_loss_mlp": 1.03440833, + "epoch": 0.6716044632550981, + "flos": 688344901632.0, + "grad_norm": 0.0512424310438266, + "language_loss": 0.84138238, + "learning_rate": 0.00025713815977655084, + "loss": 0.85207486, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.34863281, + "step": 3491, + "time_per_iteration": 2.8758041858673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107046, + "balance_loss_mlp": 1.03700948, + "epoch": 0.6717968449403616, + "flos": 460391316480.0, + "grad_norm": 0.05311776823475344, + "language_loss": 0.84021199, + "learning_rate": 0.0002568658839382969, + "loss": 0.85091662, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.3347168, + "step": 3492, + "time_per_iteration": 2.5461535453796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066732, + "balance_loss_mlp": 1.03259087, + "epoch": 0.6719892266256252, + "flos": 501362182656.0, + "grad_norm": 0.0636144820373753, + "language_loss": 0.84432656, + "learning_rate": 0.00025659370249307814, + "loss": 0.85499388, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.34179688, + "step": 3493, + "time_per_iteration": 2.5833051204681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066615, + "balance_loss_mlp": 1.03094745, + "epoch": 0.6721816083108888, + "flos": 683223386112.0, + "grad_norm": 0.056507935755291845, + "language_loss": 0.84795702, + "learning_rate": 0.00025632161554656473, + "loss": 0.85862321, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.35717773, + "step": 3494, + "time_per_iteration": 2.852865219116211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065933, + "balance_loss_mlp": 1.03067088, + "epoch": 0.6723739899961524, + "flos": 585544578048.0, + "grad_norm": 0.05119219920681276, + "language_loss": 0.82001173, + "learning_rate": 0.00025604962320439017, + "loss": 0.83067107, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.35327148, + "step": 3495, + "time_per_iteration": 2.6681125164031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068608, + "balance_loss_mlp": 1.03334618, + "epoch": 0.672566371681416, + "flos": 506336721408.0, + "grad_norm": 0.06376768707456672, + "language_loss": 0.82132721, + "learning_rate": 0.0002557777255721516, + "loss": 0.83201331, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.35302734, + "step": 3496, + "time_per_iteration": 2.688211441040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066305, + "balance_loss_mlp": 1.03142464, + "epoch": 0.6727587533666795, + "flos": 535405856256.0, + "grad_norm": 0.061511790914054676, + "language_loss": 0.80550486, + "learning_rate": 0.0002555059227554087, + "loss": 0.81616795, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.34912109, + "step": 3497, + "time_per_iteration": 2.6755480766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107069, + "balance_loss_mlp": 1.03588057, + "epoch": 0.672951135051943, + "flos": 602532919296.0, + "grad_norm": 0.08077616236025223, + "language_loss": 0.77663779, + "learning_rate": 0.00025523421485968453, + "loss": 0.78734469, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.34838867, + "step": 3498, + "time_per_iteration": 2.782900333404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067117, + "balance_loss_mlp": 1.0330708, + "epoch": 0.6731435167372066, + "flos": 810976693248.0, + "grad_norm": 0.05548957560218429, + "language_loss": 0.85524929, + "learning_rate": 0.00025496260199046585, + "loss": 0.86592042, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.34082031, + "step": 3499, + "time_per_iteration": 2.9468865394592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070869, + "balance_loss_mlp": 1.0354166, + "epoch": 0.6733358984224702, + "flos": 611306486784.0, + "grad_norm": 0.05533117407316435, + "language_loss": 0.84011221, + "learning_rate": 0.000254691084253202, + "loss": 0.8508209, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.35473633, + "step": 3500, + "time_per_iteration": 2.7936129570007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107048, + "balance_loss_mlp": 1.03607607, + "epoch": 0.6735282801077337, + "flos": 558636120576.0, + "grad_norm": 0.06619060652022955, + "language_loss": 0.77001846, + "learning_rate": 0.00025441966175330567, + "loss": 0.78072333, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.34423828, + "step": 3501, + "time_per_iteration": 2.7096900939941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073276, + "balance_loss_mlp": 1.03737032, + "epoch": 0.6737206617929973, + "flos": 672134962176.0, + "grad_norm": 0.04835122337119983, + "language_loss": 0.79766667, + "learning_rate": 0.00025414833459615183, + "loss": 0.80839938, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.35913086, + "step": 3502, + "time_per_iteration": 2.787539482116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075889, + "balance_loss_mlp": 1.03933966, + "epoch": 0.6739130434782609, + "flos": 633148683264.0, + "grad_norm": 0.05358836017753152, + "language_loss": 0.80260807, + "learning_rate": 0.0002538771028870796, + "loss": 0.81336701, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.36547852, + "step": 3503, + "time_per_iteration": 2.7826414108276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077652, + "balance_loss_mlp": 1.04224694, + "epoch": 0.6741054251635245, + "flos": 531171841536.0, + "grad_norm": 0.07580622934543835, + "language_loss": 0.81591624, + "learning_rate": 0.0002536059667313903, + "loss": 0.82669276, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.35424805, + "step": 3504, + "time_per_iteration": 2.7296247482299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107121, + "balance_loss_mlp": 1.03551888, + "epoch": 0.674297806848788, + "flos": 542343223296.0, + "grad_norm": 0.056073772887399426, + "language_loss": 0.8900978, + "learning_rate": 0.0002533349262343483, + "loss": 0.90080988, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.35742188, + "step": 3505, + "time_per_iteration": 2.674409866333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107348, + "balance_loss_mlp": 1.03828955, + "epoch": 0.6744901885340515, + "flos": 463291107840.0, + "grad_norm": 0.05947075073095298, + "language_loss": 0.81730378, + "learning_rate": 0.0002530639815011807, + "loss": 0.82803857, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.35229492, + "step": 3506, + "time_per_iteration": 2.497544765472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068375, + "balance_loss_mlp": 1.0326128, + "epoch": 0.6746825702193151, + "flos": 631533652992.0, + "grad_norm": 0.07086052765097473, + "language_loss": 0.84639049, + "learning_rate": 0.0002527931326370781, + "loss": 0.85707426, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.35791016, + "step": 3507, + "time_per_iteration": 2.7526142597198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069527, + "balance_loss_mlp": 1.03395462, + "epoch": 0.6748749519045787, + "flos": 670835644416.0, + "grad_norm": 0.05093445347334381, + "language_loss": 0.82660782, + "learning_rate": 0.00025252237974719276, + "loss": 0.83730316, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.35595703, + "step": 3508, + "time_per_iteration": 2.8549742698669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107359, + "balance_loss_mlp": 1.03782725, + "epoch": 0.6750673335898423, + "flos": 766756827648.0, + "grad_norm": 0.05329285448866526, + "language_loss": 0.80265921, + "learning_rate": 0.00025225172293664056, + "loss": 0.81339508, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.3581543, + "step": 3509, + "time_per_iteration": 2.974613904953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026572, + "balance_loss_mlp": 1.01465082, + "epoch": 0.6752597152751059, + "flos": 1511786198016.0, + "grad_norm": 0.015514835233315651, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77959704, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.11914062, + "step": 3510, + "time_per_iteration": 4.91582179069519 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072206, + "balance_loss_mlp": 1.03637218, + "epoch": 0.6754520969603693, + "flos": 686990329344.0, + "grad_norm": 0.06350153745428545, + "language_loss": 0.84804261, + "learning_rate": 0.00025171069797381106, + "loss": 0.85876471, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.35864258, + "step": 3511, + "time_per_iteration": 2.7993617057800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066581, + "balance_loss_mlp": 1.0310328, + "epoch": 0.6756444786456329, + "flos": 500318940672.0, + "grad_norm": 0.06118900000736982, + "language_loss": 0.81987178, + "learning_rate": 0.00025144033003157864, + "loss": 0.83053756, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.35620117, + "step": 3512, + "time_per_iteration": 2.5873219966888428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069288, + "balance_loss_mlp": 1.03450298, + "epoch": 0.6758368603308965, + "flos": 492357270528.0, + "grad_norm": 0.060009957038895716, + "language_loss": 0.78680366, + "learning_rate": 0.00025117005858876806, + "loss": 0.7974965, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.34838867, + "step": 3513, + "time_per_iteration": 2.6835427284240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069658, + "balance_loss_mlp": 1.03427649, + "epoch": 0.6760292420161601, + "flos": 555657753600.0, + "grad_norm": 0.15540830916665044, + "language_loss": 0.8478874, + "learning_rate": 0.000250899883750308, + "loss": 0.85858399, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.35400391, + "step": 3514, + "time_per_iteration": 2.650256395339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070046, + "balance_loss_mlp": 1.03478396, + "epoch": 0.6762216237014236, + "flos": 607322755584.0, + "grad_norm": 0.06069446103583955, + "language_loss": 0.8186444, + "learning_rate": 0.00025062980562109006, + "loss": 0.82934481, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.35302734, + "step": 3515, + "time_per_iteration": 2.7015137672424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066431, + "balance_loss_mlp": 1.0309782, + "epoch": 0.6764140053866872, + "flos": 533501254656.0, + "grad_norm": 0.06011919218972519, + "language_loss": 0.82936066, + "learning_rate": 0.0002503598243059677, + "loss": 0.84002495, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.35473633, + "step": 3516, + "time_per_iteration": 2.7936599254608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066759, + "balance_loss_mlp": 1.03221166, + "epoch": 0.6766063870719508, + "flos": 504548573184.0, + "grad_norm": 0.0538086785967606, + "language_loss": 0.79831243, + "learning_rate": 0.0002500899399097568, + "loss": 0.80897999, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.34594727, + "step": 3517, + "time_per_iteration": 2.647766351699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068577, + "balance_loss_mlp": 1.03340983, + "epoch": 0.6767987687572143, + "flos": 512923470336.0, + "grad_norm": 0.05682834446853688, + "language_loss": 0.85193241, + "learning_rate": 0.0002498201525372359, + "loss": 0.86261815, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.35205078, + "step": 3518, + "time_per_iteration": 2.5557949542999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064417, + "balance_loss_mlp": 1.03029943, + "epoch": 0.6769911504424779, + "flos": 524780121600.0, + "grad_norm": 0.05092560749530118, + "language_loss": 0.83158201, + "learning_rate": 0.00024955046229314584, + "loss": 0.84222615, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.34130859, + "step": 3519, + "time_per_iteration": 2.578089475631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069846, + "balance_loss_mlp": 1.03422618, + "epoch": 0.6771835321277414, + "flos": 449662275072.0, + "grad_norm": 0.05617502004048809, + "language_loss": 0.87603748, + "learning_rate": 0.00024928086928218947, + "loss": 0.88673592, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.35644531, + "step": 3520, + "time_per_iteration": 2.490943193435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068484, + "balance_loss_mlp": 1.03322208, + "epoch": 0.677375913813005, + "flos": 709020200448.0, + "grad_norm": 0.051602142671676454, + "language_loss": 0.75993657, + "learning_rate": 0.00024901137360903216, + "loss": 0.77062142, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.35302734, + "step": 3521, + "time_per_iteration": 2.9075634479522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073222, + "balance_loss_mlp": 1.03817451, + "epoch": 0.6775682954982686, + "flos": 428189635584.0, + "grad_norm": 0.10231641973637204, + "language_loss": 0.81175685, + "learning_rate": 0.00024874197537830115, + "loss": 0.82248902, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.35083008, + "step": 3522, + "time_per_iteration": 2.5057058334350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069237, + "balance_loss_mlp": 1.03478503, + "epoch": 0.6777606771835322, + "flos": 437677585920.0, + "grad_norm": 0.060253133761597404, + "language_loss": 0.83087361, + "learning_rate": 0.00024847267469458684, + "loss": 0.84156603, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.3449707, + "step": 3523, + "time_per_iteration": 2.5406739711761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067339, + "balance_loss_mlp": 1.03210068, + "epoch": 0.6779530588687956, + "flos": 775106993664.0, + "grad_norm": 0.0551254373136415, + "language_loss": 0.78231275, + "learning_rate": 0.00024820347166244034, + "loss": 0.79298615, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.35302734, + "step": 3524, + "time_per_iteration": 3.021663188934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064562, + "balance_loss_mlp": 1.03013432, + "epoch": 0.6781454405540592, + "flos": 571502518272.0, + "grad_norm": 0.04412805225967261, + "language_loss": 0.84577274, + "learning_rate": 0.0002479343663863755, + "loss": 0.85641837, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.34448242, + "step": 3525, + "time_per_iteration": 2.760934352874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070433, + "balance_loss_mlp": 1.03395486, + "epoch": 0.6783378222393228, + "flos": 484788478464.0, + "grad_norm": 0.051123449842866715, + "language_loss": 0.76749617, + "learning_rate": 0.00024766535897086876, + "loss": 0.77820051, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.36474609, + "step": 3526, + "time_per_iteration": 2.5466532707214355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071181, + "balance_loss_mlp": 1.03584695, + "epoch": 0.6785302039245864, + "flos": 482592895488.0, + "grad_norm": 0.04922293189317912, + "language_loss": 0.78913069, + "learning_rate": 0.0002473964495203578, + "loss": 0.79984254, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.35351562, + "step": 3527, + "time_per_iteration": 2.65765118598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072886, + "balance_loss_mlp": 1.03609788, + "epoch": 0.67872258560985, + "flos": 524451262464.0, + "grad_norm": 0.04942804135010068, + "language_loss": 0.85464156, + "learning_rate": 0.0002471276381392425, + "loss": 0.86537039, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.36791992, + "step": 3528, + "time_per_iteration": 2.75915265083313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038837, + "balance_loss_mlp": 1.02634406, + "epoch": 0.6789149672951135, + "flos": 1551786605568.0, + "grad_norm": 0.02259283228752806, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79227471, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.125, + "step": 3529, + "time_per_iteration": 4.964378356933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069996, + "balance_loss_mlp": 1.0344243, + "epoch": 0.6791073489803771, + "flos": 741088051200.0, + "grad_norm": 0.05189094051618866, + "language_loss": 0.84224343, + "learning_rate": 0.00024659031000260826, + "loss": 0.85294336, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.35595703, + "step": 3530, + "time_per_iteration": 2.8634302616119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072509, + "balance_loss_mlp": 1.03638899, + "epoch": 0.6792997306656406, + "flos": 576095915520.0, + "grad_norm": 0.055023533803773034, + "language_loss": 0.80543637, + "learning_rate": 0.0002463217934556985, + "loss": 0.81616145, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.36132812, + "step": 3531, + "time_per_iteration": 2.632070541381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031238, + "balance_loss_mlp": 1.01884079, + "epoch": 0.6794921123509042, + "flos": 1502538356736.0, + "grad_norm": 0.018779116568333653, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77563328, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.12402344, + "step": 3532, + "time_per_iteration": 4.7274627685546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073531, + "balance_loss_mlp": 1.03836441, + "epoch": 0.6796844940361677, + "flos": 698620018176.0, + "grad_norm": 0.05756666047667581, + "language_loss": 0.8354668, + "learning_rate": 0.0002457850559259306, + "loss": 0.84620214, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.35205078, + "step": 3533, + "time_per_iteration": 2.8860280513763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074469, + "balance_loss_mlp": 1.03901649, + "epoch": 0.6798768757214313, + "flos": 552496094208.0, + "grad_norm": 0.05133054826538493, + "language_loss": 0.81485093, + "learning_rate": 0.00024551683515145275, + "loss": 0.82559562, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.35498047, + "step": 3534, + "time_per_iteration": 2.620476722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072102, + "balance_loss_mlp": 1.03610086, + "epoch": 0.6800692574066949, + "flos": 522677670912.0, + "grad_norm": 0.04887500327812814, + "language_loss": 0.86479199, + "learning_rate": 0.0002452487131761014, + "loss": 0.87551308, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.35986328, + "step": 3535, + "time_per_iteration": 2.7402584552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069949, + "balance_loss_mlp": 1.03523564, + "epoch": 0.6802616390919585, + "flos": 573747563520.0, + "grad_norm": 0.05056319210769973, + "language_loss": 0.79672563, + "learning_rate": 0.00024498069010397093, + "loss": 0.80742508, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.34741211, + "step": 3536, + "time_per_iteration": 2.6493327617645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076551, + "balance_loss_mlp": 1.04109788, + "epoch": 0.6804540207772221, + "flos": 487915232256.0, + "grad_norm": 0.08967027587321133, + "language_loss": 0.85052317, + "learning_rate": 0.00024471276603911697, + "loss": 0.86128873, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.35449219, + "step": 3537, + "time_per_iteration": 2.5946011543273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074683, + "balance_loss_mlp": 1.03946912, + "epoch": 0.6806464024624855, + "flos": 578307465216.0, + "grad_norm": 0.050744450088680546, + "language_loss": 0.78934067, + "learning_rate": 0.0002444449410855572, + "loss": 0.80008757, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.35229492, + "step": 3538, + "time_per_iteration": 2.7160799503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073592, + "balance_loss_mlp": 1.03778172, + "epoch": 0.6808387841477491, + "flos": 553456378368.0, + "grad_norm": 0.0415443850681439, + "language_loss": 0.84257662, + "learning_rate": 0.00024417721534727033, + "loss": 0.85331261, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.35864258, + "step": 3539, + "time_per_iteration": 2.6316590309143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067702, + "balance_loss_mlp": 1.03220177, + "epoch": 0.6810311658330127, + "flos": 426613893120.0, + "grad_norm": 0.06268112342212401, + "language_loss": 0.82995272, + "learning_rate": 0.00024390958892819687, + "loss": 0.8406297, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.35546875, + "step": 3540, + "time_per_iteration": 2.4619975090026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071831, + "balance_loss_mlp": 1.03518569, + "epoch": 0.6812235475182763, + "flos": 571956443136.0, + "grad_norm": 0.047330457395290515, + "language_loss": 0.80951297, + "learning_rate": 0.0002436420619322381, + "loss": 0.82023126, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.36645508, + "step": 3541, + "time_per_iteration": 2.814427614212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070446, + "balance_loss_mlp": 1.03515983, + "epoch": 0.6814159292035398, + "flos": 501648781824.0, + "grad_norm": 0.0608425293250951, + "language_loss": 0.82551098, + "learning_rate": 0.0002433746344632577, + "loss": 0.83621544, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.35327148, + "step": 3542, + "time_per_iteration": 2.6463205814361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069714, + "balance_loss_mlp": 1.03340268, + "epoch": 0.6816083108888034, + "flos": 765176702976.0, + "grad_norm": 0.05597669105837374, + "language_loss": 0.7998035, + "learning_rate": 0.00024310730662508006, + "loss": 0.81050068, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.36303711, + "step": 3543, + "time_per_iteration": 3.0262770652770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107287, + "balance_loss_mlp": 1.03787053, + "epoch": 0.681800692574067, + "flos": 479205683712.0, + "grad_norm": 0.05246394950285061, + "language_loss": 0.87412894, + "learning_rate": 0.0002428400785214911, + "loss": 0.88485765, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.3503418, + "step": 3544, + "time_per_iteration": 2.6026573181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072118, + "balance_loss_mlp": 1.03547359, + "epoch": 0.6819930742593305, + "flos": 691298537472.0, + "grad_norm": 0.057535239065408805, + "language_loss": 0.8261283, + "learning_rate": 0.00024257295025623794, + "loss": 0.83684945, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.36645508, + "step": 3545, + "time_per_iteration": 2.813525915145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066625, + "balance_loss_mlp": 1.03059971, + "epoch": 0.6821854559445941, + "flos": 677783185920.0, + "grad_norm": 0.051890775320829655, + "language_loss": 0.80731034, + "learning_rate": 0.00024230592193302892, + "loss": 0.81797659, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.3605957, + "step": 3546, + "time_per_iteration": 2.852640151977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069168, + "balance_loss_mlp": 1.03378654, + "epoch": 0.6823778376298576, + "flos": 461956884480.0, + "grad_norm": 0.04826922291722955, + "language_loss": 0.84192979, + "learning_rate": 0.00024203899365553372, + "loss": 0.85262144, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.35424805, + "step": 3547, + "time_per_iteration": 2.51088285446167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018198, + "balance_loss_mlp": 1.00651574, + "epoch": 0.6825702193151212, + "flos": 1474582427136.0, + "grad_norm": 0.01234117563256537, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.77752554, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.11669922, + "step": 3548, + "time_per_iteration": 4.512159824371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069713, + "balance_loss_mlp": 1.03397429, + "epoch": 0.6827626010003848, + "flos": 722791627776.0, + "grad_norm": 0.05201405662428197, + "language_loss": 0.83068311, + "learning_rate": 0.00024150543765216848, + "loss": 0.84138024, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.35766602, + "step": 3549, + "time_per_iteration": 2.9022421836853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066879, + "balance_loss_mlp": 1.03066325, + "epoch": 0.6829549826856484, + "flos": 558596832768.0, + "grad_norm": 0.050492877395882395, + "language_loss": 0.83153272, + "learning_rate": 0.00024123881013344352, + "loss": 0.84220147, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.36230469, + "step": 3550, + "time_per_iteration": 2.663245677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070068, + "balance_loss_mlp": 1.03509164, + "epoch": 0.6831473643709118, + "flos": 624635573760.0, + "grad_norm": 0.06049149203697264, + "language_loss": 0.79663515, + "learning_rate": 0.00024097228307472202, + "loss": 0.80733585, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.35009766, + "step": 3551, + "time_per_iteration": 2.7762739658355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070299, + "balance_loss_mlp": 1.03458428, + "epoch": 0.6833397460561754, + "flos": 713553960960.0, + "grad_norm": 0.05841581019215986, + "language_loss": 0.81410074, + "learning_rate": 0.00024070585657947846, + "loss": 0.82480371, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.35717773, + "step": 3552, + "time_per_iteration": 2.8573665618896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070846, + "balance_loss_mlp": 1.03470206, + "epoch": 0.683532127741439, + "flos": 464449241088.0, + "grad_norm": 0.042320338748993415, + "language_loss": 0.85217428, + "learning_rate": 0.00024043953075114934, + "loss": 0.86288273, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.36157227, + "step": 3553, + "time_per_iteration": 2.6308178901672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106716, + "balance_loss_mlp": 1.03230345, + "epoch": 0.6837245094267026, + "flos": 581979866112.0, + "grad_norm": 0.06353851780596993, + "language_loss": 0.88855463, + "learning_rate": 0.00024017330569313128, + "loss": 0.89922619, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.34912109, + "step": 3554, + "time_per_iteration": 2.691176176071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070226, + "balance_loss_mlp": 1.03415298, + "epoch": 0.6839168911119662, + "flos": 793836993024.0, + "grad_norm": 0.05307417263054524, + "language_loss": 0.74880016, + "learning_rate": 0.0002399071815087821, + "loss": 0.75950241, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.36108398, + "step": 3555, + "time_per_iteration": 2.990910530090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073324, + "balance_loss_mlp": 1.03803802, + "epoch": 0.6841092727972297, + "flos": 579734820864.0, + "grad_norm": 0.05505515245095852, + "language_loss": 0.83355868, + "learning_rate": 0.00023964115830142025, + "loss": 0.84429193, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.35327148, + "step": 3556, + "time_per_iteration": 2.6737208366394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070632, + "balance_loss_mlp": 1.03522646, + "epoch": 0.6843016544824932, + "flos": 383530401792.0, + "grad_norm": 0.06254442302238046, + "language_loss": 0.8747263, + "learning_rate": 0.00023937523617432522, + "loss": 0.8854326, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.35449219, + "step": 3557, + "time_per_iteration": 2.4377589225769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066745, + "balance_loss_mlp": 1.03176904, + "epoch": 0.6844940361677568, + "flos": 1438474332672.0, + "grad_norm": 0.05391810386575329, + "language_loss": 0.86953497, + "learning_rate": 0.00023910941523073705, + "loss": 0.88020241, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.34985352, + "step": 3558, + "time_per_iteration": 3.854933738708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072327, + "balance_loss_mlp": 1.03739882, + "epoch": 0.6846864178530204, + "flos": 520614508032.0, + "grad_norm": 0.05572945475530707, + "language_loss": 0.86660743, + "learning_rate": 0.0002388436955738566, + "loss": 0.87733072, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.34960938, + "step": 3559, + "time_per_iteration": 2.6673743724823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072956, + "balance_loss_mlp": 1.03874326, + "epoch": 0.6848787995382839, + "flos": 717626442240.0, + "grad_norm": 0.051092768918582485, + "language_loss": 0.81714153, + "learning_rate": 0.00023857807730684523, + "loss": 0.82787108, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.3425293, + "step": 3560, + "time_per_iteration": 2.8930888175964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074328, + "balance_loss_mlp": 1.03956604, + "epoch": 0.6850711812235475, + "flos": 510787524096.0, + "grad_norm": 0.06174671890156068, + "language_loss": 0.82387376, + "learning_rate": 0.00023831256053282547, + "loss": 0.83461708, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.34790039, + "step": 3561, + "time_per_iteration": 2.6872005462646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073119, + "balance_loss_mlp": 1.03923941, + "epoch": 0.6852635629088111, + "flos": 667832546304.0, + "grad_norm": 0.051363024529254335, + "language_loss": 0.78085375, + "learning_rate": 0.00023804714535488003, + "loss": 0.79158491, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.33911133, + "step": 3562, + "time_per_iteration": 4.3489556312561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008548, + "balance_loss_mlp": 0.9979142, + "epoch": 0.6854559445940747, + "flos": 1522136918016.0, + "grad_norm": 0.005165223405227486, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80818176, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.10644531, + "step": 3563, + "time_per_iteration": 4.906137704849243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072342, + "balance_loss_mlp": 1.03812885, + "epoch": 0.6856483262793382, + "flos": 453970483200.0, + "grad_norm": 0.05119141259642537, + "language_loss": 0.80591673, + "learning_rate": 0.00023751662019934488, + "loss": 0.81664014, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.3425293, + "step": 3564, + "time_per_iteration": 2.4906551837921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071978, + "balance_loss_mlp": 1.03745532, + "epoch": 0.6858407079646017, + "flos": 615269869056.0, + "grad_norm": 0.08282945217506828, + "language_loss": 0.79188418, + "learning_rate": 0.00023725151042772364, + "loss": 0.80260396, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.34545898, + "step": 3565, + "time_per_iteration": 2.7048499584198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075078, + "balance_loss_mlp": 1.04065084, + "epoch": 0.6860330896498653, + "flos": 465793638912.0, + "grad_norm": 0.05470196692680893, + "language_loss": 0.82981157, + "learning_rate": 0.00023698650266411276, + "loss": 0.8405624, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.34472656, + "step": 3566, + "time_per_iteration": 2.6011905670166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072281, + "balance_loss_mlp": 1.03909349, + "epoch": 0.6862254713351289, + "flos": 863879814144.0, + "grad_norm": 0.05579586531854514, + "language_loss": 0.82876581, + "learning_rate": 0.00023672159701139755, + "loss": 0.83948863, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.33203125, + "step": 3567, + "time_per_iteration": 3.1918952465057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078201, + "balance_loss_mlp": 1.0438447, + "epoch": 0.6864178530203925, + "flos": 446905078272.0, + "grad_norm": 0.06805670760386738, + "language_loss": 0.85873824, + "learning_rate": 0.00023645679357242296, + "loss": 0.86952031, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.34399414, + "step": 3568, + "time_per_iteration": 2.4888172149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074879, + "balance_loss_mlp": 1.04128623, + "epoch": 0.6866102347056561, + "flos": 424034196480.0, + "grad_norm": 0.05006770232648597, + "language_loss": 0.83895862, + "learning_rate": 0.00023619209244999534, + "loss": 0.84970748, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.33618164, + "step": 3569, + "time_per_iteration": 2.502540111541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107562, + "balance_loss_mlp": 1.04150224, + "epoch": 0.6868026163909196, + "flos": 472134486528.0, + "grad_norm": 0.060913037985659245, + "language_loss": 0.85054779, + "learning_rate": 0.0002359274937468806, + "loss": 0.86130404, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.34155273, + "step": 3570, + "time_per_iteration": 2.5016539096832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076781, + "balance_loss_mlp": 1.04263973, + "epoch": 0.6869949980761831, + "flos": 463937089536.0, + "grad_norm": 0.04774464497453654, + "language_loss": 0.778054, + "learning_rate": 0.00023566299756580512, + "loss": 0.78882182, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.34179688, + "step": 3571, + "time_per_iteration": 2.6037425994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076706, + "balance_loss_mlp": 1.04194498, + "epoch": 0.6871873797614467, + "flos": 426012991488.0, + "grad_norm": 0.056784915958369084, + "language_loss": 0.7818104, + "learning_rate": 0.0002353986040094551, + "loss": 0.79257739, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.34765625, + "step": 3572, + "time_per_iteration": 2.4650750160217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077273, + "balance_loss_mlp": 1.04286885, + "epoch": 0.6873797614467103, + "flos": 443394210816.0, + "grad_norm": 0.05696789275443238, + "language_loss": 0.7911824, + "learning_rate": 0.00023513431318047796, + "loss": 0.8019551, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.34448242, + "step": 3573, + "time_per_iteration": 2.5429108142852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072767, + "balance_loss_mlp": 1.03912568, + "epoch": 0.6875721431319738, + "flos": 991927074816.0, + "grad_norm": 0.06588497554546605, + "language_loss": 0.76656246, + "learning_rate": 0.00023487012518147977, + "loss": 0.77729011, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.33666992, + "step": 3574, + "time_per_iteration": 3.2478342056274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074542, + "balance_loss_mlp": 1.03985214, + "epoch": 0.6877645248172374, + "flos": 1285031900160.0, + "grad_norm": 0.05648016172081939, + "language_loss": 0.84123796, + "learning_rate": 0.00023460604011502772, + "loss": 0.85198337, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.34692383, + "step": 3575, + "time_per_iteration": 3.6104493141174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073345, + "balance_loss_mlp": 1.03946543, + "epoch": 0.687956906502501, + "flos": 876360688128.0, + "grad_norm": 0.05234067730424214, + "language_loss": 0.8542276, + "learning_rate": 0.00023434205808364845, + "loss": 0.86496103, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.33911133, + "step": 3576, + "time_per_iteration": 3.1311981678009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107559, + "balance_loss_mlp": 1.04142499, + "epoch": 0.6881492881877646, + "flos": 563038871040.0, + "grad_norm": 0.05805523475293479, + "language_loss": 0.8543247, + "learning_rate": 0.00023407817918982932, + "loss": 0.86508065, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.34204102, + "step": 3577, + "time_per_iteration": 2.76940655708313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075416, + "balance_loss_mlp": 1.04101276, + "epoch": 0.6883416698730281, + "flos": 794782720512.0, + "grad_norm": 0.05454368675547281, + "language_loss": 0.7852968, + "learning_rate": 0.00023381440353601718, + "loss": 0.79605091, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.34448242, + "step": 3578, + "time_per_iteration": 2.987713098526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078404, + "balance_loss_mlp": 1.04295087, + "epoch": 0.6885340515582916, + "flos": 723308161536.0, + "grad_norm": 0.1550034716178633, + "language_loss": 0.8585633, + "learning_rate": 0.00023355073122461822, + "loss": 0.86934739, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.35449219, + "step": 3579, + "time_per_iteration": 2.8689723014831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073162, + "balance_loss_mlp": 1.03866315, + "epoch": 0.6887264332435552, + "flos": 1010529036288.0, + "grad_norm": 0.05073405937769219, + "language_loss": 0.82913256, + "learning_rate": 0.00023328716235799973, + "loss": 0.83986419, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.34545898, + "step": 3580, + "time_per_iteration": 3.2760398387908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077211, + "balance_loss_mlp": 1.04292655, + "epoch": 0.6889188149288188, + "flos": 584993138688.0, + "grad_norm": 0.0642868391556551, + "language_loss": 0.83958888, + "learning_rate": 0.00023302369703848803, + "loss": 0.85036099, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.34326172, + "step": 3581, + "time_per_iteration": 2.6795780658721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075736, + "balance_loss_mlp": 1.04121315, + "epoch": 0.6891111966140824, + "flos": 635831686656.0, + "grad_norm": 0.05830003162798764, + "language_loss": 0.79951459, + "learning_rate": 0.00023276033536836937, + "loss": 0.81027198, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.34570312, + "step": 3582, + "time_per_iteration": 2.7684953212738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074456, + "balance_loss_mlp": 1.03964663, + "epoch": 0.6893035782993459, + "flos": 495011160576.0, + "grad_norm": 0.04509310145442872, + "language_loss": 0.84428883, + "learning_rate": 0.00023249707744988984, + "loss": 0.8550334, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.34838867, + "step": 3583, + "time_per_iteration": 2.6324620246887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074677, + "balance_loss_mlp": 1.04041624, + "epoch": 0.6894959599846094, + "flos": 457983327744.0, + "grad_norm": 0.06541043788965, + "language_loss": 0.81646812, + "learning_rate": 0.00023223392338525529, + "loss": 0.8272149, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.34301758, + "step": 3584, + "time_per_iteration": 2.496835231781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070744, + "balance_loss_mlp": 1.03614986, + "epoch": 0.689688341669873, + "flos": 504740630016.0, + "grad_norm": 0.0500959825049001, + "language_loss": 0.78515136, + "learning_rate": 0.00023197087327663107, + "loss": 0.7958588, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.34643555, + "step": 3585, + "time_per_iteration": 2.6497855186462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107737, + "balance_loss_mlp": 1.04349089, + "epoch": 0.6898807233551366, + "flos": 763584993792.0, + "grad_norm": 0.05545986059450925, + "language_loss": 0.81721687, + "learning_rate": 0.00023170792722614243, + "loss": 0.82799053, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.33911133, + "step": 3586, + "time_per_iteration": 2.8789288997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071576, + "balance_loss_mlp": 1.0367434, + "epoch": 0.6900731050404002, + "flos": 583030310400.0, + "grad_norm": 0.05029766249236532, + "language_loss": 0.83530807, + "learning_rate": 0.00023144508533587377, + "loss": 0.84602392, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.34863281, + "step": 3587, + "time_per_iteration": 2.8913052082061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074935, + "balance_loss_mlp": 1.03998244, + "epoch": 0.6902654867256637, + "flos": 711531495936.0, + "grad_norm": 0.0709422421698616, + "language_loss": 0.7865144, + "learning_rate": 0.0002311823477078698, + "loss": 0.79726374, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.35009766, + "step": 3588, + "time_per_iteration": 2.923501491546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068421, + "balance_loss_mlp": 1.03446984, + "epoch": 0.6904578684109273, + "flos": 596816294400.0, + "grad_norm": 0.26453664714217867, + "language_loss": 0.8501482, + "learning_rate": 0.00023091971444413428, + "loss": 0.86083239, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.33984375, + "step": 3589, + "time_per_iteration": 2.779235363006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076229, + "balance_loss_mlp": 1.04056144, + "epoch": 0.6906502500961909, + "flos": 584757411840.0, + "grad_norm": 0.051361873763105706, + "language_loss": 0.82785845, + "learning_rate": 0.00023065718564663012, + "loss": 0.83862066, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.35668945, + "step": 3590, + "time_per_iteration": 2.7035253047943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020688, + "balance_loss_mlp": 1.00957787, + "epoch": 0.6908426317814544, + "flos": 1587001559040.0, + "grad_norm": 0.009423557970014077, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74932277, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.11132812, + "step": 3591, + "time_per_iteration": 4.9744603633880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073352, + "balance_loss_mlp": 1.03901935, + "epoch": 0.6910350134667179, + "flos": 500525554176.0, + "grad_norm": 0.048031169148873155, + "language_loss": 0.80940306, + "learning_rate": 0.0002301324418579666, + "loss": 0.82013655, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.34350586, + "step": 3592, + "time_per_iteration": 2.673436403274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016267, + "balance_loss_mlp": 1.00534713, + "epoch": 0.6912273951519815, + "flos": 1408462138368.0, + "grad_norm": 0.006132313228220279, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79704738, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.109375, + "step": 3593, + "time_per_iteration": 4.7109363079071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074657, + "balance_loss_mlp": 1.04053962, + "epoch": 0.6914197768372451, + "flos": 634961562624.0, + "grad_norm": 0.056049498625347735, + "language_loss": 0.80705756, + "learning_rate": 0.00022960811715677415, + "loss": 0.8178041, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.34155273, + "step": 3594, + "time_per_iteration": 2.830838918685913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107036, + "balance_loss_mlp": 1.03686213, + "epoch": 0.6916121585225087, + "flos": 557755822080.0, + "grad_norm": 0.05478776736586074, + "language_loss": 0.81540507, + "learning_rate": 0.00022934611221845608, + "loss": 0.82610869, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.33520508, + "step": 3595, + "time_per_iteration": 2.800851583480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074607, + "balance_loss_mlp": 1.04127622, + "epoch": 0.6918045402077723, + "flos": 528887508480.0, + "grad_norm": 0.051880347807473304, + "language_loss": 0.77869982, + "learning_rate": 0.00022908421235729609, + "loss": 0.78944588, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.33349609, + "step": 3596, + "time_per_iteration": 2.7151432037353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071648, + "balance_loss_mlp": 1.03645778, + "epoch": 0.6919969218930357, + "flos": 570083927040.0, + "grad_norm": 0.044849912113491465, + "language_loss": 0.85305548, + "learning_rate": 0.0002288224176749728, + "loss": 0.86377192, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.35205078, + "step": 3597, + "time_per_iteration": 2.634561061859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075011, + "balance_loss_mlp": 1.04005897, + "epoch": 0.6921893035782993, + "flos": 683006598144.0, + "grad_norm": 0.0536844380747242, + "language_loss": 0.78127837, + "learning_rate": 0.00022856072827312385, + "loss": 0.79202843, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.34936523, + "step": 3598, + "time_per_iteration": 2.8242592811584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072467, + "balance_loss_mlp": 1.03830183, + "epoch": 0.6923816852635629, + "flos": 546484105728.0, + "grad_norm": 0.13391006913463419, + "language_loss": 0.76835263, + "learning_rate": 0.00022829914425334598, + "loss": 0.77907735, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.34204102, + "step": 3599, + "time_per_iteration": 2.634923219680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074851, + "balance_loss_mlp": 1.04051888, + "epoch": 0.6925740669488265, + "flos": 509782159872.0, + "grad_norm": 0.0539133277986469, + "language_loss": 0.80556238, + "learning_rate": 0.0002280376657171956, + "loss": 0.81631094, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.34350586, + "step": 3600, + "time_per_iteration": 2.6054348945617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071386, + "balance_loss_mlp": 1.03662419, + "epoch": 0.69276644863409, + "flos": 869053764096.0, + "grad_norm": 0.05194865310511828, + "language_loss": 0.76575196, + "learning_rate": 0.00022777629276618706, + "loss": 0.77646577, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.34765625, + "step": 3601, + "time_per_iteration": 3.1115190982818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077383, + "balance_loss_mlp": 1.04219222, + "epoch": 0.6929588303193536, + "flos": 625486758912.0, + "grad_norm": 0.05453934109077095, + "language_loss": 0.77726191, + "learning_rate": 0.0002275150255017947, + "loss": 0.78803569, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.35205078, + "step": 3602, + "time_per_iteration": 2.7954330444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013352, + "balance_loss_mlp": 1.00333869, + "epoch": 0.6931512120046172, + "flos": 1544530553856.0, + "grad_norm": 0.00865021754788789, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76746023, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.10009766, + "step": 3603, + "time_per_iteration": 4.98169469833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016608, + "balance_loss_mlp": 1.00664246, + "epoch": 0.6933435936898807, + "flos": 1447460001792.0, + "grad_norm": 0.007581021196043067, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76143718, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.09960938, + "step": 3604, + "time_per_iteration": 4.666281223297119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071493, + "balance_loss_mlp": 1.03739882, + "epoch": 0.6935359753751443, + "flos": 540639442944.0, + "grad_norm": 0.05365572329513203, + "language_loss": 0.84348619, + "learning_rate": 0.0002267318588424379, + "loss": 0.85420108, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.34130859, + "step": 3605, + "time_per_iteration": 2.5876171588897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071367, + "balance_loss_mlp": 1.03755951, + "epoch": 0.6937283570604078, + "flos": 719074146816.0, + "grad_norm": 0.0635324341399035, + "language_loss": 0.87573755, + "learning_rate": 0.00022647101533842845, + "loss": 0.8864513, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.33837891, + "step": 3606, + "time_per_iteration": 2.873445510864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072563, + "balance_loss_mlp": 1.03825426, + "epoch": 0.6939207387456714, + "flos": 521909443584.0, + "grad_norm": 0.05554055490203988, + "language_loss": 0.76844239, + "learning_rate": 0.00022621027802778872, + "loss": 0.77916795, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.34350586, + "step": 3607, + "time_per_iteration": 2.607332706451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074645, + "balance_loss_mlp": 1.04086149, + "epoch": 0.694113120430935, + "flos": 535100318208.0, + "grad_norm": 0.058788257779223134, + "language_loss": 0.78766942, + "learning_rate": 0.00022594964701174586, + "loss": 0.79841584, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.33813477, + "step": 3608, + "time_per_iteration": 2.6019680500030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074213, + "balance_loss_mlp": 1.03985715, + "epoch": 0.6943055021161986, + "flos": 523101072384.0, + "grad_norm": 0.052336959457674984, + "language_loss": 0.84605336, + "learning_rate": 0.00022568912239148586, + "loss": 0.85679555, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.34399414, + "step": 3609, + "time_per_iteration": 2.6037116050720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073862, + "balance_loss_mlp": 1.03943467, + "epoch": 0.694497883801462, + "flos": 484637119488.0, + "grad_norm": 0.05428318108102923, + "language_loss": 0.81688815, + "learning_rate": 0.00022542870426815344, + "loss": 0.82762676, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.34472656, + "step": 3610, + "time_per_iteration": 2.723229169845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080613, + "balance_loss_mlp": 1.04518366, + "epoch": 0.6946902654867256, + "flos": 461238119424.0, + "grad_norm": 0.06119674491487997, + "language_loss": 0.86244833, + "learning_rate": 0.00022516839274285173, + "loss": 0.87325442, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.35449219, + "step": 3611, + "time_per_iteration": 2.540647268295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073799, + "balance_loss_mlp": 1.03832269, + "epoch": 0.6948826471719892, + "flos": 512603375616.0, + "grad_norm": 0.054515273937313154, + "language_loss": 0.74971861, + "learning_rate": 0.00022490818791664265, + "loss": 0.76045656, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.35522461, + "step": 3612, + "time_per_iteration": 2.577448844909668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074613, + "balance_loss_mlp": 1.03989887, + "epoch": 0.6950750288572528, + "flos": 556917783552.0, + "grad_norm": 0.04771365069249161, + "language_loss": 0.85378981, + "learning_rate": 0.00022464808989054676, + "loss": 0.86453593, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.34741211, + "step": 3613, + "time_per_iteration": 2.6405351161956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071033, + "balance_loss_mlp": 1.03646183, + "epoch": 0.6952674105425164, + "flos": 542215185408.0, + "grad_norm": 0.06079183455352582, + "language_loss": 0.75739813, + "learning_rate": 0.00022438809876554284, + "loss": 0.76810849, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.34594727, + "step": 3614, + "time_per_iteration": 2.613945484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075685, + "balance_loss_mlp": 1.04128122, + "epoch": 0.6954597922277799, + "flos": 546465166848.0, + "grad_norm": 0.05561683748761922, + "language_loss": 0.80328143, + "learning_rate": 0.00022412821464256873, + "loss": 0.81403828, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.34448242, + "step": 3615, + "time_per_iteration": 2.7260682582855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073152, + "balance_loss_mlp": 1.03922486, + "epoch": 0.6956521739130435, + "flos": 519255553536.0, + "grad_norm": 0.0593468724066596, + "language_loss": 0.82113886, + "learning_rate": 0.00022386843762252023, + "loss": 0.83187044, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.33959961, + "step": 3616, + "time_per_iteration": 2.6294190883636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070918, + "balance_loss_mlp": 1.03622794, + "epoch": 0.695844555598307, + "flos": 466029365760.0, + "grad_norm": 0.055313153128714786, + "language_loss": 0.79384601, + "learning_rate": 0.00022360876780625193, + "loss": 0.80455518, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.34741211, + "step": 3617, + "time_per_iteration": 2.590061664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071648, + "balance_loss_mlp": 1.03741097, + "epoch": 0.6960369372835706, + "flos": 600347510784.0, + "grad_norm": 0.044171001480645455, + "language_loss": 0.79755616, + "learning_rate": 0.00022334920529457604, + "loss": 0.8082726, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.34277344, + "step": 3618, + "time_per_iteration": 2.9306209087371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071307, + "balance_loss_mlp": 1.0369513, + "epoch": 0.6962293189688342, + "flos": 643927186944.0, + "grad_norm": 0.0535379410757751, + "language_loss": 0.87326622, + "learning_rate": 0.00022308975018826423, + "loss": 0.88397926, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.34399414, + "step": 3619, + "time_per_iteration": 2.888936758041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074623, + "balance_loss_mlp": 1.03967083, + "epoch": 0.6964217006540977, + "flos": 638524864512.0, + "grad_norm": 0.061080983554533244, + "language_loss": 0.84665489, + "learning_rate": 0.00022283040258804564, + "loss": 0.85740113, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.34985352, + "step": 3620, + "time_per_iteration": 2.777407169342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073748, + "balance_loss_mlp": 1.04005957, + "epoch": 0.6966140823393613, + "flos": 651864125952.0, + "grad_norm": 0.05227227103704651, + "language_loss": 0.83467555, + "learning_rate": 0.00022257116259460802, + "loss": 0.84541297, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.3371582, + "step": 3621, + "time_per_iteration": 2.8371803760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107145, + "balance_loss_mlp": 1.03802419, + "epoch": 0.6968064640246249, + "flos": 704160552960.0, + "grad_norm": 0.054247578312955166, + "language_loss": 0.8137657, + "learning_rate": 0.00022231203030859725, + "loss": 0.82448018, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.33447266, + "step": 3622, + "time_per_iteration": 2.9509494304656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077752, + "balance_loss_mlp": 1.04361081, + "epoch": 0.6969988457098885, + "flos": 492312190464.0, + "grad_norm": 0.06806535076017864, + "language_loss": 0.83473521, + "learning_rate": 0.00022205300583061737, + "loss": 0.84551275, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.34179688, + "step": 3623, + "time_per_iteration": 2.564910888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006317, + "balance_loss_mlp": 0.99630374, + "epoch": 0.6971912273951519, + "flos": 1351839974400.0, + "grad_norm": 0.005946878920226346, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83844519, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.10009766, + "step": 3624, + "time_per_iteration": 4.894897937774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_mlp": 1.04030991, + "epoch": 0.6973836090804155, + "flos": 602182301184.0, + "grad_norm": 0.052322011442081255, + "language_loss": 0.77296048, + "learning_rate": 0.00022153528070095735, + "loss": 0.78370118, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.33789062, + "step": 3625, + "time_per_iteration": 2.6873764991760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074827, + "balance_loss_mlp": 1.04056633, + "epoch": 0.6975759907656791, + "flos": 523805280768.0, + "grad_norm": 0.05344661809943597, + "language_loss": 0.88087487, + "learning_rate": 0.00022127658025027568, + "loss": 0.89162308, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.34301758, + "step": 3626, + "time_per_iteration": 2.6872076988220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077609, + "balance_loss_mlp": 1.04291928, + "epoch": 0.6977683724509427, + "flos": 480672327168.0, + "grad_norm": 0.05134929974551719, + "language_loss": 0.84773469, + "learning_rate": 0.00022101798800962258, + "loss": 0.85851079, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.34741211, + "step": 3627, + "time_per_iteration": 2.592256546020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074479, + "balance_loss_mlp": 1.03933573, + "epoch": 0.6979607541362063, + "flos": 522372132864.0, + "grad_norm": 0.06417164030840651, + "language_loss": 0.78953862, + "learning_rate": 0.00022075950407939227, + "loss": 0.80028337, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.35180664, + "step": 3628, + "time_per_iteration": 2.616570234298706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_mlp": 1.04023814, + "epoch": 0.6981531358214698, + "flos": 547818329088.0, + "grad_norm": 0.05532420233787888, + "language_loss": 0.82282603, + "learning_rate": 0.0002205011285599367, + "loss": 0.83356667, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.33862305, + "step": 3629, + "time_per_iteration": 2.612488269805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073299, + "balance_loss_mlp": 1.03925288, + "epoch": 0.6983455175067333, + "flos": 699747628032.0, + "grad_norm": 0.05532386422624981, + "language_loss": 0.80727249, + "learning_rate": 0.00022024286155156658, + "loss": 0.8180055, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.34082031, + "step": 3630, + "time_per_iteration": 2.8387677669525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070371, + "balance_loss_mlp": 1.03632545, + "epoch": 0.6985378991919969, + "flos": 484819001856.0, + "grad_norm": 0.047952910030837306, + "language_loss": 0.85720146, + "learning_rate": 0.00021998470315454994, + "loss": 0.8679052, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.34057617, + "step": 3631, + "time_per_iteration": 2.635730743408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071863, + "balance_loss_mlp": 1.03843713, + "epoch": 0.6987302808772605, + "flos": 558503700480.0, + "grad_norm": 0.05280665579931524, + "language_loss": 0.86521721, + "learning_rate": 0.00021972665346911275, + "loss": 0.87593591, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.33447266, + "step": 3632, + "time_per_iteration": 2.668616771697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071923, + "balance_loss_mlp": 1.03763855, + "epoch": 0.698922662562524, + "flos": 483350948352.0, + "grad_norm": 0.05402222352143004, + "language_loss": 0.79431093, + "learning_rate": 0.00021946871259543877, + "loss": 0.80503017, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.34326172, + "step": 3633, + "time_per_iteration": 2.580191135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068198, + "balance_loss_mlp": 1.03486705, + "epoch": 0.6991150442477876, + "flos": 718586726400.0, + "grad_norm": 0.05023014316790998, + "language_loss": 0.8304534, + "learning_rate": 0.00021921088063366957, + "loss": 0.84113538, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.33349609, + "step": 3634, + "time_per_iteration": 2.9607045650482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067291, + "balance_loss_mlp": 1.03384113, + "epoch": 0.6993074259330512, + "flos": 488871134208.0, + "grad_norm": 0.05127346508888132, + "language_loss": 0.8176077, + "learning_rate": 0.00021895315768390435, + "loss": 0.82828063, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.3347168, + "step": 3635, + "time_per_iteration": 2.585498332977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107047, + "balance_loss_mlp": 1.03651941, + "epoch": 0.6994998076183148, + "flos": 717745715712.0, + "grad_norm": 0.04635500593717234, + "language_loss": 0.87909687, + "learning_rate": 0.00021869554384619999, + "loss": 0.88980162, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.33959961, + "step": 3636, + "time_per_iteration": 2.968268394470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074967, + "balance_loss_mlp": 1.0413022, + "epoch": 0.6996921893035783, + "flos": 578730866688.0, + "grad_norm": 0.05835542586274351, + "language_loss": 0.80754793, + "learning_rate": 0.00021843803922057115, + "loss": 0.81829762, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.33691406, + "step": 3637, + "time_per_iteration": 2.7109100818634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068155, + "balance_loss_mlp": 1.0351578, + "epoch": 0.6998845709888418, + "flos": 518369462784.0, + "grad_norm": 0.06833550802909422, + "language_loss": 0.81533343, + "learning_rate": 0.00021818064390698977, + "loss": 0.826015, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.33007812, + "step": 3638, + "time_per_iteration": 2.5944924354553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071173, + "balance_loss_mlp": 1.03726995, + "epoch": 0.7000769526741054, + "flos": 620666399232.0, + "grad_norm": 0.05517026065702434, + "language_loss": 0.86890268, + "learning_rate": 0.0002179233580053861, + "loss": 0.87961447, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.33935547, + "step": 3639, + "time_per_iteration": 2.7613229751586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070142, + "balance_loss_mlp": 1.03652453, + "epoch": 0.700269334359369, + "flos": 559670598144.0, + "grad_norm": 0.13465593059658462, + "language_loss": 0.85617924, + "learning_rate": 0.00021766618161564688, + "loss": 0.86688066, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.33642578, + "step": 3640, + "time_per_iteration": 2.7400569915771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071036, + "balance_loss_mlp": 1.0372045, + "epoch": 0.7004617160446326, + "flos": 483090490368.0, + "grad_norm": 0.051527698047250534, + "language_loss": 0.87097609, + "learning_rate": 0.00021740911483761677, + "loss": 0.88168645, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.33862305, + "step": 3641, + "time_per_iteration": 2.5464553833007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107241, + "balance_loss_mlp": 1.0389359, + "epoch": 0.7006540977298961, + "flos": 696647015424.0, + "grad_norm": 0.04496743490694548, + "language_loss": 0.91822404, + "learning_rate": 0.00021715215777109837, + "loss": 0.92894816, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.33496094, + "step": 3642, + "time_per_iteration": 2.9422945976257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068413, + "balance_loss_mlp": 1.03477192, + "epoch": 0.7008464794151597, + "flos": 504528224256.0, + "grad_norm": 0.053490842325032185, + "language_loss": 0.84272158, + "learning_rate": 0.00021689531051585103, + "loss": 0.85340571, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.33642578, + "step": 3643, + "time_per_iteration": 2.609464406967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069383, + "balance_loss_mlp": 1.03421593, + "epoch": 0.7010388611004232, + "flos": 536985980928.0, + "grad_norm": 0.06575198455651811, + "language_loss": 0.79940069, + "learning_rate": 0.00021663857317159196, + "loss": 0.81009454, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.35229492, + "step": 3644, + "time_per_iteration": 2.652776002883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074301, + "balance_loss_mlp": 1.04006386, + "epoch": 0.7012312427856868, + "flos": 546996257280.0, + "grad_norm": 0.05180675245879084, + "language_loss": 0.8175106, + "learning_rate": 0.00021638194583799487, + "loss": 0.82825363, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.3425293, + "step": 3645, + "time_per_iteration": 2.647700071334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072403, + "balance_loss_mlp": 1.03785658, + "epoch": 0.7014236244709504, + "flos": 941020125696.0, + "grad_norm": 0.0581240827613666, + "language_loss": 0.82057631, + "learning_rate": 0.00021612542861469176, + "loss": 0.83130032, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.34594727, + "step": 3646, + "time_per_iteration": 3.1926403045654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070686, + "balance_loss_mlp": 1.03644955, + "epoch": 0.7016160061562139, + "flos": 524908159488.0, + "grad_norm": 0.05426451368259885, + "language_loss": 0.82171357, + "learning_rate": 0.00021586902160127135, + "loss": 0.83242047, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.34277344, + "step": 3647, + "time_per_iteration": 2.5836267471313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074201, + "balance_loss_mlp": 1.03967857, + "epoch": 0.7018083878414775, + "flos": 373170917376.0, + "grad_norm": 0.07691887625237197, + "language_loss": 0.73860252, + "learning_rate": 0.00021561272489727974, + "loss": 0.74934447, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.34570312, + "step": 3648, + "time_per_iteration": 2.426370143890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068543, + "balance_loss_mlp": 1.03518772, + "epoch": 0.7020007695267411, + "flos": 527522761728.0, + "grad_norm": 0.07653563490177187, + "language_loss": 0.80320156, + "learning_rate": 0.0002153565386022199, + "loss": 0.813887, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.33374023, + "step": 3649, + "time_per_iteration": 2.6524124145507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073058, + "balance_loss_mlp": 1.03822541, + "epoch": 0.7021931512120047, + "flos": 689850832896.0, + "grad_norm": 0.0770521311839047, + "language_loss": 0.82439005, + "learning_rate": 0.00021510046281555262, + "loss": 0.83512068, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.34887695, + "step": 3650, + "time_per_iteration": 2.796095609664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069064, + "balance_loss_mlp": 1.03497052, + "epoch": 0.7023855328972681, + "flos": 639499705344.0, + "grad_norm": 0.07628366219259466, + "language_loss": 0.81408215, + "learning_rate": 0.0002148444976366949, + "loss": 0.82477278, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.34130859, + "step": 3651, + "time_per_iteration": 2.7908504009246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071482, + "balance_loss_mlp": 1.03760242, + "epoch": 0.7025779145825317, + "flos": 560674552320.0, + "grad_norm": 0.06297036166850548, + "language_loss": 0.82553816, + "learning_rate": 0.00021458864316502136, + "loss": 0.83625293, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.33911133, + "step": 3652, + "time_per_iteration": 2.7136270999908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073226, + "balance_loss_mlp": 1.03927469, + "epoch": 0.7027702962677953, + "flos": 447214998528.0, + "grad_norm": 0.0549303916698645, + "language_loss": 0.87089896, + "learning_rate": 0.0002143328994998634, + "loss": 0.88163126, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.33959961, + "step": 3653, + "time_per_iteration": 2.4819934368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071057, + "balance_loss_mlp": 1.03603339, + "epoch": 0.7029626779530589, + "flos": 622198471680.0, + "grad_norm": 0.05753095633291236, + "language_loss": 0.78409469, + "learning_rate": 0.00021407726674050982, + "loss": 0.79480523, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.35058594, + "step": 3654, + "time_per_iteration": 2.839901924133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077723, + "balance_loss_mlp": 1.04312825, + "epoch": 0.7031550596383225, + "flos": 629307546624.0, + "grad_norm": 0.04660069709874721, + "language_loss": 0.87104034, + "learning_rate": 0.0002138217449862061, + "loss": 0.88181752, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.34619141, + "step": 3655, + "time_per_iteration": 2.729714870452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074113, + "balance_loss_mlp": 1.04001868, + "epoch": 0.703347441323586, + "flos": 530589878784.0, + "grad_norm": 0.04994580933868796, + "language_loss": 0.78216398, + "learning_rate": 0.00021356633433615403, + "loss": 0.79290509, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.34130859, + "step": 3656, + "time_per_iteration": 2.578078031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074299, + "balance_loss_mlp": 1.04044342, + "epoch": 0.7035398230088495, + "flos": 693264185856.0, + "grad_norm": 0.0479106829759696, + "language_loss": 0.83245599, + "learning_rate": 0.0002133110348895133, + "loss": 0.84319901, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.33862305, + "step": 3657, + "time_per_iteration": 2.9648847579956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068496, + "balance_loss_mlp": 1.03537953, + "epoch": 0.7037322046941131, + "flos": 967628837376.0, + "grad_norm": 0.048159657931533775, + "language_loss": 0.84623647, + "learning_rate": 0.0002130558467453999, + "loss": 0.85692137, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.33129883, + "step": 3658, + "time_per_iteration": 3.3155901432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069074, + "balance_loss_mlp": 1.03514767, + "epoch": 0.7039245863793767, + "flos": 502598891520.0, + "grad_norm": 0.045313539316245835, + "language_loss": 0.84409332, + "learning_rate": 0.0002128007700028865, + "loss": 0.85478401, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.33959961, + "step": 3659, + "time_per_iteration": 2.7024378776550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072456, + "balance_loss_mlp": 1.03926849, + "epoch": 0.7041169680646402, + "flos": 465709271040.0, + "grad_norm": 0.056824645226565565, + "language_loss": 0.84162152, + "learning_rate": 0.00021254580476100276, + "loss": 0.85234612, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.33203125, + "step": 3660, + "time_per_iteration": 2.5560450553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074809, + "balance_loss_mlp": 1.04097748, + "epoch": 0.7043093497499038, + "flos": 631897417728.0, + "grad_norm": 0.07471330414673147, + "language_loss": 0.78714609, + "learning_rate": 0.00021229095111873497, + "loss": 0.79789412, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.33862305, + "step": 3661, + "time_per_iteration": 2.7691423892974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070997, + "balance_loss_mlp": 1.03704596, + "epoch": 0.7045017314351674, + "flos": 542639996928.0, + "grad_norm": 0.04471074658603975, + "language_loss": 0.86054224, + "learning_rate": 0.0002120362091750261, + "loss": 0.87125218, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.33984375, + "step": 3662, + "time_per_iteration": 2.7782440185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073091, + "balance_loss_mlp": 1.03883076, + "epoch": 0.704694113120431, + "flos": 428012135424.0, + "grad_norm": 0.05523093470828303, + "language_loss": 0.86868262, + "learning_rate": 0.00021178157902877566, + "loss": 0.8794136, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.34301758, + "step": 3663, + "time_per_iteration": 2.440488815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070446, + "balance_loss_mlp": 1.03682911, + "epoch": 0.7048864948056945, + "flos": 650253477888.0, + "grad_norm": 0.07482453920379879, + "language_loss": 0.87160063, + "learning_rate": 0.0002115270607788397, + "loss": 0.88230515, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.33642578, + "step": 3664, + "time_per_iteration": 2.760225772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.04015791, + "epoch": 0.705078876490958, + "flos": 412330314240.0, + "grad_norm": 0.05762286530441703, + "language_loss": 0.85702121, + "learning_rate": 0.00021127265452403133, + "loss": 0.86775321, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.33032227, + "step": 3665, + "time_per_iteration": 2.561060905456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007528, + "balance_loss_mlp": 0.99813432, + "epoch": 0.7052712581762216, + "flos": 1419266783232.0, + "grad_norm": 0.0045947469063837235, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85099161, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.09375, + "step": 3666, + "time_per_iteration": 4.89429235458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_mlp": 1.03714871, + "epoch": 0.7054636398614852, + "flos": 492795228672.0, + "grad_norm": 0.08921720435757349, + "language_loss": 0.82764697, + "learning_rate": 0.00021076417839483065, + "loss": 0.83834386, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.32543945, + "step": 3667, + "time_per_iteration": 2.768646240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073838, + "balance_loss_mlp": 1.04010153, + "epoch": 0.7056560215467488, + "flos": 450228271104.0, + "grad_norm": 0.04427607909576538, + "language_loss": 0.85058916, + "learning_rate": 0.00021051010871784589, + "loss": 0.86132753, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.33764648, + "step": 3668, + "time_per_iteration": 2.567970037460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068482, + "balance_loss_mlp": 1.03462708, + "epoch": 0.7058484032320124, + "flos": 565426510848.0, + "grad_norm": 0.048767729933519285, + "language_loss": 0.78747618, + "learning_rate": 0.0002102561514308045, + "loss": 0.79816097, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.33886719, + "step": 3669, + "time_per_iteration": 2.7534899711608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069827, + "balance_loss_mlp": 1.03635263, + "epoch": 0.7060407849172758, + "flos": 566736003072.0, + "grad_norm": 0.04982032344187492, + "language_loss": 0.82456899, + "learning_rate": 0.00021000230663230135, + "loss": 0.83526719, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.33496094, + "step": 3670, + "time_per_iteration": 2.6715986728668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070311, + "balance_loss_mlp": 1.03655052, + "epoch": 0.7062331666025394, + "flos": 468505755648.0, + "grad_norm": 0.07243344373146629, + "language_loss": 0.82818425, + "learning_rate": 0.00020974857442088762, + "loss": 0.83888733, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.33789062, + "step": 3671, + "time_per_iteration": 2.5750696659088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072721, + "balance_loss_mlp": 1.03896141, + "epoch": 0.706425548287803, + "flos": 595042702848.0, + "grad_norm": 0.061680604914147966, + "language_loss": 0.88855779, + "learning_rate": 0.00020949495489507104, + "loss": 0.89928508, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.33789062, + "step": 3672, + "time_per_iteration": 2.6669857501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070135, + "balance_loss_mlp": 1.03680396, + "epoch": 0.7066179299730666, + "flos": 475566778368.0, + "grad_norm": 0.055232709126585705, + "language_loss": 0.8461234, + "learning_rate": 0.00020924144815331525, + "loss": 0.85682476, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.33349609, + "step": 3673, + "time_per_iteration": 2.5462799072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067992, + "balance_loss_mlp": 1.03451765, + "epoch": 0.7068103116583301, + "flos": 506153428992.0, + "grad_norm": 0.061788729653189316, + "language_loss": 0.82846355, + "learning_rate": 0.00020898805429404044, + "loss": 0.83914346, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.33496094, + "step": 3674, + "time_per_iteration": 2.5948987007141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073379, + "balance_loss_mlp": 1.03880787, + "epoch": 0.7070026933435937, + "flos": 679028659200.0, + "grad_norm": 0.053331350399745237, + "language_loss": 0.78217506, + "learning_rate": 0.0002087347734156228, + "loss": 0.79290879, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.34619141, + "step": 3675, + "time_per_iteration": 2.8384974002838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069694, + "balance_loss_mlp": 1.0364821, + "epoch": 0.7071950750288573, + "flos": 471981717504.0, + "grad_norm": 0.04797263488188438, + "language_loss": 0.79430759, + "learning_rate": 0.00020848160561639452, + "loss": 0.8050046, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.33227539, + "step": 3676, + "time_per_iteration": 2.6169028282165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067947, + "balance_loss_mlp": 1.03406775, + "epoch": 0.7073874567141208, + "flos": 473507997696.0, + "grad_norm": 0.04772517856798178, + "language_loss": 0.85496527, + "learning_rate": 0.0002082285509946445, + "loss": 0.86564475, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.33911133, + "step": 3677, + "time_per_iteration": 2.536482334136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070461, + "balance_loss_mlp": 1.03562784, + "epoch": 0.7075798383993844, + "flos": 545589250560.0, + "grad_norm": 0.05597865502328579, + "language_loss": 0.83377022, + "learning_rate": 0.00020797560964861683, + "loss": 0.84447479, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.34887695, + "step": 3678, + "time_per_iteration": 2.7888569831848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070957, + "balance_loss_mlp": 1.03765035, + "epoch": 0.7077722200846479, + "flos": 661766713344.0, + "grad_norm": 0.05495651688887883, + "language_loss": 0.80313671, + "learning_rate": 0.0002077227816765122, + "loss": 0.81384623, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.33325195, + "step": 3679, + "time_per_iteration": 3.0229249000549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009856, + "balance_loss_mlp": 1.00065279, + "epoch": 0.7079646017699115, + "flos": 1529128129536.0, + "grad_norm": 0.00795907908422284, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77457583, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.09179688, + "step": 3680, + "time_per_iteration": 4.766546249389648 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066391, + "balance_loss_mlp": 1.03317952, + "epoch": 0.7081569834551751, + "flos": 621217838592.0, + "grad_norm": 0.05324470770264926, + "language_loss": 0.78516078, + "learning_rate": 0.00020721746624665383, + "loss": 0.79582465, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.33203125, + "step": 3681, + "time_per_iteration": 2.7075722217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065664, + "balance_loss_mlp": 1.03199935, + "epoch": 0.7083493651404387, + "flos": 794280743424.0, + "grad_norm": 0.05089131854365718, + "language_loss": 0.79764175, + "learning_rate": 0.00020696497898508114, + "loss": 0.80829841, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.33691406, + "step": 3682, + "time_per_iteration": 2.9950366020202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066323, + "balance_loss_mlp": 1.03165746, + "epoch": 0.7085417468257021, + "flos": 813394856448.0, + "grad_norm": 0.05983793282747749, + "language_loss": 0.7766552, + "learning_rate": 0.00020671260548979316, + "loss": 0.78731841, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.34716797, + "step": 3683, + "time_per_iteration": 2.986528158187866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069107, + "balance_loss_mlp": 1.03503704, + "epoch": 0.7087341285109657, + "flos": 700259779584.0, + "grad_norm": 0.07395200120023371, + "language_loss": 0.84964406, + "learning_rate": 0.00020646034585876982, + "loss": 0.86033517, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.34106445, + "step": 3684, + "time_per_iteration": 2.801340341567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068964, + "balance_loss_mlp": 1.03467929, + "epoch": 0.7089265101962293, + "flos": 596211010560.0, + "grad_norm": 0.047359686788279315, + "language_loss": 0.84225708, + "learning_rate": 0.00020620820018994718, + "loss": 0.85294676, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.34301758, + "step": 3685, + "time_per_iteration": 2.8521230220794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069096, + "balance_loss_mlp": 1.03438258, + "epoch": 0.7091188918814929, + "flos": 486842876928.0, + "grad_norm": 0.05746562851929707, + "language_loss": 0.82886755, + "learning_rate": 0.00020595616858121675, + "loss": 0.8395586, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.34765625, + "step": 3686, + "time_per_iteration": 2.7113983631134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064604, + "balance_loss_mlp": 1.03034306, + "epoch": 0.7093112735667565, + "flos": 599833949184.0, + "grad_norm": 0.05104944796705689, + "language_loss": 0.80622023, + "learning_rate": 0.00020570425113042586, + "loss": 0.81686622, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.34277344, + "step": 3687, + "time_per_iteration": 2.712451457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066788, + "balance_loss_mlp": 1.03293276, + "epoch": 0.70950365525202, + "flos": 505577258496.0, + "grad_norm": 0.05729403369858188, + "language_loss": 0.85692352, + "learning_rate": 0.0002054524479353776, + "loss": 0.86759138, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.33886719, + "step": 3688, + "time_per_iteration": 2.6377811431884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068463, + "balance_loss_mlp": 1.03446496, + "epoch": 0.7096960369372836, + "flos": 731846002176.0, + "grad_norm": 0.05774020478713443, + "language_loss": 0.81201112, + "learning_rate": 0.00020520075909383063, + "loss": 0.82269579, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.34033203, + "step": 3689, + "time_per_iteration": 2.8854405879974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068474, + "balance_loss_mlp": 1.03409433, + "epoch": 0.7098884186225471, + "flos": 971685351936.0, + "grad_norm": 0.048806563033970844, + "language_loss": 0.8087877, + "learning_rate": 0.00020494918470349916, + "loss": 0.81947243, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.34399414, + "step": 3690, + "time_per_iteration": 3.2719247341156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069117, + "balance_loss_mlp": 1.03518987, + "epoch": 0.7100808003078107, + "flos": 504001516032.0, + "grad_norm": 0.0562848132432342, + "language_loss": 0.85595727, + "learning_rate": 0.00020469772486205297, + "loss": 0.86664844, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.33959961, + "step": 3691, + "time_per_iteration": 2.599254608154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064349, + "balance_loss_mlp": 1.03018332, + "epoch": 0.7102731819930742, + "flos": 540073446912.0, + "grad_norm": 0.052398389551748005, + "language_loss": 0.81299037, + "learning_rate": 0.0002044463796671177, + "loss": 0.82363379, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.34204102, + "step": 3692, + "time_per_iteration": 2.6676712036132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068474, + "balance_loss_mlp": 1.03502345, + "epoch": 0.7104655636783378, + "flos": 620066907648.0, + "grad_norm": 0.05724464606399067, + "language_loss": 0.80306011, + "learning_rate": 0.00020419514921627408, + "loss": 0.8137449, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.3347168, + "step": 3693, + "time_per_iteration": 2.906092643737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071382, + "balance_loss_mlp": 1.03707361, + "epoch": 0.7106579453636014, + "flos": 557060378112.0, + "grad_norm": 0.04981428600794461, + "language_loss": 0.77017659, + "learning_rate": 0.00020394403360705855, + "loss": 0.78089035, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.34350586, + "step": 3694, + "time_per_iteration": 2.69543719291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107094, + "balance_loss_mlp": 1.03634608, + "epoch": 0.710850327048865, + "flos": 512795432448.0, + "grad_norm": 0.05615701524037797, + "language_loss": 0.8807683, + "learning_rate": 0.00020369303293696228, + "loss": 0.8914777, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.34619141, + "step": 3695, + "time_per_iteration": 2.613211154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072906, + "balance_loss_mlp": 1.03850234, + "epoch": 0.7110427087341286, + "flos": 423398389248.0, + "grad_norm": 0.05344233224786611, + "language_loss": 0.78265321, + "learning_rate": 0.00020344214730343304, + "loss": 0.79338229, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.34448242, + "step": 3696, + "time_per_iteration": 2.60355544090271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070854, + "balance_loss_mlp": 1.03687966, + "epoch": 0.711235090419392, + "flos": 577107072000.0, + "grad_norm": 0.05731164613368461, + "language_loss": 0.79340208, + "learning_rate": 0.00020319137680387296, + "loss": 0.80411065, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.34008789, + "step": 3697, + "time_per_iteration": 2.9248886108398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071143, + "balance_loss_mlp": 1.03712082, + "epoch": 0.7114274721046556, + "flos": 447830456832.0, + "grad_norm": 0.06826664171711681, + "language_loss": 0.80587053, + "learning_rate": 0.0002029407215356398, + "loss": 0.81658196, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.34057617, + "step": 3698, + "time_per_iteration": 2.5251829624176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066516, + "balance_loss_mlp": 1.03304207, + "epoch": 0.7116198537899192, + "flos": 621680527872.0, + "grad_norm": 0.05434937939483776, + "language_loss": 0.83318967, + "learning_rate": 0.00020269018159604663, + "loss": 0.84385484, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.33496094, + "step": 3699, + "time_per_iteration": 2.6997692584991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062784, + "balance_loss_mlp": 1.02921486, + "epoch": 0.7118122354751828, + "flos": 498476947968.0, + "grad_norm": 0.04823068648652618, + "language_loss": 0.81931448, + "learning_rate": 0.00020243975708236162, + "loss": 0.82994235, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.3359375, + "step": 3700, + "time_per_iteration": 2.5654993057250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071384, + "balance_loss_mlp": 1.03717113, + "epoch": 0.7120046171604463, + "flos": 572438071296.0, + "grad_norm": 0.09878181502627377, + "language_loss": 0.85897946, + "learning_rate": 0.00020218944809180818, + "loss": 0.86969334, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.3425293, + "step": 3701, + "time_per_iteration": 2.7016773223876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070567, + "balance_loss_mlp": 1.03661633, + "epoch": 0.7121969988457099, + "flos": 572388609024.0, + "grad_norm": 0.07221648962243508, + "language_loss": 0.8452931, + "learning_rate": 0.00020193925472156493, + "loss": 0.85599875, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.33984375, + "step": 3702, + "time_per_iteration": 2.6914734840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034328, + "balance_loss_mlp": 1.02545857, + "epoch": 0.7123893805309734, + "flos": 1522585050624.0, + "grad_norm": 0.022091327023181177, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75323498, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.08886719, + "step": 3703, + "time_per_iteration": 4.884379148483276 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066468, + "balance_loss_mlp": 1.03304124, + "epoch": 0.712581762216237, + "flos": 614779476480.0, + "grad_norm": 0.06545400953207585, + "language_loss": 0.83676839, + "learning_rate": 0.00020143921523049863, + "loss": 0.84743309, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.33447266, + "step": 3704, + "time_per_iteration": 2.9219436645507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106542, + "balance_loss_mlp": 1.03185105, + "epoch": 0.7127741439015006, + "flos": 597504536064.0, + "grad_norm": 0.06577771502635076, + "language_loss": 0.835908, + "learning_rate": 0.00020118936930380837, + "loss": 0.84656215, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.3359375, + "step": 3705, + "time_per_iteration": 2.6833901405334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070499, + "balance_loss_mlp": 1.03635776, + "epoch": 0.7129665255867641, + "flos": 537138749952.0, + "grad_norm": 0.05242920734791126, + "language_loss": 0.80929446, + "learning_rate": 0.0002009396393856932, + "loss": 0.81999946, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.34179688, + "step": 3706, + "time_per_iteration": 2.6226556301116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107105, + "balance_loss_mlp": 1.03707516, + "epoch": 0.7131589072720277, + "flos": 526173981696.0, + "grad_norm": 0.05578991259827158, + "language_loss": 0.82312477, + "learning_rate": 0.00020069002557310673, + "loss": 0.8338353, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.34008789, + "step": 3707, + "time_per_iteration": 2.6535470485687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064608, + "balance_loss_mlp": 1.0319922, + "epoch": 0.7133512889572913, + "flos": 530626194432.0, + "grad_norm": 0.0741438657284304, + "language_loss": 0.77105689, + "learning_rate": 0.00020044052796295807, + "loss": 0.78170288, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.32617188, + "step": 3708, + "time_per_iteration": 2.787355899810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066524, + "balance_loss_mlp": 1.03226364, + "epoch": 0.7135436706425549, + "flos": 503282750976.0, + "grad_norm": 0.05095203093874289, + "language_loss": 0.82020175, + "learning_rate": 0.00020019114665211063, + "loss": 0.83086699, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.34301758, + "step": 3709, + "time_per_iteration": 2.5732407569885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070401, + "balance_loss_mlp": 1.03645074, + "epoch": 0.7137360523278183, + "flos": 515719954944.0, + "grad_norm": 0.04941715658479687, + "language_loss": 0.81220102, + "learning_rate": 0.00019994188173738276, + "loss": 0.82290506, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.33984375, + "step": 3710, + "time_per_iteration": 2.5564064979553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068107, + "balance_loss_mlp": 1.03398967, + "epoch": 0.7139284340130819, + "flos": 510103664640.0, + "grad_norm": 0.05502854520341245, + "language_loss": 0.80873179, + "learning_rate": 0.0001996927333155477, + "loss": 0.81941289, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.34155273, + "step": 3711, + "time_per_iteration": 2.732224225997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071876, + "balance_loss_mlp": 1.03825879, + "epoch": 0.7141208156983455, + "flos": 889896388608.0, + "grad_norm": 0.05033741502761429, + "language_loss": 0.85233271, + "learning_rate": 0.00019944370148333346, + "loss": 0.86305141, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.33642578, + "step": 3712, + "time_per_iteration": 3.213644504547119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107149, + "balance_loss_mlp": 1.03827798, + "epoch": 0.7143131973836091, + "flos": 535504780800.0, + "grad_norm": 0.05173411094558013, + "language_loss": 0.79739279, + "learning_rate": 0.00019919478633742278, + "loss": 0.80810767, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.33227539, + "step": 3713, + "time_per_iteration": 2.7310914993286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072393, + "balance_loss_mlp": 1.03884721, + "epoch": 0.7145055790688727, + "flos": 473429422080.0, + "grad_norm": 0.04797356179200618, + "language_loss": 0.85098791, + "learning_rate": 0.00019894598797445302, + "loss": 0.86171186, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.33569336, + "step": 3714, + "time_per_iteration": 2.5128626823425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071822, + "balance_loss_mlp": 1.03796673, + "epoch": 0.7146979607541362, + "flos": 570227931648.0, + "grad_norm": 0.05105012604374378, + "language_loss": 0.81882799, + "learning_rate": 0.00019869730649101615, + "loss": 0.82954621, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.33886719, + "step": 3715, + "time_per_iteration": 2.7468035221099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074372, + "balance_loss_mlp": 1.03965807, + "epoch": 0.7148903424393998, + "flos": 839299359744.0, + "grad_norm": 0.0561955521045174, + "language_loss": 0.72303152, + "learning_rate": 0.00019844874198365943, + "loss": 0.73377526, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.34765625, + "step": 3716, + "time_per_iteration": 3.0928800106048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072381, + "balance_loss_mlp": 1.03807223, + "epoch": 0.7150827241246633, + "flos": 541560439296.0, + "grad_norm": 0.05538627322116671, + "language_loss": 0.83775991, + "learning_rate": 0.00019820029454888362, + "loss": 0.84848368, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.34326172, + "step": 3717, + "time_per_iteration": 2.6984283924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101976, + "balance_loss_mlp": 1.00993717, + "epoch": 0.7152751058099269, + "flos": 1582803859968.0, + "grad_norm": 0.008798476496045995, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.75541025, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.09814453, + "step": 3718, + "time_per_iteration": 5.056431531906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072324, + "balance_loss_mlp": 1.03775322, + "epoch": 0.7154674874951905, + "flos": 517167659520.0, + "grad_norm": 0.0523553620911167, + "language_loss": 0.80075788, + "learning_rate": 0.0001977037512828529, + "loss": 0.81148112, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.34594727, + "step": 3719, + "time_per_iteration": 2.57888126373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068149, + "balance_loss_mlp": 1.03391242, + "epoch": 0.715659869180454, + "flos": 602246320128.0, + "grad_norm": 0.048902324655222526, + "language_loss": 0.86289543, + "learning_rate": 0.0001974556556443734, + "loss": 0.873577, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.3425293, + "step": 3720, + "time_per_iteration": 2.6931040287017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065993, + "balance_loss_mlp": 1.03206623, + "epoch": 0.7158522508657176, + "flos": 531403186176.0, + "grad_norm": 0.0436888691485468, + "language_loss": 0.88365716, + "learning_rate": 0.00019720767746402547, + "loss": 0.89431709, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.33959961, + "step": 3721, + "time_per_iteration": 2.7067127227783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072793, + "balance_loss_mlp": 1.03867531, + "epoch": 0.7160446325509812, + "flos": 557301897216.0, + "grad_norm": 0.0582274730279212, + "language_loss": 0.80045772, + "learning_rate": 0.00019695981683808222, + "loss": 0.8111856, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.34155273, + "step": 3722, + "time_per_iteration": 2.708950996398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067549, + "balance_loss_mlp": 1.03405118, + "epoch": 0.7162370142362448, + "flos": 690664140288.0, + "grad_norm": 0.04509643904161843, + "language_loss": 0.84632957, + "learning_rate": 0.00019671207386277225, + "loss": 0.85700506, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.33520508, + "step": 3723, + "time_per_iteration": 2.9580013751983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068909, + "balance_loss_mlp": 1.03462386, + "epoch": 0.7164293959215082, + "flos": 793772974080.0, + "grad_norm": 0.06707821988874196, + "language_loss": 0.77988201, + "learning_rate": 0.0001964644486342777, + "loss": 0.79057109, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.34326172, + "step": 3724, + "time_per_iteration": 2.937603712081909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067817, + "balance_loss_mlp": 1.03403354, + "epoch": 0.7166217776067718, + "flos": 493922838528.0, + "grad_norm": 0.05338190287132838, + "language_loss": 0.86470282, + "learning_rate": 0.00019621694124873524, + "loss": 0.87538099, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.33813477, + "step": 3725, + "time_per_iteration": 2.708923816680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012685, + "balance_loss_mlp": 1.00305271, + "epoch": 0.7168141592920354, + "flos": 1400337524736.0, + "grad_norm": 0.004329548481597118, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.7755276, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.09619141, + "step": 3726, + "time_per_iteration": 4.868973970413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067858, + "balance_loss_mlp": 1.03323972, + "epoch": 0.717006540977299, + "flos": 792789368832.0, + "grad_norm": 0.04993242383663973, + "language_loss": 0.77399421, + "learning_rate": 0.00019572228039082428, + "loss": 0.78467286, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.34643555, + "step": 3727, + "time_per_iteration": 3.0444281101226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063691, + "balance_loss_mlp": 1.02971661, + "epoch": 0.7171989226625626, + "flos": 554525761536.0, + "grad_norm": 0.045554501799563094, + "language_loss": 0.83411372, + "learning_rate": 0.0001954751271105002, + "loss": 0.84475064, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.34008789, + "step": 3728, + "time_per_iteration": 2.809967041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065462, + "balance_loss_mlp": 1.03186858, + "epoch": 0.717391304347826, + "flos": 555628640256.0, + "grad_norm": 0.05755567657425633, + "language_loss": 0.80672932, + "learning_rate": 0.00019522809205721687, + "loss": 0.81738389, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.33618164, + "step": 3729, + "time_per_iteration": 2.7862703800201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067947, + "balance_loss_mlp": 1.03459263, + "epoch": 0.7175836860330896, + "flos": 538582072320.0, + "grad_norm": 0.05354925450450462, + "language_loss": 0.82769603, + "learning_rate": 0.0001949811753268816, + "loss": 0.83837551, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.33374023, + "step": 3730, + "time_per_iteration": 2.6676440238952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106439, + "balance_loss_mlp": 1.03046322, + "epoch": 0.7177760677183532, + "flos": 515385303552.0, + "grad_norm": 0.057530592847955, + "language_loss": 0.82664466, + "learning_rate": 0.00019473437701535634, + "loss": 0.8372885, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.33959961, + "step": 3731, + "time_per_iteration": 2.5901401042938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061132, + "balance_loss_mlp": 1.02772939, + "epoch": 0.7179684494036168, + "flos": 674414913024.0, + "grad_norm": 0.05555536497682914, + "language_loss": 0.89367867, + "learning_rate": 0.00019448769721845677, + "loss": 0.90428996, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.33422852, + "step": 3732, + "time_per_iteration": 2.784381628036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106645, + "balance_loss_mlp": 1.03192735, + "epoch": 0.7181608310888803, + "flos": 469672653312.0, + "grad_norm": 0.05444278495505657, + "language_loss": 0.85605729, + "learning_rate": 0.00019424113603195203, + "loss": 0.86672175, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.34570312, + "step": 3733, + "time_per_iteration": 2.5088841915130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106796, + "balance_loss_mlp": 1.03343654, + "epoch": 0.7183532127741439, + "flos": 593645870592.0, + "grad_norm": 0.06008894294367452, + "language_loss": 0.79899514, + "learning_rate": 0.0001939946935515657, + "loss": 0.80967468, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.34570312, + "step": 3734, + "time_per_iteration": 2.8258321285247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064654, + "balance_loss_mlp": 1.03065538, + "epoch": 0.7185455944594075, + "flos": 498669004800.0, + "grad_norm": 0.05732279387699742, + "language_loss": 0.80418706, + "learning_rate": 0.0001937483698729755, + "loss": 0.81483358, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.34008789, + "step": 3735, + "time_per_iteration": 2.5968332290649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065941, + "balance_loss_mlp": 1.03182328, + "epoch": 0.718737976144671, + "flos": 814590867456.0, + "grad_norm": 0.053801017075388924, + "language_loss": 0.82329178, + "learning_rate": 0.0001935021650918128, + "loss": 0.83395112, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.34155273, + "step": 3736, + "time_per_iteration": 2.982541084289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063158, + "balance_loss_mlp": 1.02894521, + "epoch": 0.7189303578299346, + "flos": 438100987392.0, + "grad_norm": 0.06976823938990344, + "language_loss": 0.86880851, + "learning_rate": 0.0001932560793036625, + "loss": 0.87944007, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.3425293, + "step": 3737, + "time_per_iteration": 2.5138731002807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064404, + "balance_loss_mlp": 1.0309298, + "epoch": 0.7191227395151981, + "flos": 549137995776.0, + "grad_norm": 0.0607946285508029, + "language_loss": 0.8638792, + "learning_rate": 0.00019301011260406382, + "loss": 0.87452322, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.33496094, + "step": 3738, + "time_per_iteration": 2.628265619277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065412, + "balance_loss_mlp": 1.03224778, + "epoch": 0.7193151212004617, + "flos": 626653656576.0, + "grad_norm": 0.05146382358147088, + "language_loss": 0.79296547, + "learning_rate": 0.00019276426508850936, + "loss": 0.80361962, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.33178711, + "step": 3739, + "time_per_iteration": 2.7006874084472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065434, + "balance_loss_mlp": 1.03179288, + "epoch": 0.7195075028857253, + "flos": 740719904256.0, + "grad_norm": 0.046550971907091544, + "language_loss": 0.80166346, + "learning_rate": 0.00019251853685244564, + "loss": 0.81231779, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.33666992, + "step": 3740, + "time_per_iteration": 3.0175721645355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066355, + "balance_loss_mlp": 1.0327853, + "epoch": 0.7196998845709889, + "flos": 802523220480.0, + "grad_norm": 0.05930173376482813, + "language_loss": 0.80639338, + "learning_rate": 0.00019227292799127283, + "loss": 0.81705689, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.3359375, + "step": 3741, + "time_per_iteration": 3.074167251586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069786, + "balance_loss_mlp": 1.03640747, + "epoch": 0.7198922662562524, + "flos": 924786865152.0, + "grad_norm": 0.05002690922956246, + "language_loss": 0.79003727, + "learning_rate": 0.00019202743860034454, + "loss": 0.80073518, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.33398438, + "step": 3742, + "time_per_iteration": 3.205714702606201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067652, + "balance_loss_mlp": 1.03448844, + "epoch": 0.7200846479415159, + "flos": 579838127616.0, + "grad_norm": 0.05345251644076864, + "language_loss": 0.83706784, + "learning_rate": 0.00019178206877496873, + "loss": 0.84774435, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.33178711, + "step": 3743, + "time_per_iteration": 2.6547601222991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106189, + "balance_loss_mlp": 1.02834439, + "epoch": 0.7202770296267795, + "flos": 557410996224.0, + "grad_norm": 0.043135096200134324, + "language_loss": 0.85002279, + "learning_rate": 0.0001915368186104059, + "loss": 0.86064172, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.33569336, + "step": 3744, + "time_per_iteration": 2.740265130996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066689, + "balance_loss_mlp": 1.03385842, + "epoch": 0.7204694113120431, + "flos": 672248443392.0, + "grad_norm": 0.0510098873102972, + "language_loss": 0.81037152, + "learning_rate": 0.0001912916882018706, + "loss": 0.82103842, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.32836914, + "step": 3745, + "time_per_iteration": 2.8475067615509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068353, + "balance_loss_mlp": 1.03511715, + "epoch": 0.7206617929973067, + "flos": 798845027328.0, + "grad_norm": 0.058473767349389985, + "language_loss": 0.78699112, + "learning_rate": 0.00019104667764453125, + "loss": 0.79767466, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.33251953, + "step": 3746, + "time_per_iteration": 3.016134738922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064862, + "balance_loss_mlp": 1.031793, + "epoch": 0.7208541746825702, + "flos": 531638913024.0, + "grad_norm": 0.04570203365425481, + "language_loss": 0.80496103, + "learning_rate": 0.00019080178703350926, + "loss": 0.81560969, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.33081055, + "step": 3747, + "time_per_iteration": 2.6047801971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060751, + "balance_loss_mlp": 1.02682364, + "epoch": 0.7210465563678338, + "flos": 534883530240.0, + "grad_norm": 0.04791251301755464, + "language_loss": 0.82855403, + "learning_rate": 0.00019055701646387952, + "loss": 0.83916157, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.33959961, + "step": 3748, + "time_per_iteration": 2.6366617679595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015113, + "balance_loss_mlp": 1.00548053, + "epoch": 0.7212389380530974, + "flos": 1533076955136.0, + "grad_norm": 0.0050303066243172915, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81487799, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.09619141, + "step": 3749, + "time_per_iteration": 4.800697326660156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067212, + "balance_loss_mlp": 1.03361845, + "epoch": 0.7214313197383609, + "flos": 461277407232.0, + "grad_norm": 0.05889548383130951, + "language_loss": 0.86542219, + "learning_rate": 0.00019006783582886368, + "loss": 0.87609434, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.33618164, + "step": 3750, + "time_per_iteration": 2.52746844291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066015, + "balance_loss_mlp": 1.0318023, + "epoch": 0.7216237014236244, + "flos": 1036691025408.0, + "grad_norm": 0.046476584677382714, + "language_loss": 0.82800925, + "learning_rate": 0.00018982342595339437, + "loss": 0.83866942, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.3425293, + "step": 3751, + "time_per_iteration": 3.5170929431915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067071, + "balance_loss_mlp": 1.03416932, + "epoch": 0.721816083108888, + "flos": 895578107904.0, + "grad_norm": 0.05167132755024372, + "language_loss": 0.81707644, + "learning_rate": 0.00018957913649915076, + "loss": 0.82774711, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.32910156, + "step": 3752, + "time_per_iteration": 3.1112849712371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010644, + "balance_loss_mlp": 1.03178465, + "epoch": 0.7220084647941516, + "flos": 523066166784.0, + "grad_norm": 0.05533376577602326, + "language_loss": 0.79672492, + "learning_rate": 0.00018933496756097428, + "loss": 0.80736887, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.32617188, + "step": 3753, + "time_per_iteration": 2.5987796783447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064595, + "balance_loss_mlp": 1.03102577, + "epoch": 0.7222008464794152, + "flos": 815757765120.0, + "grad_norm": 0.05288107423325553, + "language_loss": 0.81242466, + "learning_rate": 0.0001890909192336603, + "loss": 0.82307053, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.3359375, + "step": 3754, + "time_per_iteration": 3.0019736289978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065173, + "balance_loss_mlp": 1.03172278, + "epoch": 0.7223932281646788, + "flos": 748725244416.0, + "grad_norm": 0.049565047551570436, + "language_loss": 0.70085669, + "learning_rate": 0.00018884699161195623, + "loss": 0.71150839, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.3347168, + "step": 3755, + "time_per_iteration": 2.921433448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106462, + "balance_loss_mlp": 1.03047848, + "epoch": 0.7225856098499422, + "flos": 745132829184.0, + "grad_norm": 0.05110029255023059, + "language_loss": 0.77537811, + "learning_rate": 0.00018860318479056327, + "loss": 0.78602433, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.34179688, + "step": 3756, + "time_per_iteration": 4.5331456661224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064579, + "balance_loss_mlp": 1.03155816, + "epoch": 0.7227779915352058, + "flos": 547055894016.0, + "grad_norm": 0.047457603213344, + "language_loss": 0.835307, + "learning_rate": 0.00018835949886413555, + "loss": 0.84595281, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.33032227, + "step": 3757, + "time_per_iteration": 2.721592903137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106988, + "balance_loss_mlp": 1.0362395, + "epoch": 0.7229703732204694, + "flos": 530230496256.0, + "grad_norm": 0.05570980366468543, + "language_loss": 0.78520513, + "learning_rate": 0.0001881159339272806, + "loss": 0.79590392, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.33666992, + "step": 3758, + "time_per_iteration": 2.6724090576171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066008, + "balance_loss_mlp": 1.03289187, + "epoch": 0.723162754905733, + "flos": 528103314432.0, + "grad_norm": 0.05510744793319723, + "language_loss": 0.7836262, + "learning_rate": 0.00018787249007455858, + "loss": 0.79428625, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.33129883, + "step": 3759, + "time_per_iteration": 2.608786106109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065133, + "balance_loss_mlp": 1.03292298, + "epoch": 0.7233551365909965, + "flos": 654571860480.0, + "grad_norm": 0.051481631649939415, + "language_loss": 0.71461964, + "learning_rate": 0.00018762916740048302, + "loss": 0.72527099, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.32202148, + "step": 3760, + "time_per_iteration": 2.768165111541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064991, + "balance_loss_mlp": 1.03201807, + "epoch": 0.7235475182762601, + "flos": 522097118208.0, + "grad_norm": 0.045655130957968595, + "language_loss": 0.85612011, + "learning_rate": 0.0001873859659995195, + "loss": 0.86677003, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.32983398, + "step": 3761, + "time_per_iteration": 2.749396800994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106691, + "balance_loss_mlp": 1.03357887, + "epoch": 0.7237398999615237, + "flos": 608883941376.0, + "grad_norm": 0.05437044634391734, + "language_loss": 0.83492088, + "learning_rate": 0.0001871428859660878, + "loss": 0.84559, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.33349609, + "step": 3762, + "time_per_iteration": 2.767180919647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107084, + "balance_loss_mlp": 1.03820074, + "epoch": 0.7239322816467872, + "flos": 658664690688.0, + "grad_norm": 0.04804139363705488, + "language_loss": 0.82056308, + "learning_rate": 0.00018689992739455975, + "loss": 0.83127153, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.32641602, + "step": 3763, + "time_per_iteration": 2.8873496055603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071938, + "balance_loss_mlp": 1.03803444, + "epoch": 0.7241246633320508, + "flos": 968869928448.0, + "grad_norm": 0.04487268066979416, + "language_loss": 0.85964411, + "learning_rate": 0.00018665709037926027, + "loss": 0.87036347, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.33935547, + "step": 3764, + "time_per_iteration": 3.2812607288360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067601, + "balance_loss_mlp": 1.03429401, + "epoch": 0.7243170450173143, + "flos": 514745114112.0, + "grad_norm": 0.06636395802329886, + "language_loss": 0.84182644, + "learning_rate": 0.00018641437501446694, + "loss": 0.85250252, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.33325195, + "step": 3765, + "time_per_iteration": 2.573697328567505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069298, + "balance_loss_mlp": 1.03668237, + "epoch": 0.7245094267025779, + "flos": 559482923520.0, + "grad_norm": 0.05849002982454381, + "language_loss": 0.82240844, + "learning_rate": 0.0001861717813944104, + "loss": 0.83310151, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.32617188, + "step": 3766, + "time_per_iteration": 2.630692481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070145, + "balance_loss_mlp": 1.03674293, + "epoch": 0.7247018083878415, + "flos": 612359903232.0, + "grad_norm": 0.059142078563837144, + "language_loss": 0.7934258, + "learning_rate": 0.00018592930961327365, + "loss": 0.80412722, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.33422852, + "step": 3767, + "time_per_iteration": 2.714850902557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069775, + "balance_loss_mlp": 1.03694439, + "epoch": 0.7248941900731051, + "flos": 634379599872.0, + "grad_norm": 0.04667094016302488, + "language_loss": 0.8795737, + "learning_rate": 0.00018568695976519273, + "loss": 0.89027148, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.32836914, + "step": 3768, + "time_per_iteration": 2.78951358795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067478, + "balance_loss_mlp": 1.03433776, + "epoch": 0.7250865717583687, + "flos": 424718055936.0, + "grad_norm": 0.05715863238838566, + "language_loss": 0.80076563, + "learning_rate": 0.00018544473194425593, + "loss": 0.81144047, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.33154297, + "step": 3769, + "time_per_iteration": 2.5101308822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068926, + "balance_loss_mlp": 1.03542805, + "epoch": 0.7252789534436321, + "flos": 634794236928.0, + "grad_norm": 0.05221621035796038, + "language_loss": 0.78552115, + "learning_rate": 0.00018520262624450485, + "loss": 0.79621041, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.33520508, + "step": 3770, + "time_per_iteration": 2.851344347000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065232, + "balance_loss_mlp": 1.03247309, + "epoch": 0.7254713351288957, + "flos": 616895073792.0, + "grad_norm": 0.05281322327607285, + "language_loss": 0.86844021, + "learning_rate": 0.00018496064275993324, + "loss": 0.87909257, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.32763672, + "step": 3771, + "time_per_iteration": 2.740528106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065588, + "balance_loss_mlp": 1.03132713, + "epoch": 0.7256637168141593, + "flos": 766662285312.0, + "grad_norm": 0.053619752531576234, + "language_loss": 0.81698912, + "learning_rate": 0.00018471878158448686, + "loss": 0.82764494, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.34301758, + "step": 3772, + "time_per_iteration": 2.940927028656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068836, + "balance_loss_mlp": 1.03538561, + "epoch": 0.7258560984994229, + "flos": 495268646400.0, + "grad_norm": 0.044202669157845896, + "language_loss": 0.8410005, + "learning_rate": 0.00018447704281206512, + "loss": 0.85168886, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.3347168, + "step": 3773, + "time_per_iteration": 2.9211905002593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010641, + "balance_loss_mlp": 1.03050709, + "epoch": 0.7260484801846864, + "flos": 529802712576.0, + "grad_norm": 0.0599389288946333, + "language_loss": 0.82910264, + "learning_rate": 0.0001842354265365191, + "loss": 0.83974361, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.33618164, + "step": 3774, + "time_per_iteration": 2.672297477722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067714, + "balance_loss_mlp": 1.03478813, + "epoch": 0.72624086186995, + "flos": 624679243776.0, + "grad_norm": 0.055766679807351886, + "language_loss": 0.80738944, + "learning_rate": 0.0001839939328516526, + "loss": 0.81806654, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.32910156, + "step": 3775, + "time_per_iteration": 2.715765953063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067661, + "balance_loss_mlp": 1.03475976, + "epoch": 0.7264332435552135, + "flos": 716203468800.0, + "grad_norm": 0.054689232806286694, + "language_loss": 0.80927253, + "learning_rate": 0.0001837525618512218, + "loss": 0.81994909, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.32910156, + "step": 3776, + "time_per_iteration": 2.9182283878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067478, + "balance_loss_mlp": 1.03467178, + "epoch": 0.7266256252404771, + "flos": 680736821760.0, + "grad_norm": 0.056616455322331526, + "language_loss": 0.83123744, + "learning_rate": 0.00018351131362893519, + "loss": 0.84191227, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.328125, + "step": 3777, + "time_per_iteration": 2.8280246257781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065965, + "balance_loss_mlp": 1.03227687, + "epoch": 0.7268180069257407, + "flos": 518654651904.0, + "grad_norm": 0.0757528299469481, + "language_loss": 0.80649394, + "learning_rate": 0.00018327018827845364, + "loss": 0.81715357, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.3371582, + "step": 3778, + "time_per_iteration": 2.6342718601226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065833, + "balance_loss_mlp": 1.03221643, + "epoch": 0.7270103886110042, + "flos": 512411318784.0, + "grad_norm": 0.05462394949163198, + "language_loss": 0.87201697, + "learning_rate": 0.00018302918589339036, + "loss": 0.88267529, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.33642578, + "step": 3779, + "time_per_iteration": 2.6401546001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065353, + "balance_loss_mlp": 1.03166389, + "epoch": 0.7272027702962678, + "flos": 546395355648.0, + "grad_norm": 0.050485328839168696, + "language_loss": 0.90140432, + "learning_rate": 0.00018278830656731054, + "loss": 0.91205782, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.3371582, + "step": 3780, + "time_per_iteration": 2.6837782859802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060977, + "balance_loss_mlp": 1.02883863, + "epoch": 0.7273951519815314, + "flos": 592772926464.0, + "grad_norm": 0.04496338740790305, + "language_loss": 0.86495197, + "learning_rate": 0.00018254755039373222, + "loss": 0.87556171, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.32128906, + "step": 3781, + "time_per_iteration": 2.7322683334350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063791, + "balance_loss_mlp": 1.03084135, + "epoch": 0.727587533666795, + "flos": 605732456448.0, + "grad_norm": 0.056903164121683655, + "language_loss": 0.83278424, + "learning_rate": 0.0001823069174661252, + "loss": 0.84342206, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.32958984, + "step": 3782, + "time_per_iteration": 2.75710129737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067494, + "balance_loss_mlp": 1.03380585, + "epoch": 0.7277799153520584, + "flos": 512770701312.0, + "grad_norm": 0.05370507093110541, + "language_loss": 0.78568602, + "learning_rate": 0.00018206640787791112, + "loss": 0.79636097, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.3371582, + "step": 3783, + "time_per_iteration": 2.61852765083313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062618, + "balance_loss_mlp": 1.02923894, + "epoch": 0.727972297037322, + "flos": 537498132480.0, + "grad_norm": 0.05379721469366117, + "language_loss": 0.85843956, + "learning_rate": 0.00018182602172246416, + "loss": 0.8690657, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.33398438, + "step": 3784, + "time_per_iteration": 2.593327522277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061524, + "balance_loss_mlp": 1.02819335, + "epoch": 0.7281646787225856, + "flos": 534780223488.0, + "grad_norm": 0.06658957148496236, + "language_loss": 0.76393896, + "learning_rate": 0.00018158575909311075, + "loss": 0.77455419, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.33349609, + "step": 3785, + "time_per_iteration": 2.600620985031128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106257, + "balance_loss_mlp": 1.02935863, + "epoch": 0.7283570604078492, + "flos": 624767993856.0, + "grad_norm": 0.053054924881327924, + "language_loss": 0.79626518, + "learning_rate": 0.000181345620083129, + "loss": 0.80689085, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.33227539, + "step": 3786, + "time_per_iteration": 2.746778726577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065243, + "balance_loss_mlp": 1.03255534, + "epoch": 0.7285494420931128, + "flos": 533904307200.0, + "grad_norm": 0.097300641099862, + "language_loss": 0.86717927, + "learning_rate": 0.00018110560478574927, + "loss": 0.8778317, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.3269043, + "step": 3787, + "time_per_iteration": 2.6793131828308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065255, + "balance_loss_mlp": 1.03147149, + "epoch": 0.7287418237783763, + "flos": 666251011584.0, + "grad_norm": 0.05707772132850956, + "language_loss": 0.80307966, + "learning_rate": 0.0001808657132941533, + "loss": 0.81373221, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.33813477, + "step": 3788, + "time_per_iteration": 2.7490005493164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065453, + "balance_loss_mlp": 1.03147793, + "epoch": 0.7289342054636399, + "flos": 550344181248.0, + "grad_norm": 0.05691575768916977, + "language_loss": 0.82927215, + "learning_rate": 0.00018062594570147572, + "loss": 0.83992666, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.33984375, + "step": 3789, + "time_per_iteration": 2.584277391433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063831, + "balance_loss_mlp": 1.03109622, + "epoch": 0.7291265871489034, + "flos": 687620344320.0, + "grad_norm": 0.05865206546440876, + "language_loss": 0.85141826, + "learning_rate": 0.00018038630210080243, + "loss": 0.86205661, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.32739258, + "step": 3790, + "time_per_iteration": 2.7913711071014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010669, + "balance_loss_mlp": 1.03421283, + "epoch": 0.729318968834167, + "flos": 572388609024.0, + "grad_norm": 0.08871994753922169, + "language_loss": 0.8494693, + "learning_rate": 0.0001801467825851712, + "loss": 0.8601383, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.3269043, + "step": 3791, + "time_per_iteration": 2.7232275009155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064513, + "balance_loss_mlp": 1.03013325, + "epoch": 0.7295113505194305, + "flos": 585786097152.0, + "grad_norm": 0.05597763782774928, + "language_loss": 0.78437781, + "learning_rate": 0.00017990738724757172, + "loss": 0.79502296, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.34423828, + "step": 3792, + "time_per_iteration": 2.8646349906921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070211, + "balance_loss_mlp": 1.03664136, + "epoch": 0.7297037322046941, + "flos": 706872669696.0, + "grad_norm": 0.0454122102846594, + "language_loss": 0.82281637, + "learning_rate": 0.00017966811618094598, + "loss": 0.83351851, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.3359375, + "step": 3793, + "time_per_iteration": 2.9363014698028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065082, + "balance_loss_mlp": 1.03148866, + "epoch": 0.7298961138899577, + "flos": 487039315968.0, + "grad_norm": 0.060918230322826325, + "language_loss": 0.84644252, + "learning_rate": 0.00017942896947818664, + "loss": 0.85709333, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.33618164, + "step": 3794, + "time_per_iteration": 2.634622097015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014366, + "balance_loss_mlp": 1.00473428, + "epoch": 0.7300884955752213, + "flos": 1365102222336.0, + "grad_norm": 0.006306847562880891, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.75839418, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.09619141, + "step": 3795, + "time_per_iteration": 4.8498523235321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067916, + "balance_loss_mlp": 1.03434658, + "epoch": 0.7302808772604849, + "flos": 531550162944.0, + "grad_norm": 0.07784703337464734, + "language_loss": 0.85064995, + "learning_rate": 0.00017895104953559947, + "loss": 0.86132914, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.33569336, + "step": 3796, + "time_per_iteration": 2.578749418258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069311, + "balance_loss_mlp": 1.03533602, + "epoch": 0.7304732589457483, + "flos": 435949074432.0, + "grad_norm": 0.06903187092561903, + "language_loss": 0.8945868, + "learning_rate": 0.00017871227648131672, + "loss": 0.90527987, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.34008789, + "step": 3797, + "time_per_iteration": 2.498368740081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064089, + "balance_loss_mlp": 1.03071082, + "epoch": 0.7306656406310119, + "flos": 451376229888.0, + "grad_norm": 0.049186518116542115, + "language_loss": 0.82359099, + "learning_rate": 0.0001784736281619907, + "loss": 0.83423185, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.33398438, + "step": 3798, + "time_per_iteration": 2.5968668460845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063054, + "balance_loss_mlp": 1.02898395, + "epoch": 0.7308580223162755, + "flos": 511756572672.0, + "grad_norm": 0.049616480799322744, + "language_loss": 0.74341989, + "learning_rate": 0.00017823510467027232, + "loss": 0.75405043, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.34106445, + "step": 3799, + "time_per_iteration": 2.733454465866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063516, + "balance_loss_mlp": 1.02930331, + "epoch": 0.7310504040015391, + "flos": 375209349120.0, + "grad_norm": 0.0582146456406939, + "language_loss": 0.78020084, + "learning_rate": 0.00017799670609876516, + "loss": 0.79083604, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.3425293, + "step": 3800, + "time_per_iteration": 4.01823616027832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065254, + "balance_loss_mlp": 1.03135109, + "epoch": 0.7312427856868026, + "flos": 549073976832.0, + "grad_norm": 0.04960878758692363, + "language_loss": 0.8857708, + "learning_rate": 0.00017775843254002366, + "loss": 0.89642334, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.33935547, + "step": 3801, + "time_per_iteration": 2.6998913288116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063333, + "balance_loss_mlp": 1.03014541, + "epoch": 0.7314351673720662, + "flos": 766880483328.0, + "grad_norm": 0.0540974976561695, + "language_loss": 0.84199798, + "learning_rate": 0.00017752028408655367, + "loss": 0.85263133, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.33203125, + "step": 3802, + "time_per_iteration": 3.058145523071289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064093, + "balance_loss_mlp": 1.03102422, + "epoch": 0.7316275490573297, + "flos": 486492258816.0, + "grad_norm": 0.051110561372661595, + "language_loss": 0.85141397, + "learning_rate": 0.00017728226083081272, + "loss": 0.86205482, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.33081055, + "step": 3803, + "time_per_iteration": 2.5310099124908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065165, + "balance_loss_mlp": 1.03166723, + "epoch": 0.7318199307425933, + "flos": 473183520768.0, + "grad_norm": 0.05616081836254539, + "language_loss": 0.81485891, + "learning_rate": 0.00017704436286520965, + "loss": 0.8255105, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.33520508, + "step": 3804, + "time_per_iteration": 2.568283796310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063426, + "balance_loss_mlp": 1.02952337, + "epoch": 0.7320123124278569, + "flos": 549202014720.0, + "grad_norm": 0.05320670127317765, + "language_loss": 0.84491169, + "learning_rate": 0.0001768065902821046, + "loss": 0.855546, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.33935547, + "step": 3805, + "time_per_iteration": 2.605682134628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061947, + "balance_loss_mlp": 1.02751899, + "epoch": 0.7322046941131204, + "flos": 570502946304.0, + "grad_norm": 0.06611321477092025, + "language_loss": 0.8209759, + "learning_rate": 0.00017656894317380907, + "loss": 0.83159536, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.34472656, + "step": 3806, + "time_per_iteration": 2.7116403579711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010208, + "balance_loss_mlp": 1.00062358, + "epoch": 0.732397075798384, + "flos": 1468334559744.0, + "grad_norm": 0.00621008772312024, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77041477, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.09570312, + "step": 3807, + "time_per_iteration": 4.968751668930054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061209, + "balance_loss_mlp": 1.0275209, + "epoch": 0.7325894574836476, + "flos": 464620948992.0, + "grad_norm": 0.05827651043720701, + "language_loss": 0.83991838, + "learning_rate": 0.00017609402575064875, + "loss": 0.85053051, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.3371582, + "step": 3808, + "time_per_iteration": 2.5385282039642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063699, + "balance_loss_mlp": 1.03003407, + "epoch": 0.7327818391689112, + "flos": 495246887424.0, + "grad_norm": 0.05735407240941104, + "language_loss": 0.80858552, + "learning_rate": 0.00017585675562016367, + "loss": 0.81922251, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.33691406, + "step": 3809, + "time_per_iteration": 2.555299997329712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007774, + "balance_loss_mlp": 0.99823719, + "epoch": 0.7329742208541746, + "flos": 1432694794752.0, + "grad_norm": 0.0030976704675862504, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78220618, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.09521484, + "step": 3810, + "time_per_iteration": 4.790294647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062704, + "balance_loss_mlp": 1.02894437, + "epoch": 0.7331666025394382, + "flos": 496645129728.0, + "grad_norm": 0.057652785058487796, + "language_loss": 0.84699941, + "learning_rate": 0.00017538259298196474, + "loss": 0.85762644, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.33789062, + "step": 3811, + "time_per_iteration": 2.5608150959014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066548, + "balance_loss_mlp": 1.03271604, + "epoch": 0.7333589842247018, + "flos": 538247420928.0, + "grad_norm": 0.07102765773461414, + "language_loss": 0.81726062, + "learning_rate": 0.00017514570065833745, + "loss": 0.82792604, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.33862305, + "step": 3812, + "time_per_iteration": 2.733987808227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063175, + "balance_loss_mlp": 1.03024936, + "epoch": 0.7335513659099654, + "flos": 490825198080.0, + "grad_norm": 0.0783727795203613, + "language_loss": 0.80580723, + "learning_rate": 0.00017490893445433426, + "loss": 0.81643891, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.3293457, + "step": 3813, + "time_per_iteration": 2.5801103115081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062414, + "balance_loss_mlp": 1.02953637, + "epoch": 0.733743747595229, + "flos": 561876355584.0, + "grad_norm": 0.048847975772381425, + "language_loss": 0.81069362, + "learning_rate": 0.00017467229446187587, + "loss": 0.82131779, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.32885742, + "step": 3814, + "time_per_iteration": 2.683293104171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060801, + "balance_loss_mlp": 1.02684999, + "epoch": 0.7339361292804925, + "flos": 538315822080.0, + "grad_norm": 0.047730041635456175, + "language_loss": 0.81664294, + "learning_rate": 0.00017443578077283424, + "loss": 0.82725096, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.33984375, + "step": 3815, + "time_per_iteration": 2.641364574432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064457, + "balance_loss_mlp": 1.03043437, + "epoch": 0.734128510965756, + "flos": 548198060544.0, + "grad_norm": 0.05243488536705766, + "language_loss": 0.85093778, + "learning_rate": 0.0001741993934790319, + "loss": 0.86158234, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.34057617, + "step": 3816, + "time_per_iteration": 2.7296290397644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060693, + "balance_loss_mlp": 1.027004, + "epoch": 0.7343208926510196, + "flos": 539783875584.0, + "grad_norm": 0.059294435662015, + "language_loss": 0.84253871, + "learning_rate": 0.00017396313267224273, + "loss": 0.85314572, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.3371582, + "step": 3817, + "time_per_iteration": 2.702885866165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064529, + "balance_loss_mlp": 1.03141296, + "epoch": 0.7345132743362832, + "flos": 570827423232.0, + "grad_norm": 0.058276166249488254, + "language_loss": 0.88087535, + "learning_rate": 0.0001737269984441912, + "loss": 0.89152062, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.33129883, + "step": 3818, + "time_per_iteration": 2.6317105293273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064679, + "balance_loss_mlp": 1.03089499, + "epoch": 0.7347056560215467, + "flos": 545135325696.0, + "grad_norm": 0.04588849649553848, + "language_loss": 0.84933245, + "learning_rate": 0.00017349099088655263, + "loss": 0.85997921, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.33813477, + "step": 3819, + "time_per_iteration": 2.6894302368164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063138, + "balance_loss_mlp": 1.03023624, + "epoch": 0.7348980377068103, + "flos": 595668335616.0, + "grad_norm": 0.04507487661925427, + "language_loss": 0.80804777, + "learning_rate": 0.00017325511009095375, + "loss": 0.81867915, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.32910156, + "step": 3820, + "time_per_iteration": 2.7293684482574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106202, + "balance_loss_mlp": 1.02833104, + "epoch": 0.7350904193920739, + "flos": 538291090944.0, + "grad_norm": 0.05281271554601035, + "language_loss": 0.83436865, + "learning_rate": 0.00017301935614897113, + "loss": 0.84498882, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.3371582, + "step": 3821, + "time_per_iteration": 2.727043390274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065798, + "balance_loss_mlp": 1.03199053, + "epoch": 0.7352828010773375, + "flos": 512712474624.0, + "grad_norm": 0.049847760142976955, + "language_loss": 0.81776285, + "learning_rate": 0.00017278372915213274, + "loss": 0.82842088, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.33837891, + "step": 3822, + "time_per_iteration": 2.650468587875366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016098, + "balance_loss_mlp": 1.00732386, + "epoch": 0.735475182762601, + "flos": 1552965087744.0, + "grad_norm": 0.006919711828678118, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80909944, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.08789062, + "step": 3823, + "time_per_iteration": 4.953552007675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064811, + "balance_loss_mlp": 1.03152812, + "epoch": 0.7356675644478645, + "flos": 680984133120.0, + "grad_norm": 0.05477130008948058, + "language_loss": 0.80415845, + "learning_rate": 0.00017231285635975314, + "loss": 0.81480658, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.33300781, + "step": 3824, + "time_per_iteration": 2.889289140701294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067221, + "balance_loss_mlp": 1.03334153, + "epoch": 0.7358599461331281, + "flos": 514961902080.0, + "grad_norm": 0.05024116025531215, + "language_loss": 0.83180618, + "learning_rate": 0.00017207761074702115, + "loss": 0.84247839, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.33911133, + "step": 3825, + "time_per_iteration": 2.5944931507110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068957, + "balance_loss_mlp": 1.03491116, + "epoch": 0.7360523278183917, + "flos": 443739036672.0, + "grad_norm": 0.05416022756752086, + "language_loss": 0.83636504, + "learning_rate": 0.0001718424924450514, + "loss": 0.8470546, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.34082031, + "step": 3826, + "time_per_iteration": 2.6031198501586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067291, + "balance_loss_mlp": 1.03441358, + "epoch": 0.7362447095036553, + "flos": 603142585344.0, + "grad_norm": 0.04455430936789472, + "language_loss": 0.85882723, + "learning_rate": 0.00017160750154512482, + "loss": 0.86950016, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.32885742, + "step": 3827, + "time_per_iteration": 2.702148914337158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067373, + "balance_loss_mlp": 1.03470922, + "epoch": 0.7364370911889189, + "flos": 552807424512.0, + "grad_norm": 0.06654318382518472, + "language_loss": 0.83394545, + "learning_rate": 0.0001713726381384731, + "loss": 0.84461915, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.32666016, + "step": 3828, + "time_per_iteration": 2.7451815605163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069365, + "balance_loss_mlp": 1.03622484, + "epoch": 0.7366294728741823, + "flos": 448830028800.0, + "grad_norm": 0.05260282371151395, + "language_loss": 0.81186259, + "learning_rate": 0.00017113790231627812, + "loss": 0.82255614, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.33154297, + "step": 3829, + "time_per_iteration": 2.537193775177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017376, + "balance_loss_mlp": 1.00879276, + "epoch": 0.7368218545594459, + "flos": 1534705132032.0, + "grad_norm": 0.0074062815552694275, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80275595, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.0859375, + "step": 3830, + "time_per_iteration": 4.833421945571899 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069316, + "balance_loss_mlp": 1.03584218, + "epoch": 0.7370142362447095, + "flos": 515164133376.0, + "grad_norm": 0.05241835365741791, + "language_loss": 0.81748456, + "learning_rate": 0.00017066881378973936, + "loss": 0.82817769, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.33496094, + "step": 3831, + "time_per_iteration": 2.619849443435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071106, + "balance_loss_mlp": 1.03808546, + "epoch": 0.7372066179299731, + "flos": 500531346432.0, + "grad_norm": 0.056102661804596575, + "language_loss": 0.82564443, + "learning_rate": 0.00017043446126751189, + "loss": 0.83635545, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.33032227, + "step": 3832, + "time_per_iteration": 2.689955711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069455, + "balance_loss_mlp": 1.03605282, + "epoch": 0.7373989996152366, + "flos": 557814048768.0, + "grad_norm": 0.062254186962725604, + "language_loss": 0.76771331, + "learning_rate": 0.00017020023669397376, + "loss": 0.77840781, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.33422852, + "step": 3833, + "time_per_iteration": 2.7102112770080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071175, + "balance_loss_mlp": 1.03722405, + "epoch": 0.7375913813005002, + "flos": 506527368192.0, + "grad_norm": 0.05138473189519923, + "language_loss": 0.81401753, + "learning_rate": 0.0001699661401600589, + "loss": 0.82472932, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.33984375, + "step": 3834, + "time_per_iteration": 2.5580482482910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066317, + "balance_loss_mlp": 1.03386855, + "epoch": 0.7377837629857638, + "flos": 485940819456.0, + "grad_norm": 0.04817361691999996, + "language_loss": 0.78101605, + "learning_rate": 0.00016973217175665205, + "loss": 0.7916792, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.32446289, + "step": 3835, + "time_per_iteration": 2.5466511249542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014272, + "balance_loss_mlp": 1.00540292, + "epoch": 0.7379761446710273, + "flos": 1413900776448.0, + "grad_norm": 0.004962525889406641, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.8218044, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.08886719, + "step": 3836, + "time_per_iteration": 4.947209358215332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065038, + "balance_loss_mlp": 1.03173065, + "epoch": 0.7381685263562909, + "flos": 629445758976.0, + "grad_norm": 0.04309096718082386, + "language_loss": 0.83880627, + "learning_rate": 0.00016926461970465047, + "loss": 0.84945667, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.33325195, + "step": 3837, + "time_per_iteration": 2.7604105472564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064561, + "balance_loss_mlp": 1.03175426, + "epoch": 0.7383609080415544, + "flos": 738869147136.0, + "grad_norm": 0.046495404641084814, + "language_loss": 0.84092653, + "learning_rate": 0.00016903103623757516, + "loss": 0.8515721, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.328125, + "step": 3838, + "time_per_iteration": 3.0393178462982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064697, + "balance_loss_mlp": 1.03165209, + "epoch": 0.738553289726818, + "flos": 549945510912.0, + "grad_norm": 0.05807903751309768, + "language_loss": 0.80044198, + "learning_rate": 0.00016879758126404738, + "loss": 0.81108892, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.33056641, + "step": 3839, + "time_per_iteration": 2.7287819385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066157, + "balance_loss_mlp": 1.03296924, + "epoch": 0.7387456714120816, + "flos": 909925705728.0, + "grad_norm": 0.06505190297085839, + "language_loss": 0.7982837, + "learning_rate": 0.00016856425487470216, + "loss": 0.8089453, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.33203125, + "step": 3840, + "time_per_iteration": 3.088334321975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070195, + "balance_loss_mlp": 1.03724539, + "epoch": 0.7389380530973452, + "flos": 852308352000.0, + "grad_norm": 0.054902923406453155, + "language_loss": 0.78921622, + "learning_rate": 0.00016833105716012486, + "loss": 0.79991817, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.32958984, + "step": 3841, + "time_per_iteration": 3.1420795917510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067762, + "balance_loss_mlp": 1.03433585, + "epoch": 0.7391304347826086, + "flos": 816678761472.0, + "grad_norm": 0.0538484990097731, + "language_loss": 0.85046756, + "learning_rate": 0.00016809798821085088, + "loss": 0.86114514, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.33447266, + "step": 3842, + "time_per_iteration": 2.9748454093933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067117, + "balance_loss_mlp": 1.03321409, + "epoch": 0.7393228164678722, + "flos": 572541378048.0, + "grad_norm": 0.07853013477986996, + "language_loss": 0.88786352, + "learning_rate": 0.00016786504811736565, + "loss": 0.89853466, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.33935547, + "step": 3843, + "time_per_iteration": 2.697993516921997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107057, + "balance_loss_mlp": 1.0370723, + "epoch": 0.7395151981531358, + "flos": 684903845376.0, + "grad_norm": 0.054879027639850184, + "language_loss": 0.82676303, + "learning_rate": 0.00016763223697010442, + "loss": 0.83746874, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.33520508, + "step": 3844, + "time_per_iteration": 2.941396951675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069204, + "balance_loss_mlp": 1.03680301, + "epoch": 0.7397075798383994, + "flos": 556095711744.0, + "grad_norm": 0.044630458439445526, + "language_loss": 0.84558266, + "learning_rate": 0.00016739955485945256, + "loss": 0.85627472, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.32397461, + "step": 3845, + "time_per_iteration": 2.6704368591308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070852, + "balance_loss_mlp": 1.03692532, + "epoch": 0.739899961523663, + "flos": 546523393536.0, + "grad_norm": 0.16146348926095225, + "language_loss": 0.8579582, + "learning_rate": 0.00016716700187574513, + "loss": 0.86866671, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.33959961, + "step": 3846, + "time_per_iteration": 2.689548969268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066553, + "balance_loss_mlp": 1.03400922, + "epoch": 0.7400923432089265, + "flos": 608913054720.0, + "grad_norm": 0.062089054691193496, + "language_loss": 0.83502501, + "learning_rate": 0.0001669345781092675, + "loss": 0.84569055, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.32543945, + "step": 3847, + "time_per_iteration": 2.7922914028167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106553, + "balance_loss_mlp": 1.03286684, + "epoch": 0.7402847248941901, + "flos": 590715555840.0, + "grad_norm": 0.053588507044290926, + "language_loss": 0.86693704, + "learning_rate": 0.0001667022836502546, + "loss": 0.87759233, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.32666016, + "step": 3848, + "time_per_iteration": 2.7810423374176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106826, + "balance_loss_mlp": 1.0351913, + "epoch": 0.7404771065794536, + "flos": 477136728576.0, + "grad_norm": 0.05607520940274661, + "language_loss": 0.82591665, + "learning_rate": 0.00016647011858889077, + "loss": 0.83659923, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.33081055, + "step": 3849, + "time_per_iteration": 2.5447256565093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068875, + "balance_loss_mlp": 1.03552043, + "epoch": 0.7406694882647172, + "flos": 496192614912.0, + "grad_norm": 0.05524374859668954, + "language_loss": 0.85861689, + "learning_rate": 0.00016623808301531056, + "loss": 0.86930567, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.33374023, + "step": 3850, + "time_per_iteration": 2.647326707839966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067961, + "balance_loss_mlp": 1.03455853, + "epoch": 0.7408618699499807, + "flos": 561925817856.0, + "grad_norm": 0.0770294501397313, + "language_loss": 0.79239726, + "learning_rate": 0.00016600617701959842, + "loss": 0.80307692, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.33422852, + "step": 3851, + "time_per_iteration": 2.724172830581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011471, + "balance_loss_mlp": 1.00212514, + "epoch": 0.7410542516352443, + "flos": 1387421512704.0, + "grad_norm": 0.004619624955394922, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79855287, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.09326172, + "step": 3852, + "time_per_iteration": 4.94897198677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069894, + "balance_loss_mlp": 1.03620529, + "epoch": 0.7412466333205079, + "flos": 669697860096.0, + "grad_norm": 0.05139846534823347, + "language_loss": 0.80732995, + "learning_rate": 0.00016554275412186315, + "loss": 0.81802887, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.3371582, + "step": 3853, + "time_per_iteration": 2.798964262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069502, + "balance_loss_mlp": 1.0356704, + "epoch": 0.7414390150057715, + "flos": 489038459904.0, + "grad_norm": 0.059331107298497686, + "language_loss": 0.80721259, + "learning_rate": 0.0001653112373997568, + "loss": 0.81790757, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.33862305, + "step": 3854, + "time_per_iteration": 2.6622824668884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071808, + "balance_loss_mlp": 1.03878713, + "epoch": 0.7416313966910351, + "flos": 599119566336.0, + "grad_norm": 0.060794627478568314, + "language_loss": 0.74696434, + "learning_rate": 0.0001650798506153517, + "loss": 0.7576825, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.33032227, + "step": 3855, + "time_per_iteration": 2.6897103786468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068866, + "balance_loss_mlp": 1.03558254, + "epoch": 0.7418237783762985, + "flos": 542279204352.0, + "grad_norm": 0.06401290816121721, + "language_loss": 0.83928871, + "learning_rate": 0.00016484859385848023, + "loss": 0.84997737, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.33276367, + "step": 3856, + "time_per_iteration": 2.6182141304016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065034, + "balance_loss_mlp": 1.0325613, + "epoch": 0.7420161600615621, + "flos": 543865121280.0, + "grad_norm": 0.060824827203723085, + "language_loss": 0.77091217, + "learning_rate": 0.0001646174672189243, + "loss": 0.78156251, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.32470703, + "step": 3857, + "time_per_iteration": 2.639897584915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072771, + "balance_loss_mlp": 1.039464, + "epoch": 0.7422085417468257, + "flos": 526921860096.0, + "grad_norm": 0.05508256135397888, + "language_loss": 0.80038357, + "learning_rate": 0.00016438647078641488, + "loss": 0.81111133, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.33325195, + "step": 3858, + "time_per_iteration": 2.583303213119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071831, + "balance_loss_mlp": 1.0385952, + "epoch": 0.7424009234320893, + "flos": 508404266496.0, + "grad_norm": 0.05219884306446566, + "language_loss": 0.83017123, + "learning_rate": 0.00016415560465063344, + "loss": 0.84088957, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.33251953, + "step": 3859, + "time_per_iteration": 2.7442150115966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069864, + "balance_loss_mlp": 1.03670025, + "epoch": 0.7425933051173528, + "flos": 512347299840.0, + "grad_norm": 0.07638052287216905, + "language_loss": 0.78861916, + "learning_rate": 0.0001639248689012095, + "loss": 0.79931784, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.33154297, + "step": 3860, + "time_per_iteration": 2.5846545696258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067753, + "balance_loss_mlp": 1.03487468, + "epoch": 0.7427856868026164, + "flos": 458034200064.0, + "grad_norm": 0.05020095318806213, + "language_loss": 0.87714618, + "learning_rate": 0.00016369426362772271, + "loss": 0.8878237, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.32885742, + "step": 3861, + "time_per_iteration": 2.7977116107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106857, + "balance_loss_mlp": 1.03576398, + "epoch": 0.74297806848788, + "flos": 604728502272.0, + "grad_norm": 0.04367608298357755, + "language_loss": 0.80370325, + "learning_rate": 0.00016346378891970233, + "loss": 0.81438893, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.328125, + "step": 3862, + "time_per_iteration": 2.8144397735595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067373, + "balance_loss_mlp": 1.03416157, + "epoch": 0.7431704501731435, + "flos": 890971564032.0, + "grad_norm": 0.052584770309724485, + "language_loss": 0.81109643, + "learning_rate": 0.00016323344486662633, + "loss": 0.82177019, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.33227539, + "step": 3863, + "time_per_iteration": 3.306062936782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069805, + "balance_loss_mlp": 1.03566337, + "epoch": 0.7433628318584071, + "flos": 591867896832.0, + "grad_norm": 0.05409036708303953, + "language_loss": 0.78479373, + "learning_rate": 0.00016300323155792247, + "loss": 0.79549176, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.34179688, + "step": 3864, + "time_per_iteration": 2.881361961364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070231, + "balance_loss_mlp": 1.03756773, + "epoch": 0.7435552135436706, + "flos": 476896619520.0, + "grad_norm": 0.06261465074360906, + "language_loss": 0.88414448, + "learning_rate": 0.00016277314908296687, + "loss": 0.8948468, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.32666016, + "step": 3865, + "time_per_iteration": 2.6607327461242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068826, + "balance_loss_mlp": 1.03497088, + "epoch": 0.7437475952289342, + "flos": 672874076160.0, + "grad_norm": 0.05871216754162407, + "language_loss": 0.75963724, + "learning_rate": 0.00016254319753108604, + "loss": 0.77032548, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.33862305, + "step": 3866, + "time_per_iteration": 2.8663392066955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072261, + "balance_loss_mlp": 1.03881145, + "epoch": 0.7439399769141978, + "flos": 770094577152.0, + "grad_norm": 0.0657107928380086, + "language_loss": 0.76937765, + "learning_rate": 0.00016231337699155492, + "loss": 0.78010023, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.3347168, + "step": 3867, + "time_per_iteration": 3.0015652179718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069051, + "balance_loss_mlp": 1.03579164, + "epoch": 0.7441323585994614, + "flos": 647462785536.0, + "grad_norm": 0.05480167763007067, + "language_loss": 0.781057, + "learning_rate": 0.0001620836875535977, + "loss": 0.79174751, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.33276367, + "step": 3868, + "time_per_iteration": 2.842230796813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067908, + "balance_loss_mlp": 1.03438592, + "epoch": 0.7443247402847248, + "flos": 565091859456.0, + "grad_norm": 0.08182292750671373, + "language_loss": 0.80810648, + "learning_rate": 0.00016185412930638766, + "loss": 0.81878555, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.33544922, + "step": 3869, + "time_per_iteration": 2.7977213859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071043, + "balance_loss_mlp": 1.03761721, + "epoch": 0.7445171219699884, + "flos": 578243446272.0, + "grad_norm": 0.07110615471626963, + "language_loss": 0.82752168, + "learning_rate": 0.00016162470233904765, + "loss": 0.8382321, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.33447266, + "step": 3870, + "time_per_iteration": 2.707329273223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106823, + "balance_loss_mlp": 1.03456485, + "epoch": 0.744709503655252, + "flos": 618588679680.0, + "grad_norm": 0.08201563915437336, + "language_loss": 0.81978703, + "learning_rate": 0.00016139540674064856, + "loss": 0.83046937, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.33666992, + "step": 3871, + "time_per_iteration": 2.779015302658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.03349781, + "epoch": 0.7449018853405156, + "flos": 528355008000.0, + "grad_norm": 0.053737872907142804, + "language_loss": 0.77632427, + "learning_rate": 0.00016116624260021113, + "loss": 0.78698754, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.32836914, + "step": 3872, + "time_per_iteration": 2.748868942260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068405, + "balance_loss_mlp": 1.03509796, + "epoch": 0.7450942670257792, + "flos": 433088570880.0, + "grad_norm": 0.050066249617561176, + "language_loss": 0.83786619, + "learning_rate": 0.0001609372100067046, + "loss": 0.84855032, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.33325195, + "step": 3873, + "time_per_iteration": 2.5261478424072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068874, + "balance_loss_mlp": 1.03504205, + "epoch": 0.7452866487110427, + "flos": 696562647552.0, + "grad_norm": 0.062485843646331765, + "language_loss": 0.84858561, + "learning_rate": 0.0001607083090490475, + "loss": 0.85927439, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.33862305, + "step": 3874, + "time_per_iteration": 2.912550210952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070703, + "balance_loss_mlp": 1.03718174, + "epoch": 0.7454790303963063, + "flos": 511944247296.0, + "grad_norm": 0.05620990191133866, + "language_loss": 0.80024898, + "learning_rate": 0.00016047953981610714, + "loss": 0.810956, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.33544922, + "step": 3875, + "time_per_iteration": 2.7009074687957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024051, + "balance_loss_mlp": 1.01460981, + "epoch": 0.7456714120815698, + "flos": 1325221088256.0, + "grad_norm": 0.008467942690165917, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.8075369, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.09423828, + "step": 3876, + "time_per_iteration": 4.952231168746948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065537, + "balance_loss_mlp": 1.0320152, + "epoch": 0.7458637937668334, + "flos": 721397767680.0, + "grad_norm": 0.05688245720911951, + "language_loss": 0.8058607, + "learning_rate": 0.0001600223968795889, + "loss": 0.8165161, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.33544922, + "step": 3877, + "time_per_iteration": 2.87972092628479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024325, + "balance_loss_mlp": 1.014979, + "epoch": 0.746056175452097, + "flos": 1500761793024.0, + "grad_norm": 0.00806071633609759, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76720393, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.09326172, + "step": 3878, + "time_per_iteration": 4.914839029312134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065825, + "balance_loss_mlp": 1.03335285, + "epoch": 0.7462485571373605, + "flos": 519984493056.0, + "grad_norm": 0.05864389965433195, + "language_loss": 0.81840986, + "learning_rate": 0.00015956578190706483, + "loss": 0.82906812, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.32470703, + "step": 3879, + "time_per_iteration": 2.690336227416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067732, + "balance_loss_mlp": 1.03492546, + "epoch": 0.7464409388226241, + "flos": 480967690752.0, + "grad_norm": 0.05296793730256709, + "language_loss": 0.75717044, + "learning_rate": 0.00015933767262892468, + "loss": 0.76784778, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.328125, + "step": 3880, + "time_per_iteration": 2.702094078063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106723, + "balance_loss_mlp": 1.03418517, + "epoch": 0.7466333205078877, + "flos": 486516989952.0, + "grad_norm": 0.06088844142287201, + "language_loss": 0.81730115, + "learning_rate": 0.00015910969560762927, + "loss": 0.82797348, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.33056641, + "step": 3881, + "time_per_iteration": 2.5547542572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066924, + "balance_loss_mlp": 1.03464174, + "epoch": 0.7468257021931513, + "flos": 611015505408.0, + "grad_norm": 0.05773306272323557, + "language_loss": 0.83265662, + "learning_rate": 0.00015888185093168727, + "loss": 0.84332585, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.32275391, + "step": 3882, + "time_per_iteration": 2.7600655555725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069044, + "balance_loss_mlp": 1.03502131, + "epoch": 0.7470180838784147, + "flos": 533204481024.0, + "grad_norm": 0.06850625099692723, + "language_loss": 0.8104043, + "learning_rate": 0.00015865413868955581, + "loss": 0.82109475, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.34057617, + "step": 3883, + "time_per_iteration": 2.6018030643463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066378, + "balance_loss_mlp": 1.03378606, + "epoch": 0.7472104655636783, + "flos": 739005949440.0, + "grad_norm": 0.05384081039558067, + "language_loss": 0.82672417, + "learning_rate": 0.00015842655896964054, + "loss": 0.83738798, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.32592773, + "step": 3884, + "time_per_iteration": 3.021933078765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065038, + "balance_loss_mlp": 1.03223145, + "epoch": 0.7474028472489419, + "flos": 640007474688.0, + "grad_norm": 0.052763664912519236, + "language_loss": 0.73725951, + "learning_rate": 0.00015819911186029567, + "loss": 0.7479099, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.328125, + "step": 3885, + "time_per_iteration": 2.8068392276763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068676, + "balance_loss_mlp": 1.03577399, + "epoch": 0.7475952289342055, + "flos": 589980824064.0, + "grad_norm": 0.05740266756526494, + "language_loss": 0.8658216, + "learning_rate": 0.00015797179744982443, + "loss": 0.87650836, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.32910156, + "step": 3886, + "time_per_iteration": 2.7342216968536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067874, + "balance_loss_mlp": 1.03492451, + "epoch": 0.7477876106194691, + "flos": 487935581184.0, + "grad_norm": 0.05063564499597122, + "language_loss": 0.79109228, + "learning_rate": 0.00015774461582647765, + "loss": 0.80177104, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.32958984, + "step": 3887, + "time_per_iteration": 2.617705821990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066709, + "balance_loss_mlp": 1.03390241, + "epoch": 0.7479799923047326, + "flos": 554470507008.0, + "grad_norm": 0.04778068214414316, + "language_loss": 0.81002998, + "learning_rate": 0.00015751756707845505, + "loss": 0.82069701, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.328125, + "step": 3888, + "time_per_iteration": 2.611276626586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067147, + "balance_loss_mlp": 1.03505635, + "epoch": 0.7481723739899961, + "flos": 767037634560.0, + "grad_norm": 0.054687563688018546, + "language_loss": 0.88108873, + "learning_rate": 0.00015729065129390502, + "loss": 0.89176023, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.32080078, + "step": 3889, + "time_per_iteration": 3.022294759750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069902, + "balance_loss_mlp": 1.03614235, + "epoch": 0.7483647556752597, + "flos": 495926364672.0, + "grad_norm": 0.07150557993865005, + "language_loss": 0.81957299, + "learning_rate": 0.0001570638685609241, + "loss": 0.83027202, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.33789062, + "step": 3890, + "time_per_iteration": 2.540038585662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068666, + "balance_loss_mlp": 1.03588343, + "epoch": 0.7485571373605233, + "flos": 472607350272.0, + "grad_norm": 0.055161335390356114, + "language_loss": 0.8031671, + "learning_rate": 0.00015683721896755693, + "loss": 0.81385386, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.32788086, + "step": 3891, + "time_per_iteration": 2.5199973583221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015851, + "balance_loss_mlp": 1.00683892, + "epoch": 0.7487495190457868, + "flos": 1553619833856.0, + "grad_norm": 0.004937901566549453, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83226347, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.09033203, + "step": 3892, + "time_per_iteration": 4.912605047225952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068919, + "balance_loss_mlp": 1.03632677, + "epoch": 0.7489419007310504, + "flos": 581566639104.0, + "grad_norm": 0.04880798479848443, + "language_loss": 0.84992248, + "learning_rate": 0.00015638431955158528, + "loss": 0.86061168, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.32592773, + "step": 3893, + "time_per_iteration": 2.6795592308044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066956, + "balance_loss_mlp": 1.03398299, + "epoch": 0.749134282416314, + "flos": 567297616896.0, + "grad_norm": 0.04606226658973748, + "language_loss": 0.80857748, + "learning_rate": 0.00015615806990481186, + "loss": 0.81924701, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.32983398, + "step": 3894, + "time_per_iteration": 2.7299861907958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066927, + "balance_loss_mlp": 1.03433573, + "epoch": 0.7493266641015776, + "flos": 532786871808.0, + "grad_norm": 0.044395679249862555, + "language_loss": 0.8442167, + "learning_rate": 0.00015593195374931452, + "loss": 0.854886, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.32592773, + "step": 3895, + "time_per_iteration": 2.725260019302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066101, + "balance_loss_mlp": 1.03346133, + "epoch": 0.7495190457868411, + "flos": 523338209280.0, + "grad_norm": 0.05913067332521358, + "language_loss": 0.79859447, + "learning_rate": 0.00015570597117287922, + "loss": 0.80925548, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.32641602, + "step": 3896, + "time_per_iteration": 2.6577799320220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070214, + "balance_loss_mlp": 1.03762269, + "epoch": 0.7497114274721046, + "flos": 513937598976.0, + "grad_norm": 0.0999283842203671, + "language_loss": 0.77427346, + "learning_rate": 0.0001554801222632406, + "loss": 0.78497565, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.32592773, + "step": 3897, + "time_per_iteration": 2.6006200313568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065155, + "balance_loss_mlp": 1.03239596, + "epoch": 0.7499038091573682, + "flos": 494759467008.0, + "grad_norm": 0.050843654610054065, + "language_loss": 0.85019195, + "learning_rate": 0.00015525440710808052, + "loss": 0.86084348, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.32763672, + "step": 3898, + "time_per_iteration": 2.661421775817871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068075, + "balance_loss_mlp": 1.03586483, + "epoch": 0.7500961908426318, + "flos": 737326900224.0, + "grad_norm": 0.05107930467548482, + "language_loss": 0.77678949, + "learning_rate": 0.00015502882579502953, + "loss": 0.78747022, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.32202148, + "step": 3899, + "time_per_iteration": 2.9202702045440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062076, + "balance_loss_mlp": 1.02931714, + "epoch": 0.7502885725278954, + "flos": 533117140992.0, + "grad_norm": 0.046214312949338116, + "language_loss": 0.84483492, + "learning_rate": 0.00015480337841166592, + "loss": 0.85545564, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.32763672, + "step": 3900, + "time_per_iteration": 2.704392194747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070609, + "balance_loss_mlp": 1.03761196, + "epoch": 0.7504809542131589, + "flos": 589017567744.0, + "grad_norm": 0.05276594694020605, + "language_loss": 0.82456982, + "learning_rate": 0.00015457806504551647, + "loss": 0.83527595, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.33007812, + "step": 3901, + "time_per_iteration": 2.8369719982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066454, + "balance_loss_mlp": 1.03376722, + "epoch": 0.7506733358984224, + "flos": 511293883392.0, + "grad_norm": 0.05412460278066938, + "language_loss": 0.78305542, + "learning_rate": 0.0001543528857840554, + "loss": 0.79372001, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.3269043, + "step": 3902, + "time_per_iteration": 2.679732084274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064427, + "balance_loss_mlp": 1.03204942, + "epoch": 0.750865717583686, + "flos": 538990917120.0, + "grad_norm": 0.0614099012114921, + "language_loss": 0.80124992, + "learning_rate": 0.000154127840714705, + "loss": 0.81189418, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.32373047, + "step": 3903, + "time_per_iteration": 2.7384650707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065338, + "balance_loss_mlp": 1.03265119, + "epoch": 0.7510580992689496, + "flos": 476339387904.0, + "grad_norm": 0.0665672194872541, + "language_loss": 0.81678092, + "learning_rate": 0.00015390292992483557, + "loss": 0.82743436, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.3269043, + "step": 3904, + "time_per_iteration": 2.489619731903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106175, + "balance_loss_mlp": 1.02894402, + "epoch": 0.7512504809542132, + "flos": 578755597824.0, + "grad_norm": 0.06071277834491827, + "language_loss": 0.83697867, + "learning_rate": 0.00015367815350176523, + "loss": 0.84759617, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.328125, + "step": 3905, + "time_per_iteration": 2.716557025909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062505, + "balance_loss_mlp": 1.02943611, + "epoch": 0.7514428626394767, + "flos": 418435435008.0, + "grad_norm": 0.05426428820628694, + "language_loss": 0.82564658, + "learning_rate": 0.00015345351153275987, + "loss": 0.83627158, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.33081055, + "step": 3906, + "time_per_iteration": 2.522923707962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065336, + "balance_loss_mlp": 1.03262544, + "epoch": 0.7516352443247403, + "flos": 640736414208.0, + "grad_norm": 0.05433907321222643, + "language_loss": 0.80729043, + "learning_rate": 0.00015322900410503332, + "loss": 0.81794381, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.32714844, + "step": 3907, + "time_per_iteration": 2.793536424636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064989, + "balance_loss_mlp": 1.03189635, + "epoch": 0.7518276260100039, + "flos": 580700897280.0, + "grad_norm": 0.05951130469098692, + "language_loss": 0.76875365, + "learning_rate": 0.00015300463130574703, + "loss": 0.77940357, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.33105469, + "step": 3908, + "time_per_iteration": 2.8399226665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063261, + "balance_loss_mlp": 1.03045464, + "epoch": 0.7520200076952674, + "flos": 687025234944.0, + "grad_norm": 0.0651669879699934, + "language_loss": 0.81970477, + "learning_rate": 0.00015278039322201033, + "loss": 0.83033741, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.328125, + "step": 3909, + "time_per_iteration": 2.9373419284820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063542, + "balance_loss_mlp": 1.02985382, + "epoch": 0.7522123893805309, + "flos": 486196895232.0, + "grad_norm": 0.06049213601321292, + "language_loss": 0.79440963, + "learning_rate": 0.00015255628994088004, + "loss": 0.80504501, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.3371582, + "step": 3910, + "time_per_iteration": 2.528364419937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065253, + "balance_loss_mlp": 1.03175521, + "epoch": 0.7524047710657945, + "flos": 818581800960.0, + "grad_norm": 0.05892068173673864, + "language_loss": 0.75070155, + "learning_rate": 0.00015233232154936082, + "loss": 0.76135409, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.33520508, + "step": 3911, + "time_per_iteration": 3.230201244354248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062613, + "balance_loss_mlp": 1.02916312, + "epoch": 0.7525971527510581, + "flos": 699191806464.0, + "grad_norm": 0.055756434069827554, + "language_loss": 0.76463896, + "learning_rate": 0.0001521084881344048, + "loss": 0.7752651, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.3347168, + "step": 3912, + "time_per_iteration": 2.8348512649536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065733, + "balance_loss_mlp": 1.03216362, + "epoch": 0.7527895344363217, + "flos": 633497891328.0, + "grad_norm": 0.050850444756768094, + "language_loss": 0.86350536, + "learning_rate": 0.00015188478978291208, + "loss": 0.87416273, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.3359375, + "step": 3913, + "time_per_iteration": 2.744290828704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.03404021, + "epoch": 0.7529819161215853, + "flos": 562555832832.0, + "grad_norm": 0.05433821617011464, + "language_loss": 0.8621949, + "learning_rate": 0.00015166122658173014, + "loss": 0.8728655, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.33032227, + "step": 3914, + "time_per_iteration": 2.8117570877075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066949, + "balance_loss_mlp": 1.03368926, + "epoch": 0.7531742978068487, + "flos": 690344045568.0, + "grad_norm": 0.048975254587736855, + "language_loss": 0.88076222, + "learning_rate": 0.00015143779861765332, + "loss": 0.89143169, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.33251953, + "step": 3915, + "time_per_iteration": 2.8815720081329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063708, + "balance_loss_mlp": 1.03140223, + "epoch": 0.7533666794921123, + "flos": 680800840704.0, + "grad_norm": 0.04986662461838111, + "language_loss": 0.81009239, + "learning_rate": 0.00015121450597742458, + "loss": 0.82072949, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.32299805, + "step": 3916, + "time_per_iteration": 2.80761456489563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010608, + "balance_loss_mlp": 1.02830386, + "epoch": 0.7535590611773759, + "flos": 623384308224.0, + "grad_norm": 0.05782496092002166, + "language_loss": 0.78096646, + "learning_rate": 0.00015099134874773369, + "loss": 0.79157448, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.32495117, + "step": 3917, + "time_per_iteration": 2.7233426570892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065187, + "balance_loss_mlp": 1.03149819, + "epoch": 0.7537514428626395, + "flos": 519162421248.0, + "grad_norm": 0.0518571632225719, + "language_loss": 0.80421233, + "learning_rate": 0.00015076832701521793, + "loss": 0.81486416, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.3371582, + "step": 3918, + "time_per_iteration": 2.6993284225463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062901, + "balance_loss_mlp": 1.02971327, + "epoch": 0.753943824547903, + "flos": 723309571584.0, + "grad_norm": 0.06554029395428207, + "language_loss": 0.82133907, + "learning_rate": 0.000150545440866462, + "loss": 0.83196807, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.33203125, + "step": 3919, + "time_per_iteration": 2.9902353286743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063028, + "balance_loss_mlp": 1.03050804, + "epoch": 0.7541362062331666, + "flos": 437318203392.0, + "grad_norm": 0.051833460096662155, + "language_loss": 0.78462708, + "learning_rate": 0.000150322690387998, + "loss": 0.79525733, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.32519531, + "step": 3920, + "time_per_iteration": 2.496290922164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106303, + "balance_loss_mlp": 1.02941298, + "epoch": 0.7543285879184302, + "flos": 565007491584.0, + "grad_norm": 0.05213671641073607, + "language_loss": 0.75242233, + "learning_rate": 0.00015010007566630535, + "loss": 0.76305258, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.33642578, + "step": 3921, + "time_per_iteration": 2.7238450050354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065892, + "balance_loss_mlp": 1.03210807, + "epoch": 0.7545209696036937, + "flos": 520781833728.0, + "grad_norm": 0.060725267986870404, + "language_loss": 0.8104378, + "learning_rate": 0.00014987759678781077, + "loss": 0.82109678, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.33813477, + "step": 3922, + "time_per_iteration": 2.596788167953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066538, + "balance_loss_mlp": 1.03208637, + "epoch": 0.7547133512889573, + "flos": 615782020608.0, + "grad_norm": 0.05117423221869946, + "language_loss": 0.82205606, + "learning_rate": 0.00014965525383888795, + "loss": 0.83272147, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.3449707, + "step": 3923, + "time_per_iteration": 2.7719502449035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106299, + "balance_loss_mlp": 1.0298022, + "epoch": 0.7549057329742208, + "flos": 750522157056.0, + "grad_norm": 0.05672347636966434, + "language_loss": 0.72166836, + "learning_rate": 0.00014943304690585851, + "loss": 0.73229825, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.33203125, + "step": 3924, + "time_per_iteration": 2.90588116645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063807, + "balance_loss_mlp": 1.03069079, + "epoch": 0.7550981146594844, + "flos": 514193674752.0, + "grad_norm": 0.06004038441284508, + "language_loss": 0.79123962, + "learning_rate": 0.0001492109760749908, + "loss": 0.80187768, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.33129883, + "step": 3925, + "time_per_iteration": 2.573479652404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062523, + "balance_loss_mlp": 1.02900124, + "epoch": 0.755290496344748, + "flos": 521756674560.0, + "grad_norm": 0.04754610420203459, + "language_loss": 0.79945302, + "learning_rate": 0.00014898904143250002, + "loss": 0.81007826, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.33544922, + "step": 3926, + "time_per_iteration": 2.6605517864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011981, + "balance_loss_mlp": 1.00320745, + "epoch": 0.7554828780300116, + "flos": 1413845521920.0, + "grad_norm": 0.009243318676460378, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76767182, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.08789062, + "step": 3927, + "time_per_iteration": 4.911595106124878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066882, + "balance_loss_mlp": 1.03343201, + "epoch": 0.7556752597152752, + "flos": 556676264448.0, + "grad_norm": 0.06225847362151781, + "language_loss": 0.80114925, + "learning_rate": 0.0001485455810572474, + "loss": 0.81181806, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.3347168, + "step": 3928, + "time_per_iteration": 2.6221096515655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061736, + "balance_loss_mlp": 1.02864373, + "epoch": 0.7558676414005386, + "flos": 563363347968.0, + "grad_norm": 0.0430287394272786, + "language_loss": 0.83688951, + "learning_rate": 0.00014832405549665236, + "loss": 0.84750688, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.33105469, + "step": 3929, + "time_per_iteration": 2.687077760696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062194, + "balance_loss_mlp": 1.02898264, + "epoch": 0.7560600230858022, + "flos": 561089189376.0, + "grad_norm": 0.072300166117579, + "language_loss": 0.78684491, + "learning_rate": 0.00014810266646876746, + "loss": 0.79746687, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.33227539, + "step": 3930, + "time_per_iteration": 2.784480571746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060871, + "balance_loss_mlp": 1.02703977, + "epoch": 0.7562524047710658, + "flos": 719232708096.0, + "grad_norm": 0.05835242926257929, + "language_loss": 0.7758401, + "learning_rate": 0.00014788141405954364, + "loss": 0.78644884, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.33862305, + "step": 3931, + "time_per_iteration": 2.9784233570098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066404, + "balance_loss_mlp": 1.03345442, + "epoch": 0.7564447864563294, + "flos": 543086719488.0, + "grad_norm": 0.059110171964688825, + "language_loss": 0.84827656, + "learning_rate": 0.00014766029835487865, + "loss": 0.85894054, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.32958984, + "step": 3932, + "time_per_iteration": 2.6907904148101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062333, + "balance_loss_mlp": 1.02945542, + "epoch": 0.7566371681415929, + "flos": 725484805632.0, + "grad_norm": 0.06258669653948258, + "language_loss": 0.79361248, + "learning_rate": 0.0001474393194406173, + "loss": 0.80423582, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.32885742, + "step": 3933, + "time_per_iteration": 2.8968892097473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062749, + "balance_loss_mlp": 1.02991855, + "epoch": 0.7568295498268565, + "flos": 576274825728.0, + "grad_norm": 0.05981896319872157, + "language_loss": 0.79737186, + "learning_rate": 0.00014721847740255112, + "loss": 0.80799937, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.32836914, + "step": 3934, + "time_per_iteration": 2.7890961170196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011573, + "balance_loss_mlp": 1.00279939, + "epoch": 0.75702193151212, + "flos": 1519273594368.0, + "grad_norm": 0.004234862497934677, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74923497, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.08789062, + "step": 3935, + "time_per_iteration": 4.601314544677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061211, + "balance_loss_mlp": 1.02866662, + "epoch": 0.7572143131973836, + "flos": 525218079744.0, + "grad_norm": 0.08729501831475094, + "language_loss": 0.78364342, + "learning_rate": 0.00014677720429790526, + "loss": 0.7942555, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.32543945, + "step": 3936, + "time_per_iteration": 2.5926949977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061936, + "balance_loss_mlp": 1.0290581, + "epoch": 0.7574066948826472, + "flos": 550467836928.0, + "grad_norm": 0.04449678335712254, + "language_loss": 0.84388995, + "learning_rate": 0.0001465567734026429, + "loss": 0.85450935, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.32885742, + "step": 3937, + "time_per_iteration": 2.673203706741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064975, + "balance_loss_mlp": 1.03183448, + "epoch": 0.7575990765679107, + "flos": 395682416640.0, + "grad_norm": 0.06471305080336787, + "language_loss": 0.82730478, + "learning_rate": 0.00014633647972621034, + "loss": 0.83795452, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.33154297, + "step": 3938, + "time_per_iteration": 2.4455604553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067639, + "balance_loss_mlp": 1.03399837, + "epoch": 0.7577914582531743, + "flos": 584742855168.0, + "grad_norm": 0.04609831927497642, + "language_loss": 0.86192119, + "learning_rate": 0.00014611632335413354, + "loss": 0.87259758, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.33666992, + "step": 3939, + "time_per_iteration": 2.7661402225494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068002, + "balance_loss_mlp": 1.03526759, + "epoch": 0.7579838399384379, + "flos": 820604265984.0, + "grad_norm": 0.05221570879511052, + "language_loss": 0.82420516, + "learning_rate": 0.00014589630437188456, + "loss": 0.83488512, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.32739258, + "step": 3940, + "time_per_iteration": 3.1596429347991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010684, + "balance_loss_mlp": 1.03578401, + "epoch": 0.7581762216237015, + "flos": 443664843264.0, + "grad_norm": 0.0650937472679739, + "language_loss": 0.78844047, + "learning_rate": 0.00014567642286488253, + "loss": 0.79912448, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.32617188, + "step": 3941, + "time_per_iteration": 2.515453577041626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067786, + "balance_loss_mlp": 1.03505135, + "epoch": 0.7583686033089649, + "flos": 540624886272.0, + "grad_norm": 0.060324478977950624, + "language_loss": 0.7890631, + "learning_rate": 0.00014545667891849258, + "loss": 0.79974091, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.32739258, + "step": 3942, + "time_per_iteration": 2.632852554321289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068794, + "balance_loss_mlp": 1.03648806, + "epoch": 0.7585609849942285, + "flos": 522332845056.0, + "grad_norm": 0.05155975595459647, + "language_loss": 0.8239159, + "learning_rate": 0.00014523707261802733, + "loss": 0.83460391, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.32299805, + "step": 3943, + "time_per_iteration": 2.6377763748168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074539, + "balance_loss_mlp": 1.04170835, + "epoch": 0.7587533666794921, + "flos": 541599727104.0, + "grad_norm": 0.05698795626816005, + "language_loss": 0.81395125, + "learning_rate": 0.00014501760404874527, + "loss": 0.82469666, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.32836914, + "step": 3944, + "time_per_iteration": 2.690519332885742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073349, + "balance_loss_mlp": 1.04116213, + "epoch": 0.7589457483647557, + "flos": 606131126784.0, + "grad_norm": 0.06183156174415775, + "language_loss": 0.85775477, + "learning_rate": 0.00014479827329585176, + "loss": 0.86848831, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.32177734, + "step": 3945, + "time_per_iteration": 2.7058537006378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066895, + "balance_loss_mlp": 1.03449392, + "epoch": 0.7591381300500193, + "flos": 554821125120.0, + "grad_norm": 0.04920928189565755, + "language_loss": 0.84866571, + "learning_rate": 0.00014457908044449846, + "loss": 0.85933459, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.32397461, + "step": 3946, + "time_per_iteration": 2.785212516784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071888, + "balance_loss_mlp": 1.03963017, + "epoch": 0.7593305117352828, + "flos": 529399660032.0, + "grad_norm": 0.05182175118482316, + "language_loss": 0.82816386, + "learning_rate": 0.00014436002557978371, + "loss": 0.8388828, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.32250977, + "step": 3947, + "time_per_iteration": 2.784555196762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029365, + "balance_loss_mlp": 1.02059126, + "epoch": 0.7595228934205464, + "flos": 1502020412928.0, + "grad_norm": 0.01048294354444643, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77672517, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.08789062, + "step": 3948, + "time_per_iteration": 4.8788769245147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072982, + "balance_loss_mlp": 1.0407002, + "epoch": 0.7597152751058099, + "flos": 455290149888.0, + "grad_norm": 0.0492093123378979, + "language_loss": 0.79732686, + "learning_rate": 0.0001439223301503945, + "loss": 0.80805671, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.32275391, + "step": 3949, + "time_per_iteration": 2.548963785171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071385, + "balance_loss_mlp": 1.0404619, + "epoch": 0.7599076567910735, + "flos": 685135190016.0, + "grad_norm": 0.05900471318664728, + "language_loss": 0.76152921, + "learning_rate": 0.00014370368975564834, + "loss": 0.77224308, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.30883789, + "step": 3950, + "time_per_iteration": 2.913701295852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107191, + "balance_loss_mlp": 1.03915179, + "epoch": 0.760100038476337, + "flos": 532092837888.0, + "grad_norm": 0.059009621355687734, + "language_loss": 0.83279252, + "learning_rate": 0.00014348518768739766, + "loss": 0.84351158, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.32763672, + "step": 3951, + "time_per_iteration": 2.7261831760406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022819, + "balance_loss_mlp": 1.01409268, + "epoch": 0.7602924201616006, + "flos": 1470952134144.0, + "grad_norm": 0.0078103610005334605, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77750862, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.08740234, + "step": 3952, + "time_per_iteration": 4.8437769412994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072298, + "balance_loss_mlp": 1.04094601, + "epoch": 0.7604848018468642, + "flos": 774280539648.0, + "grad_norm": 0.04997444218606865, + "language_loss": 0.86468828, + "learning_rate": 0.00014304859886964867, + "loss": 0.87541121, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.31323242, + "step": 3953, + "time_per_iteration": 3.0284688472747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074246, + "balance_loss_mlp": 1.04148698, + "epoch": 0.7606771835321278, + "flos": 557917355520.0, + "grad_norm": 0.06472890254950428, + "language_loss": 0.83519757, + "learning_rate": 0.00014283051228964878, + "loss": 0.84594011, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.32763672, + "step": 3954, + "time_per_iteration": 2.783090591430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067108, + "balance_loss_mlp": 1.03527939, + "epoch": 0.7608695652173914, + "flos": 525139504128.0, + "grad_norm": 0.05417243250507387, + "language_loss": 0.82754749, + "learning_rate": 0.00014261256437514197, + "loss": 0.83821857, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.31811523, + "step": 3955, + "time_per_iteration": 2.644597291946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067658, + "balance_loss_mlp": 1.03497052, + "epoch": 0.7610619469026548, + "flos": 614757717504.0, + "grad_norm": 0.055555468337999576, + "language_loss": 0.82313621, + "learning_rate": 0.0001423947552107428, + "loss": 0.83381271, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.3269043, + "step": 3956, + "time_per_iteration": 2.7361013889312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069431, + "balance_loss_mlp": 1.03648186, + "epoch": 0.7612543285879184, + "flos": 862992313344.0, + "grad_norm": 0.0569357592258459, + "language_loss": 0.77433807, + "learning_rate": 0.00014217708488101243, + "loss": 0.78503239, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.32958984, + "step": 3957, + "time_per_iteration": 3.050961494445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074949, + "balance_loss_mlp": 1.04271495, + "epoch": 0.761446710273182, + "flos": 553392359424.0, + "grad_norm": 0.06767693941608623, + "language_loss": 0.77007008, + "learning_rate": 0.0001419595534704579, + "loss": 0.78081954, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.32226562, + "step": 3958, + "time_per_iteration": 2.660353899002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062214, + "balance_loss_mlp": 1.03105259, + "epoch": 0.7616390919584456, + "flos": 467107513344.0, + "grad_norm": 0.049028323039667754, + "language_loss": 0.80953354, + "learning_rate": 0.00014174216106353237, + "loss": 0.82015562, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.3112793, + "step": 3959, + "time_per_iteration": 2.5838327407836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068236, + "balance_loss_mlp": 1.03542924, + "epoch": 0.7618314736437091, + "flos": 498181584384.0, + "grad_norm": 0.05923666711399137, + "language_loss": 0.75957918, + "learning_rate": 0.00014152490774463512, + "loss": 0.77026153, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.328125, + "step": 3960, + "time_per_iteration": 2.629302978515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068987, + "balance_loss_mlp": 1.03639507, + "epoch": 0.7620238553289727, + "flos": 434319487488.0, + "grad_norm": 0.07059591088547341, + "language_loss": 0.8700611, + "learning_rate": 0.00014130779359811135, + "loss": 0.88075095, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.32592773, + "step": 3961, + "time_per_iteration": 2.485924243927002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067259, + "balance_loss_mlp": 1.03528666, + "epoch": 0.7622162370142362, + "flos": 663962296320.0, + "grad_norm": 0.05047068415952909, + "language_loss": 0.85704315, + "learning_rate": 0.0001410908187082521, + "loss": 0.86771578, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.31958008, + "step": 3962, + "time_per_iteration": 2.8265414237976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065802, + "balance_loss_mlp": 1.03313875, + "epoch": 0.7624086186994998, + "flos": 557700567552.0, + "grad_norm": 0.05430861505422096, + "language_loss": 0.82810938, + "learning_rate": 0.0001408739831592949, + "loss": 0.83876741, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.32666016, + "step": 3963, + "time_per_iteration": 2.661726236343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067961, + "balance_loss_mlp": 1.03529739, + "epoch": 0.7626010003847634, + "flos": 628844857344.0, + "grad_norm": 0.06042473159086171, + "language_loss": 0.77454793, + "learning_rate": 0.0001406572870354224, + "loss": 0.78522754, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.32666016, + "step": 3964, + "time_per_iteration": 2.7862119674682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068961, + "balance_loss_mlp": 1.03706062, + "epoch": 0.7627933820700269, + "flos": 437716873728.0, + "grad_norm": 0.04673534263309711, + "language_loss": 0.86767244, + "learning_rate": 0.00014044073042076337, + "loss": 0.87836206, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.31884766, + "step": 3965, + "time_per_iteration": 2.4798128604888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069459, + "balance_loss_mlp": 1.03765345, + "epoch": 0.7629857637552905, + "flos": 532456602624.0, + "grad_norm": 0.04658863025626681, + "language_loss": 0.88987994, + "learning_rate": 0.00014022431339939302, + "loss": 0.90057456, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.31787109, + "step": 3966, + "time_per_iteration": 2.636894702911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067525, + "balance_loss_mlp": 1.03479052, + "epoch": 0.7631781454405541, + "flos": 679737249792.0, + "grad_norm": 0.08316975322842361, + "language_loss": 0.77961999, + "learning_rate": 0.00014000803605533163, + "loss": 0.79029524, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.32739258, + "step": 3967, + "time_per_iteration": 2.8040103912353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065892, + "balance_loss_mlp": 1.03344274, + "epoch": 0.7633705271258177, + "flos": 507246133248.0, + "grad_norm": 0.05895392031680787, + "language_loss": 0.83634377, + "learning_rate": 0.00013979189847254553, + "loss": 0.84700263, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.32446289, + "step": 3968, + "time_per_iteration": 2.5431933403015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067532, + "balance_loss_mlp": 1.03501129, + "epoch": 0.7635629088110811, + "flos": 618574123008.0, + "grad_norm": 0.055607531947043785, + "language_loss": 0.80514443, + "learning_rate": 0.00013957590073494674, + "loss": 0.81581974, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.32519531, + "step": 3969, + "time_per_iteration": 2.8017959594726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064277, + "balance_loss_mlp": 1.03232884, + "epoch": 0.7637552904963447, + "flos": 638140750848.0, + "grad_norm": 0.26403384502939975, + "language_loss": 0.78649521, + "learning_rate": 0.0001393600429263931, + "loss": 0.79713798, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.31933594, + "step": 3970, + "time_per_iteration": 4.2505412101745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100666, + "balance_loss_mlp": 0.99793345, + "epoch": 0.7639476721816083, + "flos": 1562359905792.0, + "grad_norm": 0.004510519200430985, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75751543, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.08740234, + "step": 3971, + "time_per_iteration": 4.917391777038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063864, + "balance_loss_mlp": 1.03112936, + "epoch": 0.7641400538668719, + "flos": 495729925632.0, + "grad_norm": 0.05348736526149064, + "language_loss": 0.81438577, + "learning_rate": 0.0001389287474315804, + "loss": 0.82502437, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.32739258, + "step": 3972, + "time_per_iteration": 2.611975908279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063623, + "balance_loss_mlp": 1.03153205, + "epoch": 0.7643324355521355, + "flos": 578173635072.0, + "grad_norm": 0.05070273758495156, + "language_loss": 0.7976076, + "learning_rate": 0.00013871330991276505, + "loss": 0.80824381, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.32080078, + "step": 3973, + "time_per_iteration": 2.6702983379364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106582, + "balance_loss_mlp": 1.03318095, + "epoch": 0.764524817237399, + "flos": 784472698368.0, + "grad_norm": 0.053475096213737486, + "language_loss": 0.80356216, + "learning_rate": 0.00013849801265788247, + "loss": 0.81422037, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.32641602, + "step": 3974, + "time_per_iteration": 3.00087571144104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066526, + "balance_loss_mlp": 1.03357661, + "epoch": 0.7647171989226625, + "flos": 526025594880.0, + "grad_norm": 0.054787050143816365, + "language_loss": 0.82488281, + "learning_rate": 0.00013828285575051818, + "loss": 0.83554804, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.32958984, + "step": 3975, + "time_per_iteration": 2.6055147647857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061125, + "balance_loss_mlp": 1.0279367, + "epoch": 0.7649095806079261, + "flos": 554589780480.0, + "grad_norm": 0.05436611510263978, + "language_loss": 0.84129888, + "learning_rate": 0.0001380678392742035, + "loss": 0.85191011, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.33203125, + "step": 3976, + "time_per_iteration": 2.6914188861846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106537, + "balance_loss_mlp": 1.03232551, + "epoch": 0.7651019622931897, + "flos": 648836296704.0, + "grad_norm": 0.051149264081770666, + "language_loss": 0.84838861, + "learning_rate": 0.00013785296331241526, + "loss": 0.85904235, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.33056641, + "step": 3977, + "time_per_iteration": 2.866154670715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064487, + "balance_loss_mlp": 1.03089428, + "epoch": 0.7652943439784533, + "flos": 1046034971136.0, + "grad_norm": 0.05614197674370758, + "language_loss": 0.87043619, + "learning_rate": 0.00013763822794857583, + "loss": 0.8810811, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.33618164, + "step": 3978, + "time_per_iteration": 3.309242010116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062044, + "balance_loss_mlp": 1.02947557, + "epoch": 0.7654867256637168, + "flos": 504085883904.0, + "grad_norm": 0.05878573704619195, + "language_loss": 0.89744586, + "learning_rate": 0.00013742363326605278, + "loss": 0.90806627, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.32568359, + "step": 3979, + "time_per_iteration": 2.687633991241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060607, + "balance_loss_mlp": 1.02789593, + "epoch": 0.7656791073489804, + "flos": 574422658560.0, + "grad_norm": 0.055229141283006315, + "language_loss": 0.78390539, + "learning_rate": 0.00013720917934815935, + "loss": 0.79451144, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.32714844, + "step": 3980, + "time_per_iteration": 2.7192299365997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106176, + "balance_loss_mlp": 1.02876329, + "epoch": 0.765871489034244, + "flos": 492568266240.0, + "grad_norm": 0.11784191582460708, + "language_loss": 0.82716662, + "learning_rate": 0.00013699486627815344, + "loss": 0.83778423, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.33007812, + "step": 3981, + "time_per_iteration": 2.5523879528045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066983, + "balance_loss_mlp": 1.03386712, + "epoch": 0.7660638707195075, + "flos": 485769111552.0, + "grad_norm": 0.048709081947545384, + "language_loss": 0.82393169, + "learning_rate": 0.00013678069413923928, + "loss": 0.83460152, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.33129883, + "step": 3982, + "time_per_iteration": 2.5948498249053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062511, + "balance_loss_mlp": 1.03034854, + "epoch": 0.766256252404771, + "flos": 444059131392.0, + "grad_norm": 0.05195057178385164, + "language_loss": 0.81826979, + "learning_rate": 0.00013656666301456555, + "loss": 0.82889485, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.3215332, + "step": 3983, + "time_per_iteration": 2.5596601963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063593, + "balance_loss_mlp": 1.02980876, + "epoch": 0.7664486340900346, + "flos": 484922308608.0, + "grad_norm": 0.08343651185872063, + "language_loss": 0.84138393, + "learning_rate": 0.0001363527729872267, + "loss": 0.85201979, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.33813477, + "step": 3984, + "time_per_iteration": 2.6182045936584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065831, + "balance_loss_mlp": 1.03354931, + "epoch": 0.7666410157752982, + "flos": 645905981952.0, + "grad_norm": 0.1262618740109736, + "language_loss": 0.76256335, + "learning_rate": 0.00013613902414026207, + "loss": 0.77322161, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.32275391, + "step": 3985, + "time_per_iteration": 2.7776031494140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106424, + "balance_loss_mlp": 1.03079021, + "epoch": 0.7668333974605618, + "flos": 773964827136.0, + "grad_norm": 0.050561982196081254, + "language_loss": 0.8239125, + "learning_rate": 0.00013592541655665642, + "loss": 0.83455491, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.3347168, + "step": 3986, + "time_per_iteration": 2.952242374420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064817, + "balance_loss_mlp": 1.03136706, + "epoch": 0.7670257791458254, + "flos": 613200913920.0, + "grad_norm": 0.052879642645961566, + "language_loss": 0.85094202, + "learning_rate": 0.00013571195031933947, + "loss": 0.86159021, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.33447266, + "step": 3987, + "time_per_iteration": 2.7266581058502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005099, + "balance_loss_mlp": 0.9958964, + "epoch": 0.7672181608310888, + "flos": 1484608670208.0, + "grad_norm": 0.011043844961489012, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.8148644, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.09179688, + "step": 3988, + "time_per_iteration": 4.669104814529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063461, + "balance_loss_mlp": 1.03079784, + "epoch": 0.7674105425163524, + "flos": 610449509376.0, + "grad_norm": 0.05355294055383006, + "language_loss": 0.85597003, + "learning_rate": 0.00013528544221501655, + "loss": 0.86660457, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.32666016, + "step": 3989, + "time_per_iteration": 2.7729666233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063434, + "balance_loss_mlp": 1.02960289, + "epoch": 0.767602924201616, + "flos": 844857423360.0, + "grad_norm": 0.05868617722535175, + "language_loss": 0.81521833, + "learning_rate": 0.00013507240051359586, + "loss": 0.82585269, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.33837891, + "step": 3990, + "time_per_iteration": 3.0997486114501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065906, + "balance_loss_mlp": 1.0340054, + "epoch": 0.7677953058868796, + "flos": 526857841152.0, + "grad_norm": 0.07003191043706981, + "language_loss": 0.8601203, + "learning_rate": 0.00013485950048963425, + "loss": 0.8707794, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.31884766, + "step": 3991, + "time_per_iteration": 2.5849506855010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063228, + "balance_loss_mlp": 1.03039789, + "epoch": 0.7679876875721431, + "flos": 923161660416.0, + "grad_norm": 0.07243254290057845, + "language_loss": 0.82772785, + "learning_rate": 0.00013464674222578643, + "loss": 0.83836013, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.32836914, + "step": 3992, + "time_per_iteration": 3.2332818508148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106772, + "balance_loss_mlp": 1.03410292, + "epoch": 0.7681800692574067, + "flos": 457855289856.0, + "grad_norm": 0.05271812769462788, + "language_loss": 0.83249938, + "learning_rate": 0.00013443412580465292, + "loss": 0.8431766, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.33642578, + "step": 3993, + "time_per_iteration": 2.5794618129730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062971, + "balance_loss_mlp": 1.03033197, + "epoch": 0.7683724509426703, + "flos": 658113251328.0, + "grad_norm": 0.050288127283744266, + "language_loss": 0.83906549, + "learning_rate": 0.00013422165130877857, + "loss": 0.84969521, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.32641602, + "step": 3994, + "time_per_iteration": 2.8854472637176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060909, + "balance_loss_mlp": 1.028723, + "epoch": 0.7685648326279338, + "flos": 555021946368.0, + "grad_norm": 0.05841740887579896, + "language_loss": 0.80092537, + "learning_rate": 0.00013400931882065327, + "loss": 0.81153446, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.32177734, + "step": 3995, + "time_per_iteration": 2.6247458457946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066631, + "balance_loss_mlp": 1.03337145, + "epoch": 0.7687572143131974, + "flos": 687070315008.0, + "grad_norm": 0.0471892049079333, + "language_loss": 0.8085227, + "learning_rate": 0.0001337971284227118, + "loss": 0.81918901, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.33276367, + "step": 3996, + "time_per_iteration": 3.0075807571411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003827, + "balance_loss_mlp": 0.99471956, + "epoch": 0.7689495959984609, + "flos": 1488653448192.0, + "grad_norm": 0.013752910811902266, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77122247, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.09130859, + "step": 3997, + "time_per_iteration": 4.915713787078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060268, + "balance_loss_mlp": 1.02698493, + "epoch": 0.7691419776837245, + "flos": 570133389312.0, + "grad_norm": 0.05931733235007729, + "language_loss": 0.79872787, + "learning_rate": 0.0001333731742268438, + "loss": 0.80933058, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.33276367, + "step": 3998, + "time_per_iteration": 2.7005136013031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063521, + "balance_loss_mlp": 1.03033328, + "epoch": 0.7693343593689881, + "flos": 519812785152.0, + "grad_norm": 0.05123464057208785, + "language_loss": 0.8547945, + "learning_rate": 0.0001331614105935109, + "loss": 0.8654297, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.33203125, + "step": 3999, + "time_per_iteration": 2.6618032455444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062851, + "balance_loss_mlp": 1.0290674, + "epoch": 0.7695267410542517, + "flos": 660086254080.0, + "grad_norm": 0.04349114240195965, + "language_loss": 0.84291816, + "learning_rate": 0.00013294978937954883, + "loss": 0.85354662, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.33813477, + "step": 4000, + "time_per_iteration": 2.787548780441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106608, + "balance_loss_mlp": 1.03336918, + "epoch": 0.7697191227395151, + "flos": 546548124672.0, + "grad_norm": 0.06371806812200402, + "language_loss": 0.85203207, + "learning_rate": 0.00013273831066711655, + "loss": 0.86269283, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.32714844, + "step": 4001, + "time_per_iteration": 2.603930950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066325, + "balance_loss_mlp": 1.03387642, + "epoch": 0.7699115044247787, + "flos": 540339697152.0, + "grad_norm": 0.04713288479352539, + "language_loss": 0.80269563, + "learning_rate": 0.00013252697453831747, + "loss": 0.8133589, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.32446289, + "step": 4002, + "time_per_iteration": 2.681474447250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065185, + "balance_loss_mlp": 1.03230727, + "epoch": 0.7701038861100423, + "flos": 562635818496.0, + "grad_norm": 0.05017266789361132, + "language_loss": 0.82595527, + "learning_rate": 0.00013231578107519916, + "loss": 0.8366071, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.32885742, + "step": 4003, + "time_per_iteration": 2.910759210586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106487, + "balance_loss_mlp": 1.03289843, + "epoch": 0.7702962677953059, + "flos": 481490016768.0, + "grad_norm": 0.05443168691462721, + "language_loss": 0.82779682, + "learning_rate": 0.00013210473035975422, + "loss": 0.83844554, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.31958008, + "step": 4004, + "time_per_iteration": 2.574204444885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106935, + "balance_loss_mlp": 1.03656733, + "epoch": 0.7704886494805695, + "flos": 770036350464.0, + "grad_norm": 0.05675172766442488, + "language_loss": 0.85354382, + "learning_rate": 0.0001318938224739201, + "loss": 0.86423731, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.32788086, + "step": 4005, + "time_per_iteration": 3.032860279083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067294, + "balance_loss_mlp": 1.03417802, + "epoch": 0.770681031165833, + "flos": 600912096768.0, + "grad_norm": 0.04532626069780256, + "language_loss": 0.83667225, + "learning_rate": 0.00013168305749957843, + "loss": 0.84734517, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.33129883, + "step": 4006, + "time_per_iteration": 2.7624073028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066913, + "balance_loss_mlp": 1.03379726, + "epoch": 0.7708734128510966, + "flos": 495862345728.0, + "grad_norm": 0.05222212765251844, + "language_loss": 0.82636768, + "learning_rate": 0.00013147243551855532, + "loss": 0.83703679, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.33129883, + "step": 4007, + "time_per_iteration": 2.5816714763641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063599, + "balance_loss_mlp": 1.03115058, + "epoch": 0.7710657945363601, + "flos": 567012427776.0, + "grad_norm": 0.057481422314481036, + "language_loss": 0.80578291, + "learning_rate": 0.00013126195661262148, + "loss": 0.81641883, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.32446289, + "step": 4008, + "time_per_iteration": 2.7452778816223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064326, + "balance_loss_mlp": 1.03190088, + "epoch": 0.7712581762216237, + "flos": 604251256320.0, + "grad_norm": 0.05872708876253251, + "language_loss": 0.86326575, + "learning_rate": 0.00013105162086349216, + "loss": 0.873909, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.32421875, + "step": 4009, + "time_per_iteration": 2.8586156368255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066916, + "balance_loss_mlp": 1.03530204, + "epoch": 0.7714505579068872, + "flos": 530620402176.0, + "grad_norm": 0.047861775046014535, + "language_loss": 0.86009622, + "learning_rate": 0.00013084142835282687, + "loss": 0.87076533, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.31591797, + "step": 4010, + "time_per_iteration": 2.704119920730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005244, + "balance_loss_mlp": 0.99647039, + "epoch": 0.7716429395921508, + "flos": 1421414313984.0, + "grad_norm": 0.012063998338178145, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80889606, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.08789062, + "step": 4011, + "time_per_iteration": 4.7817652225494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065556, + "balance_loss_mlp": 1.03301144, + "epoch": 0.7718353212774144, + "flos": 578140139520.0, + "grad_norm": 0.051053206649878655, + "language_loss": 0.89366746, + "learning_rate": 0.0001304214733732485, + "loss": 0.90432304, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.32543945, + "step": 4012, + "time_per_iteration": 2.7189698219299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067191, + "balance_loss_mlp": 1.0337882, + "epoch": 0.772027702962678, + "flos": 510486368256.0, + "grad_norm": 0.053964234671719305, + "language_loss": 0.82622194, + "learning_rate": 0.00013021171106737672, + "loss": 0.8368938, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.33422852, + "step": 4013, + "time_per_iteration": 2.695345401763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062166, + "balance_loss_mlp": 1.03031349, + "epoch": 0.7722200846479416, + "flos": 525391197696.0, + "grad_norm": 0.05051004242016687, + "language_loss": 0.79927659, + "learning_rate": 0.00013000209232605071, + "loss": 0.80989826, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.31835938, + "step": 4014, + "time_per_iteration": 2.6742262840270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062585, + "balance_loss_mlp": 1.03049421, + "epoch": 0.772412466333205, + "flos": 479348278272.0, + "grad_norm": 0.06883144067650042, + "language_loss": 0.79881573, + "learning_rate": 0.0001297926172306519, + "loss": 0.80944163, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.32080078, + "step": 4015, + "time_per_iteration": 2.5998587608337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106658, + "balance_loss_mlp": 1.03420317, + "epoch": 0.7726048480184686, + "flos": 905284256256.0, + "grad_norm": 0.049021978478966305, + "language_loss": 0.7864179, + "learning_rate": 0.0001295832858625055, + "loss": 0.79708374, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.32373047, + "step": 4016, + "time_per_iteration": 3.241476535797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064551, + "balance_loss_mlp": 1.03195906, + "epoch": 0.7727972297037322, + "flos": 631085520384.0, + "grad_norm": 0.050738578814051916, + "language_loss": 0.69703871, + "learning_rate": 0.00012937409830288154, + "loss": 0.70768428, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.32592773, + "step": 4017, + "time_per_iteration": 2.7928261756896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060438, + "balance_loss_mlp": 1.02868032, + "epoch": 0.7729896113889958, + "flos": 414565185024.0, + "grad_norm": 0.11993476807725541, + "language_loss": 0.84959614, + "learning_rate": 0.00012916505463299362, + "loss": 0.86020052, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.31738281, + "step": 4018, + "time_per_iteration": 2.4724020957946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061884, + "balance_loss_mlp": 1.03012657, + "epoch": 0.7731819930742593, + "flos": 668609538048.0, + "grad_norm": 0.07815379187745079, + "language_loss": 0.78152752, + "learning_rate": 0.00012895615493399972, + "loss": 0.79214638, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.31738281, + "step": 4019, + "time_per_iteration": 2.7819771766662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105935, + "balance_loss_mlp": 1.02704406, + "epoch": 0.7733743747595229, + "flos": 489604455936.0, + "grad_norm": 0.06361322846277707, + "language_loss": 0.82174695, + "learning_rate": 0.00012874739928700192, + "loss": 0.83234048, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.32299805, + "step": 4020, + "time_per_iteration": 2.577558755874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063745, + "balance_loss_mlp": 1.03046131, + "epoch": 0.7735667564447865, + "flos": 659294705664.0, + "grad_norm": 0.0626070053016161, + "language_loss": 0.79737717, + "learning_rate": 0.00012853878777304624, + "loss": 0.80801463, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.33300781, + "step": 4021, + "time_per_iteration": 2.868053674697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064138, + "balance_loss_mlp": 1.03140283, + "epoch": 0.77375913813005, + "flos": 533106966528.0, + "grad_norm": 0.04737550155927703, + "language_loss": 0.84463626, + "learning_rate": 0.000128330320473123, + "loss": 0.85527766, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.32739258, + "step": 4022, + "time_per_iteration": 2.668313503265381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008425, + "balance_loss_mlp": 0.99988997, + "epoch": 0.7739515198153136, + "flos": 1519260447744.0, + "grad_norm": 0.005844569838786065, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79340327, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.08544922, + "step": 4023, + "time_per_iteration": 4.965493202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063459, + "balance_loss_mlp": 1.03053296, + "epoch": 0.7741439015005771, + "flos": 639819800064.0, + "grad_norm": 0.08130494829641424, + "language_loss": 0.81473714, + "learning_rate": 0.0001279138188390543, + "loss": 0.82537174, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.3293457, + "step": 4024, + "time_per_iteration": 2.7925288677215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063718, + "balance_loss_mlp": 1.03122211, + "epoch": 0.7743362831858407, + "flos": 665546803200.0, + "grad_norm": 0.05426924538048376, + "language_loss": 0.86122662, + "learning_rate": 0.00012770578466660915, + "loss": 0.87186384, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.32495117, + "step": 4025, + "time_per_iteration": 2.8743951320648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067837, + "balance_loss_mlp": 1.0342437, + "epoch": 0.7745286648711043, + "flos": 562453936128.0, + "grad_norm": 0.050549186901469166, + "language_loss": 0.81480557, + "learning_rate": 0.0001274978950315968, + "loss": 0.82548392, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.33618164, + "step": 4026, + "time_per_iteration": 2.7961745262145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061898, + "balance_loss_mlp": 1.02923501, + "epoch": 0.7747210465563679, + "flos": 516651125760.0, + "grad_norm": 0.06240008099647138, + "language_loss": 0.82893825, + "learning_rate": 0.00012729015001472716, + "loss": 0.83955729, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.32666016, + "step": 4027, + "time_per_iteration": 2.63754940032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065965, + "balance_loss_mlp": 1.03227663, + "epoch": 0.7749134282416313, + "flos": 633921292800.0, + "grad_norm": 0.052874284120550924, + "language_loss": 0.81483364, + "learning_rate": 0.00012708254969665418, + "loss": 0.82549322, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.3371582, + "step": 4028, + "time_per_iteration": 2.7484118938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064367, + "balance_loss_mlp": 1.03070259, + "epoch": 0.7751058099268949, + "flos": 495118849536.0, + "grad_norm": 0.06123905199819526, + "language_loss": 0.83476496, + "learning_rate": 0.00012687509415797526, + "loss": 0.84540868, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.33691406, + "step": 4029, + "time_per_iteration": 2.5675880908966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063972, + "balance_loss_mlp": 1.03183281, + "epoch": 0.7752981916121585, + "flos": 510048410112.0, + "grad_norm": 0.09107931997699928, + "language_loss": 0.81183356, + "learning_rate": 0.00012666778347923208, + "loss": 0.82247323, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.32128906, + "step": 4030, + "time_per_iteration": 2.632314443588257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060699, + "balance_loss_mlp": 1.02813113, + "epoch": 0.7754905732974221, + "flos": 497295493632.0, + "grad_norm": 0.04486214088641844, + "language_loss": 0.83638769, + "learning_rate": 0.0001264606177409092, + "loss": 0.84699464, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.32568359, + "step": 4031, + "time_per_iteration": 2.6301512718200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063527, + "balance_loss_mlp": 1.03081632, + "epoch": 0.7756829549826857, + "flos": 480486062592.0, + "grad_norm": 0.0481221818679906, + "language_loss": 0.86095941, + "learning_rate": 0.00012625359702343609, + "loss": 0.87159473, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.32714844, + "step": 4032, + "time_per_iteration": 2.708512544631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063655, + "balance_loss_mlp": 1.03125429, + "epoch": 0.7758753366679492, + "flos": 552368056320.0, + "grad_norm": 0.0642979185043706, + "language_loss": 0.84532368, + "learning_rate": 0.00012604672140718504, + "loss": 0.85596019, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.32397461, + "step": 4033, + "time_per_iteration": 2.632307529449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062755, + "balance_loss_mlp": 1.03006816, + "epoch": 0.7760677183532128, + "flos": 703529127936.0, + "grad_norm": 0.05215032719242253, + "language_loss": 0.77701473, + "learning_rate": 0.00012583999097247233, + "loss": 0.78764236, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.3269043, + "step": 4034, + "time_per_iteration": 2.8174097537994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064658, + "balance_loss_mlp": 1.03266239, + "epoch": 0.7762601000384763, + "flos": 523218935808.0, + "grad_norm": 0.06260246603028506, + "language_loss": 0.79696673, + "learning_rate": 0.0001256334057995578, + "loss": 0.80761331, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.31982422, + "step": 4035, + "time_per_iteration": 2.69726300239563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063141, + "balance_loss_mlp": 1.03159809, + "epoch": 0.7764524817237399, + "flos": 557262609408.0, + "grad_norm": 0.048886632926304276, + "language_loss": 0.84979451, + "learning_rate": 0.000125426965968645, + "loss": 0.86042595, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.31518555, + "step": 4036, + "time_per_iteration": 2.72336483001709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066748, + "balance_loss_mlp": 1.03508615, + "epoch": 0.7766448634090035, + "flos": 579454013952.0, + "grad_norm": 0.07567948550064775, + "language_loss": 0.81946111, + "learning_rate": 0.00012522067155988092, + "loss": 0.83012855, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.31640625, + "step": 4037, + "time_per_iteration": 2.6716489791870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063745, + "balance_loss_mlp": 1.03153515, + "epoch": 0.776837245094267, + "flos": 635300596224.0, + "grad_norm": 0.05548749189645599, + "language_loss": 0.75042689, + "learning_rate": 0.00012501452265335617, + "loss": 0.76106441, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.32202148, + "step": 4038, + "time_per_iteration": 2.798152446746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063102, + "balance_loss_mlp": 1.03115439, + "epoch": 0.7770296267795306, + "flos": 614398334976.0, + "grad_norm": 0.04733898192839437, + "language_loss": 0.83099091, + "learning_rate": 0.0001248085193291047, + "loss": 0.84162188, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.31933594, + "step": 4039, + "time_per_iteration": 2.713104009628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064564, + "balance_loss_mlp": 1.03287828, + "epoch": 0.7772220084647942, + "flos": 878438407680.0, + "grad_norm": 0.06729067040173044, + "language_loss": 0.8247925, + "learning_rate": 0.00012460266166710443, + "loss": 0.83543813, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.31665039, + "step": 4040, + "time_per_iteration": 3.142155408859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061758, + "balance_loss_mlp": 1.02988183, + "epoch": 0.7774143901500578, + "flos": 839293567488.0, + "grad_norm": 0.08233225163586903, + "language_loss": 0.77612185, + "learning_rate": 0.00012439694974727633, + "loss": 0.78673941, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.31860352, + "step": 4041, + "time_per_iteration": 2.9853243827819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065759, + "balance_loss_mlp": 1.03338194, + "epoch": 0.7776067718353212, + "flos": 567878169600.0, + "grad_norm": 0.054149054361607385, + "language_loss": 0.79806697, + "learning_rate": 0.00012419138364948458, + "loss": 0.80872452, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.32373047, + "step": 4042, + "time_per_iteration": 2.7431745529174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064082, + "balance_loss_mlp": 1.03191924, + "epoch": 0.7777991535205848, + "flos": 745627603968.0, + "grad_norm": 0.05348286137005146, + "language_loss": 0.8234185, + "learning_rate": 0.00012398596345353702, + "loss": 0.83405924, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.3215332, + "step": 4043, + "time_per_iteration": 2.896669864654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069609, + "balance_loss_mlp": 1.03785181, + "epoch": 0.7779915352058484, + "flos": 537799288320.0, + "grad_norm": 0.048601854183842386, + "language_loss": 0.83191538, + "learning_rate": 0.0001237806892391851, + "loss": 0.84261149, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.31738281, + "step": 4044, + "time_per_iteration": 2.6875576972961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067998, + "balance_loss_mlp": 1.03523958, + "epoch": 0.778183916891112, + "flos": 634497463296.0, + "grad_norm": 0.05218142456455376, + "language_loss": 0.807693, + "learning_rate": 0.0001235755610861233, + "loss": 0.81837296, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.32763672, + "step": 4045, + "time_per_iteration": 2.7440977096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063341, + "balance_loss_mlp": 1.03125, + "epoch": 0.7783762985763756, + "flos": 588400699392.0, + "grad_norm": 0.06119934823569683, + "language_loss": 0.85257781, + "learning_rate": 0.0001233705790739893, + "loss": 0.86321127, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.32080078, + "step": 4046, + "time_per_iteration": 2.771397829055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066435, + "balance_loss_mlp": 1.03398585, + "epoch": 0.7785686802616391, + "flos": 930261970944.0, + "grad_norm": 0.05518335637199763, + "language_loss": 0.74865597, + "learning_rate": 0.0001231657432823643, + "loss": 0.75932032, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.32446289, + "step": 4047, + "time_per_iteration": 3.2299704551696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068089, + "balance_loss_mlp": 1.03618836, + "epoch": 0.7787610619469026, + "flos": 497679607296.0, + "grad_norm": 0.061331476050258626, + "language_loss": 0.78644454, + "learning_rate": 0.0001229610537907725, + "loss": 0.7971254, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.31884766, + "step": 4048, + "time_per_iteration": 2.581489324569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062949, + "balance_loss_mlp": 1.03040469, + "epoch": 0.7789534436321662, + "flos": 515385303552.0, + "grad_norm": 0.060582734060361326, + "language_loss": 0.90193808, + "learning_rate": 0.00012275651067868143, + "loss": 0.9125675, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.32543945, + "step": 4049, + "time_per_iteration": 2.5799412727355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066, + "balance_loss_mlp": 1.03350401, + "epoch": 0.7791458253174298, + "flos": 988081555968.0, + "grad_norm": 0.06086378000483131, + "language_loss": 0.80482578, + "learning_rate": 0.00012255211402550182, + "loss": 0.81548578, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.32495117, + "step": 4050, + "time_per_iteration": 3.228003740310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065165, + "balance_loss_mlp": 1.03283536, + "epoch": 0.7793382070026933, + "flos": 628756107264.0, + "grad_norm": 0.1203274251701162, + "language_loss": 0.76654673, + "learning_rate": 0.00012234786391058727, + "loss": 0.77719831, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.32324219, + "step": 4051, + "time_per_iteration": 2.7767224311828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106691, + "balance_loss_mlp": 1.03405643, + "epoch": 0.7795305886879569, + "flos": 531500700672.0, + "grad_norm": 0.06608083549317771, + "language_loss": 0.85191727, + "learning_rate": 0.0001221437604132352, + "loss": 0.86258644, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.32861328, + "step": 4052, + "time_per_iteration": 2.6072323322296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069028, + "balance_loss_mlp": 1.03703237, + "epoch": 0.7797229703732205, + "flos": 611690600448.0, + "grad_norm": 0.06701840569046753, + "language_loss": 0.80875957, + "learning_rate": 0.0001219398036126852, + "loss": 0.8194499, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.31982422, + "step": 4053, + "time_per_iteration": 2.789151668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069667, + "balance_loss_mlp": 1.03738546, + "epoch": 0.7799153520584841, + "flos": 871758526464.0, + "grad_norm": 0.05089113411890528, + "language_loss": 0.78444964, + "learning_rate": 0.00012173599358812027, + "loss": 0.79514629, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.32275391, + "step": 4054, + "time_per_iteration": 3.282203197479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065975, + "balance_loss_mlp": 1.03359818, + "epoch": 0.7801077337437476, + "flos": 583348995072.0, + "grad_norm": 0.06359619445711458, + "language_loss": 0.82295758, + "learning_rate": 0.0001215323304186668, + "loss": 0.83361733, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.32373047, + "step": 4055, + "time_per_iteration": 2.751826763153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062514, + "balance_loss_mlp": 1.03073275, + "epoch": 0.7803001154290111, + "flos": 600887365632.0, + "grad_norm": 0.04750930955711312, + "language_loss": 0.8780787, + "learning_rate": 0.00012132881418339364, + "loss": 0.88870382, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.31762695, + "step": 4056, + "time_per_iteration": 2.7023940086364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016356, + "balance_loss_mlp": 1.00820196, + "epoch": 0.7804924971142747, + "flos": 1478743506432.0, + "grad_norm": 0.010148524200822068, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.78533918, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.08154297, + "step": 4057, + "time_per_iteration": 4.826777458190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066059, + "balance_loss_mlp": 1.03430223, + "epoch": 0.7806848787995383, + "flos": 630075773952.0, + "grad_norm": 0.04851285793009641, + "language_loss": 0.76570946, + "learning_rate": 0.00012092222283137944, + "loss": 0.77637005, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.31738281, + "step": 4058, + "time_per_iteration": 2.7130894660949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014581, + "balance_loss_mlp": 1.00647449, + "epoch": 0.7808772604848019, + "flos": 1416800567808.0, + "grad_norm": 0.006919063816033351, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79920888, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.08105469, + "step": 4059, + "time_per_iteration": 4.767851114273071 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068468, + "balance_loss_mlp": 1.03575706, + "epoch": 0.7810696421700654, + "flos": 731345435136.0, + "grad_norm": 0.0468820010320679, + "language_loss": 0.83492804, + "learning_rate": 0.00012051622016348856, + "loss": 0.8456127, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.32714844, + "step": 4060, + "time_per_iteration": 3.0499465465545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065592, + "balance_loss_mlp": 1.0336442, + "epoch": 0.781262023855329, + "flos": 424718055936.0, + "grad_norm": 0.05864420891572784, + "language_loss": 0.8411994, + "learning_rate": 0.00012031343978315539, + "loss": 0.85185528, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.31933594, + "step": 4061, + "time_per_iteration": 2.448692560195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063026, + "balance_loss_mlp": 1.0311023, + "epoch": 0.7814544055405925, + "flos": 500767073280.0, + "grad_norm": 0.10364470659774863, + "language_loss": 0.82632732, + "learning_rate": 0.00012011080681021774, + "loss": 0.83695757, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.3190918, + "step": 4062, + "time_per_iteration": 2.611121892929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066061, + "balance_loss_mlp": 1.03373194, + "epoch": 0.7816467872258561, + "flos": 462212960256.0, + "grad_norm": 0.09614941126191437, + "language_loss": 0.86035311, + "learning_rate": 0.00011990832132334512, + "loss": 0.87101376, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.32324219, + "step": 4063, + "time_per_iteration": 2.5123276710510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066235, + "balance_loss_mlp": 1.03354836, + "epoch": 0.7818391689111197, + "flos": 740497324032.0, + "grad_norm": 0.05603872830064661, + "language_loss": 0.8259666, + "learning_rate": 0.00011970598340114897, + "loss": 0.83662897, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.3269043, + "step": 4064, + "time_per_iteration": 2.992100238800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062887, + "balance_loss_mlp": 1.03101015, + "epoch": 0.7820315505963832, + "flos": 547386163200.0, + "grad_norm": 0.05629095926792252, + "language_loss": 0.8402884, + "learning_rate": 0.00011950379312218396, + "loss": 0.85091722, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.31860352, + "step": 4065, + "time_per_iteration": 2.7270681858062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061717, + "balance_loss_mlp": 1.02950692, + "epoch": 0.7822239322816468, + "flos": 728665403904.0, + "grad_norm": 0.045794357656988534, + "language_loss": 0.8601073, + "learning_rate": 0.00011930175056494719, + "loss": 0.87072444, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.32202148, + "step": 4066, + "time_per_iteration": 2.8730247020721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066726, + "balance_loss_mlp": 1.03408647, + "epoch": 0.7824163139669104, + "flos": 451774900224.0, + "grad_norm": 0.04781338865883617, + "language_loss": 0.76222277, + "learning_rate": 0.00011909985580787885, + "loss": 0.77288997, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.32641602, + "step": 4067, + "time_per_iteration": 2.6421656608581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063309, + "balance_loss_mlp": 1.03138483, + "epoch": 0.782608695652174, + "flos": 540207277056.0, + "grad_norm": 0.05261646090903281, + "language_loss": 0.81026649, + "learning_rate": 0.00011889810892936137, + "loss": 0.82089961, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.3190918, + "step": 4068, + "time_per_iteration": 2.70185923576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071177, + "balance_loss_mlp": 1.03813219, + "epoch": 0.7828010773374374, + "flos": 500029369344.0, + "grad_norm": 0.05419048158551631, + "language_loss": 0.7722286, + "learning_rate": 0.00011869651000771959, + "loss": 0.78294039, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.33056641, + "step": 4069, + "time_per_iteration": 2.822190523147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060879, + "balance_loss_mlp": 1.02890754, + "epoch": 0.782993459022701, + "flos": 600542539776.0, + "grad_norm": 0.05379601018960074, + "language_loss": 0.82404703, + "learning_rate": 0.00011849505912122117, + "loss": 0.83465582, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.31958008, + "step": 4070, + "time_per_iteration": 2.7197659015655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061129, + "balance_loss_mlp": 1.02827537, + "epoch": 0.7831858407079646, + "flos": 809702106624.0, + "grad_norm": 0.06431726516643936, + "language_loss": 0.77697992, + "learning_rate": 0.00011829375634807654, + "loss": 0.78759122, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.32861328, + "step": 4071, + "time_per_iteration": 3.0201632976531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060522, + "balance_loss_mlp": 1.027596, + "epoch": 0.7833782223932282, + "flos": 806240701440.0, + "grad_norm": 0.09019117286711203, + "language_loss": 0.80854774, + "learning_rate": 0.00011809260176643821, + "loss": 0.81915295, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.3293457, + "step": 4072, + "time_per_iteration": 3.059041738510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062272, + "balance_loss_mlp": 1.0295614, + "epoch": 0.7835706040784918, + "flos": 520614508032.0, + "grad_norm": 0.05845304127163334, + "language_loss": 0.83590925, + "learning_rate": 0.00011789159545440131, + "loss": 0.84653199, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.32714844, + "step": 4073, + "time_per_iteration": 2.5912578105926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064777, + "balance_loss_mlp": 1.03199446, + "epoch": 0.7837629857637552, + "flos": 505322592768.0, + "grad_norm": 0.0488968990026523, + "language_loss": 0.82248485, + "learning_rate": 0.00011769073749000348, + "loss": 0.83313262, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.32788086, + "step": 4074, + "time_per_iteration": 2.7853548526763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067546, + "balance_loss_mlp": 1.03533578, + "epoch": 0.7839553674490188, + "flos": 515872723968.0, + "grad_norm": 0.0606411027248537, + "language_loss": 0.75941336, + "learning_rate": 0.0001174900279512246, + "loss": 0.77008879, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.32202148, + "step": 4075, + "time_per_iteration": 2.5954041481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065014, + "balance_loss_mlp": 1.03266096, + "epoch": 0.7841477491342824, + "flos": 506399330304.0, + "grad_norm": 0.05056809711727469, + "language_loss": 0.81398273, + "learning_rate": 0.00011728946691598707, + "loss": 0.82463288, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.32348633, + "step": 4076, + "time_per_iteration": 2.618093252182007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059278, + "balance_loss_mlp": 1.02680504, + "epoch": 0.784340130819546, + "flos": 719320048128.0, + "grad_norm": 0.06832591600294699, + "language_loss": 0.76352495, + "learning_rate": 0.00011708905446215561, + "loss": 0.77411771, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.32470703, + "step": 4077, + "time_per_iteration": 2.8518495559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064476, + "balance_loss_mlp": 1.03228974, + "epoch": 0.7845325125048095, + "flos": 514174735872.0, + "grad_norm": 0.05162512360480059, + "language_loss": 0.79919541, + "learning_rate": 0.00011688879066753711, + "loss": 0.8098402, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.32177734, + "step": 4078, + "time_per_iteration": 2.693814516067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107088, + "balance_loss_mlp": 1.03919387, + "epoch": 0.7847248941900731, + "flos": 465866422272.0, + "grad_norm": 0.057791720647150095, + "language_loss": 0.87164676, + "learning_rate": 0.00011668867560988122, + "loss": 0.88235557, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.31665039, + "step": 4079, + "time_per_iteration": 2.544497489929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106489, + "balance_loss_mlp": 1.03217876, + "epoch": 0.7849172758753367, + "flos": 502766217216.0, + "grad_norm": 0.06577906092431222, + "language_loss": 0.84248155, + "learning_rate": 0.00011648870936687916, + "loss": 0.85313052, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.32714844, + "step": 4080, + "time_per_iteration": 2.73219895362854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067029, + "balance_loss_mlp": 1.03465128, + "epoch": 0.7851096575606002, + "flos": 531742219776.0, + "grad_norm": 0.07071087412215145, + "language_loss": 0.77993482, + "learning_rate": 0.00011628889201616461, + "loss": 0.79060507, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.32373047, + "step": 4081, + "time_per_iteration": 2.6256251335144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064024, + "balance_loss_mlp": 1.03145564, + "epoch": 0.7853020392458638, + "flos": 569685256704.0, + "grad_norm": 0.054581090755724565, + "language_loss": 0.81991017, + "learning_rate": 0.00011608922363531393, + "loss": 0.83055043, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.32568359, + "step": 4082, + "time_per_iteration": 2.68129825592041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066291, + "balance_loss_mlp": 1.03522539, + "epoch": 0.7854944209311273, + "flos": 832228162560.0, + "grad_norm": 0.0528540930480431, + "language_loss": 0.83166963, + "learning_rate": 0.00011588970430184504, + "loss": 0.84233254, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.31030273, + "step": 4083, + "time_per_iteration": 3.01277494430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068509, + "balance_loss_mlp": 1.03608418, + "epoch": 0.7856868026163909, + "flos": 559660423680.0, + "grad_norm": 0.04365607087588255, + "language_loss": 0.81863219, + "learning_rate": 0.00011569033409321822, + "loss": 0.82931721, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.32421875, + "step": 4084, + "time_per_iteration": 2.6665027141571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106523, + "balance_loss_mlp": 1.03290033, + "epoch": 0.7858791843016545, + "flos": 544972382208.0, + "grad_norm": 0.05673133805325975, + "language_loss": 0.72893167, + "learning_rate": 0.00011549111308683591, + "loss": 0.73958397, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.32324219, + "step": 4085, + "time_per_iteration": 2.652221918106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062463, + "balance_loss_mlp": 1.03111076, + "epoch": 0.7860715659869181, + "flos": 380787761664.0, + "grad_norm": 0.058608703259898844, + "language_loss": 0.80785263, + "learning_rate": 0.00011529204136004251, + "loss": 0.81847727, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.31323242, + "step": 4086, + "time_per_iteration": 2.4127490520477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069783, + "balance_loss_mlp": 1.03762007, + "epoch": 0.7862639476721817, + "flos": 567173961216.0, + "grad_norm": 0.058008459467675216, + "language_loss": 0.84520507, + "learning_rate": 0.00011509311899012459, + "loss": 0.85590291, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.3215332, + "step": 4087, + "time_per_iteration": 2.6412453651428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067278, + "balance_loss_mlp": 1.03544927, + "epoch": 0.7864563293574451, + "flos": 544968000000.0, + "grad_norm": 0.06454830776496215, + "language_loss": 0.78072417, + "learning_rate": 0.00011489434605431053, + "loss": 0.79139692, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.31811523, + "step": 4088, + "time_per_iteration": 2.637660026550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106782, + "balance_loss_mlp": 1.03563344, + "epoch": 0.7866487110427087, + "flos": 563260041216.0, + "grad_norm": 0.058240331432363256, + "language_loss": 0.81125653, + "learning_rate": 0.0001146957226297708, + "loss": 0.82193476, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.32177734, + "step": 4089, + "time_per_iteration": 2.6684415340423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065799, + "balance_loss_mlp": 1.03323102, + "epoch": 0.7868410927279723, + "flos": 727849124352.0, + "grad_norm": 0.04414589533004489, + "language_loss": 0.76471299, + "learning_rate": 0.00011449724879361827, + "loss": 0.77537096, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.32568359, + "step": 4090, + "time_per_iteration": 2.951436758041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106424, + "balance_loss_mlp": 1.03236377, + "epoch": 0.7870334744132359, + "flos": 521082989568.0, + "grad_norm": 0.060886300721865946, + "language_loss": 0.73346722, + "learning_rate": 0.00011429892462290687, + "loss": 0.74410957, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.31860352, + "step": 4091, + "time_per_iteration": 2.681136131286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063333, + "balance_loss_mlp": 1.03143215, + "epoch": 0.7872258560984994, + "flos": 451173998592.0, + "grad_norm": 0.05425416710162835, + "language_loss": 0.83261812, + "learning_rate": 0.00011410075019463295, + "loss": 0.84325141, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.31884766, + "step": 4092, + "time_per_iteration": 2.596997022628784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_mlp": 1.03559613, + "epoch": 0.787418237783763, + "flos": 514932788736.0, + "grad_norm": 0.06041624723999286, + "language_loss": 0.80031419, + "learning_rate": 0.00011390272558573461, + "loss": 0.81098628, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.31591797, + "step": 4093, + "time_per_iteration": 2.724531412124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066529, + "balance_loss_mlp": 1.03422308, + "epoch": 0.7876106194690266, + "flos": 484837940736.0, + "grad_norm": 0.057479971789758694, + "language_loss": 0.79717124, + "learning_rate": 0.00011370485087309202, + "loss": 0.80783653, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.32299805, + "step": 4094, + "time_per_iteration": 2.6680920124053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066683, + "balance_loss_mlp": 1.03401947, + "epoch": 0.7878030011542901, + "flos": 542570185728.0, + "grad_norm": 0.07064536799183499, + "language_loss": 0.79107904, + "learning_rate": 0.00011350712613352688, + "loss": 0.80174589, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.32666016, + "step": 4095, + "time_per_iteration": 2.6333553791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066243, + "balance_loss_mlp": 1.03415227, + "epoch": 0.7879953828395537, + "flos": 516488182272.0, + "grad_norm": 0.06900072412934964, + "language_loss": 0.79095006, + "learning_rate": 0.00011330955144380283, + "loss": 0.8016125, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.32080078, + "step": 4096, + "time_per_iteration": 2.5925889015197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.03246856, + "epoch": 0.7881877645248172, + "flos": 582004597248.0, + "grad_norm": 0.054709813023541755, + "language_loss": 0.8620733, + "learning_rate": 0.00011311212688062483, + "loss": 0.87271917, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.32104492, + "step": 4097, + "time_per_iteration": 2.774585485458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065245, + "balance_loss_mlp": 1.03279638, + "epoch": 0.7883801462100808, + "flos": 588883737600.0, + "grad_norm": 0.05950523871883677, + "language_loss": 0.77641714, + "learning_rate": 0.0001129148525206402, + "loss": 0.78706962, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.32446289, + "step": 4098, + "time_per_iteration": 2.8262319564819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066849, + "balance_loss_mlp": 1.03535402, + "epoch": 0.7885725278953444, + "flos": 481475460096.0, + "grad_norm": 0.05859958093341329, + "language_loss": 0.86361545, + "learning_rate": 0.00011271772844043759, + "loss": 0.87428391, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.31469727, + "step": 4099, + "time_per_iteration": 2.6731910705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064893, + "balance_loss_mlp": 1.03306413, + "epoch": 0.788764909580608, + "flos": 756470126592.0, + "grad_norm": 0.05966502266655521, + "language_loss": 0.75518525, + "learning_rate": 0.00011252075471654727, + "loss": 0.76583415, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.31811523, + "step": 4100, + "time_per_iteration": 2.919638156890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065613, + "balance_loss_mlp": 1.03294969, + "epoch": 0.7889572912658714, + "flos": 702225427968.0, + "grad_norm": 0.050441368463949324, + "language_loss": 0.77960974, + "learning_rate": 0.00011232393142544133, + "loss": 0.79026586, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.32666016, + "step": 4101, + "time_per_iteration": 2.9371449947357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064145, + "balance_loss_mlp": 1.03188694, + "epoch": 0.789149672951135, + "flos": 736047931392.0, + "grad_norm": 0.05824722379420924, + "language_loss": 0.83012629, + "learning_rate": 0.00011212725864353323, + "loss": 0.8407678, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.32250977, + "step": 4102, + "time_per_iteration": 3.070425033569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019214, + "balance_loss_mlp": 1.01106, + "epoch": 0.7893420546363986, + "flos": 1480626349056.0, + "grad_norm": 0.00964834437524815, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.7735514, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.08154297, + "step": 4103, + "time_per_iteration": 4.87341046333313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069024, + "balance_loss_mlp": 1.03688502, + "epoch": 0.7895344363216622, + "flos": 508821875712.0, + "grad_norm": 0.06647723888078448, + "language_loss": 0.76089919, + "learning_rate": 0.00011173436491267291, + "loss": 0.77158946, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.32128906, + "step": 4104, + "time_per_iteration": 2.579040050506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069021, + "balance_loss_mlp": 1.036906, + "epoch": 0.7897268180069258, + "flos": 541727764992.0, + "grad_norm": 0.05890584899946244, + "language_loss": 0.81946945, + "learning_rate": 0.0001115381441162554, + "loss": 0.83015972, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.32104492, + "step": 4105, + "time_per_iteration": 2.6771240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019188, + "balance_loss_mlp": 1.01103461, + "epoch": 0.7899191996921893, + "flos": 1411924953600.0, + "grad_norm": 0.009593800245269755, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74602914, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.08154297, + "step": 4106, + "time_per_iteration": 4.9348978996276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067573, + "balance_loss_mlp": 1.03593516, + "epoch": 0.7901115813774529, + "flos": 622547679744.0, + "grad_norm": 0.05203428042978299, + "language_loss": 0.84845543, + "learning_rate": 0.00011114615504234465, + "loss": 0.85913116, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.31616211, + "step": 4107, + "time_per_iteration": 2.78153657913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067357, + "balance_loss_mlp": 1.03581429, + "epoch": 0.7903039630627164, + "flos": 645232296960.0, + "grad_norm": 0.05460483755610551, + "language_loss": 0.80740857, + "learning_rate": 0.00011095038691703468, + "loss": 0.81808215, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.31518555, + "step": 4108, + "time_per_iteration": 2.83954119682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069484, + "balance_loss_mlp": 1.03829885, + "epoch": 0.79049634474798, + "flos": 594054715392.0, + "grad_norm": 0.05143854855735133, + "language_loss": 0.82689941, + "learning_rate": 0.00011075476983417998, + "loss": 0.83759421, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.31152344, + "step": 4109, + "time_per_iteration": 2.8581154346466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069019, + "balance_loss_mlp": 1.03792906, + "epoch": 0.7906887264332435, + "flos": 715784449536.0, + "grad_norm": 0.056450839629860305, + "language_loss": 0.77744591, + "learning_rate": 0.00011055930386972579, + "loss": 0.78813612, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.31054688, + "step": 4110, + "time_per_iteration": 2.8273229598999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071548, + "balance_loss_mlp": 1.03855133, + "epoch": 0.7908811081185071, + "flos": 789553516032.0, + "grad_norm": 0.04891253400272343, + "language_loss": 0.78669703, + "learning_rate": 0.00011036398909955863, + "loss": 0.79741246, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.33007812, + "step": 4111, + "time_per_iteration": 2.961766004562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069451, + "balance_loss_mlp": 1.03747857, + "epoch": 0.7910734898037707, + "flos": 641612330496.0, + "grad_norm": 0.048663438809518546, + "language_loss": 0.81452119, + "learning_rate": 0.00011016882559950648, + "loss": 0.82521558, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.31958008, + "step": 4112, + "time_per_iteration": 2.8214406967163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068413, + "balance_loss_mlp": 1.03660822, + "epoch": 0.7912658714890343, + "flos": 669057670656.0, + "grad_norm": 0.05392137662685343, + "language_loss": 0.80067742, + "learning_rate": 0.00010997381344533853, + "loss": 0.81136161, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.31787109, + "step": 4113, + "time_per_iteration": 2.811772346496582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073852, + "balance_loss_mlp": 1.04152238, + "epoch": 0.7914582531742979, + "flos": 557504128512.0, + "grad_norm": 0.0581863083981893, + "language_loss": 0.80220509, + "learning_rate": 0.00010977895271276517, + "loss": 0.81294358, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.32324219, + "step": 4114, + "time_per_iteration": 2.719431161880493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107287, + "balance_loss_mlp": 1.0409224, + "epoch": 0.7916506348595613, + "flos": 569784181248.0, + "grad_norm": 0.05018332028611806, + "language_loss": 0.7987901, + "learning_rate": 0.00010958424347743807, + "loss": 0.80951875, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.31933594, + "step": 4115, + "time_per_iteration": 2.6972670555114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106862, + "balance_loss_mlp": 1.03724396, + "epoch": 0.7918430165448249, + "flos": 717966885888.0, + "grad_norm": 0.06933669285907723, + "language_loss": 0.80126512, + "learning_rate": 0.00010938968581494991, + "loss": 0.81195128, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.31347656, + "step": 4116, + "time_per_iteration": 2.9974632263183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069688, + "balance_loss_mlp": 1.03750205, + "epoch": 0.7920353982300885, + "flos": 553377802752.0, + "grad_norm": 0.05941447289744039, + "language_loss": 0.78879136, + "learning_rate": 0.000109195279800835, + "loss": 0.79948825, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.32177734, + "step": 4117, + "time_per_iteration": 2.710513114929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071029, + "balance_loss_mlp": 1.03896213, + "epoch": 0.7922277799153521, + "flos": 809766125568.0, + "grad_norm": 0.05531983375516572, + "language_loss": 0.76555854, + "learning_rate": 0.00010900102551056834, + "loss": 0.77626884, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.32055664, + "step": 4118, + "time_per_iteration": 3.0103225708007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069951, + "balance_loss_mlp": 1.03766966, + "epoch": 0.7924201616006156, + "flos": 421128612864.0, + "grad_norm": 0.05482547351078549, + "language_loss": 0.84337735, + "learning_rate": 0.00010880692301956601, + "loss": 0.85407686, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.32275391, + "step": 4119, + "time_per_iteration": 2.445122003555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069691, + "balance_loss_mlp": 1.03707528, + "epoch": 0.7926125432858792, + "flos": 617541055488.0, + "grad_norm": 0.04369868110465695, + "language_loss": 0.86072242, + "learning_rate": 0.00010861297240318518, + "loss": 0.87141925, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.32617188, + "step": 4120, + "time_per_iteration": 2.85048508644104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067992, + "balance_loss_mlp": 1.03656876, + "epoch": 0.7928049249711427, + "flos": 602207032320.0, + "grad_norm": 0.05006458241333452, + "language_loss": 0.86780667, + "learning_rate": 0.00010841917373672444, + "loss": 0.87848663, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.31396484, + "step": 4121, + "time_per_iteration": 2.704904794692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067696, + "balance_loss_mlp": 1.03570032, + "epoch": 0.7929973066564063, + "flos": 655724201472.0, + "grad_norm": 0.05319226556655214, + "language_loss": 0.78318095, + "learning_rate": 0.00010822552709542293, + "loss": 0.79385787, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.31982422, + "step": 4122, + "time_per_iteration": 2.8160955905914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069953, + "balance_loss_mlp": 1.03814769, + "epoch": 0.7931896883416699, + "flos": 536139177984.0, + "grad_norm": 0.04444307991995564, + "language_loss": 0.85812402, + "learning_rate": 0.0001080320325544612, + "loss": 0.86882365, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.31787109, + "step": 4123, + "time_per_iteration": 2.6734302043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067197, + "balance_loss_mlp": 1.03594005, + "epoch": 0.7933820700269334, + "flos": 497836758528.0, + "grad_norm": 0.04986309312867086, + "language_loss": 0.82817209, + "learning_rate": 0.00010783869018895997, + "loss": 0.838844, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.31225586, + "step": 4124, + "time_per_iteration": 2.578643321990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067226, + "balance_loss_mlp": 1.03506327, + "epoch": 0.793574451712197, + "flos": 537217325568.0, + "grad_norm": 0.05142590484857824, + "language_loss": 0.84177709, + "learning_rate": 0.00010764550007398189, + "loss": 0.8524493, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.3215332, + "step": 4125, + "time_per_iteration": 2.668468475341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065806, + "balance_loss_mlp": 1.03419125, + "epoch": 0.7937668333974606, + "flos": 488043270144.0, + "grad_norm": 0.048489850781485225, + "language_loss": 0.81036043, + "learning_rate": 0.00010745246228452982, + "loss": 0.82101846, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.31591797, + "step": 4126, + "time_per_iteration": 2.5388453006744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106876, + "balance_loss_mlp": 1.0364542, + "epoch": 0.7939592150827242, + "flos": 527163379200.0, + "grad_norm": 0.05117583653255347, + "language_loss": 0.81550407, + "learning_rate": 0.00010725957689554771, + "loss": 0.82619166, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.32299805, + "step": 4127, + "time_per_iteration": 2.7774598598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065815, + "balance_loss_mlp": 1.03353345, + "epoch": 0.7941515967679876, + "flos": 541428019200.0, + "grad_norm": 0.13198996647770603, + "language_loss": 0.84346122, + "learning_rate": 0.00010706684398192013, + "loss": 0.85411942, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.32275391, + "step": 4128, + "time_per_iteration": 2.6948909759521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068056, + "balance_loss_mlp": 1.03555918, + "epoch": 0.7943439784532512, + "flos": 518104622592.0, + "grad_norm": 0.05568877803614168, + "language_loss": 0.81997395, + "learning_rate": 0.00010687426361847313, + "loss": 0.8306545, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.32495117, + "step": 4129, + "time_per_iteration": 2.693753957748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069811, + "balance_loss_mlp": 1.0384829, + "epoch": 0.7945363601385148, + "flos": 508768031232.0, + "grad_norm": 0.052703179932938445, + "language_loss": 0.85951877, + "learning_rate": 0.00010668183587997254, + "loss": 0.87021685, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.31298828, + "step": 4130, + "time_per_iteration": 2.5763041973114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069665, + "balance_loss_mlp": 1.03731203, + "epoch": 0.7947287418237784, + "flos": 650918398464.0, + "grad_norm": 0.061493260737887565, + "language_loss": 0.77379823, + "learning_rate": 0.0001064895608411256, + "loss": 0.78449482, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.32348633, + "step": 4131, + "time_per_iteration": 2.763904333114624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068483, + "balance_loss_mlp": 1.03620124, + "epoch": 0.794921123509042, + "flos": 695726019072.0, + "grad_norm": 0.07934957130099038, + "language_loss": 0.80297732, + "learning_rate": 0.00010629743857657998, + "loss": 0.81366217, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.32275391, + "step": 4132, + "time_per_iteration": 2.933009386062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019333, + "balance_loss_mlp": 1.01117909, + "epoch": 0.7951135051943055, + "flos": 1402161988608.0, + "grad_norm": 0.006928845772435826, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71618003, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.08154297, + "step": 4133, + "time_per_iteration": 4.611080884933472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067103, + "balance_loss_mlp": 1.03560841, + "epoch": 0.795305886879569, + "flos": 809745776640.0, + "grad_norm": 0.059789926396459823, + "language_loss": 0.81835663, + "learning_rate": 0.00010591365266868802, + "loss": 0.82902765, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.31469727, + "step": 4134, + "time_per_iteration": 2.9697659015655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016843, + "balance_loss_mlp": 1.00873721, + "epoch": 0.7954982685648326, + "flos": 1425205988352.0, + "grad_norm": 0.006305006479863361, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76528627, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.08105469, + "step": 4135, + "time_per_iteration": 4.8860838413238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068161, + "balance_loss_mlp": 1.03547359, + "epoch": 0.7956906502500962, + "flos": 389670428160.0, + "grad_norm": 0.055642824897664006, + "language_loss": 0.79057562, + "learning_rate": 0.00010553047875229166, + "loss": 0.80125725, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.3269043, + "step": 4136, + "time_per_iteration": 2.5156140327453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065548, + "balance_loss_mlp": 1.03359985, + "epoch": 0.7958830319353598, + "flos": 515321284608.0, + "grad_norm": 0.05406078670363032, + "language_loss": 0.83169937, + "learning_rate": 0.00010533912147689328, + "loss": 0.84235483, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.31933594, + "step": 4137, + "time_per_iteration": 2.613961696624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064795, + "balance_loss_mlp": 1.03296661, + "epoch": 0.7960754136206233, + "flos": 493695876096.0, + "grad_norm": 0.050232390896865514, + "language_loss": 0.82344103, + "learning_rate": 0.00010514791742243656, + "loss": 0.83408904, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.31811523, + "step": 4138, + "time_per_iteration": 2.5978379249572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106016, + "balance_loss_mlp": 1.02813983, + "epoch": 0.7962677953058869, + "flos": 655409899008.0, + "grad_norm": 0.05370274741433686, + "language_loss": 0.82677209, + "learning_rate": 0.00010495686666315341, + "loss": 0.83737361, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.32006836, + "step": 4139, + "time_per_iteration": 2.872997283935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062257, + "balance_loss_mlp": 1.03088117, + "epoch": 0.7964601769911505, + "flos": 542126435328.0, + "grad_norm": 0.05348146063522791, + "language_loss": 0.77502406, + "learning_rate": 0.00010476596927321635, + "loss": 0.78564668, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.31347656, + "step": 4140, + "time_per_iteration": 2.620577812194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064605, + "balance_loss_mlp": 1.0327282, + "epoch": 0.796652558676414, + "flos": 537356947968.0, + "grad_norm": 0.042260612329337484, + "language_loss": 0.80177677, + "learning_rate": 0.00010457522532673835, + "loss": 0.81242287, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.31860352, + "step": 4141, + "time_per_iteration": 2.7778780460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066312, + "balance_loss_mlp": 1.03419721, + "epoch": 0.7968449403616775, + "flos": 474852395520.0, + "grad_norm": 0.061301631429393516, + "language_loss": 0.82973599, + "learning_rate": 0.00010438463489777272, + "loss": 0.84039915, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.32104492, + "step": 4142, + "time_per_iteration": 2.579953908920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064314, + "balance_loss_mlp": 1.03157902, + "epoch": 0.7970373220469411, + "flos": 567336904704.0, + "grad_norm": 0.06081943760353449, + "language_loss": 0.77709621, + "learning_rate": 0.00010419419806031316, + "loss": 0.7877394, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.32739258, + "step": 4143, + "time_per_iteration": 2.6624910831451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066036, + "balance_loss_mlp": 1.03477979, + "epoch": 0.7972297037322047, + "flos": 555924003840.0, + "grad_norm": 0.05994376344115418, + "language_loss": 0.83806866, + "learning_rate": 0.00010400391488829403, + "loss": 0.84872901, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.31225586, + "step": 4144, + "time_per_iteration": 2.774700880050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063404, + "balance_loss_mlp": 1.03157544, + "epoch": 0.7974220854174683, + "flos": 575899476480.0, + "grad_norm": 0.04407421907789105, + "language_loss": 0.86373734, + "learning_rate": 0.00010381378545558984, + "loss": 0.87437141, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.31811523, + "step": 4145, + "time_per_iteration": 2.686239004135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065318, + "balance_loss_mlp": 1.03301203, + "epoch": 0.7976144671027319, + "flos": 482824240128.0, + "grad_norm": 0.047216774900369206, + "language_loss": 0.8480643, + "learning_rate": 0.00010362380983601505, + "loss": 0.85871744, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.32299805, + "step": 4146, + "time_per_iteration": 2.533198833465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062521, + "balance_loss_mlp": 1.03102612, + "epoch": 0.7978068487879953, + "flos": 1077420372480.0, + "grad_norm": 0.04375196843804429, + "language_loss": 0.78552485, + "learning_rate": 0.00010343398810332477, + "loss": 0.79615009, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.31469727, + "step": 4147, + "time_per_iteration": 3.451004981994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106262, + "balance_loss_mlp": 1.03007627, + "epoch": 0.7979992304732589, + "flos": 733421744640.0, + "grad_norm": 0.06305718879587498, + "language_loss": 0.84127843, + "learning_rate": 0.00010324432033121467, + "loss": 0.85190463, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.32543945, + "step": 4148, + "time_per_iteration": 2.890085220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065874, + "balance_loss_mlp": 1.03349686, + "epoch": 0.7981916121585225, + "flos": 415531261440.0, + "grad_norm": 0.050318448147633754, + "language_loss": 0.83318138, + "learning_rate": 0.00010305480659332005, + "loss": 0.84384012, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.32373047, + "step": 4149, + "time_per_iteration": 2.588676929473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063088, + "balance_loss_mlp": 1.03133059, + "epoch": 0.7983839938437861, + "flos": 465019619328.0, + "grad_norm": 0.06596514407169883, + "language_loss": 0.83595121, + "learning_rate": 0.00010286544696321682, + "loss": 0.84658206, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.31738281, + "step": 4150, + "time_per_iteration": 2.546215772628784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064473, + "balance_loss_mlp": 1.03304911, + "epoch": 0.7985763755290496, + "flos": 510304485888.0, + "grad_norm": 0.05480976519736011, + "language_loss": 0.79303128, + "learning_rate": 0.00010267624151442073, + "loss": 0.80367601, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.31396484, + "step": 4151, + "time_per_iteration": 2.620140790939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062507, + "balance_loss_mlp": 1.03077376, + "epoch": 0.7987687572143132, + "flos": 1010243847168.0, + "grad_norm": 0.05583504275555366, + "language_loss": 0.81259573, + "learning_rate": 0.000102487190320388, + "loss": 0.82322085, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.31713867, + "step": 4152, + "time_per_iteration": 3.3063504695892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064782, + "balance_loss_mlp": 1.03247619, + "epoch": 0.7989611388995768, + "flos": 1020662968320.0, + "grad_norm": 0.05403781232268857, + "language_loss": 0.79678059, + "learning_rate": 0.00010229829345451475, + "loss": 0.80742842, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.32299805, + "step": 4153, + "time_per_iteration": 3.301403522491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064656, + "balance_loss_mlp": 1.03237379, + "epoch": 0.7991535205848403, + "flos": 1100915476992.0, + "grad_norm": 0.05303368831267737, + "language_loss": 0.79783893, + "learning_rate": 0.00010210955099013724, + "loss": 0.80848551, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.32275391, + "step": 4154, + "time_per_iteration": 3.383039712905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065536, + "balance_loss_mlp": 1.03301597, + "epoch": 0.7993459022701039, + "flos": 834454268928.0, + "grad_norm": 0.06456924160427363, + "language_loss": 0.76284033, + "learning_rate": 0.00010192096300053167, + "loss": 0.77349567, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.32519531, + "step": 4155, + "time_per_iteration": 3.0697450637817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061021, + "balance_loss_mlp": 1.02928793, + "epoch": 0.7995382839553674, + "flos": 522417212928.0, + "grad_norm": 0.04699781712080769, + "language_loss": 0.851726, + "learning_rate": 0.00010173252955891477, + "loss": 0.86233628, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.31713867, + "step": 4156, + "time_per_iteration": 2.7266414165496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106594, + "balance_loss_mlp": 1.03389633, + "epoch": 0.799730665640631, + "flos": 537562151424.0, + "grad_norm": 0.059253037565978675, + "language_loss": 0.73188376, + "learning_rate": 0.00010154425073844253, + "loss": 0.7425431, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.3203125, + "step": 4157, + "time_per_iteration": 2.6836955547332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067435, + "balance_loss_mlp": 1.0347718, + "epoch": 0.7999230473258946, + "flos": 504809031168.0, + "grad_norm": 0.050604408560098985, + "language_loss": 0.82231861, + "learning_rate": 0.00010135612661221138, + "loss": 0.83299297, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.32666016, + "step": 4158, + "time_per_iteration": 2.5858490467071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061337, + "balance_loss_mlp": 1.02903104, + "epoch": 0.8001154290111582, + "flos": 1026935414784.0, + "grad_norm": 0.07154666191877361, + "language_loss": 0.81335956, + "learning_rate": 0.00010116815725325751, + "loss": 0.82397294, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.32299805, + "step": 4159, + "time_per_iteration": 3.30757474899292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063379, + "balance_loss_mlp": 1.03073967, + "epoch": 0.8003078106964217, + "flos": 750567237120.0, + "grad_norm": 0.05734149006242142, + "language_loss": 0.80527955, + "learning_rate": 0.00010098034273455725, + "loss": 0.81591332, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.32641602, + "step": 4160, + "time_per_iteration": 2.9547767639160156 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 344944048, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9383636514111488.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/training_args.bin b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..459663e238ea62a90da439e633388cc1e16cedb6 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63f07a99639c8908760dc7ac65f4d34d749c1861fc4b5a1f91cbdcc73581ce9e +size 7992 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/zero_to_fp32.py b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-4160/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/added_tokens.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7929d4cdbe9bb7ee3537b93d161990a8caa422ec --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sharev3", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/generation_config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a4089f7825d081faa2f50e84e016de6c5b6e1d1 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1ee1b302c4f721ae6f8352b4e1fb3bdcc4b379ec52d1263cad847c987d6cb3b +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..44e461ca9f6d47d978904f91ef5ca834960d4c0b --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc0359600b10b71411216be99a437d708510ef0ccb2fc77e4f50b2f9f7172283 +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e553ba57bfebe52355e4b6c058b8bc76ec4b7ec --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ef120bdd9cb4d6e6dbd6db6e255b6777d8648945245e808f62c0f214a72f317 +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f69f01c4324101b010275bad0b0e9e7a51f3738 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee3996c8a64188470bca5f2e5224db1e290174d11e09a282d9b7a3d9230491b8 +size 396575120 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..531a098a73a1eb7b46455bfe464742b844ecb4aa --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e716b9aa1a1fe5aca85db5eb21719ce63e7e2aa0e4383c36a51dbfc2b8c9aaa1 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c7cda31bd4ca6548918b66d58a78cc1fbef3e84 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68366233591d186b4d870e9f205effe05bbc7c9b37a280a7bbe29953c4277d2a +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34fe8da4d9ae8549b17a11db6440dff464831c11 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17aea391fd94f2cdd4986e71ee8b00ca91f6c09242bcfa921d74df60e519853b +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d007a90149b6cb32aef7371685feee0f2115c0d0 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a32b1c25e35577f36d60f8aeeb60e6e5d1f0aaa68f010ea8902a160b729995ae +size 2117321480 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/latest b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/latest new file mode 100644 index 0000000000000000000000000000000000000000..c0e63763d1d13a0ca7a3b62ff8f5cd1d69cc4978 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/latest @@ -0,0 +1 @@ +global_step5198 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a5850e0e33fb7be370bb086085a9d3bf29450a73 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7df115f383ed31a5a9b7c49a81df7f755bfb40161e10c12560c7bc1dd60f2330 +size 3759020544 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/model.safetensors.index.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..90a56cf0153ed9062ff35ee2b372f7ab9e796c6a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731420128 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/rng_state_0.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/rng_state_1.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/rng_state_2.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/rng_state_3.pth b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/special_tokens_map.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/tokenizer.model b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/tokenizer_config.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/trainer_state.json b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c1fe4d602f57c451a711c2520e350c7c8aa8a4b6 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/trainer_state.json @@ -0,0 +1,78003 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03964023, + "balance_loss_mlp": 3.01339984, + "epoch": 0.00019238168526356292, + "flos": 470464353792.0, + "grad_norm": 27.10233905437441, + "language_loss": 3.72295761, + "learning_rate": 0.0, + "loss": 2.48840261, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 9.5, + "step": 1, + "time_per_iteration": 29.606513023376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01906453, + "balance_loss_mlp": 1.25642872, + "epoch": 0.00038476337052712584, + "flos": 504311436288.0, + "grad_norm": 2.874173750989579, + "language_loss": 1.79264998, + "learning_rate": 0.00013726078121135892, + "loss": 1.81171465, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.5, + "step": 2, + "time_per_iteration": 2.7078208923339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01915611, + "balance_loss_mlp": 1.2667315, + "epoch": 0.0005771450557906887, + "flos": 598869282816.0, + "grad_norm": 2.1141296462778643, + "language_loss": 1.61429811, + "learning_rate": 0.00021755319103969496, + "loss": 1.63345432, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.48828125, + "step": 3, + "time_per_iteration": 3.010409116744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01917319, + "balance_loss_mlp": 1.26309848, + "epoch": 0.0007695267410542517, + "flos": 580133491200.0, + "grad_norm": 1.255159247360545, + "language_loss": 1.49202251, + "learning_rate": 0.00027452156242271784, + "loss": 1.51119578, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.54296875, + "step": 4, + "time_per_iteration": 2.7161622047424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0185163, + "balance_loss_mlp": 1.22144234, + "epoch": 0.0009619084263178145, + "flos": 485857861632.0, + "grad_norm": 4.267520959606063, + "language_loss": 1.57359505, + "learning_rate": 0.0003187096642208417, + "loss": 1.59211147, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 6.296875, + "step": 5, + "time_per_iteration": 2.718417167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01828185, + "balance_loss_mlp": 1.21211123, + "epoch": 0.0011542901115813775, + "flos": 559744791552.0, + "grad_norm": 1.225349312557607, + "language_loss": 1.4752574, + "learning_rate": 0.0003548139722510539, + "loss": 1.49353933, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 6.15234375, + "step": 6, + "time_per_iteration": 2.6827874183654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01821666, + "balance_loss_mlp": 1.22428453, + "epoch": 0.0013466717968449403, + "flos": 533721014784.0, + "grad_norm": 0.5025899606895544, + "language_loss": 1.33846116, + "learning_rate": 0.00038533972973918044, + "loss": 1.35667801, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.96875, + "step": 7, + "time_per_iteration": 2.6889517307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01776667, + "balance_loss_mlp": 1.2090404, + "epoch": 0.0015390534821085034, + "flos": 492037175808.0, + "grad_norm": 0.1719820928967348, + "language_loss": 1.2814672, + "learning_rate": 0.0004117823436340768, + "loss": 1.29923391, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.6875, + "step": 8, + "time_per_iteration": 2.7207248210906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0177577, + "balance_loss_mlp": 1.23217535, + "epoch": 0.0017314351673720662, + "flos": 564402207744.0, + "grad_norm": 0.6128716609675008, + "language_loss": 1.39861906, + "learning_rate": 0.00043510638207938993, + "loss": 1.41637683, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.44140625, + "step": 9, + "time_per_iteration": 2.887538194656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01823371, + "balance_loss_mlp": 1.31334615, + "epoch": 0.001923816852635629, + "flos": 593132308992.0, + "grad_norm": 0.480897383035181, + "language_loss": 1.25963569, + "learning_rate": 0.00045597044543220066, + "loss": 1.27786922, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.09765625, + "step": 10, + "time_per_iteration": 2.7672832012176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01930298, + "balance_loss_mlp": 1.44621277, + "epoch": 0.002116198537899192, + "flos": 609308752896.0, + "grad_norm": 0.21803247425844502, + "language_loss": 1.22959518, + "learning_rate": 0.00047484428652143135, + "loss": 1.24889803, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 4.83203125, + "step": 11, + "time_per_iteration": 2.9771082401275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130152, + "balance_loss_mlp": 1.67772901, + "epoch": 0.002308580223162755, + "flos": 544869075456.0, + "grad_norm": 0.19847359144835577, + "language_loss": 1.28057694, + "learning_rate": 0.0004920747534624128, + "loss": 1.30187845, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 4.52734375, + "step": 12, + "time_per_iteration": 2.6094090938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02177014, + "balance_loss_mlp": 1.7512939, + "epoch": 0.002500961908426318, + "flos": 644458277376.0, + "grad_norm": 0.3126355826019607, + "language_loss": 1.29235363, + "learning_rate": 0.0005079252465375872, + "loss": 1.31412375, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 4.265625, + "step": 13, + "time_per_iteration": 2.841792345046997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02141221, + "balance_loss_mlp": 1.74411082, + "epoch": 0.0026933435936898806, + "flos": 487605312000.0, + "grad_norm": 0.282411779716686, + "language_loss": 1.17459798, + "learning_rate": 0.0005226005109505393, + "loss": 1.19601011, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 3.96875, + "step": 14, + "time_per_iteration": 2.597313165664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02024541, + "balance_loss_mlp": 1.65890288, + "epoch": 0.0028857252789534437, + "flos": 434368949760.0, + "grad_norm": 0.2583476739022616, + "language_loss": 1.22957516, + "learning_rate": 0.0005362628552605367, + "loss": 1.24982059, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 3.65234375, + "step": 15, + "time_per_iteration": 2.6388704776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01790575, + "balance_loss_mlp": 1.44687057, + "epoch": 0.0030781069642170067, + "flos": 596465676288.0, + "grad_norm": 0.18613747071639053, + "language_loss": 1.27631426, + "learning_rate": 0.0005490431248454357, + "loss": 1.29421997, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 3.44140625, + "step": 16, + "time_per_iteration": 2.708346128463745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01779165, + "balance_loss_mlp": 1.46941185, + "epoch": 0.0032704886494805694, + "flos": 1537360432128.0, + "grad_norm": 0.2733785965407311, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.77484274, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 3.09375, + "step": 17, + "time_per_iteration": 6.916250705718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01553778, + "balance_loss_mlp": 1.24955583, + "epoch": 0.0034628703347441324, + "flos": 473720403456.0, + "grad_norm": 0.11658431553946913, + "language_loss": 1.14468098, + "learning_rate": 0.0005723671632907488, + "loss": 1.16021872, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 3.03710938, + "step": 18, + "time_per_iteration": 2.7716212272644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01490625, + "balance_loss_mlp": 1.21005416, + "epoch": 0.0036552520200076955, + "flos": 448303320576.0, + "grad_norm": 0.11552730485963776, + "language_loss": 1.19723654, + "learning_rate": 0.0005830738490244919, + "loss": 1.21214283, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 2.80859375, + "step": 19, + "time_per_iteration": 2.6067557334899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0141948, + "balance_loss_mlp": 1.16103387, + "epoch": 0.003847633705271258, + "flos": 635881148928.0, + "grad_norm": 0.11977740619668105, + "language_loss": 1.21676993, + "learning_rate": 0.0005932312266435596, + "loss": 1.23096466, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 2.58398438, + "step": 20, + "time_per_iteration": 2.8545703887939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01364308, + "balance_loss_mlp": 1.13084817, + "epoch": 0.004040015390534821, + "flos": 589222771200.0, + "grad_norm": 0.09935322828728523, + "language_loss": 1.16681409, + "learning_rate": 0.0006028929207788754, + "loss": 1.18045723, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 2.33203125, + "step": 21, + "time_per_iteration": 2.7119524478912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319718, + "balance_loss_mlp": 1.11038613, + "epoch": 0.004232397075798384, + "flos": 756253338624.0, + "grad_norm": 0.09023283304690737, + "language_loss": 1.20250762, + "learning_rate": 0.0006121050677327902, + "loss": 1.21570492, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 2.09667969, + "step": 22, + "time_per_iteration": 2.884739398956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304467, + "balance_loss_mlp": 1.1184051, + "epoch": 0.004424778761061947, + "flos": 526434439680.0, + "grad_norm": 0.08559602389751407, + "language_loss": 1.10067201, + "learning_rate": 0.0006209076479463684, + "loss": 1.1137166, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 1.85839844, + "step": 23, + "time_per_iteration": 2.6616718769073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275434, + "balance_loss_mlp": 1.10787356, + "epoch": 0.00461716044632551, + "flos": 547907079168.0, + "grad_norm": 0.07141137445072718, + "language_loss": 1.2012924, + "learning_rate": 0.0006293355346737718, + "loss": 1.21404672, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 1.67675781, + "step": 24, + "time_per_iteration": 2.7025952339172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252583, + "balance_loss_mlp": 1.10476315, + "epoch": 0.004809542131589073, + "flos": 567293234688.0, + "grad_norm": 0.08524381015789384, + "language_loss": 1.16738653, + "learning_rate": 0.0006374193284416834, + "loss": 1.17991233, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 1.47753906, + "step": 25, + "time_per_iteration": 2.827439069747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223638, + "balance_loss_mlp": 1.0984205, + "epoch": 0.005001923816852636, + "flos": 470391418368.0, + "grad_norm": 0.08512374611478205, + "language_loss": 1.15399337, + "learning_rate": 0.0006451860277489461, + "loss": 1.16622972, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 1.25097656, + "step": 26, + "time_per_iteration": 2.6214864253997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206141, + "balance_loss_mlp": 1.10009253, + "epoch": 0.005194305502116198, + "flos": 415283950080.0, + "grad_norm": 0.07774032731783902, + "language_loss": 1.23061514, + "learning_rate": 0.0006526595731190848, + "loss": 1.2426765, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 1.0625, + "step": 27, + "time_per_iteration": 2.5637125968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117904, + "balance_loss_mlp": 1.09192181, + "epoch": 0.005386687187379761, + "flos": 628466535936.0, + "grad_norm": 0.05524077436438855, + "language_loss": 1.1626848, + "learning_rate": 0.0006598612921618983, + "loss": 1.17447519, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 0.87158203, + "step": 28, + "time_per_iteration": 2.8202784061431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159441, + "balance_loss_mlp": 1.08772469, + "epoch": 0.005579068872643324, + "flos": 886100332032.0, + "grad_norm": 0.07386109802626846, + "language_loss": 1.08505416, + "learning_rate": 0.0006668102665011454, + "loss": 1.09664845, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 0.71728516, + "step": 29, + "time_per_iteration": 3.2254040241241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154142, + "balance_loss_mlp": 1.09520459, + "epoch": 0.005771450557906887, + "flos": 547287238656.0, + "grad_norm": 0.0797557646441396, + "language_loss": 1.18077409, + "learning_rate": 0.0006735236364718957, + "loss": 1.19231534, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 0.58886719, + "step": 30, + "time_per_iteration": 2.6730945110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140737, + "balance_loss_mlp": 1.09384, + "epoch": 0.00596383224317045, + "flos": 531766950912.0, + "grad_norm": 0.060827451674393726, + "language_loss": 1.1687839, + "learning_rate": 0.0006800168558381346, + "loss": 1.18019128, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 0.46875, + "step": 31, + "time_per_iteration": 2.649216651916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148736, + "balance_loss_mlp": 1.11166239, + "epoch": 0.0061562139284340135, + "flos": 588813926400.0, + "grad_norm": 0.10592463777190406, + "language_loss": 1.19211543, + "learning_rate": 0.0006863039060567947, + "loss": 1.20360279, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 0.37084961, + "step": 32, + "time_per_iteration": 2.6697018146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132499, + "balance_loss_mlp": 1.10136151, + "epoch": 0.006348595613697576, + "flos": 617929551360.0, + "grad_norm": 0.09812744917576391, + "language_loss": 1.1217525, + "learning_rate": 0.0006923974775611263, + "loss": 1.13307738, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 0.3112793, + "step": 33, + "time_per_iteration": 2.770225763320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137532, + "balance_loss_mlp": 1.11146092, + "epoch": 0.006540977298961139, + "flos": 777564444672.0, + "grad_norm": 0.06513543096417564, + "language_loss": 1.08375585, + "learning_rate": 0.0006983091239737814, + "loss": 1.09513116, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 0.26086426, + "step": 34, + "time_per_iteration": 2.99418306350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128276, + "balance_loss_mlp": 1.10578084, + "epoch": 0.006733358984224702, + "flos": 666837356544.0, + "grad_norm": 0.06344935516817307, + "language_loss": 1.07062221, + "learning_rate": 0.0007040493939600222, + "loss": 1.08190489, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 0.22497559, + "step": 35, + "time_per_iteration": 2.9126057624816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119708, + "balance_loss_mlp": 1.09892988, + "epoch": 0.006925740669488265, + "flos": 564092287488.0, + "grad_norm": 0.06579143759664555, + "language_loss": 1.07960629, + "learning_rate": 0.0007096279445021078, + "loss": 1.09080338, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 0.20788574, + "step": 36, + "time_per_iteration": 2.7079102993011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114855, + "balance_loss_mlp": 1.09574544, + "epoch": 0.007118122354751828, + "flos": 549583156224.0, + "grad_norm": 0.14799474820221378, + "language_loss": 1.14634764, + "learning_rate": 0.0007150536386503726, + "loss": 1.15749621, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 0.19104004, + "step": 37, + "time_per_iteration": 2.8290467262268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104011, + "balance_loss_mlp": 1.08533084, + "epoch": 0.007310504040015391, + "flos": 702161409024.0, + "grad_norm": 0.2513092385422617, + "language_loss": 1.08396375, + "learning_rate": 0.0007203346302358509, + "loss": 1.09500384, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 0.18688965, + "step": 38, + "time_per_iteration": 2.961430311203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121274, + "balance_loss_mlp": 1.10231924, + "epoch": 0.007502885725278953, + "flos": 599022051840.0, + "grad_norm": 0.0999674626629785, + "language_loss": 1.11391926, + "learning_rate": 0.000725478437577282, + "loss": 1.12513208, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 0.18945312, + "step": 39, + "time_per_iteration": 2.742088556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146989, + "balance_loss_mlp": 1.12810588, + "epoch": 0.007695267410542516, + "flos": 560000867328.0, + "grad_norm": 0.3323772184023467, + "language_loss": 1.08355689, + "learning_rate": 0.0007304920078549186, + "loss": 1.09502685, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 0.18884277, + "step": 40, + "time_per_iteration": 2.66943621635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116486, + "balance_loss_mlp": 1.1452378, + "epoch": 0.007887649095806078, + "flos": 507906671616.0, + "grad_norm": 0.11539272036457353, + "language_loss": 1.09356606, + "learning_rate": 0.0007353817735343603, + "loss": 1.1052146, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 0.19604492, + "step": 41, + "time_per_iteration": 2.7052595615386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132998, + "balance_loss_mlp": 1.11293542, + "epoch": 0.008080030781069641, + "flos": 503642133504.0, + "grad_norm": 0.12251683576194117, + "language_loss": 1.04851842, + "learning_rate": 0.0007401537019902344, + "loss": 1.05984843, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 0.20056152, + "step": 42, + "time_per_iteration": 2.590432643890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124507, + "balance_loss_mlp": 1.10198867, + "epoch": 0.008272412466333205, + "flos": 517764178944.0, + "grad_norm": 0.09393858903586973, + "language_loss": 1.08539796, + "learning_rate": 0.0007448133392900729, + "loss": 1.09664297, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 0.22521973, + "step": 43, + "time_per_iteration": 2.6619081497192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112544, + "balance_loss_mlp": 1.10156202, + "epoch": 0.008464794151596768, + "flos": 607673373696.0, + "grad_norm": 0.06822323064374927, + "language_loss": 1.03845203, + "learning_rate": 0.0007493658489441491, + "loss": 1.04970646, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 0.23864746, + "step": 44, + "time_per_iteration": 2.861008644104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128905, + "balance_loss_mlp": 1.10477662, + "epoch": 0.00865717583686033, + "flos": 537661075968.0, + "grad_norm": 0.1413166066405165, + "language_loss": 1.08820629, + "learning_rate": 0.0007538160463002316, + "loss": 1.09949529, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 0.2409668, + "step": 45, + "time_per_iteration": 2.643458604812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115676, + "balance_loss_mlp": 1.13258433, + "epoch": 0.008849557522123894, + "flos": 507758284800.0, + "grad_norm": 0.08570115972640321, + "language_loss": 1.10720444, + "learning_rate": 0.0007581684291577274, + "loss": 1.11877203, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.24157715, + "step": 46, + "time_per_iteration": 2.5904788970947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145761, + "balance_loss_mlp": 1.12085772, + "epoch": 0.009041939207387457, + "flos": 625048800768.0, + "grad_norm": 0.06636849455276843, + "language_loss": 1.14156199, + "learning_rate": 0.0007624272050891776, + "loss": 1.15301955, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.24902344, + "step": 47, + "time_per_iteration": 2.782179594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154374, + "balance_loss_mlp": 1.12759995, + "epoch": 0.00923432089265102, + "flos": 549124849152.0, + "grad_norm": 0.09356522507451794, + "language_loss": 1.04615343, + "learning_rate": 0.0007665963158851307, + "loss": 1.05769718, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.26806641, + "step": 48, + "time_per_iteration": 2.824540138244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174661, + "balance_loss_mlp": 1.14738548, + "epoch": 0.009426702577914583, + "flos": 562202242560.0, + "grad_norm": 0.059100241584136314, + "language_loss": 1.12381458, + "learning_rate": 0.0007706794594783609, + "loss": 1.13556111, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.27270508, + "step": 49, + "time_per_iteration": 2.790757894515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192673, + "balance_loss_mlp": 1.16604137, + "epoch": 0.009619084263178146, + "flos": 616486228992.0, + "grad_norm": 0.08074806779925832, + "language_loss": 1.11280799, + "learning_rate": 0.0007746801096530423, + "loss": 1.12473488, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.2668457, + "step": 50, + "time_per_iteration": 2.7235305309295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116178, + "balance_loss_mlp": 1.135149, + "epoch": 0.009811465948441709, + "flos": 541176325632.0, + "grad_norm": 0.06558886342971224, + "language_loss": 1.16576111, + "learning_rate": 0.0007786015338021173, + "loss": 1.17737889, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.26672363, + "step": 51, + "time_per_iteration": 2.6817519664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134628, + "balance_loss_mlp": 1.1085453, + "epoch": 0.010003847633705272, + "flos": 535608087552.0, + "grad_norm": 0.06210449580458492, + "language_loss": 1.08870959, + "learning_rate": 0.0007824468089603051, + "loss": 1.10005593, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.26098633, + "step": 52, + "time_per_iteration": 2.644577980041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125522, + "balance_loss_mlp": 1.09910512, + "epoch": 0.010196229318968833, + "flos": 908867907072.0, + "grad_norm": 0.05864822926220488, + "language_loss": 1.07807887, + "learning_rate": 0.0007862188363098669, + "loss": 1.08933413, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.26428223, + "step": 53, + "time_per_iteration": 3.1450047492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126237, + "balance_loss_mlp": 1.10084558, + "epoch": 0.010388611004232396, + "flos": 585594040320.0, + "grad_norm": 0.07974065634267835, + "language_loss": 1.08295977, + "learning_rate": 0.0007899203543304438, + "loss": 1.09422219, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.25390625, + "step": 54, + "time_per_iteration": 2.6822280883789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155972, + "balance_loss_mlp": 1.13315582, + "epoch": 0.01058099268949596, + "flos": 502233716736.0, + "grad_norm": 0.07014139109577967, + "language_loss": 1.22212756, + "learning_rate": 0.0007935539507422731, + "loss": 1.23368728, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.22814941, + "step": 55, + "time_per_iteration": 2.5841405391693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117516, + "balance_loss_mlp": 1.153512, + "epoch": 0.010773374374759523, + "flos": 544170659328.0, + "grad_norm": 0.07006342440897594, + "language_loss": 1.13914931, + "learning_rate": 0.0007971220733732573, + "loss": 1.15090084, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.21643066, + "step": 56, + "time_per_iteration": 2.697427988052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193099, + "balance_loss_mlp": 1.17267895, + "epoch": 0.010965756060023086, + "flos": 525874235904.0, + "grad_norm": 0.08125896119424647, + "language_loss": 1.0764755, + "learning_rate": 0.0008006270400641869, + "loss": 1.08840656, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.2043457, + "step": 57, + "time_per_iteration": 2.723154306411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174019, + "balance_loss_mlp": 1.15412247, + "epoch": 0.011158137745286649, + "flos": 576653147136.0, + "grad_norm": 0.07485866075688756, + "language_loss": 1.09104013, + "learning_rate": 0.0008040710477125043, + "loss": 1.10278034, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.19897461, + "step": 58, + "time_per_iteration": 2.703186273574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153983, + "balance_loss_mlp": 1.13440859, + "epoch": 0.011350519430550212, + "flos": 529024310784.0, + "grad_norm": 0.06764829366941465, + "language_loss": 1.09780312, + "learning_rate": 0.0008074561805429771, + "loss": 1.10934305, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.19567871, + "step": 59, + "time_per_iteration": 2.6111674308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136624, + "balance_loss_mlp": 1.11676335, + "epoch": 0.011542901115813775, + "flos": 555608291328.0, + "grad_norm": 0.06986870516034673, + "language_loss": 1.08079648, + "learning_rate": 0.0008107844176832545, + "loss": 1.09216261, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.19848633, + "step": 60, + "time_per_iteration": 2.682687997817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125651, + "balance_loss_mlp": 1.1056236, + "epoch": 0.011735282801077338, + "flos": 571826995200.0, + "grad_norm": 0.061548073586970495, + "language_loss": 1.09071934, + "learning_rate": 0.0008140576401132568, + "loss": 1.10197592, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.20019531, + "step": 61, + "time_per_iteration": 2.639394760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111743, + "balance_loss_mlp": 1.09838021, + "epoch": 0.0119276644863409, + "flos": 615309156864.0, + "grad_norm": 0.06273761556608791, + "language_loss": 1.10558033, + "learning_rate": 0.0008172776370494935, + "loss": 1.11675453, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.19030762, + "step": 62, + "time_per_iteration": 2.7110230922698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134294, + "balance_loss_mlp": 1.11483955, + "epoch": 0.012120046171604464, + "flos": 500835474432.0, + "grad_norm": 0.07391589684249159, + "language_loss": 1.17346644, + "learning_rate": 0.0008204461118185703, + "loss": 1.18480933, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.19445801, + "step": 63, + "time_per_iteration": 2.5490689277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142708, + "balance_loss_mlp": 1.12420678, + "epoch": 0.012312427856868027, + "flos": 473109327360.0, + "grad_norm": 0.05825974543220343, + "language_loss": 1.06081367, + "learning_rate": 0.0008235646872681536, + "loss": 1.07224083, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.18505859, + "step": 64, + "time_per_iteration": 2.5874247550964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139504, + "balance_loss_mlp": 1.12069249, + "epoch": 0.012504809542131588, + "flos": 538094651904.0, + "grad_norm": 0.1040066778051144, + "language_loss": 1.06503749, + "learning_rate": 0.0008266349107584288, + "loss": 1.07643247, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.18823242, + "step": 65, + "time_per_iteration": 2.678736925125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123492, + "balance_loss_mlp": 1.10500288, + "epoch": 0.012697191227395151, + "flos": 608450365440.0, + "grad_norm": 0.09066354406474254, + "language_loss": 1.09410381, + "learning_rate": 0.0008296582587724851, + "loss": 1.10533869, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.18481445, + "step": 66, + "time_per_iteration": 2.6937255859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121105, + "balance_loss_mlp": 1.10255599, + "epoch": 0.012889572912658714, + "flos": 767750607360.0, + "grad_norm": 0.11790618145169461, + "language_loss": 1.07982886, + "learning_rate": 0.0008326361411800136, + "loss": 1.0910399, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.1854248, + "step": 67, + "time_per_iteration": 2.9377663135528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096346, + "balance_loss_mlp": 1.07871521, + "epoch": 0.013081954597922277, + "flos": 533604561408.0, + "grad_norm": 0.09153807632987658, + "language_loss": 1.08335972, + "learning_rate": 0.0008355699051851403, + "loss": 1.09432316, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.17651367, + "step": 68, + "time_per_iteration": 2.7278473377227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_mlp": 1.0865227, + "epoch": 0.01327433628318584, + "flos": 572826567168.0, + "grad_norm": 0.08317322449907456, + "language_loss": 1.14837921, + "learning_rate": 0.0008384608389860635, + "loss": 1.15941942, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.1751709, + "step": 69, + "time_per_iteration": 2.7238211631774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111418, + "balance_loss_mlp": 1.09424019, + "epoch": 0.013466717968449404, + "flos": 497029243392.0, + "grad_norm": 0.08213812906773327, + "language_loss": 1.04970825, + "learning_rate": 0.000841310175171381, + "loss": 1.06082237, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.17199707, + "step": 70, + "time_per_iteration": 2.578726291656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101987, + "balance_loss_mlp": 1.08526158, + "epoch": 0.013659099653712967, + "flos": 565234454016.0, + "grad_norm": 0.06358988870017376, + "language_loss": 1.03380442, + "learning_rate": 0.000844119093875517, + "loss": 1.04482436, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.16723633, + "step": 71, + "time_per_iteration": 2.692791223526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103533, + "balance_loss_mlp": 1.08689094, + "epoch": 0.01385148133897653, + "flos": 573540950016.0, + "grad_norm": 0.07461407963015444, + "language_loss": 1.08098376, + "learning_rate": 0.0008468887257134666, + "loss": 1.09201908, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.16650391, + "step": 72, + "time_per_iteration": 2.6599459648132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122889, + "balance_loss_mlp": 1.10587776, + "epoch": 0.014043863024240093, + "flos": 576539665920.0, + "grad_norm": 0.05931650266846123, + "language_loss": 1.10316896, + "learning_rate": 0.0008496201545131264, + "loss": 1.11439776, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.17028809, + "step": 73, + "time_per_iteration": 2.7093684673309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126213, + "balance_loss_mlp": 1.10950017, + "epoch": 0.014236244709503656, + "flos": 938287660032.0, + "grad_norm": 0.060718352480344094, + "language_loss": 1.08902812, + "learning_rate": 0.0008523144198617317, + "loss": 1.1002903, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.16711426, + "step": 74, + "time_per_iteration": 3.1743276119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125614, + "balance_loss_mlp": 1.10876918, + "epoch": 0.014428626394767219, + "flos": 528231352320.0, + "grad_norm": 0.07198154728214846, + "language_loss": 1.08249164, + "learning_rate": 0.0008549725194813783, + "loss": 1.09374774, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.1685791, + "step": 75, + "time_per_iteration": 2.630387783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106727, + "balance_loss_mlp": 1.09047866, + "epoch": 0.014621008080030782, + "flos": 803371433472.0, + "grad_norm": 0.07553700512989577, + "language_loss": 1.06998253, + "learning_rate": 0.0008575954114472099, + "loss": 1.0810498, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.16247559, + "step": 76, + "time_per_iteration": 3.134385347366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109532, + "balance_loss_mlp": 1.0933075, + "epoch": 0.014813389765294343, + "flos": 696588788736.0, + "grad_norm": 0.053440596513601155, + "language_loss": 1.05069363, + "learning_rate": 0.0008601840162606118, + "loss": 1.06178904, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.16223145, + "step": 77, + "time_per_iteration": 3.039991855621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123171, + "balance_loss_mlp": 1.10660076, + "epoch": 0.015005771450557906, + "flos": 596702813184.0, + "grad_norm": 0.07894951514499118, + "language_loss": 1.1143651, + "learning_rate": 0.000862739218788641, + "loss": 1.12559676, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.16577148, + "step": 78, + "time_per_iteration": 2.867741346359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113228, + "balance_loss_mlp": 1.11553121, + "epoch": 0.01519815313582147, + "flos": 549148170240.0, + "grad_norm": 0.0893413961860561, + "language_loss": 1.07743871, + "learning_rate": 0.0008652618700799138, + "loss": 1.08876157, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.16760254, + "step": 79, + "time_per_iteration": 2.675795555114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160152, + "balance_loss_mlp": 1.14348662, + "epoch": 0.015390534821085032, + "flos": 430306642944.0, + "grad_norm": 0.06679936706529424, + "language_loss": 1.07125092, + "learning_rate": 0.0008677527890662774, + "loss": 1.08285248, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.16662598, + "step": 80, + "time_per_iteration": 2.4765963554382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196819, + "balance_loss_mlp": 1.17889023, + "epoch": 0.015582916506348595, + "flos": 523854743040.0, + "grad_norm": 0.12362960542988827, + "language_loss": 1.09903598, + "learning_rate": 0.0008702127641587799, + "loss": 1.11100423, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.17932129, + "step": 81, + "time_per_iteration": 2.636688470840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180455, + "balance_loss_mlp": 1.16288388, + "epoch": 0.015775298191612157, + "flos": 575151598080.0, + "grad_norm": 0.08274533442322421, + "language_loss": 1.04032063, + "learning_rate": 0.0008726425547457192, + "loss": 1.05212522, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.17565918, + "step": 82, + "time_per_iteration": 2.765179395675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157804, + "balance_loss_mlp": 1.14051914, + "epoch": 0.01596767987687572, + "flos": 610040664576.0, + "grad_norm": 0.07618339381967684, + "language_loss": 1.03921247, + "learning_rate": 0.0008750428925998964, + "loss": 1.05079055, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.1730957, + "step": 83, + "time_per_iteration": 2.7615418434143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159673, + "balance_loss_mlp": 1.14280462, + "epoch": 0.016160061562139283, + "flos": 566864040960.0, + "grad_norm": 0.0706757922791228, + "language_loss": 1.09743476, + "learning_rate": 0.0008774144832015932, + "loss": 1.10903156, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.16882324, + "step": 84, + "time_per_iteration": 2.694364070892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01699218, + "balance_loss_mlp": 1.68252861, + "epoch": 0.016352443247402846, + "flos": 1410557234688.0, + "grad_norm": 0.23342967148410274, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76473522, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.16699219, + "step": 85, + "time_per_iteration": 4.599137306213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212855, + "balance_loss_mlp": 1.19580793, + "epoch": 0.01654482493266641, + "flos": 730177127424.0, + "grad_norm": 0.09253845479208671, + "language_loss": 1.04518116, + "learning_rate": 0.0008820741205014318, + "loss": 1.05730963, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.1706543, + "step": 86, + "time_per_iteration": 2.8595266342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246652, + "balance_loss_mlp": 1.22939014, + "epoch": 0.016737206617929972, + "flos": 536016932352.0, + "grad_norm": 0.10044068584300966, + "language_loss": 1.06437612, + "learning_rate": 0.0008843634575408404, + "loss": 1.07684278, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.17248535, + "step": 87, + "time_per_iteration": 2.690492630004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215448, + "balance_loss_mlp": 1.19887805, + "epoch": 0.016929588303193535, + "flos": 536706584064.0, + "grad_norm": 0.0661610487366718, + "language_loss": 1.07674646, + "learning_rate": 0.0008866266301555082, + "loss": 1.08890104, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.16577148, + "step": 88, + "time_per_iteration": 2.737339496612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203027, + "balance_loss_mlp": 1.18706512, + "epoch": 0.017121969988457098, + "flos": 526498458624.0, + "grad_norm": 0.07897226836222233, + "language_loss": 1.08543992, + "learning_rate": 0.0008888642296509615, + "loss": 1.09747016, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.1595459, + "step": 89, + "time_per_iteration": 2.576819658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187655, + "balance_loss_mlp": 1.17131162, + "epoch": 0.01731435167372066, + "flos": 625304876544.0, + "grad_norm": 0.0740353605135553, + "language_loss": 1.13367987, + "learning_rate": 0.0008910768275115906, + "loss": 1.14555645, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.16345215, + "step": 90, + "time_per_iteration": 2.778571128845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173459, + "balance_loss_mlp": 1.15692425, + "epoch": 0.017506733358984224, + "flos": 496157709312.0, + "grad_norm": 0.07518713147028631, + "language_loss": 1.08794332, + "learning_rate": 0.0008932649762767675, + "loss": 1.0996778, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.16540527, + "step": 91, + "time_per_iteration": 2.5931665897369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185544, + "balance_loss_mlp": 1.16881919, + "epoch": 0.017699115044247787, + "flos": 745613047296.0, + "grad_norm": 0.07711429280558382, + "language_loss": 1.11576343, + "learning_rate": 0.0008954292103690864, + "loss": 1.12761879, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.1673584, + "step": 92, + "time_per_iteration": 2.9129488468170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194769, + "balance_loss_mlp": 1.17854476, + "epoch": 0.01789149672951135, + "flos": 515257265664.0, + "grad_norm": 0.0669718610224715, + "language_loss": 1.1343056, + "learning_rate": 0.0008975700468778296, + "loss": 1.14625335, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.16223145, + "step": 93, + "time_per_iteration": 2.576620101928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216953, + "balance_loss_mlp": 1.20076382, + "epoch": 0.018083878414774913, + "flos": 585850116096.0, + "grad_norm": 0.11698648494194364, + "language_loss": 1.0652318, + "learning_rate": 0.0008996879863005366, + "loss": 1.07740128, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.16186523, + "step": 94, + "time_per_iteration": 2.6751108169555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217025, + "balance_loss_mlp": 1.2013253, + "epoch": 0.018276260100038477, + "flos": 497103436800.0, + "grad_norm": 0.08327491501556071, + "language_loss": 1.06208014, + "learning_rate": 0.0009017835132453337, + "loss": 1.07425046, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.15686035, + "step": 95, + "time_per_iteration": 2.5971803665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196463, + "balance_loss_mlp": 1.1804409, + "epoch": 0.01846864178530204, + "flos": 639765955584.0, + "grad_norm": 0.09756000368948786, + "language_loss": 1.06920743, + "learning_rate": 0.0009038570970964896, + "loss": 1.08117199, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.16027832, + "step": 96, + "time_per_iteration": 2.7428832054138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173361, + "balance_loss_mlp": 1.15723228, + "epoch": 0.018661023470565603, + "flos": 511411746816.0, + "grad_norm": 0.07053433913024812, + "language_loss": 1.04343212, + "learning_rate": 0.0009059091926454854, + "loss": 1.05516577, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.16125488, + "step": 97, + "time_per_iteration": 2.570509433746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.16246903, + "epoch": 0.018853405155829166, + "flos": 930710103552.0, + "grad_norm": 0.08767892767743933, + "language_loss": 1.03389072, + "learning_rate": 0.0009079402406897198, + "loss": 1.04567385, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.15844727, + "step": 98, + "time_per_iteration": 3.202298164367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179075, + "balance_loss_mlp": 1.16296983, + "epoch": 0.01904578684109273, + "flos": 576209396736.0, + "grad_norm": 0.2639136557883628, + "language_loss": 1.0596242, + "learning_rate": 0.0009099506686008212, + "loss": 1.07141495, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.16101074, + "step": 99, + "time_per_iteration": 2.812368869781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139923, + "balance_loss_mlp": 1.12423468, + "epoch": 0.019238168526356292, + "flos": 558173431296.0, + "grad_norm": 0.12311670746354397, + "language_loss": 1.08180976, + "learning_rate": 0.0009119408908644013, + "loss": 1.09320903, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.15673828, + "step": 100, + "time_per_iteration": 2.7063775062561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150815, + "balance_loss_mlp": 1.13574743, + "epoch": 0.019430550211619855, + "flos": 723539506176.0, + "grad_norm": 0.12127606313133317, + "language_loss": 1.14121008, + "learning_rate": 0.0009139113095929519, + "loss": 1.15271831, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.15039062, + "step": 101, + "time_per_iteration": 2.840913772583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218173, + "balance_loss_mlp": 1.20243776, + "epoch": 0.019622931896883418, + "flos": 499235000832.0, + "grad_norm": 0.1104247345061639, + "language_loss": 1.0836457, + "learning_rate": 0.0009158623150134762, + "loss": 1.09582746, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.15722656, + "step": 102, + "time_per_iteration": 2.560464859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01357908, + "balance_loss_mlp": 1.34204173, + "epoch": 0.01981531358214698, + "flos": 508916418048.0, + "grad_norm": 0.15164768975642337, + "language_loss": 1.07661259, + "learning_rate": 0.000917794285931332, + "loss": 1.0901916, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.15856934, + "step": 103, + "time_per_iteration": 2.6684353351593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381196, + "balance_loss_mlp": 1.36572242, + "epoch": 0.020007695267410544, + "flos": 521087371776.0, + "grad_norm": 0.10342928287682196, + "language_loss": 0.9971087, + "learning_rate": 0.0009197075901716639, + "loss": 1.01092052, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.15454102, + "step": 104, + "time_per_iteration": 2.7250871658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01356986, + "balance_loss_mlp": 1.34017777, + "epoch": 0.020200076952674107, + "flos": 533013834240.0, + "grad_norm": 0.1824265866479698, + "language_loss": 1.09647703, + "learning_rate": 0.0009216025849997171, + "loss": 1.11004686, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.16809082, + "step": 105, + "time_per_iteration": 2.776764154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261961, + "balance_loss_mlp": 1.24583197, + "epoch": 0.020392458637937667, + "flos": 684430981632.0, + "grad_norm": 0.06376163654280764, + "language_loss": 1.0425086, + "learning_rate": 0.0009234796175212258, + "loss": 1.05512834, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.16125488, + "step": 106, + "time_per_iteration": 2.9174978733062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269614, + "balance_loss_mlp": 1.25201869, + "epoch": 0.02058484032320123, + "flos": 701791852032.0, + "grad_norm": 0.060044663360548714, + "language_loss": 1.08808422, + "learning_rate": 0.000925339025064007, + "loss": 1.10078037, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.17590332, + "step": 107, + "time_per_iteration": 2.975735902786255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324579, + "balance_loss_mlp": 1.30547023, + "epoch": 0.020777222008464793, + "flos": 638772175872.0, + "grad_norm": 0.12680512225677842, + "language_loss": 1.01262307, + "learning_rate": 0.0009271811355418027, + "loss": 1.02586877, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.19128418, + "step": 108, + "time_per_iteration": 2.8408150672912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01306621, + "balance_loss_mlp": 1.28755951, + "epoch": 0.020969603693728356, + "flos": 681785856000.0, + "grad_norm": 0.06997483982989385, + "language_loss": 1.08551693, + "learning_rate": 0.0009290062678013548, + "loss": 1.09858322, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.19055176, + "step": 109, + "time_per_iteration": 2.869980812072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309468, + "balance_loss_mlp": 1.29159832, + "epoch": 0.02116198537899192, + "flos": 533140462080.0, + "grad_norm": 0.13190855435004306, + "language_loss": 1.06647623, + "learning_rate": 0.0009308147319536321, + "loss": 1.07957077, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.17895508, + "step": 110, + "time_per_iteration": 2.6270735263824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130688, + "balance_loss_mlp": 1.29067969, + "epoch": 0.021354367064255482, + "flos": 717168135168.0, + "grad_norm": 0.10963649287068344, + "language_loss": 1.1282903, + "learning_rate": 0.0009326068296900676, + "loss": 1.14135909, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.1619873, + "step": 111, + "time_per_iteration": 2.8845341205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388527, + "balance_loss_mlp": 1.37200487, + "epoch": 0.021546748749519045, + "flos": 519290459136.0, + "grad_norm": 0.12406482447985402, + "language_loss": 1.03902006, + "learning_rate": 0.0009343828545846161, + "loss": 1.05290532, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.16516113, + "step": 112, + "time_per_iteration": 2.8167102336883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01548404, + "balance_loss_mlp": 1.53109479, + "epoch": 0.021739130434782608, + "flos": 504912337920.0, + "grad_norm": 0.2528517188051562, + "language_loss": 1.0722419, + "learning_rate": 0.0009361430923823841, + "loss": 1.08772588, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.1730957, + "step": 113, + "time_per_iteration": 2.664581060409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441472, + "balance_loss_mlp": 1.42576015, + "epoch": 0.02193151212004617, + "flos": 463251820032.0, + "grad_norm": 0.1910881492312462, + "language_loss": 1.11420846, + "learning_rate": 0.0009378878212755459, + "loss": 1.12862325, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.15710449, + "step": 114, + "time_per_iteration": 2.4851133823394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262203, + "balance_loss_mlp": 1.24767148, + "epoch": 0.022123893805309734, + "flos": 552008673792.0, + "grad_norm": 0.09004287588953173, + "language_loss": 1.0099957, + "learning_rate": 0.0009396173121672103, + "loss": 1.0226177, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.14538574, + "step": 115, + "time_per_iteration": 2.6535162925720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215709, + "balance_loss_mlp": 1.20165396, + "epoch": 0.022316275490573297, + "flos": 635920436736.0, + "grad_norm": 0.07849561533847389, + "language_loss": 1.07122314, + "learning_rate": 0.0009413318289238633, + "loss": 1.08338022, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.14050293, + "step": 116, + "time_per_iteration": 2.7836899757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203544, + "balance_loss_mlp": 1.18965602, + "epoch": 0.02250865717583686, + "flos": 798535107072.0, + "grad_norm": 0.07099947506123377, + "language_loss": 0.98912275, + "learning_rate": 0.0009430316286169771, + "loss": 1.00115824, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.13891602, + "step": 117, + "time_per_iteration": 3.049468517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263206, + "balance_loss_mlp": 1.24786401, + "epoch": 0.022701038861100423, + "flos": 455851763712.0, + "grad_norm": 0.18808502465815918, + "language_loss": 1.04843259, + "learning_rate": 0.0009447169617543361, + "loss": 1.06106472, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.15319824, + "step": 118, + "time_per_iteration": 2.5886504650115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121023, + "balance_loss_mlp": 1.19557953, + "epoch": 0.022893420546363986, + "flos": 582812112384.0, + "grad_norm": 0.09179634719817005, + "language_loss": 1.11139297, + "learning_rate": 0.0009463880725016029, + "loss": 1.12349522, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.14648438, + "step": 119, + "time_per_iteration": 2.6861259937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169264, + "balance_loss_mlp": 1.15572226, + "epoch": 0.02308580223162755, + "flos": 561010613760.0, + "grad_norm": 0.09164108943144146, + "language_loss": 1.05675769, + "learning_rate": 0.0009480451988946134, + "loss": 1.06845045, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.13549805, + "step": 120, + "time_per_iteration": 2.8075129985809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217336, + "balance_loss_mlp": 1.2034359, + "epoch": 0.023278183916891113, + "flos": 770966111232.0, + "grad_norm": 0.1019945076921087, + "language_loss": 1.07486713, + "learning_rate": 0.0009496885730428627, + "loss": 1.08704054, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.13903809, + "step": 121, + "time_per_iteration": 3.0081264972686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291544, + "balance_loss_mlp": 1.27698815, + "epoch": 0.023470565602154676, + "flos": 553111552512.0, + "grad_norm": 0.08478902086488087, + "language_loss": 1.05369067, + "learning_rate": 0.0009513184213246156, + "loss": 1.06660616, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.14550781, + "step": 122, + "time_per_iteration": 2.654902696609497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406128, + "balance_loss_mlp": 1.39054775, + "epoch": 0.02366294728741824, + "flos": 559744791552.0, + "grad_norm": 0.09837859270685317, + "language_loss": 1.09463692, + "learning_rate": 0.0009529349645740552, + "loss": 1.10869825, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.15563965, + "step": 123, + "time_per_iteration": 2.6837081909179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01484693, + "balance_loss_mlp": 1.46961284, + "epoch": 0.0238553289726818, + "flos": 468313698816.0, + "grad_norm": 0.11388616458843728, + "language_loss": 1.07573724, + "learning_rate": 0.0009545384182608524, + "loss": 1.09058416, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.1505127, + "step": 124, + "time_per_iteration": 2.5069937705993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01411359, + "balance_loss_mlp": 1.39688659, + "epoch": 0.024047710657945365, + "flos": 559763730432.0, + "grad_norm": 0.3429043048666504, + "language_loss": 1.05057025, + "learning_rate": 0.0009561289926625252, + "loss": 1.06468379, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.14465332, + "step": 125, + "time_per_iteration": 2.6802117824554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011688, + "balance_loss_mlp": 1.15507352, + "epoch": 0.024240092343208928, + "flos": 504528224256.0, + "grad_norm": 0.18048320350440872, + "language_loss": 1.09737623, + "learning_rate": 0.0009577068930299292, + "loss": 1.10906434, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.13739014, + "step": 126, + "time_per_iteration": 2.6096670627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163735, + "balance_loss_mlp": 1.15040147, + "epoch": 0.02443247402847249, + "flos": 435516908544.0, + "grad_norm": 0.07278748671530755, + "language_loss": 1.05931616, + "learning_rate": 0.0009592723197462087, + "loss": 1.07095349, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.13360596, + "step": 127, + "time_per_iteration": 2.6409482955932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248107, + "balance_loss_mlp": 1.23239577, + "epoch": 0.024624855713736054, + "flos": 683445966336.0, + "grad_norm": 0.0813490266373729, + "language_loss": 1.02871299, + "learning_rate": 0.0009608254684795125, + "loss": 1.04119396, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.15710449, + "step": 128, + "time_per_iteration": 2.940600872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265693, + "balance_loss_mlp": 1.24772859, + "epoch": 0.024817237398999614, + "flos": 524721894912.0, + "grad_norm": 0.0804185451989367, + "language_loss": 1.06161952, + "learning_rate": 0.0009623665303297678, + "loss": 1.07427645, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.1796875, + "step": 129, + "time_per_iteration": 2.7088472843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256284, + "balance_loss_mlp": 1.23668599, + "epoch": 0.025009619084263177, + "flos": 655350262272.0, + "grad_norm": 0.12369480901617341, + "language_loss": 1.10218048, + "learning_rate": 0.0009638956919697878, + "loss": 1.11474347, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.19592285, + "step": 130, + "time_per_iteration": 2.857571840286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266475, + "balance_loss_mlp": 1.24420691, + "epoch": 0.02520200076952674, + "flos": 454187271168.0, + "grad_norm": 0.08293639348197612, + "language_loss": 1.02638018, + "learning_rate": 0.0009654131357809714, + "loss": 1.03904486, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.22253418, + "step": 131, + "time_per_iteration": 2.641470432281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128644, + "balance_loss_mlp": 1.26142943, + "epoch": 0.025394382454790303, + "flos": 839427397632.0, + "grad_norm": 0.05741461740254168, + "language_loss": 1.11002767, + "learning_rate": 0.0009669190399838441, + "loss": 1.12289214, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.25036621, + "step": 132, + "time_per_iteration": 3.133596420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302533, + "balance_loss_mlp": 1.27633083, + "epoch": 0.025586764140053866, + "flos": 580725628416.0, + "grad_norm": 0.06987664196058198, + "language_loss": 1.0413487, + "learning_rate": 0.0009684135787636724, + "loss": 1.05437398, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.26208496, + "step": 133, + "time_per_iteration": 2.7968075275421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01325396, + "balance_loss_mlp": 1.29710746, + "epoch": 0.02577914582531743, + "flos": 789893959680.0, + "grad_norm": 0.07551411578012862, + "language_loss": 1.07757604, + "learning_rate": 0.0009698969223913726, + "loss": 1.09083009, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.28283691, + "step": 134, + "time_per_iteration": 3.0058987140655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131126, + "balance_loss_mlp": 1.28212547, + "epoch": 0.025971527510580992, + "flos": 594683320320.0, + "grad_norm": 0.0731546450398535, + "language_loss": 1.10457921, + "learning_rate": 0.0009713692373399265, + "loss": 1.11769176, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.29125977, + "step": 135, + "time_per_iteration": 2.6654229164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02152319, + "balance_loss_mlp": 1.95700705, + "epoch": 0.026163909195844555, + "flos": 1576771522560.0, + "grad_norm": 0.26755932757436196, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81608546, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 1.953125, + "step": 136, + "time_per_iteration": 6.531313896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01724331, + "balance_loss_mlp": 1.55266988, + "epoch": 0.026356290881108118, + "flos": 1501306030080.0, + "grad_norm": 0.1444935793983717, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79535371, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 1.71875, + "step": 137, + "time_per_iteration": 4.966995716094971 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01371776, + "balance_loss_mlp": 1.34284425, + "epoch": 0.02654867256637168, + "flos": 596841025536.0, + "grad_norm": 0.06823267419395149, + "language_loss": 1.03539467, + "learning_rate": 0.0009757216201974225, + "loss": 1.04911256, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.28918457, + "step": 138, + "time_per_iteration": 2.7901663780212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396345, + "balance_loss_mlp": 1.36752045, + "epoch": 0.026741054251635244, + "flos": 544761386496.0, + "grad_norm": 0.08904352821745645, + "language_loss": 1.08793342, + "learning_rate": 0.0009771514130396581, + "loss": 1.10189688, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.28833008, + "step": 139, + "time_per_iteration": 2.664384603500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410566, + "balance_loss_mlp": 1.38171697, + "epoch": 0.026933435936898807, + "flos": 506591387136.0, + "grad_norm": 0.09467843708761726, + "language_loss": 1.08393478, + "learning_rate": 0.00097857095638274, + "loss": 1.09804034, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.28833008, + "step": 140, + "time_per_iteration": 2.5600626468658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399161, + "balance_loss_mlp": 1.37263703, + "epoch": 0.02712581762216237, + "flos": 740513290752.0, + "grad_norm": 0.06303030428856128, + "language_loss": 0.99670362, + "learning_rate": 0.0009799803961288726, + "loss": 1.01069522, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.26538086, + "step": 141, + "time_per_iteration": 2.984253168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01354082, + "balance_loss_mlp": 1.33143175, + "epoch": 0.027318199307425933, + "flos": 848023464960.0, + "grad_norm": 0.06264638149228761, + "language_loss": 1.05898559, + "learning_rate": 0.000981379875086876, + "loss": 1.07252645, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.22644043, + "step": 142, + "time_per_iteration": 3.032597064971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323808, + "balance_loss_mlp": 1.30553341, + "epoch": 0.027510580992689496, + "flos": 575288400384.0, + "grad_norm": 0.07028220907739285, + "language_loss": 1.01752293, + "learning_rate": 0.0009827695330590185, + "loss": 1.030761, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.18273926, + "step": 143, + "time_per_iteration": 2.626483678817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303402, + "balance_loss_mlp": 1.28557992, + "epoch": 0.02770296267795306, + "flos": 772079164416.0, + "grad_norm": 0.05744811954937285, + "language_loss": 1.00619161, + "learning_rate": 0.0009841495069248256, + "loss": 1.0192256, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.17822266, + "step": 144, + "time_per_iteration": 2.9495198726654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316023, + "balance_loss_mlp": 1.29916632, + "epoch": 0.027895344363216622, + "flos": 569123642880.0, + "grad_norm": 0.04968902291069247, + "language_loss": 0.9920603, + "learning_rate": 0.0009855199307219871, + "loss": 1.00522041, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.1685791, + "step": 145, + "time_per_iteration": 2.6407721042633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130391, + "balance_loss_mlp": 1.28731608, + "epoch": 0.028087726048480186, + "flos": 547099564032.0, + "grad_norm": 0.10723696528856613, + "language_loss": 1.01566505, + "learning_rate": 0.0009868809357244854, + "loss": 1.02870417, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.16589355, + "step": 146, + "time_per_iteration": 2.6262452602386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287507, + "balance_loss_mlp": 1.27153277, + "epoch": 0.02828010773374375, + "flos": 524519663616.0, + "grad_norm": 0.06991830692152445, + "language_loss": 1.05632663, + "learning_rate": 0.0009882326505180556, + "loss": 1.06920183, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.15966797, + "step": 147, + "time_per_iteration": 2.6469435691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270213, + "balance_loss_mlp": 1.2534517, + "epoch": 0.02847248941900731, + "flos": 772108277760.0, + "grad_norm": 0.07309095407736986, + "language_loss": 1.04486537, + "learning_rate": 0.0009895752010730906, + "loss": 1.0575676, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.16748047, + "step": 148, + "time_per_iteration": 2.9457786083221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012724, + "balance_loss_mlp": 1.25667655, + "epoch": 0.028664871104270875, + "flos": 534150208512.0, + "grad_norm": 0.048334696317449924, + "language_loss": 1.10088921, + "learning_rate": 0.0009909087108150867, + "loss": 1.11361325, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.15710449, + "step": 149, + "time_per_iteration": 2.712559700012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309133, + "balance_loss_mlp": 1.29286051, + "epoch": 0.028857252789534438, + "flos": 367557599232.0, + "grad_norm": 0.13115053493636905, + "language_loss": 1.11238122, + "learning_rate": 0.0009922333006927371, + "loss": 1.12547255, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.16247559, + "step": 150, + "time_per_iteration": 2.4607067108154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01329212, + "balance_loss_mlp": 1.31257081, + "epoch": 0.029049634474798, + "flos": 515232534528.0, + "grad_norm": 0.06948512606819708, + "language_loss": 1.04613614, + "learning_rate": 0.0009935490892437632, + "loss": 1.05942833, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.16650391, + "step": 151, + "time_per_iteration": 2.5460238456726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309109, + "balance_loss_mlp": 1.29317045, + "epoch": 0.029242016160061564, + "flos": 587840495616.0, + "grad_norm": 0.11257287432569656, + "language_loss": 1.03097093, + "learning_rate": 0.0009948561926585687, + "loss": 1.04406202, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.15930176, + "step": 152, + "time_per_iteration": 2.753009557723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300362, + "balance_loss_mlp": 1.28555596, + "epoch": 0.029434397845325123, + "flos": 551816616960.0, + "grad_norm": 0.062223246716750634, + "language_loss": 1.06524086, + "learning_rate": 0.0009961547248418122, + "loss": 1.07824445, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.14807129, + "step": 153, + "time_per_iteration": 2.630092144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308008, + "balance_loss_mlp": 1.29357219, + "epoch": 0.029626779530588686, + "flos": 603221160960.0, + "grad_norm": 0.09420536563091944, + "language_loss": 1.03062868, + "learning_rate": 0.0009974447974719707, + "loss": 1.04370856, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.14440918, + "step": 154, + "time_per_iteration": 2.6962759494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312448, + "balance_loss_mlp": 1.29745138, + "epoch": 0.02981916121585225, + "flos": 620808993792.0, + "grad_norm": 0.08558703297148447, + "language_loss": 1.04985213, + "learning_rate": 0.0009987265200589763, + "loss": 1.0629766, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.15002441, + "step": 155, + "time_per_iteration": 2.7059414386749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295882, + "balance_loss_mlp": 1.28057528, + "epoch": 0.030011542901115813, + "flos": 661322962944.0, + "grad_norm": 0.09731995783752632, + "language_loss": 1.04436159, + "learning_rate": 0.001, + "loss": 1.05732036, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.1529541, + "step": 156, + "time_per_iteration": 2.856968641281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262708, + "balance_loss_mlp": 1.24682927, + "epoch": 0.030203924586379376, + "flos": 651258842112.0, + "grad_norm": 0.05966927829613408, + "language_loss": 1.02520585, + "learning_rate": 0.0009999999029413921, + "loss": 1.03783274, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.15856934, + "step": 157, + "time_per_iteration": 2.851480722427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268181, + "balance_loss_mlp": 1.25150406, + "epoch": 0.03039630627164294, + "flos": 531083091456.0, + "grad_norm": 0.1034311415514979, + "language_loss": 1.04085183, + "learning_rate": 0.0009999996117656068, + "loss": 1.05353379, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.16674805, + "step": 158, + "time_per_iteration": 2.707646369934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262524, + "balance_loss_mlp": 1.24747968, + "epoch": 0.030588687956906502, + "flos": 585914135040.0, + "grad_norm": 0.12050944658187299, + "language_loss": 0.97824669, + "learning_rate": 0.0009999991264727564, + "loss": 0.99087203, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.15039062, + "step": 159, + "time_per_iteration": 2.7575390338897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272116, + "balance_loss_mlp": 1.25716722, + "epoch": 0.030781069642170065, + "flos": 513026777088.0, + "grad_norm": 0.07020206521781955, + "language_loss": 1.08316755, + "learning_rate": 0.0009999984470630296, + "loss": 1.09588861, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.14929199, + "step": 160, + "time_per_iteration": 2.62310528755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128559, + "balance_loss_mlp": 1.27058172, + "epoch": 0.030973451327433628, + "flos": 717766064640.0, + "grad_norm": 0.06068839125924313, + "language_loss": 0.96528012, + "learning_rate": 0.0009999975735366902, + "loss": 0.978136, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.15002441, + "step": 161, + "time_per_iteration": 3.0823376178741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305752, + "balance_loss_mlp": 1.29055238, + "epoch": 0.03116583301269719, + "flos": 1109312133120.0, + "grad_norm": 0.09428930343360856, + "language_loss": 0.98546314, + "learning_rate": 0.0009999965058940775, + "loss": 0.99852067, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.1517334, + "step": 162, + "time_per_iteration": 3.486618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315996, + "balance_loss_mlp": 1.3010118, + "epoch": 0.031358214697960754, + "flos": 450676403712.0, + "grad_norm": 0.09976775191278689, + "language_loss": 1.04580116, + "learning_rate": 0.0009999952441356057, + "loss": 1.05896115, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.1496582, + "step": 163, + "time_per_iteration": 2.537173271179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300744, + "balance_loss_mlp": 1.28654623, + "epoch": 0.031550596383224314, + "flos": 1254701325312.0, + "grad_norm": 0.0838197011845512, + "language_loss": 1.05903006, + "learning_rate": 0.000999993788261765, + "loss": 1.07203746, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.14196777, + "step": 164, + "time_per_iteration": 3.5638957023620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270584, + "balance_loss_mlp": 1.25625503, + "epoch": 0.03174297806848788, + "flos": 667841310720.0, + "grad_norm": 0.068717417443618, + "language_loss": 1.0642612, + "learning_rate": 0.00099999213827312, + "loss": 1.076967, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.14343262, + "step": 165, + "time_per_iteration": 2.8084213733673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255587, + "balance_loss_mlp": 1.24152076, + "epoch": 0.03193535975375144, + "flos": 551033832960.0, + "grad_norm": 0.06892139424853191, + "language_loss": 1.0208962, + "learning_rate": 0.000999990294170312, + "loss": 1.03345203, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.14074707, + "step": 166, + "time_per_iteration": 2.6247787475585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259954, + "balance_loss_mlp": 1.24549401, + "epoch": 0.032127741439015006, + "flos": 543377700864.0, + "grad_norm": 0.08292396830811857, + "language_loss": 1.05774951, + "learning_rate": 0.0009999882559540566, + "loss": 1.07034898, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.14465332, + "step": 167, + "time_per_iteration": 2.654036283493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291491, + "balance_loss_mlp": 1.27790117, + "epoch": 0.032320123124278566, + "flos": 548104928256.0, + "grad_norm": 0.07217909902530589, + "language_loss": 1.02104354, + "learning_rate": 0.000999986023625145, + "loss": 1.03395844, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.13598633, + "step": 168, + "time_per_iteration": 2.696866750717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03738194, + "balance_loss_mlp": 3.61993837, + "epoch": 0.03251250480954213, + "flos": 1305156865536.0, + "grad_norm": 0.563981464368737, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.82662606, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 1.1796875, + "step": 169, + "time_per_iteration": 4.971506834030151 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134723, + "balance_loss_mlp": 1.33386648, + "epoch": 0.03270488649480569, + "flos": 560866609152.0, + "grad_norm": 0.12141219581883538, + "language_loss": 1.02540469, + "learning_rate": 0.0009999809766328958, + "loss": 1.03887701, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.13391113, + "step": 170, + "time_per_iteration": 2.646425724029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01355192, + "balance_loss_mlp": 1.34039843, + "epoch": 0.03289726818006926, + "flos": 482120031744.0, + "grad_norm": 0.08046017426621577, + "language_loss": 1.05186188, + "learning_rate": 0.0009999781619715177, + "loss": 1.06541371, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.14770508, + "step": 171, + "time_per_iteration": 2.535360336303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381569, + "balance_loss_mlp": 1.36640596, + "epoch": 0.03308964986533282, + "flos": 674355276288.0, + "grad_norm": 0.08789680193074563, + "language_loss": 1.04250002, + "learning_rate": 0.000999975153201402, + "loss": 1.05631578, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.15161133, + "step": 172, + "time_per_iteration": 2.8205513954162598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433883, + "balance_loss_mlp": 1.41711044, + "epoch": 0.033282031550596385, + "flos": 608937785856.0, + "grad_norm": 0.07610360898370483, + "language_loss": 1.02505267, + "learning_rate": 0.0009999719503237174, + "loss": 1.03939152, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.16760254, + "step": 173, + "time_per_iteration": 2.738676071166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451195, + "balance_loss_mlp": 1.43315864, + "epoch": 0.033474413235859944, + "flos": 467801547264.0, + "grad_norm": 0.07270846083900323, + "language_loss": 1.111094, + "learning_rate": 0.0009999685533397073, + "loss": 1.12560594, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.18029785, + "step": 174, + "time_per_iteration": 2.5556905269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01429898, + "balance_loss_mlp": 1.41368508, + "epoch": 0.03366679492112351, + "flos": 579365263872.0, + "grad_norm": 0.09196642879711979, + "language_loss": 1.03199494, + "learning_rate": 0.00099996496225069, + "loss": 1.04629397, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.16210938, + "step": 175, + "time_per_iteration": 2.6806485652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432234, + "balance_loss_mlp": 1.41513896, + "epoch": 0.03385917660638707, + "flos": 637378315776.0, + "grad_norm": 0.08705990667808558, + "language_loss": 1.05897307, + "learning_rate": 0.0009999611770580604, + "loss": 1.07329535, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.17102051, + "step": 176, + "time_per_iteration": 2.830826759338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415158, + "balance_loss_mlp": 1.39910054, + "epoch": 0.03405155829165064, + "flos": 441587123712.0, + "grad_norm": 0.08054669051038237, + "language_loss": 1.03868258, + "learning_rate": 0.0009999571977632876, + "loss": 1.05283427, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.16052246, + "step": 177, + "time_per_iteration": 2.623309850692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463141, + "balance_loss_mlp": 1.44573641, + "epoch": 0.034243939976914196, + "flos": 466097766912.0, + "grad_norm": 0.08089290506220445, + "language_loss": 1.06928194, + "learning_rate": 0.0009999530243679166, + "loss": 1.08391333, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.17407227, + "step": 178, + "time_per_iteration": 2.545133113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451423, + "balance_loss_mlp": 1.43560433, + "epoch": 0.03443632166217776, + "flos": 778919016960.0, + "grad_norm": 0.08468734735068614, + "language_loss": 1.01505899, + "learning_rate": 0.0009999486568735675, + "loss": 1.0295732, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.15808105, + "step": 179, + "time_per_iteration": 3.0384457111358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433641, + "balance_loss_mlp": 1.41778612, + "epoch": 0.03462870334744132, + "flos": 1263284246016.0, + "grad_norm": 0.06997324880309466, + "language_loss": 1.01388979, + "learning_rate": 0.0009999440952819362, + "loss": 1.02822614, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.15856934, + "step": 180, + "time_per_iteration": 3.6892786026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401308, + "balance_loss_mlp": 1.38610911, + "epoch": 0.03482108503270489, + "flos": 606899354112.0, + "grad_norm": 0.057831512038439566, + "language_loss": 1.02027512, + "learning_rate": 0.0009999393395947935, + "loss": 1.03428817, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.15185547, + "step": 181, + "time_per_iteration": 2.826353073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381551, + "balance_loss_mlp": 1.36612535, + "epoch": 0.03501346671796845, + "flos": 538010284032.0, + "grad_norm": 0.05913415109875365, + "language_loss": 1.05361927, + "learning_rate": 0.0009999343898139858, + "loss": 1.06743479, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.1541748, + "step": 182, + "time_per_iteration": 2.593250036239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01359754, + "balance_loss_mlp": 1.33988214, + "epoch": 0.035205848403232015, + "flos": 518231250432.0, + "grad_norm": 0.05898920665253376, + "language_loss": 1.04308426, + "learning_rate": 0.0009999292459414348, + "loss": 1.05668187, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.19909668, + "step": 183, + "time_per_iteration": 2.565936326980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01311064, + "balance_loss_mlp": 1.296103, + "epoch": 0.035398230088495575, + "flos": 472134486528.0, + "grad_norm": 0.06373248491183749, + "language_loss": 1.08499169, + "learning_rate": 0.0009999239079791374, + "loss": 1.09810233, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.14953613, + "step": 184, + "time_per_iteration": 2.552949905395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130912, + "balance_loss_mlp": 1.29237127, + "epoch": 0.03559061177375914, + "flos": 511820591616.0, + "grad_norm": 0.056329932736213485, + "language_loss": 1.01337337, + "learning_rate": 0.0009999183759291659, + "loss": 1.02646446, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.16748047, + "step": 185, + "time_per_iteration": 2.741727113723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291511, + "balance_loss_mlp": 1.27575147, + "epoch": 0.0357829934590227, + "flos": 477146903040.0, + "grad_norm": 0.11224085577532149, + "language_loss": 1.03738213, + "learning_rate": 0.0009999126497936682, + "loss": 1.05029726, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.1574707, + "step": 186, + "time_per_iteration": 2.4901957511901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291515, + "balance_loss_mlp": 1.27446783, + "epoch": 0.03597537514428627, + "flos": 644350588416.0, + "grad_norm": 0.06537030709871235, + "language_loss": 1.06735992, + "learning_rate": 0.0009999067295748676, + "loss": 1.08027506, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.1706543, + "step": 187, + "time_per_iteration": 2.7923052310943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327575, + "balance_loss_mlp": 1.30966997, + "epoch": 0.03616775682954983, + "flos": 580916275200.0, + "grad_norm": 0.06523062893181024, + "language_loss": 1.04418302, + "learning_rate": 0.000999900615275062, + "loss": 1.05745876, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.17919922, + "step": 188, + "time_per_iteration": 2.7248637676239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295421, + "balance_loss_mlp": 1.27722955, + "epoch": 0.03636013851481339, + "flos": 382210735104.0, + "grad_norm": 0.08035209765807474, + "language_loss": 1.10347509, + "learning_rate": 0.0009998943068966256, + "loss": 1.11642933, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.18188477, + "step": 189, + "time_per_iteration": 2.429497480392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279097, + "balance_loss_mlp": 1.26120377, + "epoch": 0.03655252020007695, + "flos": 582954706944.0, + "grad_norm": 0.07380481555246936, + "language_loss": 1.0506779, + "learning_rate": 0.0009998878044420072, + "loss": 1.06346881, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.17907715, + "step": 190, + "time_per_iteration": 2.6878626346588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012863, + "balance_loss_mlp": 1.26773953, + "epoch": 0.03674490188534051, + "flos": 471376433664.0, + "grad_norm": 0.10484442400689244, + "language_loss": 1.01223493, + "learning_rate": 0.0009998811079137318, + "loss": 1.02509785, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.18566895, + "step": 191, + "time_per_iteration": 2.561494827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281775, + "balance_loss_mlp": 1.26645625, + "epoch": 0.03693728357060408, + "flos": 528113488896.0, + "grad_norm": 0.0609431296621874, + "language_loss": 1.01984763, + "learning_rate": 0.0009998742173143987, + "loss": 1.03266537, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.1529541, + "step": 192, + "time_per_iteration": 2.59798264503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336751, + "balance_loss_mlp": 1.32157528, + "epoch": 0.03712966525586764, + "flos": 798657352704.0, + "grad_norm": 0.10186248006293357, + "language_loss": 1.02005363, + "learning_rate": 0.0009998671326466833, + "loss": 1.03342128, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.15185547, + "step": 193, + "time_per_iteration": 2.9510865211486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331057, + "balance_loss_mlp": 1.3157624, + "epoch": 0.037322046941131205, + "flos": 829628116992.0, + "grad_norm": 0.06375125184008373, + "language_loss": 1.02914846, + "learning_rate": 0.0009998598539133362, + "loss": 1.04245901, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.1529541, + "step": 194, + "time_per_iteration": 2.9981300830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01337882, + "balance_loss_mlp": 1.3235296, + "epoch": 0.037514428626394765, + "flos": 437460797952.0, + "grad_norm": 0.10181133305516413, + "language_loss": 1.03936744, + "learning_rate": 0.0009998523811171828, + "loss": 1.0527463, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.14379883, + "step": 195, + "time_per_iteration": 2.501542568206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296215, + "balance_loss_mlp": 1.28125429, + "epoch": 0.03770681031165833, + "flos": 511372459008.0, + "grad_norm": 0.09414845611868274, + "language_loss": 1.04584992, + "learning_rate": 0.0009998447142611248, + "loss": 1.05881214, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.14941406, + "step": 196, + "time_per_iteration": 2.6247317790985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128144, + "balance_loss_mlp": 1.26702762, + "epoch": 0.03789919199692189, + "flos": 807102061056.0, + "grad_norm": 0.05831249070889761, + "language_loss": 0.97701526, + "learning_rate": 0.0009998368533481387, + "loss": 0.9898296, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.14422607, + "step": 197, + "time_per_iteration": 3.01912784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294999, + "balance_loss_mlp": 1.27945375, + "epoch": 0.03809157368218546, + "flos": 690274234368.0, + "grad_norm": 0.06656848410147823, + "language_loss": 1.00630498, + "learning_rate": 0.0009998287983812762, + "loss": 1.01925504, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.15551758, + "step": 198, + "time_per_iteration": 2.8252804279327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0135387, + "balance_loss_mlp": 1.33592904, + "epoch": 0.03828395536744902, + "flos": 517675428864.0, + "grad_norm": 0.06988401379713739, + "language_loss": 1.06386423, + "learning_rate": 0.0009998205493636646, + "loss": 1.07740283, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.17944336, + "step": 199, + "time_per_iteration": 2.649765729904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339461, + "balance_loss_mlp": 1.32242572, + "epoch": 0.038476337052712584, + "flos": 581389138944.0, + "grad_norm": 0.07184113921580974, + "language_loss": 0.9925406, + "learning_rate": 0.0009998121062985063, + "loss": 1.00593519, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.17053223, + "step": 200, + "time_per_iteration": 2.6788320541381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328142, + "balance_loss_mlp": 1.31167912, + "epoch": 0.03866871873797614, + "flos": 576791359488.0, + "grad_norm": 0.059667024197710104, + "language_loss": 1.01260698, + "learning_rate": 0.0009998034691890794, + "loss": 1.02588844, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.16455078, + "step": 201, + "time_per_iteration": 2.753265380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297644, + "balance_loss_mlp": 1.28249288, + "epoch": 0.03886110042323971, + "flos": 540472117248.0, + "grad_norm": 0.07302515973387386, + "language_loss": 1.05948424, + "learning_rate": 0.0009997946380387369, + "loss": 1.07246065, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.15136719, + "step": 202, + "time_per_iteration": 2.618546485900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262023, + "balance_loss_mlp": 1.24746776, + "epoch": 0.03905348210850327, + "flos": 717694843392.0, + "grad_norm": 0.0775452329378228, + "language_loss": 1.08266401, + "learning_rate": 0.0009997856128509076, + "loss": 1.09528422, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.14550781, + "step": 203, + "time_per_iteration": 2.8284859657287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267878, + "balance_loss_mlp": 1.25321579, + "epoch": 0.039245863793766836, + "flos": 427268639232.0, + "grad_norm": 0.06664318613050589, + "language_loss": 1.02886617, + "learning_rate": 0.0009997763936290952, + "loss": 1.04154491, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.14660645, + "step": 204, + "time_per_iteration": 2.516263246536255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129892, + "balance_loss_mlp": 1.28264785, + "epoch": 0.039438245479030395, + "flos": 662804163072.0, + "grad_norm": 0.07463685050771204, + "language_loss": 1.0815413, + "learning_rate": 0.0009997669803768789, + "loss": 1.09453046, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.16271973, + "step": 205, + "time_per_iteration": 2.7576606273651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291456, + "balance_loss_mlp": 1.27812803, + "epoch": 0.03963062716429396, + "flos": 635063459328.0, + "grad_norm": 0.055878982250893716, + "language_loss": 1.03253651, + "learning_rate": 0.0009997573730979134, + "loss": 1.04545116, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.13342285, + "step": 206, + "time_per_iteration": 2.716325521469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.07279634, + "balance_loss_mlp": 4.65512276, + "epoch": 0.03982300884955752, + "flos": 1417813286400.0, + "grad_norm": 0.533603848118922, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.86472833, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 26.25, + "step": 207, + "time_per_iteration": 4.635821342468262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0137482, + "balance_loss_mlp": 1.35964513, + "epoch": 0.04001539053482109, + "flos": 688769713152.0, + "grad_norm": 0.1040721574676452, + "language_loss": 1.02094078, + "learning_rate": 0.0009997375764747294, + "loss": 1.03468895, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.1517334, + "step": 208, + "time_per_iteration": 2.974442481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415285, + "balance_loss_mlp": 1.40052748, + "epoch": 0.04020777222008465, + "flos": 533363042304.0, + "grad_norm": 0.08111266742266361, + "language_loss": 0.99458027, + "learning_rate": 0.0009997273871381967, + "loss": 1.00873303, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.14758301, + "step": 209, + "time_per_iteration": 2.6802144050598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466201, + "balance_loss_mlp": 1.44989347, + "epoch": 0.040400153905348214, + "flos": 567661381632.0, + "grad_norm": 0.06875741115436663, + "language_loss": 1.05031717, + "learning_rate": 0.0009997170037902862, + "loss": 1.0649792, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.16308594, + "step": 210, + "time_per_iteration": 2.6975836753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531768, + "balance_loss_mlp": 1.51399446, + "epoch": 0.040592535590611774, + "flos": 713130559488.0, + "grad_norm": 0.07197690318934227, + "language_loss": 1.07202697, + "learning_rate": 0.0009997064264350292, + "loss": 1.08734465, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.17785645, + "step": 211, + "time_per_iteration": 2.836771011352539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531925, + "balance_loss_mlp": 1.5154984, + "epoch": 0.04078491727587533, + "flos": 577824427008.0, + "grad_norm": 0.09120436996840299, + "language_loss": 1.0146966, + "learning_rate": 0.0009996956550765317, + "loss": 1.03001595, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.16430664, + "step": 212, + "time_per_iteration": 2.671292781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01499293, + "balance_loss_mlp": 1.4817214, + "epoch": 0.0409772989611389, + "flos": 552033404928.0, + "grad_norm": 0.11449485477945152, + "language_loss": 0.96278083, + "learning_rate": 0.0009996846897189762, + "loss": 0.97777379, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.17565918, + "step": 213, + "time_per_iteration": 2.6231424808502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014653, + "balance_loss_mlp": 1.44753814, + "epoch": 0.04116968064640246, + "flos": 555347833344.0, + "grad_norm": 0.09512793115916172, + "language_loss": 1.02356708, + "learning_rate": 0.0009996735303666193, + "loss": 1.03822017, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.1776123, + "step": 214, + "time_per_iteration": 2.6930177211761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01477298, + "balance_loss_mlp": 1.46134758, + "epoch": 0.041362062331666026, + "flos": 578204158464.0, + "grad_norm": 0.09141123477091552, + "language_loss": 1.04750729, + "learning_rate": 0.0009996621770237937, + "loss": 1.06228042, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.15942383, + "step": 215, + "time_per_iteration": 2.7448923587799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01578462, + "balance_loss_mlp": 1.56013966, + "epoch": 0.041554444016929586, + "flos": 611130396672.0, + "grad_norm": 0.10233552268903827, + "language_loss": 0.99822551, + "learning_rate": 0.0009996506296949073, + "loss": 1.01401007, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.18334961, + "step": 216, + "time_per_iteration": 2.8548526763916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01609008, + "balance_loss_mlp": 1.59156775, + "epoch": 0.04174682570219315, + "flos": 527857413120.0, + "grad_norm": 0.10522858499680945, + "language_loss": 0.99888742, + "learning_rate": 0.0009996388883844428, + "loss": 1.01497757, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.17456055, + "step": 217, + "time_per_iteration": 2.618546724319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01557164, + "balance_loss_mlp": 1.54124999, + "epoch": 0.04193920738745671, + "flos": 511258977792.0, + "grad_norm": 0.09341741551851517, + "language_loss": 1.03841758, + "learning_rate": 0.0009996269530969588, + "loss": 1.05398929, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.15905762, + "step": 218, + "time_per_iteration": 2.6204636096954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01525903, + "balance_loss_mlp": 1.50927377, + "epoch": 0.04213158907272028, + "flos": 571226093568.0, + "grad_norm": 0.09609660813155754, + "language_loss": 1.02944803, + "learning_rate": 0.0009996148238370888, + "loss": 1.04470706, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.16625977, + "step": 219, + "time_per_iteration": 2.7071943283081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0150071, + "balance_loss_mlp": 1.48340106, + "epoch": 0.04232397075798384, + "flos": 963803667456.0, + "grad_norm": 0.05454565212769997, + "language_loss": 0.9941752, + "learning_rate": 0.0009996025006095421, + "loss": 1.00918233, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.1730957, + "step": 220, + "time_per_iteration": 3.3006374835968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.10944285, + "balance_loss_mlp": 6.84272289, + "epoch": 0.042516352443247404, + "flos": 1468814777856.0, + "grad_norm": 0.48497418398004566, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.88727427, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 41.0, + "step": 221, + "time_per_iteration": 5.7136383056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601015, + "balance_loss_mlp": 1.58291924, + "epoch": 0.042708734128510964, + "flos": 654419091456.0, + "grad_norm": 0.10763646442297387, + "language_loss": 0.99765503, + "learning_rate": 0.0009995772722706307, + "loss": 1.0136652, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.1809082, + "step": 222, + "time_per_iteration": 2.8322792053222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01658843, + "balance_loss_mlp": 1.63811278, + "epoch": 0.04290111581377453, + "flos": 431601578496.0, + "grad_norm": 0.16393394652444138, + "language_loss": 1.13557565, + "learning_rate": 0.0009995643671690604, + "loss": 1.1521641, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.20739746, + "step": 223, + "time_per_iteration": 2.470729351043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163871, + "balance_loss_mlp": 1.61504686, + "epoch": 0.04309349749903809, + "flos": 644379701760.0, + "grad_norm": 0.08733094203359489, + "language_loss": 1.00837708, + "learning_rate": 0.0009995512681194023, + "loss": 1.02476418, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.23632812, + "step": 224, + "time_per_iteration": 2.8274452686309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615568, + "balance_loss_mlp": 1.58755326, + "epoch": 0.04328587918430166, + "flos": 830861853696.0, + "grad_norm": 0.12001676841435771, + "language_loss": 0.98664522, + "learning_rate": 0.0009995379751267417, + "loss": 1.00280082, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.28027344, + "step": 225, + "time_per_iteration": 3.275660991668701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01617416, + "balance_loss_mlp": 1.58639741, + "epoch": 0.043478260869565216, + "flos": 524804852736.0, + "grad_norm": 0.1467276253632499, + "language_loss": 1.0007726, + "learning_rate": 0.0009995244881962398, + "loss": 1.01694679, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.30981445, + "step": 226, + "time_per_iteration": 2.6300203800201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601732, + "balance_loss_mlp": 1.56787658, + "epoch": 0.04367064255482878, + "flos": 439253328384.0, + "grad_norm": 0.095918638324787, + "language_loss": 1.01389623, + "learning_rate": 0.0009995108073331323, + "loss": 1.02991343, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.33862305, + "step": 227, + "time_per_iteration": 2.667628765106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0158134, + "balance_loss_mlp": 1.5462923, + "epoch": 0.04386302424009234, + "flos": 507109330944.0, + "grad_norm": 0.08564981186298011, + "language_loss": 1.04024279, + "learning_rate": 0.0009994969325427309, + "loss": 1.05605614, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.35058594, + "step": 228, + "time_per_iteration": 2.6454501152038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558795, + "balance_loss_mlp": 1.52224541, + "epoch": 0.04405540592535591, + "flos": 540432829440.0, + "grad_norm": 0.07744391701114339, + "language_loss": 1.00052619, + "learning_rate": 0.0009994828638304218, + "loss": 1.016114, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.36547852, + "step": 229, + "time_per_iteration": 2.6468071937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01516137, + "balance_loss_mlp": 1.47794271, + "epoch": 0.04424778761061947, + "flos": 446136850944.0, + "grad_norm": 0.08052263902742013, + "language_loss": 1.06763554, + "learning_rate": 0.0009994686012016675, + "loss": 1.08279693, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.3815918, + "step": 230, + "time_per_iteration": 2.5467634201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01483037, + "balance_loss_mlp": 1.44515228, + "epoch": 0.044440169295883035, + "flos": 700383435264.0, + "grad_norm": 0.05918307184238542, + "language_loss": 1.0518043, + "learning_rate": 0.000999454144662005, + "loss": 1.06663465, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.37866211, + "step": 231, + "time_per_iteration": 2.8704099655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473358, + "balance_loss_mlp": 1.43549716, + "epoch": 0.044632550981146595, + "flos": 588055873536.0, + "grad_norm": 0.08626514264815018, + "language_loss": 0.99676436, + "learning_rate": 0.0009994394942170468, + "loss": 1.01149797, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.37866211, + "step": 232, + "time_per_iteration": 2.6578898429870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461415, + "balance_loss_mlp": 1.4258194, + "epoch": 0.04482493266641016, + "flos": 554534525952.0, + "grad_norm": 0.07124765242066121, + "language_loss": 0.96965969, + "learning_rate": 0.0009994246498724808, + "loss": 0.98427379, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.35620117, + "step": 233, + "time_per_iteration": 2.7764015197753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463645, + "balance_loss_mlp": 1.42790616, + "epoch": 0.04501731435167372, + "flos": 722500646400.0, + "grad_norm": 0.07759597622956232, + "language_loss": 0.99069166, + "learning_rate": 0.00099940961163407, + "loss": 1.00532806, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.35766602, + "step": 234, + "time_per_iteration": 2.8431143760681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454599, + "balance_loss_mlp": 1.42098188, + "epoch": 0.04520969603693728, + "flos": 511539784704.0, + "grad_norm": 0.05931413709293958, + "language_loss": 1.02564597, + "learning_rate": 0.0009993943795076528, + "loss": 1.04019189, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.33642578, + "step": 235, + "time_per_iteration": 2.645988702774048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444244, + "balance_loss_mlp": 1.40936303, + "epoch": 0.04540207772220085, + "flos": 364854246912.0, + "grad_norm": 0.07280953320994132, + "language_loss": 1.04776168, + "learning_rate": 0.0009993789534991427, + "loss": 1.062204, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.34912109, + "step": 236, + "time_per_iteration": 2.4084837436676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01418385, + "balance_loss_mlp": 1.38390946, + "epoch": 0.045594459407464406, + "flos": 522407038464.0, + "grad_norm": 0.060943880380569936, + "language_loss": 0.99500269, + "learning_rate": 0.0009993633336145287, + "loss": 1.00918651, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.34472656, + "step": 237, + "time_per_iteration": 2.6044533252716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406135, + "balance_loss_mlp": 1.3730185, + "epoch": 0.04578684109272797, + "flos": 671442338304.0, + "grad_norm": 0.06747057459653658, + "language_loss": 1.03573179, + "learning_rate": 0.0009993475198598752, + "loss": 1.04979324, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.33129883, + "step": 238, + "time_per_iteration": 2.9948084354400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01383668, + "balance_loss_mlp": 1.35164809, + "epoch": 0.04597922277799153, + "flos": 541387321344.0, + "grad_norm": 0.07135856148897902, + "language_loss": 0.99909985, + "learning_rate": 0.0009993315122413212, + "loss": 1.01293659, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.32006836, + "step": 239, + "time_per_iteration": 2.5848827362060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369111, + "balance_loss_mlp": 1.33773541, + "epoch": 0.0461716044632551, + "flos": 458732616192.0, + "grad_norm": 0.056000088810755834, + "language_loss": 1.0008105, + "learning_rate": 0.0009993153107650818, + "loss": 1.01450157, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.31347656, + "step": 240, + "time_per_iteration": 2.5492687225341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338815, + "balance_loss_mlp": 1.31015706, + "epoch": 0.04636398614851866, + "flos": 455009342976.0, + "grad_norm": 0.06491754001609312, + "language_loss": 0.99534512, + "learning_rate": 0.0009992989154374468, + "loss": 1.00873327, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.28662109, + "step": 241, + "time_per_iteration": 2.511237621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294622, + "balance_loss_mlp": 1.26833653, + "epoch": 0.046556367833782225, + "flos": 556558401024.0, + "grad_norm": 0.07592069792168304, + "language_loss": 1.06626534, + "learning_rate": 0.0009992823262647817, + "loss": 1.07921147, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26293945, + "step": 242, + "time_per_iteration": 2.7618701457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249282, + "balance_loss_mlp": 1.22240043, + "epoch": 0.046748749519045785, + "flos": 592625949696.0, + "grad_norm": 0.0687662987323222, + "language_loss": 1.00893593, + "learning_rate": 0.0009992655432535264, + "loss": 1.0214287, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.26879883, + "step": 243, + "time_per_iteration": 2.7471935749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255015, + "balance_loss_mlp": 1.23083937, + "epoch": 0.04694113120430935, + "flos": 569596506624.0, + "grad_norm": 0.07373455055845594, + "language_loss": 1.0054853, + "learning_rate": 0.0009992485664101973, + "loss": 1.01803541, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.24169922, + "step": 244, + "time_per_iteration": 2.635344982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295554, + "balance_loss_mlp": 1.27291572, + "epoch": 0.04713351288957291, + "flos": 863401158144.0, + "grad_norm": 0.10584905626659928, + "language_loss": 1.03312445, + "learning_rate": 0.000999231395741385, + "loss": 1.04607987, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.22631836, + "step": 245, + "time_per_iteration": 3.093386173248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128706, + "balance_loss_mlp": 1.26464868, + "epoch": 0.04732589457483648, + "flos": 536961249792.0, + "grad_norm": 0.08844420521863233, + "language_loss": 1.01371169, + "learning_rate": 0.0009992140312537557, + "loss": 1.02658224, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.22412109, + "step": 246, + "time_per_iteration": 2.667579412460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256359, + "balance_loss_mlp": 1.23515141, + "epoch": 0.04751827626010004, + "flos": 761566910976.0, + "grad_norm": 0.052835972446563725, + "language_loss": 0.9609164, + "learning_rate": 0.000999196472954051, + "loss": 0.97347999, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.2121582, + "step": 247, + "time_per_iteration": 2.9537084102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02369813, + "balance_loss_mlp": 2.16687083, + "epoch": 0.0477106579453636, + "flos": 1578961313280.0, + "grad_norm": 0.2151482568863758, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.81794667, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 2.03125, + "step": 248, + "time_per_iteration": 5.758621454238892 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289622, + "balance_loss_mlp": 1.27137113, + "epoch": 0.04790303963062716, + "flos": 457535195136.0, + "grad_norm": 0.10849969336884063, + "language_loss": 1.03316629, + "learning_rate": 0.0009991607749457578, + "loss": 1.04606247, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.18261719, + "step": 249, + "time_per_iteration": 2.5432913303375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334566, + "balance_loss_mlp": 1.31724536, + "epoch": 0.04809542131589073, + "flos": 782079266304.0, + "grad_norm": 0.08264534697846654, + "language_loss": 1.01180637, + "learning_rate": 0.0009991426352510286, + "loss": 1.02515209, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.17321777, + "step": 250, + "time_per_iteration": 3.1542766094207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351096, + "balance_loss_mlp": 1.33215368, + "epoch": 0.04828780300115429, + "flos": 558995503104.0, + "grad_norm": 0.06435857362074206, + "language_loss": 1.03307557, + "learning_rate": 0.0009991243017719422, + "loss": 1.04658651, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.18933105, + "step": 251, + "time_per_iteration": 2.693882942199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333217, + "balance_loss_mlp": 1.31485844, + "epoch": 0.048480184686417856, + "flos": 501682277376.0, + "grad_norm": 0.09276508096019526, + "language_loss": 0.97794825, + "learning_rate": 0.0009991057745156165, + "loss": 0.99128038, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.18347168, + "step": 252, + "time_per_iteration": 2.628873109817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01867297, + "balance_loss_mlp": 1.75514495, + "epoch": 0.048672566371681415, + "flos": 1535585430528.0, + "grad_norm": 0.16359674361847032, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.8377828, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.125, + "step": 253, + "time_per_iteration": 5.060615062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285031, + "balance_loss_mlp": 1.26567185, + "epoch": 0.04886494805694498, + "flos": 537665458176.0, + "grad_norm": 0.07164286827098729, + "language_loss": 1.06546187, + "learning_rate": 0.0009990681387000943, + "loss": 1.07831216, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.19384766, + "step": 254, + "time_per_iteration": 2.783367395401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275754, + "balance_loss_mlp": 1.25606036, + "epoch": 0.04905732974220854, + "flos": 679841966592.0, + "grad_norm": 0.06618046133348403, + "language_loss": 1.01404011, + "learning_rate": 0.0009990490301555093, + "loss": 1.02679765, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.19689941, + "step": 255, + "time_per_iteration": 2.9520761966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01936632, + "balance_loss_mlp": 1.86796737, + "epoch": 0.04924971142747211, + "flos": 1420408949760.0, + "grad_norm": 0.31562964738653715, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.81151783, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.6875, + "step": 256, + "time_per_iteration": 4.825209856033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615246, + "balance_loss_mlp": 1.55344784, + "epoch": 0.04944209311273567, + "flos": 1557202074624.0, + "grad_norm": 0.16937574338078817, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80857986, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.6171875, + "step": 257, + "time_per_iteration": 4.995501518249512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163115, + "balance_loss_mlp": 1.58422887, + "epoch": 0.04963447479799923, + "flos": 1569985514496.0, + "grad_norm": 0.13524925240989144, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71607035, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.46875, + "step": 258, + "time_per_iteration": 4.841471910476685 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272668, + "balance_loss_mlp": 1.24463022, + "epoch": 0.049826856483262794, + "flos": 625063357440.0, + "grad_norm": 0.06365504505971183, + "language_loss": 0.95603192, + "learning_rate": 0.0009989706585723202, + "loss": 0.96875864, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.28076172, + "step": 259, + "time_per_iteration": 2.7635786533355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130022, + "balance_loss_mlp": 1.27020288, + "epoch": 0.05001923816852635, + "flos": 503912765952.0, + "grad_norm": 0.062257698278494894, + "language_loss": 1.01846027, + "learning_rate": 0.0009989505813633442, + "loss": 1.03146255, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.29980469, + "step": 260, + "time_per_iteration": 2.6451833248138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131611, + "balance_loss_mlp": 1.28101516, + "epoch": 0.05021161985378992, + "flos": 587066476032.0, + "grad_norm": 0.06290514068599455, + "language_loss": 1.01911807, + "learning_rate": 0.000998930310444573, + "loss": 1.03227913, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.35083008, + "step": 261, + "time_per_iteration": 2.6989662647247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324978, + "balance_loss_mlp": 1.2880708, + "epoch": 0.05040400153905348, + "flos": 633029409792.0, + "grad_norm": 0.0625839964239575, + "language_loss": 1.00387836, + "learning_rate": 0.0009989098458238765, + "loss": 1.01712811, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.36914062, + "step": 262, + "time_per_iteration": 2.7581043243408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319841, + "balance_loss_mlp": 1.28395867, + "epoch": 0.050596383224317046, + "flos": 553344307200.0, + "grad_norm": 0.06067150197267865, + "language_loss": 0.99905968, + "learning_rate": 0.0009988891875091998, + "loss": 1.01225805, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.35913086, + "step": 263, + "time_per_iteration": 2.7601842880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131413, + "balance_loss_mlp": 1.27793837, + "epoch": 0.050788764909580605, + "flos": 549389689344.0, + "grad_norm": 0.07440292928735547, + "language_loss": 0.94277728, + "learning_rate": 0.0009988683355085636, + "loss": 0.95591855, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.36206055, + "step": 264, + "time_per_iteration": 2.7262909412384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277315, + "balance_loss_mlp": 1.24248254, + "epoch": 0.05098114659484417, + "flos": 604812870144.0, + "grad_norm": 0.06984595792035174, + "language_loss": 1.02861905, + "learning_rate": 0.000998847289830063, + "loss": 1.04139221, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.34838867, + "step": 265, + "time_per_iteration": 2.8318397998809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256298, + "balance_loss_mlp": 1.22272849, + "epoch": 0.05117352828010773, + "flos": 438317775360.0, + "grad_norm": 0.08677906198544101, + "language_loss": 0.95779377, + "learning_rate": 0.0009988260504818682, + "loss": 0.9703567, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.3359375, + "step": 266, + "time_per_iteration": 2.5212388038635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220367, + "balance_loss_mlp": 1.19046903, + "epoch": 0.0513659099653713, + "flos": 504784300032.0, + "grad_norm": 0.09456939977029206, + "language_loss": 1.01958096, + "learning_rate": 0.000998804617472226, + "loss": 1.03178465, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.29858398, + "step": 267, + "time_per_iteration": 2.649739980697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169139, + "balance_loss_mlp": 1.14131606, + "epoch": 0.05155829165063486, + "flos": 695183344128.0, + "grad_norm": 0.07125411147685125, + "language_loss": 0.97574937, + "learning_rate": 0.0009987829908094568, + "loss": 0.98744082, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.27856445, + "step": 268, + "time_per_iteration": 2.816098690032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119703, + "balance_loss_mlp": 1.09379935, + "epoch": 0.051750673335898424, + "flos": 1347751830528.0, + "grad_norm": 0.06583247177587333, + "language_loss": 1.04151332, + "learning_rate": 0.0009987611705019569, + "loss": 1.05271029, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.25927734, + "step": 269, + "time_per_iteration": 4.478148460388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103433, + "balance_loss_mlp": 1.08008027, + "epoch": 0.051943055021161984, + "flos": 489362936832.0, + "grad_norm": 0.06787757239342199, + "language_loss": 1.02481639, + "learning_rate": 0.0009987391565581978, + "loss": 1.03585076, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.23364258, + "step": 270, + "time_per_iteration": 2.5662009716033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111004, + "balance_loss_mlp": 1.08859241, + "epoch": 0.05213543670642555, + "flos": 545504882688.0, + "grad_norm": 0.08198896814085149, + "language_loss": 0.9504528, + "learning_rate": 0.000998716948986726, + "loss": 0.96156287, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.22424316, + "step": 271, + "time_per_iteration": 2.7815349102020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158552, + "balance_loss_mlp": 1.13697529, + "epoch": 0.05232781839168911, + "flos": 603285179904.0, + "grad_norm": 0.07646156534985457, + "language_loss": 0.97641921, + "learning_rate": 0.0009986945477961633, + "loss": 0.9880048, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.21569824, + "step": 272, + "time_per_iteration": 2.694547414779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188724, + "balance_loss_mlp": 1.16735017, + "epoch": 0.052520200076952676, + "flos": 538218307584.0, + "grad_norm": 0.07381807258867126, + "language_loss": 1.02498066, + "learning_rate": 0.0009986719529952066, + "loss": 1.03686786, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.21386719, + "step": 273, + "time_per_iteration": 2.8339192867279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121752, + "balance_loss_mlp": 1.19785035, + "epoch": 0.052712581762216236, + "flos": 463148513280.0, + "grad_norm": 0.0738352941440963, + "language_loss": 1.01808548, + "learning_rate": 0.000998649164592628, + "loss": 1.03026068, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.19677734, + "step": 274, + "time_per_iteration": 2.60577130317688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236713, + "balance_loss_mlp": 1.21763909, + "epoch": 0.0529049634474798, + "flos": 547749927936.0, + "grad_norm": 0.08134169766286939, + "language_loss": 0.99272913, + "learning_rate": 0.0009986261825972748, + "loss": 1.00509632, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.19055176, + "step": 275, + "time_per_iteration": 2.652561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196689, + "balance_loss_mlp": 1.17834246, + "epoch": 0.05309734513274336, + "flos": 617727320064.0, + "grad_norm": 0.09111845604121613, + "language_loss": 1.01860571, + "learning_rate": 0.000998603007018069, + "loss": 1.03057253, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.18334961, + "step": 276, + "time_per_iteration": 2.8293774127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011443, + "balance_loss_mlp": 1.1273365, + "epoch": 0.05328972681800693, + "flos": 605220304896.0, + "grad_norm": 0.07377841396756965, + "language_loss": 0.99345076, + "learning_rate": 0.0009985796378640089, + "loss": 1.00489378, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.16955566, + "step": 277, + "time_per_iteration": 2.694716215133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098185, + "balance_loss_mlp": 1.08067346, + "epoch": 0.05348210850327049, + "flos": 604197411840.0, + "grad_norm": 0.07074934963985437, + "language_loss": 0.99532163, + "learning_rate": 0.0009985560751441665, + "loss": 1.00630355, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.1751709, + "step": 278, + "time_per_iteration": 2.798563241958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095446, + "balance_loss_mlp": 1.07736206, + "epoch": 0.053674490188534055, + "flos": 630480236544.0, + "grad_norm": 0.054749659326078955, + "language_loss": 1.01733184, + "learning_rate": 0.00099853231886769, + "loss": 1.02828622, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.1809082, + "step": 279, + "time_per_iteration": 2.780940532684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134885, + "balance_loss_mlp": 1.11744475, + "epoch": 0.053866871873797614, + "flos": 478939433472.0, + "grad_norm": 0.06375435082524677, + "language_loss": 1.01461124, + "learning_rate": 0.0009985083690438024, + "loss": 1.02595997, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.17443848, + "step": 280, + "time_per_iteration": 2.68762469291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145965, + "balance_loss_mlp": 1.12913251, + "epoch": 0.054059253559061174, + "flos": 787673645568.0, + "grad_norm": 0.07384801764192533, + "language_loss": 0.92380941, + "learning_rate": 0.0009984842256818016, + "loss": 0.93526906, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.16845703, + "step": 281, + "time_per_iteration": 3.054032325744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114791, + "balance_loss_mlp": 1.13080359, + "epoch": 0.05425163524432474, + "flos": 628076630016.0, + "grad_norm": 0.082175996598207, + "language_loss": 1.0314945, + "learning_rate": 0.0009984598887910613, + "loss": 1.04297376, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.17114258, + "step": 282, + "time_per_iteration": 2.7095611095428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144379, + "balance_loss_mlp": 1.12627149, + "epoch": 0.0544440169295883, + "flos": 615453161472.0, + "grad_norm": 0.06813866095032944, + "language_loss": 0.9902432, + "learning_rate": 0.0009984353583810297, + "loss": 1.00168693, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.18103027, + "step": 283, + "time_per_iteration": 2.804438829421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124262, + "balance_loss_mlp": 1.10624945, + "epoch": 0.05463639861485187, + "flos": 647471549952.0, + "grad_norm": 0.10003204141391345, + "language_loss": 1.01340103, + "learning_rate": 0.0009984106344612302, + "loss": 1.02464366, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.18017578, + "step": 284, + "time_per_iteration": 2.7521376609802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109552, + "balance_loss_mlp": 1.07819879, + "epoch": 0.054828780300115426, + "flos": 796845883392.0, + "grad_norm": 0.07143310654982075, + "language_loss": 0.96421391, + "learning_rate": 0.0009983857170412615, + "loss": 0.97516906, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.17321777, + "step": 285, + "time_per_iteration": 2.9796621799468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089942, + "balance_loss_mlp": 1.07363439, + "epoch": 0.05502116198537899, + "flos": 549414420480.0, + "grad_norm": 0.05224422052371224, + "language_loss": 0.95713383, + "learning_rate": 0.000998360606130798, + "loss": 0.96803325, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.16308594, + "step": 286, + "time_per_iteration": 2.7950801849365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02908189, + "balance_loss_mlp": 2.83799911, + "epoch": 0.05521354367064255, + "flos": 1406967791616.0, + "grad_norm": 0.233188183772104, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71981305, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.703125, + "step": 287, + "time_per_iteration": 4.876653432846069 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179574, + "balance_loss_mlp": 1.1627655, + "epoch": 0.05540592535590612, + "flos": 645123197952.0, + "grad_norm": 0.17417830683261867, + "language_loss": 1.0204829, + "learning_rate": 0.0009983098038774552, + "loss": 1.03227878, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.16821289, + "step": 288, + "time_per_iteration": 2.7781550884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02101154, + "balance_loss_mlp": 2.07540464, + "epoch": 0.05559830704116968, + "flos": 1510293413376.0, + "grad_norm": 0.1730100464590254, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.80271375, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.2578125, + "step": 289, + "time_per_iteration": 4.801970481872559 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338926, + "balance_loss_mlp": 1.32332134, + "epoch": 0.055790688726433245, + "flos": 508078379520.0, + "grad_norm": 0.11288123874753296, + "language_loss": 0.99586821, + "learning_rate": 0.0009982582277800948, + "loss": 1.00925756, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.15588379, + "step": 290, + "time_per_iteration": 2.6019012928009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376714, + "balance_loss_mlp": 1.36076403, + "epoch": 0.055983070411696804, + "flos": 657570576384.0, + "grad_norm": 0.11158393407579077, + "language_loss": 1.06464982, + "learning_rate": 0.0009982321495648908, + "loss": 1.07841706, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.15942383, + "step": 291, + "time_per_iteration": 2.7833075523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281101, + "balance_loss_mlp": 1.26441216, + "epoch": 0.05617545209696037, + "flos": 587051919360.0, + "grad_norm": 0.091490024999748, + "language_loss": 0.97375935, + "learning_rate": 0.0009982058779188115, + "loss": 0.98657036, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.16699219, + "step": 292, + "time_per_iteration": 2.700998067855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223751, + "balance_loss_mlp": 1.20634639, + "epoch": 0.05636783378222393, + "flos": 611331217920.0, + "grad_norm": 0.09093545163733599, + "language_loss": 1.05090272, + "learning_rate": 0.0009981794128520567, + "loss": 1.06314015, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.17431641, + "step": 293, + "time_per_iteration": 2.769562244415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172918, + "balance_loss_mlp": 1.15501237, + "epoch": 0.0565602154674875, + "flos": 667847102976.0, + "grad_norm": 0.08200667246549262, + "language_loss": 1.02219713, + "learning_rate": 0.000998152754374901, + "loss": 1.03392649, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.17919922, + "step": 294, + "time_per_iteration": 2.8421483039855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140496, + "balance_loss_mlp": 1.12121987, + "epoch": 0.05675259715275106, + "flos": 616963474944.0, + "grad_norm": 0.06298459153201627, + "language_loss": 0.97706711, + "learning_rate": 0.0009981259024976943, + "loss": 0.98847204, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.19250488, + "step": 295, + "time_per_iteration": 2.709536552429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131247, + "balance_loss_mlp": 1.11139894, + "epoch": 0.05694497883801462, + "flos": 751424214528.0, + "grad_norm": 0.13011693222478776, + "language_loss": 0.96307456, + "learning_rate": 0.0009980988572308612, + "loss": 0.97438705, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.19848633, + "step": 296, + "time_per_iteration": 2.9606993198394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125313, + "balance_loss_mlp": 1.10492802, + "epoch": 0.05713736052327818, + "flos": 711669708288.0, + "grad_norm": 0.06808560063607492, + "language_loss": 0.9959082, + "learning_rate": 0.0009980716185849015, + "loss": 1.00716126, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.20385742, + "step": 297, + "time_per_iteration": 2.952467203140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133548, + "balance_loss_mlp": 1.11424804, + "epoch": 0.05732974220854175, + "flos": 468737100288.0, + "grad_norm": 0.05570922928007862, + "language_loss": 0.95103967, + "learning_rate": 0.0009980441865703904, + "loss": 0.9623751, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.19299316, + "step": 298, + "time_per_iteration": 2.6629996299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125947, + "balance_loss_mlp": 1.10630131, + "epoch": 0.05752212389380531, + "flos": 601143441408.0, + "grad_norm": 0.06175770353433084, + "language_loss": 1.038656, + "learning_rate": 0.000998016561197978, + "loss": 1.04991555, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.19628906, + "step": 299, + "time_per_iteration": 2.7027034759521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122899, + "balance_loss_mlp": 1.10499382, + "epoch": 0.057714505579068875, + "flos": 678344799744.0, + "grad_norm": 0.07709513760197055, + "language_loss": 0.95715761, + "learning_rate": 0.0009979887424783895, + "loss": 0.96838653, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.17907715, + "step": 300, + "time_per_iteration": 2.8467562198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122592, + "balance_loss_mlp": 1.10369694, + "epoch": 0.057906887264332435, + "flos": 595604316672.0, + "grad_norm": 0.05754387138467597, + "language_loss": 0.94804943, + "learning_rate": 0.0009979607304224248, + "loss": 0.95927536, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.18908691, + "step": 301, + "time_per_iteration": 2.7457566261291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135958, + "balance_loss_mlp": 1.11577594, + "epoch": 0.058099268949596, + "flos": 551855904768.0, + "grad_norm": 0.06951393564289957, + "language_loss": 1.02452385, + "learning_rate": 0.000997932525040959, + "loss": 1.03588343, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.20166016, + "step": 302, + "time_per_iteration": 2.670464038848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123814, + "balance_loss_mlp": 1.10513425, + "epoch": 0.05829165063485956, + "flos": 507906671616.0, + "grad_norm": 0.06408930588753382, + "language_loss": 1.04041958, + "learning_rate": 0.000997904126344943, + "loss": 1.05165768, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.18676758, + "step": 303, + "time_per_iteration": 2.654275417327881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122557, + "balance_loss_mlp": 1.10432982, + "epoch": 0.05848403232012313, + "flos": 614949774336.0, + "grad_norm": 0.10902949066110783, + "language_loss": 1.00108004, + "learning_rate": 0.0009978755343454018, + "loss": 1.0123055, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.18212891, + "step": 304, + "time_per_iteration": 2.7061922550201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118016, + "balance_loss_mlp": 1.10034943, + "epoch": 0.05867641400538669, + "flos": 499835902464.0, + "grad_norm": 0.07196511907519268, + "language_loss": 1.01183403, + "learning_rate": 0.0009978467490534355, + "loss": 1.02301419, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.17663574, + "step": 305, + "time_per_iteration": 2.5658843517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118418, + "balance_loss_mlp": 1.09971452, + "epoch": 0.05886879569065025, + "flos": 531019072512.0, + "grad_norm": 0.05577021807863236, + "language_loss": 0.98775607, + "learning_rate": 0.00099781777048022, + "loss": 0.99894023, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.18713379, + "step": 306, + "time_per_iteration": 2.688661813735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112614, + "balance_loss_mlp": 1.10866416, + "epoch": 0.05906117737591381, + "flos": 488811497472.0, + "grad_norm": 0.06489613907432343, + "language_loss": 0.99682212, + "learning_rate": 0.0009977885986370057, + "loss": 1.00808358, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.17480469, + "step": 307, + "time_per_iteration": 2.527008056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129188, + "balance_loss_mlp": 1.11242771, + "epoch": 0.05925355906117737, + "flos": 591213150720.0, + "grad_norm": 0.060579194597163814, + "language_loss": 0.94911426, + "learning_rate": 0.000997759233535118, + "loss": 0.96040612, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.16772461, + "step": 308, + "time_per_iteration": 2.768683433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_mlp": 1.09052539, + "epoch": 0.05944594074644094, + "flos": 563373522432.0, + "grad_norm": 0.074144120767366, + "language_loss": 1.01706028, + "learning_rate": 0.0009977296751859576, + "loss": 1.02814317, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.17749023, + "step": 309, + "time_per_iteration": 2.710550308227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109964, + "balance_loss_mlp": 1.0817585, + "epoch": 0.0596383224317045, + "flos": 538483147776.0, + "grad_norm": 0.1012520362466171, + "language_loss": 1.03562367, + "learning_rate": 0.0009976999236009998, + "loss": 1.04662001, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.17895508, + "step": 310, + "time_per_iteration": 2.7346065044403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095396, + "balance_loss_mlp": 1.07697809, + "epoch": 0.059830704116968066, + "flos": 560684726784.0, + "grad_norm": 0.05903807060939984, + "language_loss": 1.05193245, + "learning_rate": 0.0009976699787917955, + "loss": 1.06288636, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.18408203, + "step": 311, + "time_per_iteration": 2.737165689468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04018029, + "balance_loss_mlp": 3.94440532, + "epoch": 0.060023085802231625, + "flos": 1569759962112.0, + "grad_norm": 0.34396821433057967, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.77461016, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.734375, + "step": 312, + "time_per_iteration": 4.990010976791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130575, + "balance_loss_mlp": 1.11010623, + "epoch": 0.06021546748749519, + "flos": 482415395328.0, + "grad_norm": 0.18656347991450223, + "language_loss": 0.97164261, + "learning_rate": 0.0009976095095472243, + "loss": 0.98294836, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.20458984, + "step": 313, + "time_per_iteration": 2.5596373081207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143867, + "balance_loss_mlp": 1.12198031, + "epoch": 0.06040784917275875, + "flos": 619889407488.0, + "grad_norm": 0.10017394493353984, + "language_loss": 0.98154747, + "learning_rate": 0.0009975789851353334, + "loss": 0.9929862, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.21911621, + "step": 314, + "time_per_iteration": 2.783092498779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113993, + "balance_loss_mlp": 1.11832976, + "epoch": 0.06060023085802232, + "flos": 483292721664.0, + "grad_norm": 0.12837029886330253, + "language_loss": 1.00706339, + "learning_rate": 0.0009975482675461487, + "loss": 1.01846266, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.21594238, + "step": 315, + "time_per_iteration": 2.6375765800476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128184, + "balance_loss_mlp": 1.10697675, + "epoch": 0.06079261254328588, + "flos": 581620483584.0, + "grad_norm": 0.07139597701291463, + "language_loss": 0.9800331, + "learning_rate": 0.0009975173567915952, + "loss": 0.99131489, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.21228027, + "step": 316, + "time_per_iteration": 2.680223226547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116284, + "balance_loss_mlp": 1.09438515, + "epoch": 0.060984994228549444, + "flos": 687492306432.0, + "grad_norm": 0.12898022133672052, + "language_loss": 0.92624593, + "learning_rate": 0.000997486252883674, + "loss": 0.93740869, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.21887207, + "step": 317, + "time_per_iteration": 2.835162878036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_mlp": 1.08325243, + "epoch": 0.061177375913813004, + "flos": 1314284327424.0, + "grad_norm": 0.06442728945451602, + "language_loss": 0.97186124, + "learning_rate": 0.0009974549558344602, + "loss": 0.98290741, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.21350098, + "step": 318, + "time_per_iteration": 3.6293551921844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105129, + "balance_loss_mlp": 1.08439815, + "epoch": 0.06136975759907657, + "flos": 574072040448.0, + "grad_norm": 0.08131052095693254, + "language_loss": 1.07145, + "learning_rate": 0.000997423465656105, + "loss": 1.08250129, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.20715332, + "step": 319, + "time_per_iteration": 2.7070071697235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_mlp": 1.08168781, + "epoch": 0.06156213928434013, + "flos": 527281242624.0, + "grad_norm": 0.059301156484267634, + "language_loss": 1.04424822, + "learning_rate": 0.0009973917823608335, + "loss": 1.0552659, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.20092773, + "step": 320, + "time_per_iteration": 2.6225128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110531, + "balance_loss_mlp": 1.0897882, + "epoch": 0.061754520969603696, + "flos": 495238123008.0, + "grad_norm": 0.05387649814829365, + "language_loss": 0.98383266, + "learning_rate": 0.0009973599059609462, + "loss": 0.9949379, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.20739746, + "step": 321, + "time_per_iteration": 2.692152261734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107693, + "balance_loss_mlp": 1.08798778, + "epoch": 0.061946902654867256, + "flos": 439839673344.0, + "grad_norm": 0.06112812680296507, + "language_loss": 0.9711749, + "learning_rate": 0.000997327836468819, + "loss": 0.98225188, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.19702148, + "step": 322, + "time_per_iteration": 2.5772383213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110285, + "balance_loss_mlp": 1.0900557, + "epoch": 0.06213928434013082, + "flos": 598490961408.0, + "grad_norm": 0.0645434874295678, + "language_loss": 0.9942351, + "learning_rate": 0.000997295573896902, + "loss": 1.00533807, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.20239258, + "step": 323, + "time_per_iteration": 2.839282274246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02259253, + "balance_loss_mlp": 2.20088792, + "epoch": 0.06233166602539438, + "flos": 1449393716736.0, + "grad_norm": 0.19547826226404627, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83455294, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.58203125, + "step": 324, + "time_per_iteration": 4.67440938949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01888161, + "balance_loss_mlp": 1.83246601, + "epoch": 0.06252404771065795, + "flos": 1462504453632.0, + "grad_norm": 0.11962022052509429, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80460101, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.55859375, + "step": 325, + "time_per_iteration": 4.860283136367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177486, + "balance_loss_mlp": 1.15595722, + "epoch": 0.06271642939592151, + "flos": 464059335168.0, + "grad_norm": 0.06272096910143152, + "language_loss": 0.93621421, + "learning_rate": 0.000997197627828043, + "loss": 0.94798911, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.2154541, + "step": 326, + "time_per_iteration": 2.5594961643218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205877, + "balance_loss_mlp": 1.18165386, + "epoch": 0.06290881108118507, + "flos": 532111776768.0, + "grad_norm": 0.08849931028565244, + "language_loss": 0.89414704, + "learning_rate": 0.0009971645930629716, + "loss": 0.90620589, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.2421875, + "step": 327, + "time_per_iteration": 2.7163310050964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223238, + "balance_loss_mlp": 1.19748878, + "epoch": 0.06310119276644863, + "flos": 673262572032.0, + "grad_norm": 0.09892100413683627, + "language_loss": 1.02883804, + "learning_rate": 0.0009971313652814872, + "loss": 1.04107046, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.25769043, + "step": 328, + "time_per_iteration": 2.7786266803741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228803, + "balance_loss_mlp": 1.20175433, + "epoch": 0.0632935744517122, + "flos": 770404497408.0, + "grad_norm": 0.06852265531332852, + "language_loss": 0.99799907, + "learning_rate": 0.0009970979444964903, + "loss": 1.01028717, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.27050781, + "step": 329, + "time_per_iteration": 2.952498197555542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235649, + "balance_loss_mlp": 1.2062993, + "epoch": 0.06348595613697576, + "flos": 561649393152.0, + "grad_norm": 0.09680127661829774, + "language_loss": 1.0121367, + "learning_rate": 0.0009970643307209556, + "loss": 1.02449322, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.29296875, + "step": 330, + "time_per_iteration": 2.78190541267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240935, + "balance_loss_mlp": 1.20970178, + "epoch": 0.06367833782223932, + "flos": 675891730944.0, + "grad_norm": 0.08786526055569537, + "language_loss": 0.9788332, + "learning_rate": 0.0009970305239679334, + "loss": 0.99124253, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.31201172, + "step": 331, + "time_per_iteration": 2.805845022201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228576, + "balance_loss_mlp": 1.19891691, + "epoch": 0.06387071950750288, + "flos": 495035891712.0, + "grad_norm": 0.10390832636325384, + "language_loss": 1.03124022, + "learning_rate": 0.0009969965242505483, + "loss": 1.04352593, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.29614258, + "step": 332, + "time_per_iteration": 2.676711082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207199, + "balance_loss_mlp": 1.1777302, + "epoch": 0.06406310119276645, + "flos": 533170985472.0, + "grad_norm": 0.07105898063788767, + "language_loss": 0.98331362, + "learning_rate": 0.0009969623315820007, + "loss": 0.99538565, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.29418945, + "step": 333, + "time_per_iteration": 2.6556739807128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118815, + "balance_loss_mlp": 1.16106582, + "epoch": 0.06425548287803001, + "flos": 455940513792.0, + "grad_norm": 0.08067516684621483, + "language_loss": 0.99160993, + "learning_rate": 0.000996927945975565, + "loss": 1.0034914, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.27124023, + "step": 334, + "time_per_iteration": 2.5398526191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147495, + "balance_loss_mlp": 1.1214596, + "epoch": 0.06444786456329357, + "flos": 559817574912.0, + "grad_norm": 0.08169715789363684, + "language_loss": 0.96174645, + "learning_rate": 0.0009968933674445906, + "loss": 0.97322142, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.26062012, + "step": 335, + "time_per_iteration": 2.6592860221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112932, + "balance_loss_mlp": 1.0879097, + "epoch": 0.06464024624855713, + "flos": 665769383424.0, + "grad_norm": 0.07104021966044574, + "language_loss": 0.97756392, + "learning_rate": 0.0009968585960025028, + "loss": 0.98869324, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.25036621, + "step": 336, + "time_per_iteration": 2.9279658794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01860024, + "balance_loss_mlp": 1.84323907, + "epoch": 0.0648326279338207, + "flos": 1520578704384.0, + "grad_norm": 0.14426901756633248, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.7951321, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.16796875, + "step": 337, + "time_per_iteration": 4.810914993286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101739, + "balance_loss_mlp": 1.07948256, + "epoch": 0.06502500961908426, + "flos": 1142872768512.0, + "grad_norm": 0.058812216055980165, + "language_loss": 0.95864177, + "learning_rate": 0.0009967884744390583, + "loss": 0.96965921, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.22265625, + "step": 338, + "time_per_iteration": 3.512282371520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146504, + "balance_loss_mlp": 1.12267399, + "epoch": 0.06521739130434782, + "flos": 582339248640.0, + "grad_norm": 0.10793578588091769, + "language_loss": 0.97449529, + "learning_rate": 0.0009967531243449256, + "loss": 0.98596036, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.23828125, + "step": 339, + "time_per_iteration": 2.712907075881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154087, + "balance_loss_mlp": 1.12950587, + "epoch": 0.06540977298961138, + "flos": 497398800384.0, + "grad_norm": 0.06396927661276222, + "language_loss": 1.04641414, + "learning_rate": 0.000996717581394126, + "loss": 1.05795503, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.24584961, + "step": 340, + "time_per_iteration": 2.5783133506774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168509, + "balance_loss_mlp": 1.14584756, + "epoch": 0.06560215467487496, + "flos": 542613855744.0, + "grad_norm": 0.07568553531769329, + "language_loss": 1.05092287, + "learning_rate": 0.000996681845600459, + "loss": 1.062608, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.2265625, + "step": 341, + "time_per_iteration": 2.6543757915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.13118291, + "epoch": 0.06579453636013852, + "flos": 413230961664.0, + "grad_norm": 0.06593832485574395, + "language_loss": 0.97027373, + "learning_rate": 0.0009966459169777982, + "loss": 0.9818095, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.22387695, + "step": 342, + "time_per_iteration": 2.5120761394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141132, + "balance_loss_mlp": 1.11848283, + "epoch": 0.06598691804540208, + "flos": 560354457600.0, + "grad_norm": 0.055115078659976495, + "language_loss": 1.05281377, + "learning_rate": 0.0009966097955400924, + "loss": 1.0642252, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.22644043, + "step": 343, + "time_per_iteration": 2.6954751014709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111133, + "balance_loss_mlp": 1.08904982, + "epoch": 0.06617929973066564, + "flos": 571789117440.0, + "grad_norm": 0.06176008438986438, + "language_loss": 0.99064481, + "learning_rate": 0.0009965734813013652, + "loss": 1.00175822, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.22277832, + "step": 344, + "time_per_iteration": 2.8235929012298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090293, + "balance_loss_mlp": 1.06726193, + "epoch": 0.06637168141592921, + "flos": 490234470912.0, + "grad_norm": 0.05365164831273283, + "language_loss": 1.01308548, + "learning_rate": 0.0009965369742757151, + "loss": 1.02398837, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.23022461, + "step": 345, + "time_per_iteration": 2.5708556175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078086, + "balance_loss_mlp": 1.05727243, + "epoch": 0.06656406310119277, + "flos": 1078735656960.0, + "grad_norm": 0.04968829319439664, + "language_loss": 0.97902787, + "learning_rate": 0.0009965002744773152, + "loss": 0.98980874, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.20812988, + "step": 346, + "time_per_iteration": 3.4984121322631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086945, + "balance_loss_mlp": 1.06450987, + "epoch": 0.06675644478645633, + "flos": 513421065216.0, + "grad_norm": 0.06258978415695335, + "language_loss": 0.95138037, + "learning_rate": 0.0009964633819204139, + "loss": 0.96224982, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.22436523, + "step": 347, + "time_per_iteration": 2.6866109371185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01752926, + "balance_loss_mlp": 1.73108697, + "epoch": 0.06694882647171989, + "flos": 1446359943168.0, + "grad_norm": 0.11694655230354783, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83554041, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.21875, + "step": 348, + "time_per_iteration": 4.935550928115845 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01583198, + "balance_loss_mlp": 1.56116796, + "epoch": 0.06714120815698346, + "flos": 1551230784000.0, + "grad_norm": 0.0989027294649474, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76737082, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.22070312, + "step": 349, + "time_per_iteration": 4.891008615493774 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149858, + "balance_loss_mlp": 1.12826955, + "epoch": 0.06733358984224702, + "flos": 879689673216.0, + "grad_norm": 0.07075764146586616, + "language_loss": 0.94920838, + "learning_rate": 0.000996351547842304, + "loss": 0.96070701, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.21582031, + "step": 350, + "time_per_iteration": 3.156322717666626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192552, + "balance_loss_mlp": 1.17055774, + "epoch": 0.06752597152751058, + "flos": 518654651904.0, + "grad_norm": 0.09040238598346795, + "language_loss": 0.93423587, + "learning_rate": 0.0009963138843953744, + "loss": 0.94616139, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.2199707, + "step": 351, + "time_per_iteration": 2.610987663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206077, + "balance_loss_mlp": 1.18405879, + "epoch": 0.06771835321277414, + "flos": 539366266368.0, + "grad_norm": 0.08658544591035036, + "language_loss": 0.97852194, + "learning_rate": 0.000996276028262306, + "loss": 0.9905827, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.22021484, + "step": 352, + "time_per_iteration": 2.8413686752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166048, + "balance_loss_mlp": 1.14382768, + "epoch": 0.0679107348980377, + "flos": 460430604288.0, + "grad_norm": 0.09117082479319542, + "language_loss": 1.04269946, + "learning_rate": 0.0009962379794577964, + "loss": 1.05435991, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.22216797, + "step": 353, + "time_per_iteration": 2.591372489929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114388, + "balance_loss_mlp": 1.12227976, + "epoch": 0.06810311658330127, + "flos": 635601752064.0, + "grad_norm": 0.05781909345233015, + "language_loss": 0.94169199, + "learning_rate": 0.000996199737996617, + "loss": 0.95313084, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.21630859, + "step": 354, + "time_per_iteration": 2.9088492393493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125411, + "balance_loss_mlp": 1.10420346, + "epoch": 0.06829549826856483, + "flos": 464443448832.0, + "grad_norm": 0.06770201052263504, + "language_loss": 1.03043509, + "learning_rate": 0.0009961613038936149, + "loss": 1.04168916, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.2121582, + "step": 355, + "time_per_iteration": 2.571904420852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110613, + "balance_loss_mlp": 1.08917904, + "epoch": 0.06848787995382839, + "flos": 634335929856.0, + "grad_norm": 0.06097004840688574, + "language_loss": 0.95565176, + "learning_rate": 0.000996122677163711, + "loss": 0.96675789, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.21435547, + "step": 356, + "time_per_iteration": 2.794982671737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107296, + "balance_loss_mlp": 1.08667266, + "epoch": 0.06868026163909195, + "flos": 806023913472.0, + "grad_norm": 0.08020973782133771, + "language_loss": 1.01095176, + "learning_rate": 0.000996083857821902, + "loss": 1.02202487, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.20629883, + "step": 357, + "time_per_iteration": 3.007086753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101637, + "balance_loss_mlp": 1.08076346, + "epoch": 0.06887264332435553, + "flos": 438997252608.0, + "grad_norm": 0.08125476198078858, + "language_loss": 0.99797714, + "learning_rate": 0.0009960448458832588, + "loss": 1.00899351, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.2088623, + "step": 358, + "time_per_iteration": 2.699530601501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098146, + "balance_loss_mlp": 1.07872701, + "epoch": 0.06906502500961909, + "flos": 484513463808.0, + "grad_norm": 0.06827746260367892, + "language_loss": 0.99188638, + "learning_rate": 0.000996005641362927, + "loss": 1.00286782, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.1940918, + "step": 359, + "time_per_iteration": 2.5541014671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103345, + "balance_loss_mlp": 1.0841639, + "epoch": 0.06925740669488265, + "flos": 733293706752.0, + "grad_norm": 0.08731085845928575, + "language_loss": 1.02303529, + "learning_rate": 0.0009959662442761274, + "loss": 1.0340687, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.19189453, + "step": 360, + "time_per_iteration": 2.906623363494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093844, + "balance_loss_mlp": 1.07268476, + "epoch": 0.0694497883801462, + "flos": 552127947264.0, + "grad_norm": 0.06697663210144707, + "language_loss": 0.9595629, + "learning_rate": 0.000995926654638155, + "loss": 0.97050136, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.21179199, + "step": 361, + "time_per_iteration": 2.793663501739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082773, + "balance_loss_mlp": 1.06236482, + "epoch": 0.06964217006540978, + "flos": 677708992512.0, + "grad_norm": 0.06860924301964295, + "language_loss": 0.98198265, + "learning_rate": 0.00099588687246438, + "loss": 0.99281037, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.20410156, + "step": 362, + "time_per_iteration": 2.828139305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085274, + "balance_loss_mlp": 1.06330371, + "epoch": 0.06983455175067334, + "flos": 523987163136.0, + "grad_norm": 0.08747541291209461, + "language_loss": 1.04803789, + "learning_rate": 0.0009958468977702471, + "loss": 1.0588907, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.21972656, + "step": 363, + "time_per_iteration": 2.5759966373443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02224374, + "balance_loss_mlp": 2.20682669, + "epoch": 0.0700269334359369, + "flos": 1575943658496.0, + "grad_norm": 0.2746548069890379, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81959081, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.17578125, + "step": 364, + "time_per_iteration": 4.782835245132446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134514, + "balance_loss_mlp": 1.11340213, + "epoch": 0.07021931512120046, + "flos": 1012848274944.0, + "grad_norm": 0.08586169827549085, + "language_loss": 0.93286598, + "learning_rate": 0.0009957663708830612, + "loss": 0.94421113, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.21105957, + "step": 365, + "time_per_iteration": 3.2484283447265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116189, + "balance_loss_mlp": 1.13884652, + "epoch": 0.07041169680646403, + "flos": 822622348800.0, + "grad_norm": 0.09941073368395695, + "language_loss": 0.97043455, + "learning_rate": 0.0009957258187212714, + "loss": 0.98205346, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.23034668, + "step": 366, + "time_per_iteration": 3.009479522705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01756688, + "balance_loss_mlp": 1.7255981, + "epoch": 0.07060407849172759, + "flos": 1413670993920.0, + "grad_norm": 0.12374795181042475, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80951542, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.31054688, + "step": 367, + "time_per_iteration": 4.82874608039856 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152073, + "balance_loss_mlp": 1.13087749, + "epoch": 0.07079646017699115, + "flos": 512652837888.0, + "grad_norm": 0.06786716904588838, + "language_loss": 0.93450886, + "learning_rate": 0.0009956441370400167, + "loss": 0.94602954, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.21191406, + "step": 368, + "time_per_iteration": 2.6226603984832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153965, + "balance_loss_mlp": 1.13158989, + "epoch": 0.07098884186225471, + "flos": 540240772608.0, + "grad_norm": 0.08343626294497461, + "language_loss": 0.99467343, + "learning_rate": 0.0009956030075522636, + "loss": 1.00621307, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.22375488, + "step": 369, + "time_per_iteration": 2.7128794193267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142856, + "balance_loss_mlp": 1.12137485, + "epoch": 0.07118122354751828, + "flos": 548419230720.0, + "grad_norm": 0.07464528715750075, + "language_loss": 0.98955953, + "learning_rate": 0.0009955616856543587, + "loss": 1.00098813, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.21472168, + "step": 370, + "time_per_iteration": 2.613138198852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118555, + "balance_loss_mlp": 1.0958215, + "epoch": 0.07137360523278184, + "flos": 620612554752.0, + "grad_norm": 0.056434914921328155, + "language_loss": 0.91880834, + "learning_rate": 0.0009955201713623448, + "loss": 0.92999387, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.22717285, + "step": 371, + "time_per_iteration": 2.747133255004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01746336, + "balance_loss_mlp": 1.72154021, + "epoch": 0.0715659869180454, + "flos": 1501850115072.0, + "grad_norm": 0.08669176596007007, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78419054, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.24707031, + "step": 372, + "time_per_iteration": 4.931428670883179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102855, + "balance_loss_mlp": 1.08040774, + "epoch": 0.07175836860330896, + "flos": 495246887424.0, + "grad_norm": 0.07044890130803105, + "language_loss": 1.05121827, + "learning_rate": 0.0009954365656605333, + "loss": 1.06224692, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.22436523, + "step": 373, + "time_per_iteration": 2.550243616104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118244, + "balance_loss_mlp": 1.09438992, + "epoch": 0.07195075028857253, + "flos": 785387902464.0, + "grad_norm": 0.05415547127036835, + "language_loss": 0.98150015, + "learning_rate": 0.0009953944742831947, + "loss": 0.99268264, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.23864746, + "step": 374, + "time_per_iteration": 2.9659459590911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125507, + "balance_loss_mlp": 1.10202336, + "epoch": 0.0721431319738361, + "flos": 592799067648.0, + "grad_norm": 0.07003669353380264, + "language_loss": 1.01441097, + "learning_rate": 0.0009953521905766642, + "loss": 1.02566612, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.23486328, + "step": 375, + "time_per_iteration": 2.942763566970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117119, + "balance_loss_mlp": 1.09393334, + "epoch": 0.07233551365909965, + "flos": 547981272576.0, + "grad_norm": 0.06343477824222313, + "language_loss": 0.99901861, + "learning_rate": 0.0009953097145573577, + "loss": 1.01018989, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.23193359, + "step": 376, + "time_per_iteration": 2.6275272369384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113711, + "balance_loss_mlp": 1.09023869, + "epoch": 0.07252789534436321, + "flos": 957170428416.0, + "grad_norm": 0.0678891965164594, + "language_loss": 0.97798675, + "learning_rate": 0.000995267046241766, + "loss": 0.98912394, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.23474121, + "step": 377, + "time_per_iteration": 3.2014975547790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096997, + "balance_loss_mlp": 1.07496762, + "epoch": 0.07272027702962677, + "flos": 507398902272.0, + "grad_norm": 0.0806519998399971, + "language_loss": 0.97275257, + "learning_rate": 0.0009952241856464547, + "loss": 0.98372257, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.22045898, + "step": 378, + "time_per_iteration": 2.6189732551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109641, + "balance_loss_mlp": 1.0746069, + "epoch": 0.07291265871489035, + "flos": 612128558592.0, + "grad_norm": 0.0691049335661606, + "language_loss": 1.04592681, + "learning_rate": 0.0009951811327880632, + "loss": 1.05689096, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.21826172, + "step": 379, + "time_per_iteration": 2.7411558628082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092071, + "balance_loss_mlp": 1.07025611, + "epoch": 0.0731050404001539, + "flos": 495502963200.0, + "grad_norm": 0.05765504670581196, + "language_loss": 0.97682816, + "learning_rate": 0.0009951378876833063, + "loss": 0.98774892, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.21813965, + "step": 380, + "time_per_iteration": 2.6211278438568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081575, + "balance_loss_mlp": 1.06068945, + "epoch": 0.07329742208541747, + "flos": 639677205504.0, + "grad_norm": 0.06809750593205881, + "language_loss": 1.04190159, + "learning_rate": 0.0009950944503489736, + "loss": 1.05271733, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.20898438, + "step": 381, + "time_per_iteration": 2.7533762454986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081401, + "balance_loss_mlp": 1.0607307, + "epoch": 0.07348980377068103, + "flos": 815999284224.0, + "grad_norm": 0.06607035824886899, + "language_loss": 0.98459697, + "learning_rate": 0.0009950508208019285, + "loss": 0.99541104, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.20678711, + "step": 382, + "time_per_iteration": 2.9885637760162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073667, + "balance_loss_mlp": 1.05369973, + "epoch": 0.0736821854559446, + "flos": 508383917568.0, + "grad_norm": 0.05970909775769663, + "language_loss": 1.02745128, + "learning_rate": 0.0009950069990591096, + "loss": 1.03818798, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.19958496, + "step": 383, + "time_per_iteration": 2.6111788749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01835936, + "balance_loss_mlp": 1.8101871, + "epoch": 0.07387456714120816, + "flos": 1553801716224.0, + "grad_norm": 0.167122487372618, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.78237301, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.2578125, + "step": 384, + "time_per_iteration": 4.859915494918823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116619, + "balance_loss_mlp": 1.09575748, + "epoch": 0.07406694882647172, + "flos": 525219489792.0, + "grad_norm": 0.0799084124695288, + "language_loss": 0.96017051, + "learning_rate": 0.0009949187790542777, + "loss": 0.97133672, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.20861816, + "step": 385, + "time_per_iteration": 2.6976191997528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124239, + "balance_loss_mlp": 1.10322285, + "epoch": 0.07425933051173528, + "flos": 497468611584.0, + "grad_norm": 0.08753491640442414, + "language_loss": 0.91745877, + "learning_rate": 0.0009948743808265148, + "loss": 0.92870116, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.21020508, + "step": 386, + "time_per_iteration": 2.6870572566986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113476, + "balance_loss_mlp": 1.09249496, + "epoch": 0.07445171219699885, + "flos": 504740630016.0, + "grad_norm": 0.05063210924529089, + "language_loss": 1.0156467, + "learning_rate": 0.0009948297904714782, + "loss": 1.02678132, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.20996094, + "step": 387, + "time_per_iteration": 2.668027639389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097529, + "balance_loss_mlp": 1.07642913, + "epoch": 0.07464409388226241, + "flos": 553693515264.0, + "grad_norm": 0.06830922509793466, + "language_loss": 0.93493366, + "learning_rate": 0.0009947850080064796, + "loss": 0.9459089, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.21105957, + "step": 388, + "time_per_iteration": 2.79836106300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098078, + "balance_loss_mlp": 1.07695365, + "epoch": 0.07483647556752597, + "flos": 776511028224.0, + "grad_norm": 0.06471398355705121, + "language_loss": 0.98276728, + "learning_rate": 0.0009947400334489047, + "loss": 0.99374807, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.21130371, + "step": 389, + "time_per_iteration": 3.0046355724334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095267, + "balance_loss_mlp": 1.07513261, + "epoch": 0.07502885725278953, + "flos": 612256596480.0, + "grad_norm": 0.0754939105077014, + "language_loss": 0.90272582, + "learning_rate": 0.0009946948668162145, + "loss": 0.91367853, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.20141602, + "step": 390, + "time_per_iteration": 2.724792003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091157, + "balance_loss_mlp": 1.06946135, + "epoch": 0.0752212389380531, + "flos": 688324552704.0, + "grad_norm": 0.05626120625508035, + "language_loss": 0.9463594, + "learning_rate": 0.0009946495081259441, + "loss": 0.95727098, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.21704102, + "step": 391, + "time_per_iteration": 2.8221397399902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101684, + "balance_loss_mlp": 1.08008361, + "epoch": 0.07541362062331666, + "flos": 765362967552.0, + "grad_norm": 0.09729902751759628, + "language_loss": 0.97468722, + "learning_rate": 0.0009946039573957035, + "loss": 0.98570406, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.21606445, + "step": 392, + "time_per_iteration": 2.958655595779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095785, + "balance_loss_mlp": 1.07572174, + "epoch": 0.07560600230858022, + "flos": 588460336128.0, + "grad_norm": 0.06468718689622391, + "language_loss": 0.94257009, + "learning_rate": 0.000994558214643177, + "loss": 0.95352793, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.20056152, + "step": 393, + "time_per_iteration": 2.752979040145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101382, + "balance_loss_mlp": 1.08086586, + "epoch": 0.07579838399384378, + "flos": 749508028416.0, + "grad_norm": 0.06635223139616171, + "language_loss": 0.961483, + "learning_rate": 0.000994512279886123, + "loss": 0.97249681, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.20532227, + "step": 394, + "time_per_iteration": 3.055225133895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104661, + "balance_loss_mlp": 1.08346581, + "epoch": 0.07599076567910736, + "flos": 523185440256.0, + "grad_norm": 0.06901630142642712, + "language_loss": 0.96749192, + "learning_rate": 0.0009944661531423758, + "loss": 0.97853857, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.2121582, + "step": 395, + "time_per_iteration": 2.6922085285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093271, + "balance_loss_mlp": 1.07248056, + "epoch": 0.07618314736437092, + "flos": 550812662784.0, + "grad_norm": 0.07064334209039194, + "language_loss": 0.95375401, + "learning_rate": 0.000994419834429843, + "loss": 0.96468663, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.20788574, + "step": 396, + "time_per_iteration": 2.6657333374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092352, + "balance_loss_mlp": 1.0716933, + "epoch": 0.07637552904963447, + "flos": 697901253120.0, + "grad_norm": 0.07324881108467876, + "language_loss": 0.99580455, + "learning_rate": 0.0009943733237665069, + "loss": 1.00672793, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.20654297, + "step": 397, + "time_per_iteration": 2.8662500381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085112, + "balance_loss_mlp": 1.06454849, + "epoch": 0.07656791073489803, + "flos": 579066928128.0, + "grad_norm": 0.04790317238997088, + "language_loss": 0.98118353, + "learning_rate": 0.0009943266211704248, + "loss": 0.99203461, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.20568848, + "step": 398, + "time_per_iteration": 2.930741786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094784, + "balance_loss_mlp": 1.07348132, + "epoch": 0.0767602924201616, + "flos": 416923711488.0, + "grad_norm": 0.09980331544781734, + "language_loss": 1.00422275, + "learning_rate": 0.000994279726659728, + "loss": 1.01517057, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.21325684, + "step": 399, + "time_per_iteration": 2.533738851547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109036, + "balance_loss_mlp": 1.06970143, + "epoch": 0.07695267410542517, + "flos": 482671471104.0, + "grad_norm": 0.06967700921129397, + "language_loss": 0.97985041, + "learning_rate": 0.0009942326402526231, + "loss": 0.99075395, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.20666504, + "step": 400, + "time_per_iteration": 2.51460337638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096542, + "balance_loss_mlp": 1.07526302, + "epoch": 0.07714505579068873, + "flos": 530742647808.0, + "grad_norm": 0.052652305799428985, + "language_loss": 0.96639109, + "learning_rate": 0.0009941853619673902, + "loss": 0.97735649, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.2130127, + "step": 401, + "time_per_iteration": 2.620939016342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101036, + "balance_loss_mlp": 1.08012676, + "epoch": 0.07733743747595229, + "flos": 804635845632.0, + "grad_norm": 0.07273299487754427, + "language_loss": 0.99959278, + "learning_rate": 0.0009941378918223844, + "loss": 1.01060319, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.20910645, + "step": 402, + "time_per_iteration": 3.036839008331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110477, + "balance_loss_mlp": 1.08423018, + "epoch": 0.07752981916121585, + "flos": 622192679424.0, + "grad_norm": 0.05767312217272775, + "language_loss": 0.93044209, + "learning_rate": 0.0009940902298360354, + "loss": 0.94148982, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.20544434, + "step": 403, + "time_per_iteration": 2.7703943252563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097477, + "balance_loss_mlp": 1.07694876, + "epoch": 0.07772220084647942, + "flos": 727961195520.0, + "grad_norm": 0.0686344305115436, + "language_loss": 1.02037048, + "learning_rate": 0.0009940423760268473, + "loss": 1.03134525, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.2052002, + "step": 404, + "time_per_iteration": 2.8823602199554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.06497431, + "epoch": 0.07791458253174298, + "flos": 555149984256.0, + "grad_norm": 0.10727031409308073, + "language_loss": 0.96142864, + "learning_rate": 0.0009939943304133982, + "loss": 0.97228479, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.20654297, + "step": 405, + "time_per_iteration": 2.63908314704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078944, + "balance_loss_mlp": 1.05944133, + "epoch": 0.07810696421700654, + "flos": 552919495680.0, + "grad_norm": 0.08981509362846728, + "language_loss": 1.0302707, + "learning_rate": 0.0009939460930143416, + "loss": 1.04106021, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.19482422, + "step": 406, + "time_per_iteration": 2.63259220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079269, + "balance_loss_mlp": 1.05927801, + "epoch": 0.0782993459022701, + "flos": 650323289088.0, + "grad_norm": 0.07212254231156982, + "language_loss": 0.96910775, + "learning_rate": 0.0009938976638484043, + "loss": 0.97990054, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.1998291, + "step": 407, + "time_per_iteration": 2.9489452838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_mlp": 1.05239439, + "epoch": 0.07849172758753367, + "flos": 495926364672.0, + "grad_norm": 0.07302041560946317, + "language_loss": 0.9619081, + "learning_rate": 0.0009938490429343887, + "loss": 0.97263873, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.20666504, + "step": 408, + "time_per_iteration": 2.541293144226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078297, + "balance_loss_mlp": 1.05823374, + "epoch": 0.07868410927279723, + "flos": 577696389120.0, + "grad_norm": 0.06961121210328268, + "language_loss": 0.96404505, + "learning_rate": 0.0009938002302911709, + "loss": 0.974828, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.20056152, + "step": 409, + "time_per_iteration": 2.7890634536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.05869615, + "epoch": 0.07887649095806079, + "flos": 522698019840.0, + "grad_norm": 0.10283598941623227, + "language_loss": 0.99080813, + "learning_rate": 0.0009937512259377015, + "loss": 1.00159442, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.19921875, + "step": 410, + "time_per_iteration": 2.6631360054016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076374, + "balance_loss_mlp": 1.05739617, + "epoch": 0.07906887264332435, + "flos": 556958481408.0, + "grad_norm": 0.07518465865945036, + "language_loss": 0.97744381, + "learning_rate": 0.000993702029893006, + "loss": 0.98820746, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.18981934, + "step": 411, + "time_per_iteration": 2.762937068939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070708, + "balance_loss_mlp": 1.0512886, + "epoch": 0.07926125432858792, + "flos": 821641715712.0, + "grad_norm": 0.06547583340109177, + "language_loss": 0.97466588, + "learning_rate": 0.0009936526421761838, + "loss": 0.98537302, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.1940918, + "step": 412, + "time_per_iteration": 3.019529342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070741, + "balance_loss_mlp": 1.05210841, + "epoch": 0.07945363601385148, + "flos": 562072794624.0, + "grad_norm": 0.06412617323579047, + "language_loss": 0.9993977, + "learning_rate": 0.000993603062806409, + "loss": 1.01010513, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.18615723, + "step": 413, + "time_per_iteration": 2.667893409729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078833, + "balance_loss_mlp": 1.05879402, + "epoch": 0.07964601769911504, + "flos": 517615792128.0, + "grad_norm": 0.0777298152120257, + "language_loss": 1.03187037, + "learning_rate": 0.0009935532918029298, + "loss": 1.04265857, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.20031738, + "step": 414, + "time_per_iteration": 2.628847122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079604, + "balance_loss_mlp": 1.06020916, + "epoch": 0.0798383993843786, + "flos": 538956011520.0, + "grad_norm": 0.0762846382616791, + "language_loss": 0.96381676, + "learning_rate": 0.0009935033291850694, + "loss": 0.97461283, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.19384766, + "step": 415, + "time_per_iteration": 2.6874804496765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078311, + "balance_loss_mlp": 1.05915451, + "epoch": 0.08003078106964218, + "flos": 484901959680.0, + "grad_norm": 0.07548152614126195, + "language_loss": 0.9874112, + "learning_rate": 0.0009934531749722247, + "loss": 0.9981944, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.19177246, + "step": 416, + "time_per_iteration": 2.5752930641174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077721, + "balance_loss_mlp": 1.0581702, + "epoch": 0.08022316275490574, + "flos": 517999905792.0, + "grad_norm": 0.07373378819853486, + "language_loss": 0.97326815, + "learning_rate": 0.0009934028291838672, + "loss": 0.98404539, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.1953125, + "step": 417, + "time_per_iteration": 2.715142011642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078885, + "balance_loss_mlp": 1.0593344, + "epoch": 0.0804155444401693, + "flos": 493755512832.0, + "grad_norm": 0.06878732968267398, + "language_loss": 0.9290086, + "learning_rate": 0.0009933522918395433, + "loss": 0.93979746, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.19555664, + "step": 418, + "time_per_iteration": 2.7008063793182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01673141, + "balance_loss_mlp": 1.6505394, + "epoch": 0.08060792612543285, + "flos": 1580567579136.0, + "grad_norm": 0.10865535097535944, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.7992425, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.22558594, + "step": 419, + "time_per_iteration": 4.854820728302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092516, + "balance_loss_mlp": 1.07238102, + "epoch": 0.08080030781069643, + "flos": 525090041856.0, + "grad_norm": 0.07888672823303539, + "language_loss": 1.11010027, + "learning_rate": 0.000993250642561551, + "loss": 1.12102532, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.20129395, + "step": 420, + "time_per_iteration": 2.6152822971343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102725, + "balance_loss_mlp": 1.08251905, + "epoch": 0.08099268949595999, + "flos": 546459374592.0, + "grad_norm": 0.06927423279576624, + "language_loss": 0.96781242, + "learning_rate": 0.0009931995306673466, + "loss": 0.97883964, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.20202637, + "step": 421, + "time_per_iteration": 2.8378820419311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107938, + "balance_loss_mlp": 1.08725524, + "epoch": 0.08118507118122355, + "flos": 510116811264.0, + "grad_norm": 0.07245841989657228, + "language_loss": 1.01691484, + "learning_rate": 0.000993148227296103, + "loss": 1.02799416, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.20678711, + "step": 422, + "time_per_iteration": 2.6234657764434814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109153, + "balance_loss_mlp": 1.08827925, + "epoch": 0.08137745286648711, + "flos": 720339969024.0, + "grad_norm": 0.06440268991377437, + "language_loss": 0.90059143, + "learning_rate": 0.000993096732467738, + "loss": 0.91168296, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.2088623, + "step": 423, + "time_per_iteration": 2.9789979457855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107042, + "balance_loss_mlp": 1.08620405, + "epoch": 0.08156983455175067, + "flos": 679313848320.0, + "grad_norm": 0.09430690436493987, + "language_loss": 0.97591221, + "learning_rate": 0.0009930450462022435, + "loss": 0.9869827, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.20837402, + "step": 424, + "time_per_iteration": 2.7870407104492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01731933, + "balance_loss_mlp": 1.70847309, + "epoch": 0.08176221623701424, + "flos": 1452577135104.0, + "grad_norm": 0.13164555017172178, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80921739, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.234375, + "step": 425, + "time_per_iteration": 4.870323181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095108, + "balance_loss_mlp": 1.07456827, + "epoch": 0.0819545979222778, + "flos": 1556034071040.0, + "grad_norm": 0.10298759083167684, + "language_loss": 0.95328236, + "learning_rate": 0.0009929410994402065, + "loss": 0.9642334, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.20544434, + "step": 426, + "time_per_iteration": 3.7942585945129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093366, + "balance_loss_mlp": 1.07214665, + "epoch": 0.08214697960754136, + "flos": 512456398848.0, + "grad_norm": 0.069672302328133, + "language_loss": 0.99507213, + "learning_rate": 0.0009928888389840196, + "loss": 1.00600576, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.21240234, + "step": 427, + "time_per_iteration": 2.684760093688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073876, + "balance_loss_mlp": 1.05376494, + "epoch": 0.08233936129280492, + "flos": 594850646016.0, + "grad_norm": 0.07796900075206671, + "language_loss": 1.01749206, + "learning_rate": 0.0009928363871714147, + "loss": 1.02823079, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.20092773, + "step": 428, + "time_per_iteration": 2.6608195304870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078126, + "balance_loss_mlp": 1.05796742, + "epoch": 0.08253174297806849, + "flos": 571758594048.0, + "grad_norm": 0.07341701057973313, + "language_loss": 0.95524251, + "learning_rate": 0.0009927837440227556, + "loss": 0.96602374, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.20153809, + "step": 429, + "time_per_iteration": 2.824958324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083273, + "balance_loss_mlp": 1.06413972, + "epoch": 0.08272412466333205, + "flos": 623065623552.0, + "grad_norm": 0.06194570532237157, + "language_loss": 0.90308964, + "learning_rate": 0.0009927309095584798, + "loss": 0.91392243, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.19128418, + "step": 430, + "time_per_iteration": 2.9565205574035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105878, + "balance_loss_mlp": 1.08643484, + "epoch": 0.08291650634859561, + "flos": 513745542144.0, + "grad_norm": 0.09375416706629437, + "language_loss": 1.0225904, + "learning_rate": 0.0009926778837991, + "loss": 1.03364921, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.19433594, + "step": 431, + "time_per_iteration": 2.5606777667999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104802, + "balance_loss_mlp": 1.08521628, + "epoch": 0.08310888803385917, + "flos": 667073083392.0, + "grad_norm": 0.09022222071598751, + "language_loss": 1.00445497, + "learning_rate": 0.000992624666765202, + "loss": 1.01550293, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.19580078, + "step": 432, + "time_per_iteration": 2.763514995574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112312, + "balance_loss_mlp": 1.09166527, + "epoch": 0.08330126971912274, + "flos": 582995404800.0, + "grad_norm": 0.07142121215748316, + "language_loss": 0.98131895, + "learning_rate": 0.000992571258477447, + "loss": 0.99244213, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.20654297, + "step": 433, + "time_per_iteration": 2.7823588848114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086622, + "balance_loss_mlp": 1.06731021, + "epoch": 0.0834936514043863, + "flos": 561064458240.0, + "grad_norm": 0.06618743000622296, + "language_loss": 0.92206728, + "learning_rate": 0.0009925176589565695, + "loss": 0.93293345, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.1932373, + "step": 434, + "time_per_iteration": 2.7774362564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109069, + "balance_loss_mlp": 1.07043648, + "epoch": 0.08368603308964986, + "flos": 494272046592.0, + "grad_norm": 0.07800081613857189, + "language_loss": 1.01949787, + "learning_rate": 0.0009924638682233791, + "loss": 1.03040481, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.20251465, + "step": 435, + "time_per_iteration": 2.574716091156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236801, + "balance_loss_mlp": 1.21505737, + "epoch": 0.08387841477491342, + "flos": 1388322312192.0, + "grad_norm": 0.08820287098199171, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80801398, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.21777344, + "step": 436, + "time_per_iteration": 4.521069049835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087939, + "balance_loss_mlp": 1.06750691, + "epoch": 0.084070796460177, + "flos": 798642796032.0, + "grad_norm": 0.09737991847895365, + "language_loss": 0.92070073, + "learning_rate": 0.0009923557132036668, + "loss": 0.93158013, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.2043457, + "step": 437, + "time_per_iteration": 3.0401971340179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106896, + "balance_loss_mlp": 1.08635592, + "epoch": 0.08426317814544056, + "flos": 558681200640.0, + "grad_norm": 0.07082709395687636, + "language_loss": 0.96077365, + "learning_rate": 0.0009923013489591345, + "loss": 0.97184265, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.20532227, + "step": 438, + "time_per_iteration": 2.7388038635253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138911, + "balance_loss_mlp": 1.11965871, + "epoch": 0.08445555983070412, + "flos": 810057106944.0, + "grad_norm": 0.09946092642967543, + "language_loss": 0.94659293, + "learning_rate": 0.0009922467935862681, + "loss": 0.95798206, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.19250488, + "step": 439, + "time_per_iteration": 3.0827929973602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153103, + "balance_loss_mlp": 1.13278937, + "epoch": 0.08464794151596768, + "flos": 509939311104.0, + "grad_norm": 0.08658230076015333, + "language_loss": 0.97196984, + "learning_rate": 0.0009921920471062478, + "loss": 0.9835009, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.203125, + "step": 440, + "time_per_iteration": 2.5667247772216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_mlp": 1.08952785, + "epoch": 0.08484032320123125, + "flos": 556149556224.0, + "grad_norm": 0.0779492699350581, + "language_loss": 0.95526892, + "learning_rate": 0.0009921371095403281, + "loss": 0.96636182, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.19763184, + "step": 441, + "time_per_iteration": 2.6504476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081558, + "balance_loss_mlp": 1.06137586, + "epoch": 0.08503270488649481, + "flos": 527103742464.0, + "grad_norm": 0.0823758421396894, + "language_loss": 0.98291612, + "learning_rate": 0.0009920819809098379, + "loss": 0.99373174, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.20166016, + "step": 442, + "time_per_iteration": 2.5884947776794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076633, + "balance_loss_mlp": 1.05612862, + "epoch": 0.08522508657175837, + "flos": 613989490176.0, + "grad_norm": 0.07828377396362728, + "language_loss": 0.94043314, + "learning_rate": 0.0009920266612361798, + "loss": 0.95119947, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.20507812, + "step": 443, + "time_per_iteration": 2.7464845180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077144, + "balance_loss_mlp": 1.05650926, + "epoch": 0.08541746825702193, + "flos": 619495119360.0, + "grad_norm": 0.07442656272719532, + "language_loss": 0.94335687, + "learning_rate": 0.0009919711505408308, + "loss": 0.95412827, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.2064209, + "step": 444, + "time_per_iteration": 2.7615623474121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092391, + "balance_loss_mlp": 1.07126665, + "epoch": 0.08560984994228549, + "flos": 482671471104.0, + "grad_norm": 0.08601843511227286, + "language_loss": 0.92049706, + "learning_rate": 0.000991915448845342, + "loss": 0.93142092, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.21130371, + "step": 445, + "time_per_iteration": 2.519644260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103035, + "balance_loss_mlp": 1.08145857, + "epoch": 0.08580223162754906, + "flos": 516897027072.0, + "grad_norm": 0.07781715705073443, + "language_loss": 1.01207459, + "learning_rate": 0.000991859556171339, + "loss": 1.02310491, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.21569824, + "step": 446, + "time_per_iteration": 2.5678694248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116922, + "balance_loss_mlp": 1.09462976, + "epoch": 0.08599461331281262, + "flos": 531215511552.0, + "grad_norm": 0.11213971543052093, + "language_loss": 1.02931881, + "learning_rate": 0.000991803472540521, + "loss": 1.040488, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.22302246, + "step": 447, + "time_per_iteration": 2.6309196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124555, + "balance_loss_mlp": 1.10302639, + "epoch": 0.08618699499807618, + "flos": 789966743040.0, + "grad_norm": 0.07287006723198586, + "language_loss": 0.97443926, + "learning_rate": 0.0009917471979746615, + "loss": 0.98568487, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.21533203, + "step": 448, + "time_per_iteration": 2.9742491245269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134564, + "balance_loss_mlp": 1.11266506, + "epoch": 0.08637937668333974, + "flos": 565707317760.0, + "grad_norm": 0.08202115093309782, + "language_loss": 0.97199845, + "learning_rate": 0.0009916907324956086, + "loss": 0.98334408, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.21923828, + "step": 449, + "time_per_iteration": 2.704089641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151497, + "balance_loss_mlp": 1.12693954, + "epoch": 0.08657175836860331, + "flos": 444930665472.0, + "grad_norm": 0.09325215593581063, + "language_loss": 0.93441564, + "learning_rate": 0.0009916340761252837, + "loss": 0.9459306, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.24536133, + "step": 450, + "time_per_iteration": 2.5866575241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158359, + "balance_loss_mlp": 1.13567328, + "epoch": 0.08676414005386687, + "flos": 843789450240.0, + "grad_norm": 0.23711660967347972, + "language_loss": 0.90976942, + "learning_rate": 0.0009915772288856832, + "loss": 0.92135304, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.22668457, + "step": 451, + "time_per_iteration": 3.109010696411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118071, + "balance_loss_mlp": 1.15827537, + "epoch": 0.08695652173913043, + "flos": 602995608576.0, + "grad_norm": 0.08699490701012727, + "language_loss": 0.92036849, + "learning_rate": 0.000991520190798877, + "loss": 0.93217564, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.22424316, + "step": 452, + "time_per_iteration": 2.8523812294006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181191, + "balance_loss_mlp": 1.15807629, + "epoch": 0.08714890342439399, + "flos": 730423028736.0, + "grad_norm": 0.09293440668835976, + "language_loss": 1.01637089, + "learning_rate": 0.0009914629618870089, + "loss": 1.02818286, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.23095703, + "step": 453, + "time_per_iteration": 2.882887125015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142362, + "balance_loss_mlp": 1.12891519, + "epoch": 0.08734128510965757, + "flos": 1481518232064.0, + "grad_norm": 0.0645312523276542, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79818237, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.13476562, + "step": 454, + "time_per_iteration": 4.717878103256226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103219, + "balance_loss_mlp": 1.09034455, + "epoch": 0.08753366679492113, + "flos": 1522214083584.0, + "grad_norm": 0.04274098512475534, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82531178, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.12890625, + "step": 455, + "time_per_iteration": 4.838243246078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171645, + "balance_loss_mlp": 1.14951944, + "epoch": 0.08772604848018468, + "flos": 720935078400.0, + "grad_norm": 0.10543082910841049, + "language_loss": 0.94423014, + "learning_rate": 0.0009912901304235883, + "loss": 0.95594656, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.22131348, + "step": 456, + "time_per_iteration": 2.9432015419006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150762, + "balance_loss_mlp": 1.12861252, + "epoch": 0.08791843016544824, + "flos": 707926086144.0, + "grad_norm": 0.10980567381029156, + "language_loss": 0.91300154, + "learning_rate": 0.000991232138434397, + "loss": 0.92450917, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.22143555, + "step": 457, + "time_per_iteration": 2.832761526107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113929, + "balance_loss_mlp": 1.09195828, + "epoch": 0.08811081185071182, + "flos": 472799407104.0, + "grad_norm": 0.1324680836731367, + "language_loss": 0.97845554, + "learning_rate": 0.000991173955731976, + "loss": 0.98959482, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.21960449, + "step": 458, + "time_per_iteration": 2.660696506500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100778, + "balance_loss_mlp": 1.07958269, + "epoch": 0.08830319353597538, + "flos": 684647769600.0, + "grad_norm": 0.07138233575581546, + "language_loss": 1.0178268, + "learning_rate": 0.0009911155823389137, + "loss": 1.02883458, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.21203613, + "step": 459, + "time_per_iteration": 2.9878122806549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105128, + "balance_loss_mlp": 1.08344412, + "epoch": 0.08849557522123894, + "flos": 573235411968.0, + "grad_norm": 0.0735053314112025, + "language_loss": 0.9764787, + "learning_rate": 0.000991057018277873, + "loss": 0.98752999, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.21679688, + "step": 460, + "time_per_iteration": 2.707247018814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116963, + "balance_loss_mlp": 1.09422946, + "epoch": 0.0886879569065025, + "flos": 564303283200.0, + "grad_norm": 0.10552034142073316, + "language_loss": 0.9759655, + "learning_rate": 0.0009909982635715898, + "loss": 0.98713505, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.22729492, + "step": 461, + "time_per_iteration": 2.609016180038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120097, + "balance_loss_mlp": 1.09760189, + "epoch": 0.08888033859176607, + "flos": 563609249280.0, + "grad_norm": 0.09185893532484944, + "language_loss": 0.96625364, + "learning_rate": 0.0009909393182428751, + "loss": 0.97745454, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.22497559, + "step": 462, + "time_per_iteration": 2.682616949081421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116703, + "balance_loss_mlp": 1.09437466, + "epoch": 0.08907272027702963, + "flos": 465517214208.0, + "grad_norm": 0.08888403374641002, + "language_loss": 0.91300213, + "learning_rate": 0.000990880182314614, + "loss": 0.92416912, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.22314453, + "step": 463, + "time_per_iteration": 2.732579469680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122985, + "balance_loss_mlp": 1.10014486, + "epoch": 0.08926510196229319, + "flos": 681200921088.0, + "grad_norm": 0.07408309604525525, + "language_loss": 0.92294347, + "learning_rate": 0.0009908208558097643, + "loss": 0.93417335, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.22839355, + "step": 464, + "time_per_iteration": 2.910313606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137827, + "balance_loss_mlp": 1.115273, + "epoch": 0.08945748364755675, + "flos": 596411831808.0, + "grad_norm": 0.08673846989427919, + "language_loss": 0.93827909, + "learning_rate": 0.000990761338751359, + "loss": 0.94965738, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.22546387, + "step": 465, + "time_per_iteration": 2.827570676803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133815, + "balance_loss_mlp": 1.12222791, + "epoch": 0.08964986533282032, + "flos": 1585082400768.0, + "grad_norm": 0.06082202694548154, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74793446, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.11572266, + "step": 466, + "time_per_iteration": 4.960917234420776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177765, + "balance_loss_mlp": 1.15419745, + "epoch": 0.08984224701808388, + "flos": 533268499968.0, + "grad_norm": 0.4900596090566038, + "language_loss": 0.96587038, + "learning_rate": 0.0009906417330663815, + "loss": 0.97764802, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.23571777, + "step": 467, + "time_per_iteration": 2.5937299728393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157084, + "balance_loss_mlp": 1.13383865, + "epoch": 0.09003462870334744, + "flos": 478702296576.0, + "grad_norm": 0.08613132202477504, + "language_loss": 0.92798859, + "learning_rate": 0.0009905816444862442, + "loss": 0.93955946, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.23217773, + "step": 468, + "time_per_iteration": 2.6012237071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164868, + "balance_loss_mlp": 1.14150274, + "epoch": 0.090227010388611, + "flos": 653307448320.0, + "grad_norm": 0.08218040805372613, + "language_loss": 0.90769458, + "learning_rate": 0.0009905213654454216, + "loss": 0.91934329, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.23364258, + "step": 469, + "time_per_iteration": 2.8727760314941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176439, + "balance_loss_mlp": 1.15152478, + "epoch": 0.09041939207387456, + "flos": 617894645760.0, + "grad_norm": 0.09256259391525869, + "language_loss": 0.97864139, + "learning_rate": 0.0009904608959673158, + "loss": 0.9904058, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.24938965, + "step": 470, + "time_per_iteration": 2.7991952896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151805, + "balance_loss_mlp": 1.12671185, + "epoch": 0.09061177375913813, + "flos": 454137808896.0, + "grad_norm": 0.09693984756275055, + "language_loss": 0.97988749, + "learning_rate": 0.000990400236075403, + "loss": 0.99140555, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.25109863, + "step": 471, + "time_per_iteration": 2.523508310317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125394, + "balance_loss_mlp": 1.10119498, + "epoch": 0.0908041554444017, + "flos": 543982984704.0, + "grad_norm": 0.09250187628709369, + "language_loss": 0.9490509, + "learning_rate": 0.0009903393857932338, + "loss": 0.96030486, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.24194336, + "step": 472, + "time_per_iteration": 2.7065584659576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124084, + "balance_loss_mlp": 1.09912193, + "epoch": 0.09099653712966525, + "flos": 564052999680.0, + "grad_norm": 0.10897832311722938, + "language_loss": 0.93660218, + "learning_rate": 0.0009902783451444317, + "loss": 0.94784307, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.24963379, + "step": 473, + "time_per_iteration": 2.7067277431488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108649, + "balance_loss_mlp": 1.08496177, + "epoch": 0.09118891881492881, + "flos": 474300956160.0, + "grad_norm": 0.09402902414949979, + "language_loss": 0.97273493, + "learning_rate": 0.0009902171141526956, + "loss": 0.98382139, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.23693848, + "step": 474, + "time_per_iteration": 2.5281569957733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087186, + "balance_loss_mlp": 1.06240201, + "epoch": 0.09138130050019239, + "flos": 545579076096.0, + "grad_norm": 0.06728788346792411, + "language_loss": 0.85273343, + "learning_rate": 0.000990155692841797, + "loss": 0.86360526, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.2479248, + "step": 475, + "time_per_iteration": 2.970107316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_mlp": 1.0587163, + "epoch": 0.09157368218545595, + "flos": 732397441536.0, + "grad_norm": 0.07226189405033341, + "language_loss": 0.97062063, + "learning_rate": 0.0009900940812355818, + "loss": 0.98145562, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.24768066, + "step": 476, + "time_per_iteration": 2.959184169769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096233, + "balance_loss_mlp": 1.07208097, + "epoch": 0.0917660638707195, + "flos": 610709967360.0, + "grad_norm": 0.09034653129128065, + "language_loss": 0.92824447, + "learning_rate": 0.00099003227935797, + "loss": 0.93920678, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.24157715, + "step": 477, + "time_per_iteration": 2.7553765773773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113263, + "balance_loss_mlp": 1.08839583, + "epoch": 0.09195844555598306, + "flos": 655561257984.0, + "grad_norm": 0.09830094540804109, + "language_loss": 0.95358098, + "learning_rate": 0.000989970287232955, + "loss": 0.96471357, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.2487793, + "step": 478, + "time_per_iteration": 2.7916457653045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112064, + "balance_loss_mlp": 1.09633327, + "epoch": 0.09215082724124664, + "flos": 476339387904.0, + "grad_norm": 0.08054303064285366, + "language_loss": 0.93560576, + "learning_rate": 0.0009899081048846043, + "loss": 0.94681215, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.24267578, + "step": 479, + "time_per_iteration": 2.554161787033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114732, + "balance_loss_mlp": 1.12177348, + "epoch": 0.0923432089265102, + "flos": 524051182080.0, + "grad_norm": 0.1186512856896222, + "language_loss": 0.97593725, + "learning_rate": 0.0009898457323370593, + "loss": 0.98741049, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.25549316, + "step": 480, + "time_per_iteration": 2.5794191360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131558, + "balance_loss_mlp": 1.10608315, + "epoch": 0.09253559061177376, + "flos": 545302651392.0, + "grad_norm": 0.10688941209840569, + "language_loss": 0.96892118, + "learning_rate": 0.000989783169614535, + "loss": 0.98023689, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.25512695, + "step": 481, + "time_per_iteration": 2.6676101684570312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336494, + "balance_loss_mlp": 1.32304764, + "epoch": 0.09272797229703732, + "flos": 1537222219776.0, + "grad_norm": 0.112558059824644, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80089253, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.13476562, + "step": 482, + "time_per_iteration": 4.910710096359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121205, + "balance_loss_mlp": 1.09537172, + "epoch": 0.09292035398230089, + "flos": 689501624832.0, + "grad_norm": 0.08905484371867754, + "language_loss": 0.93989253, + "learning_rate": 0.000989657473741779, + "loss": 0.95110452, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.25866699, + "step": 483, + "time_per_iteration": 2.8736467361450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120092, + "balance_loss_mlp": 1.09219658, + "epoch": 0.09311273566756445, + "flos": 509482414080.0, + "grad_norm": 0.10011855628381364, + "language_loss": 0.94861096, + "learning_rate": 0.0009895943406403465, + "loss": 0.95981193, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.27905273, + "step": 484, + "time_per_iteration": 2.7233312129974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114409, + "balance_loss_mlp": 1.08641887, + "epoch": 0.09330511735282801, + "flos": 659111413248.0, + "grad_norm": 0.10884122740481975, + "language_loss": 0.87602448, + "learning_rate": 0.0009895310174615338, + "loss": 0.88716859, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.2800293, + "step": 485, + "time_per_iteration": 2.7538061141967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098211, + "balance_loss_mlp": 1.08533621, + "epoch": 0.09349749903809157, + "flos": 1452054809088.0, + "grad_norm": 0.04867374252302138, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76816726, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.12890625, + "step": 486, + "time_per_iteration": 4.681119441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121456, + "balance_loss_mlp": 1.09291732, + "epoch": 0.09368988072335514, + "flos": 520614508032.0, + "grad_norm": 0.07858969791005947, + "language_loss": 0.92458618, + "learning_rate": 0.0009894038009701782, + "loss": 0.93580067, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.28515625, + "step": 487, + "time_per_iteration": 2.6114649772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153128, + "balance_loss_mlp": 1.12148952, + "epoch": 0.0938822624086187, + "flos": 497502107136.0, + "grad_norm": 0.11959755259003642, + "language_loss": 0.91595036, + "learning_rate": 0.0009893399077070253, + "loss": 0.92748165, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.31616211, + "step": 488, + "time_per_iteration": 2.5603692531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127952, + "balance_loss_mlp": 1.09845996, + "epoch": 0.09407464409388226, + "flos": 532948405248.0, + "grad_norm": 0.09098963794592498, + "language_loss": 0.89760649, + "learning_rate": 0.0009892758244652718, + "loss": 0.90888608, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.29516602, + "step": 489, + "time_per_iteration": 2.65938401222229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127724, + "balance_loss_mlp": 1.09568012, + "epoch": 0.09426702577914582, + "flos": 585736634880.0, + "grad_norm": 0.09102778373185845, + "language_loss": 0.94519842, + "learning_rate": 0.0009892115512697968, + "loss": 0.95647562, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.3203125, + "step": 490, + "time_per_iteration": 2.6538186073303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120065, + "balance_loss_mlp": 1.08926105, + "epoch": 0.0944594074644094, + "flos": 503081929728.0, + "grad_norm": 0.07724049493821064, + "language_loss": 0.96624851, + "learning_rate": 0.0009891470881455537, + "loss": 0.97744912, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.30810547, + "step": 491, + "time_per_iteration": 2.699535608291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122711, + "balance_loss_mlp": 1.09145451, + "epoch": 0.09465178914967295, + "flos": 570748847616.0, + "grad_norm": 0.0816499633869022, + "language_loss": 0.94510269, + "learning_rate": 0.0009890824351175692, + "loss": 0.95632982, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.31225586, + "step": 492, + "time_per_iteration": 2.678191661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125893, + "balance_loss_mlp": 1.09418344, + "epoch": 0.09484417083493651, + "flos": 549098707968.0, + "grad_norm": 0.07977284094064935, + "language_loss": 0.98609412, + "learning_rate": 0.0009890175922109435, + "loss": 0.99735302, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.31689453, + "step": 493, + "time_per_iteration": 2.6466987133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138627, + "balance_loss_mlp": 1.10534418, + "epoch": 0.09503655252020007, + "flos": 823552109568.0, + "grad_norm": 0.09331424233507904, + "language_loss": 0.96939492, + "learning_rate": 0.0009889525594508513, + "loss": 0.9807812, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.33300781, + "step": 494, + "time_per_iteration": 3.009894371032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153225, + "balance_loss_mlp": 1.12218332, + "epoch": 0.09522893420546363, + "flos": 404397757440.0, + "grad_norm": 0.08141129996203125, + "language_loss": 0.91043431, + "learning_rate": 0.0009888873368625404, + "loss": 0.92196655, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.31030273, + "step": 495, + "time_per_iteration": 2.4904890060424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171726, + "balance_loss_mlp": 1.14025438, + "epoch": 0.0954213158907272, + "flos": 690707810304.0, + "grad_norm": 0.08256479818708104, + "language_loss": 0.94339681, + "learning_rate": 0.0009888219244713326, + "loss": 0.95511413, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.31445312, + "step": 496, + "time_per_iteration": 2.8060483932495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181664, + "balance_loss_mlp": 1.15033531, + "epoch": 0.09561369757599077, + "flos": 518739019776.0, + "grad_norm": 0.10472312979641793, + "language_loss": 0.94370055, + "learning_rate": 0.0009887563223026229, + "loss": 0.95551717, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.31323242, + "step": 497, + "time_per_iteration": 2.6536803245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228939, + "balance_loss_mlp": 1.21549225, + "epoch": 0.09580607926125433, + "flos": 1384825849344.0, + "grad_norm": 0.04877985805939708, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80297101, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.13476562, + "step": 498, + "time_per_iteration": 4.874605178833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197245, + "balance_loss_mlp": 1.16455829, + "epoch": 0.09599846094651789, + "flos": 717090969600.0, + "grad_norm": 0.08863465655244346, + "language_loss": 0.93284124, + "learning_rate": 0.0009886245487346482, + "loss": 0.94481373, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.3269043, + "step": 499, + "time_per_iteration": 3.047938108444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011865, + "balance_loss_mlp": 1.15474319, + "epoch": 0.09619084263178146, + "flos": 385824909312.0, + "grad_norm": 0.09673466805801513, + "language_loss": 0.96238041, + "learning_rate": 0.0009885583773865422, + "loss": 0.97424543, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.31762695, + "step": 500, + "time_per_iteration": 2.402763843536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140705, + "balance_loss_mlp": 1.1099968, + "epoch": 0.09638322431704502, + "flos": 533869401600.0, + "grad_norm": 0.08556524095898377, + "language_loss": 0.93457472, + "learning_rate": 0.0009884920163632524, + "loss": 0.94598186, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.30688477, + "step": 501, + "time_per_iteration": 2.7420296669006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155853, + "balance_loss_mlp": 1.12373805, + "epoch": 0.09657560600230858, + "flos": 500426629632.0, + "grad_norm": 0.08462195742795481, + "language_loss": 0.95688182, + "learning_rate": 0.000988425465690543, + "loss": 0.96844035, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.32104492, + "step": 502, + "time_per_iteration": 2.5425736904144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163304, + "balance_loss_mlp": 1.13099861, + "epoch": 0.09676798768757214, + "flos": 528995197440.0, + "grad_norm": 0.07192036847451248, + "language_loss": 0.92721838, + "learning_rate": 0.0009883587253942505, + "loss": 0.93885148, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.32324219, + "step": 503, + "time_per_iteration": 2.8340742588043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188959, + "balance_loss_mlp": 1.15598607, + "epoch": 0.09696036937283571, + "flos": 463379857920.0, + "grad_norm": 0.0888689340699796, + "language_loss": 0.99166393, + "learning_rate": 0.0009882917955002862, + "loss": 1.00355351, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.32983398, + "step": 504, + "time_per_iteration": 2.560448169708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147535, + "balance_loss_mlp": 1.11606395, + "epoch": 0.09715275105809927, + "flos": 534716204544.0, + "grad_norm": 0.07251663236407552, + "language_loss": 0.9150176, + "learning_rate": 0.0009882246760346343, + "loss": 0.92649293, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.31420898, + "step": 505, + "time_per_iteration": 2.6460299491882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114081, + "balance_loss_mlp": 1.10714495, + "epoch": 0.09734513274336283, + "flos": 454713979392.0, + "grad_norm": 0.10061537251918176, + "language_loss": 0.96100289, + "learning_rate": 0.0009881573670233533, + "loss": 0.97241098, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.33666992, + "step": 506, + "time_per_iteration": 2.5137040615081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109977, + "balance_loss_mlp": 1.08029366, + "epoch": 0.09753751442862639, + "flos": 508551243264.0, + "grad_norm": 0.0762964042901656, + "language_loss": 0.91185808, + "learning_rate": 0.0009880898684925747, + "loss": 0.92295784, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.29663086, + "step": 507, + "time_per_iteration": 2.6571738719940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110119, + "balance_loss_mlp": 1.07133985, + "epoch": 0.09772989611388996, + "flos": 484030425600.0, + "grad_norm": 0.07531505250568626, + "language_loss": 0.89554358, + "learning_rate": 0.0009880221804685037, + "loss": 0.90655547, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.29882812, + "step": 508, + "time_per_iteration": 2.596289873123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01404721, + "balance_loss_mlp": 1.39136958, + "epoch": 0.09792227779915352, + "flos": 1565306339328.0, + "grad_norm": 0.10151454340945995, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80749142, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.13378906, + "step": 509, + "time_per_iteration": 4.724441051483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116621, + "balance_loss_mlp": 1.08655643, + "epoch": 0.09811465948441708, + "flos": 587529165312.0, + "grad_norm": 0.08257009801201759, + "language_loss": 0.94708043, + "learning_rate": 0.0009878862360456733, + "loss": 0.95824659, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.30029297, + "step": 510, + "time_per_iteration": 2.703011989593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122701, + "balance_loss_mlp": 1.09406662, + "epoch": 0.09830704116968064, + "flos": 612719285760.0, + "grad_norm": 0.06191460590209878, + "language_loss": 0.88457662, + "learning_rate": 0.0009878179796996922, + "loss": 0.89580369, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.28637695, + "step": 511, + "time_per_iteration": 2.7212226390838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128587, + "balance_loss_mlp": 1.09885597, + "epoch": 0.09849942285494422, + "flos": 538528227840.0, + "grad_norm": 0.06874751685339883, + "language_loss": 0.9199326, + "learning_rate": 0.0009877495339659754, + "loss": 0.9312185, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.29724121, + "step": 512, + "time_per_iteration": 2.7520575523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111609, + "balance_loss_mlp": 1.08826661, + "epoch": 0.09869180454020778, + "flos": 620193535488.0, + "grad_norm": 0.06953003964378547, + "language_loss": 0.87301105, + "learning_rate": 0.000987680898871096, + "loss": 0.88417196, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.27832031, + "step": 513, + "time_per_iteration": 2.7121992111206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134292, + "balance_loss_mlp": 1.10401261, + "epoch": 0.09888418622547133, + "flos": 811375363584.0, + "grad_norm": 0.1024184057853134, + "language_loss": 0.87763435, + "learning_rate": 0.0009876120744417, + "loss": 0.88897729, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.30273438, + "step": 514, + "time_per_iteration": 2.971573829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123143, + "balance_loss_mlp": 1.09267306, + "epoch": 0.0990765679107349, + "flos": 535548450816.0, + "grad_norm": 0.06764912074049458, + "language_loss": 0.95588082, + "learning_rate": 0.0009875430607045078, + "loss": 0.9671123, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.3046875, + "step": 515, + "time_per_iteration": 2.6630361080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108813, + "balance_loss_mlp": 1.08072746, + "epoch": 0.09926894959599845, + "flos": 587607740928.0, + "grad_norm": 0.06593749006245919, + "language_loss": 0.92788792, + "learning_rate": 0.000987473857686313, + "loss": 0.93897605, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.28076172, + "step": 516, + "time_per_iteration": 2.710068702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121556, + "balance_loss_mlp": 1.09039485, + "epoch": 0.09946133128126203, + "flos": 640947409920.0, + "grad_norm": 0.08862761474564218, + "language_loss": 0.9451825, + "learning_rate": 0.0009874044654139824, + "loss": 0.95639801, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.3112793, + "step": 517, + "time_per_iteration": 2.729975461959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117034, + "balance_loss_mlp": 1.08520555, + "epoch": 0.09965371296652559, + "flos": 465546327552.0, + "grad_norm": 0.09157938746936445, + "language_loss": 0.9250825, + "learning_rate": 0.0009873348839144563, + "loss": 0.93625283, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.31811523, + "step": 518, + "time_per_iteration": 2.5117127895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112516, + "balance_loss_mlp": 1.09540534, + "epoch": 0.09984609465178915, + "flos": 483365505024.0, + "grad_norm": 0.07736257304557469, + "language_loss": 0.9674046, + "learning_rate": 0.000987265113214749, + "loss": 0.97865617, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.29711914, + "step": 519, + "time_per_iteration": 2.5816774368286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147544, + "balance_loss_mlp": 1.11421299, + "epoch": 0.1000384763370527, + "flos": 568764260352.0, + "grad_norm": 0.08763817133734854, + "language_loss": 0.96583092, + "learning_rate": 0.0009871951533419476, + "loss": 0.97730637, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.33325195, + "step": 520, + "time_per_iteration": 2.638664484024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140597, + "balance_loss_mlp": 1.108482, + "epoch": 0.10023085802231628, + "flos": 545515057152.0, + "grad_norm": 0.10925869968591369, + "language_loss": 0.88377398, + "learning_rate": 0.0009871250043232132, + "loss": 0.89517999, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.32104492, + "step": 521, + "time_per_iteration": 2.70491886138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136934, + "balance_loss_mlp": 1.10555792, + "epoch": 0.10042323970757984, + "flos": 503208557568.0, + "grad_norm": 0.07694864026409119, + "language_loss": 0.87725985, + "learning_rate": 0.0009870546661857797, + "loss": 0.8886292, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.31347656, + "step": 522, + "time_per_iteration": 2.653456211090088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126678, + "balance_loss_mlp": 1.09380031, + "epoch": 0.1006156213928434, + "flos": 770084402688.0, + "grad_norm": 0.08414569380370593, + "language_loss": 0.95787346, + "learning_rate": 0.0009869841389569553, + "loss": 0.96914017, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.32885742, + "step": 523, + "time_per_iteration": 2.9442663192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116557, + "balance_loss_mlp": 1.08625388, + "epoch": 0.10080800307810696, + "flos": 489786338304.0, + "grad_norm": 0.06587351152736676, + "language_loss": 0.88897854, + "learning_rate": 0.0009869134226641206, + "loss": 0.90014416, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.30297852, + "step": 524, + "time_per_iteration": 2.5559868812561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110225, + "balance_loss_mlp": 1.07746601, + "epoch": 0.10100038476337053, + "flos": 454478252544.0, + "grad_norm": 0.09167866019985617, + "language_loss": 0.88383424, + "learning_rate": 0.0009868425173347303, + "loss": 0.89493656, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.32788086, + "step": 525, + "time_per_iteration": 2.645116090774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111349, + "balance_loss_mlp": 1.08216143, + "epoch": 0.10119276644863409, + "flos": 556155348480.0, + "grad_norm": 0.07288604326691553, + "language_loss": 0.96749896, + "learning_rate": 0.0009867714229963125, + "loss": 0.97863394, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.31323242, + "step": 526, + "time_per_iteration": 2.730703592300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106354, + "balance_loss_mlp": 1.07540703, + "epoch": 0.10138514813389765, + "flos": 515990587392.0, + "grad_norm": 0.07095113284061857, + "language_loss": 0.93916923, + "learning_rate": 0.000986700139676468, + "loss": 0.95023274, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.30932617, + "step": 527, + "time_per_iteration": 2.5836338996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110446, + "balance_loss_mlp": 1.07833052, + "epoch": 0.10157752981916121, + "flos": 500323322880.0, + "grad_norm": 0.06933811905919615, + "language_loss": 0.91673893, + "learning_rate": 0.0009866286674028717, + "loss": 0.92784333, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.32104492, + "step": 528, + "time_per_iteration": 2.7084739208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_mlp": 1.07100391, + "epoch": 0.10176991150442478, + "flos": 656444376576.0, + "grad_norm": 0.07189407365130172, + "language_loss": 0.88586026, + "learning_rate": 0.0009865570062032717, + "loss": 0.8968786, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.30810547, + "step": 529, + "time_per_iteration": 2.9141628742218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103952, + "balance_loss_mlp": 1.07443571, + "epoch": 0.10196229318968834, + "flos": 572974953984.0, + "grad_norm": 0.06841647032337263, + "language_loss": 0.93659967, + "learning_rate": 0.0009864851561054893, + "loss": 0.94763923, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.29516602, + "step": 530, + "time_per_iteration": 2.7539894580841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090977, + "balance_loss_mlp": 1.06110358, + "epoch": 0.1021546748749519, + "flos": 517946061312.0, + "grad_norm": 0.07340246055426732, + "language_loss": 0.91722125, + "learning_rate": 0.0009864131171374191, + "loss": 0.92813098, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.29882812, + "step": 531, + "time_per_iteration": 2.6921956539154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109944, + "balance_loss_mlp": 1.06749225, + "epoch": 0.10234705656021546, + "flos": 609470286336.0, + "grad_norm": 0.07867637119915549, + "language_loss": 0.91107762, + "learning_rate": 0.0009863408893270292, + "loss": 0.92207205, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.31933594, + "step": 532, + "time_per_iteration": 2.7911570072174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106913, + "balance_loss_mlp": 1.07396317, + "epoch": 0.10253943824547904, + "flos": 601473710592.0, + "grad_norm": 0.08191923529880715, + "language_loss": 0.86522454, + "learning_rate": 0.0009862684727023605, + "loss": 0.87629366, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.3293457, + "step": 533, + "time_per_iteration": 2.7452800273895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105875, + "balance_loss_mlp": 1.07466602, + "epoch": 0.1027318199307426, + "flos": 662647011840.0, + "grad_norm": 0.07282647554851075, + "language_loss": 0.90315968, + "learning_rate": 0.0009861958672915283, + "loss": 0.91421843, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.31201172, + "step": 534, + "time_per_iteration": 2.8041269779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096602, + "balance_loss_mlp": 1.0673244, + "epoch": 0.10292420161600616, + "flos": 682962928128.0, + "grad_norm": 0.058349855756870184, + "language_loss": 0.90126884, + "learning_rate": 0.0009861230731227201, + "loss": 0.9122349, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.29248047, + "step": 535, + "time_per_iteration": 2.8627805709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108033, + "balance_loss_mlp": 1.07615674, + "epoch": 0.10311658330126972, + "flos": 490042414080.0, + "grad_norm": 0.091555564896082, + "language_loss": 0.91954774, + "learning_rate": 0.0009860500902241973, + "loss": 0.93062806, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.31884766, + "step": 536, + "time_per_iteration": 2.6052157878875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120335, + "balance_loss_mlp": 1.08800602, + "epoch": 0.10330896498653329, + "flos": 431508446208.0, + "grad_norm": 0.0585767653270487, + "language_loss": 0.96574026, + "learning_rate": 0.0009859769186242942, + "loss": 0.97694361, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.32324219, + "step": 537, + "time_per_iteration": 2.51180362701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116517, + "balance_loss_mlp": 1.08571362, + "epoch": 0.10350134667179685, + "flos": 549330052608.0, + "grad_norm": 0.0744119924563098, + "language_loss": 0.8926785, + "learning_rate": 0.0009859035583514187, + "loss": 0.90384364, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.30834961, + "step": 538, + "time_per_iteration": 2.6369993686676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146613, + "balance_loss_mlp": 1.11380613, + "epoch": 0.10369372835706041, + "flos": 640327569408.0, + "grad_norm": 0.09976070350989504, + "language_loss": 0.90389431, + "learning_rate": 0.0009858300094340517, + "loss": 0.91536051, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.328125, + "step": 539, + "time_per_iteration": 2.7695086002349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150737, + "balance_loss_mlp": 1.11838388, + "epoch": 0.10388611004232397, + "flos": 521500598784.0, + "grad_norm": 0.08771902350159133, + "language_loss": 0.85304511, + "learning_rate": 0.0009857562719007473, + "loss": 0.8645525, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.32324219, + "step": 540, + "time_per_iteration": 2.59881329536438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144681, + "balance_loss_mlp": 1.11320961, + "epoch": 0.10407849172758753, + "flos": 702111946752.0, + "grad_norm": 0.07496368213999542, + "language_loss": 0.88249481, + "learning_rate": 0.0009856823457801331, + "loss": 0.89394164, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.31494141, + "step": 541, + "time_per_iteration": 2.873481035232544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119735, + "balance_loss_mlp": 1.08738184, + "epoch": 0.1042708734128511, + "flos": 502652736000.0, + "grad_norm": 0.06973546911765124, + "language_loss": 0.94998306, + "learning_rate": 0.00098560823110091, + "loss": 0.96118045, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.32373047, + "step": 542, + "time_per_iteration": 2.661374807357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_mlp": 1.08757377, + "epoch": 0.10446325509811466, + "flos": 485331153408.0, + "grad_norm": 0.0792045331206184, + "language_loss": 0.95517921, + "learning_rate": 0.000985533927891851, + "loss": 0.96635967, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.30419922, + "step": 543, + "time_per_iteration": 2.7264697551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096256, + "balance_loss_mlp": 1.06502366, + "epoch": 0.10465563678337822, + "flos": 568365590016.0, + "grad_norm": 0.0919664039836503, + "language_loss": 0.93718112, + "learning_rate": 0.0009854594361818044, + "loss": 0.94814372, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.31201172, + "step": 544, + "time_per_iteration": 2.6869821548461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099422, + "balance_loss_mlp": 1.0683322, + "epoch": 0.10484801846864178, + "flos": 625806853632.0, + "grad_norm": 0.1054615502202609, + "language_loss": 0.927598, + "learning_rate": 0.0009853847559996897, + "loss": 0.9385922, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.31103516, + "step": 545, + "time_per_iteration": 2.7953526973724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100313, + "balance_loss_mlp": 1.06772113, + "epoch": 0.10504040015390535, + "flos": 743063874048.0, + "grad_norm": 0.0768702593450629, + "language_loss": 0.92008656, + "learning_rate": 0.0009853098873745, + "loss": 0.93108964, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.32592773, + "step": 546, + "time_per_iteration": 3.0344293117523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106321, + "balance_loss_mlp": 1.07430172, + "epoch": 0.10523278183916891, + "flos": 586382616576.0, + "grad_norm": 0.072035501246702, + "language_loss": 0.90983582, + "learning_rate": 0.0009852348303353027, + "loss": 0.92089903, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.32006836, + "step": 547, + "time_per_iteration": 2.7647972106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110403, + "balance_loss_mlp": 1.07100892, + "epoch": 0.10542516352443247, + "flos": 869270552064.0, + "grad_norm": 0.07817580313906373, + "language_loss": 0.84611928, + "learning_rate": 0.000985159584911237, + "loss": 0.85715961, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.33007812, + "step": 548, + "time_per_iteration": 3.143122434616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104478, + "balance_loss_mlp": 1.07212472, + "epoch": 0.10561754520969603, + "flos": 505182970368.0, + "grad_norm": 0.08898596974063745, + "language_loss": 0.91126573, + "learning_rate": 0.0009850841511315162, + "loss": 0.92231047, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.32348633, + "step": 549, + "time_per_iteration": 2.6164846420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112982, + "balance_loss_mlp": 1.07946038, + "epoch": 0.1058099268949596, + "flos": 559690947072.0, + "grad_norm": 0.06224197989448247, + "language_loss": 0.92054999, + "learning_rate": 0.0009850085290254256, + "loss": 0.93167984, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.33520508, + "step": 550, + "time_per_iteration": 2.7473480701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110676, + "balance_loss_mlp": 1.07431078, + "epoch": 0.10600230858022316, + "flos": 561773048832.0, + "grad_norm": 0.05678957528127819, + "language_loss": 0.88957977, + "learning_rate": 0.0009849327186223246, + "loss": 0.90064728, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.32446289, + "step": 551, + "time_per_iteration": 2.805126905441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094878, + "balance_loss_mlp": 1.06297779, + "epoch": 0.10619469026548672, + "flos": 494079989760.0, + "grad_norm": 0.07906939671673464, + "language_loss": 0.95596325, + "learning_rate": 0.000984856719951646, + "loss": 0.96691203, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.31860352, + "step": 552, + "time_per_iteration": 2.5688273906707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105536, + "balance_loss_mlp": 1.07370734, + "epoch": 0.10638707195075028, + "flos": 675843678720.0, + "grad_norm": 0.06469368191660979, + "language_loss": 0.93170857, + "learning_rate": 0.0009847805330428943, + "loss": 0.94276392, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.31811523, + "step": 553, + "time_per_iteration": 2.8858227729797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105116, + "balance_loss_mlp": 1.07080746, + "epoch": 0.10657945363601386, + "flos": 487811925504.0, + "grad_norm": 0.07365688544553677, + "language_loss": 0.94454086, + "learning_rate": 0.0009847041579256481, + "loss": 0.95559192, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.34326172, + "step": 554, + "time_per_iteration": 2.5912039279937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114154, + "balance_loss_mlp": 1.08158636, + "epoch": 0.10677183532127742, + "flos": 482706376704.0, + "grad_norm": 0.06731486395760358, + "language_loss": 0.95310724, + "learning_rate": 0.0009846275946295592, + "loss": 0.96424878, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.32568359, + "step": 555, + "time_per_iteration": 2.6071619987487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120557, + "balance_loss_mlp": 1.08755958, + "epoch": 0.10696421700654098, + "flos": 655917668352.0, + "grad_norm": 0.06239681935918944, + "language_loss": 0.88169777, + "learning_rate": 0.0009845508431843518, + "loss": 0.89290333, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.32983398, + "step": 556, + "time_per_iteration": 2.9906973838806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_mlp": 1.08986306, + "epoch": 0.10715659869180454, + "flos": 567483881472.0, + "grad_norm": 0.06803394611182671, + "language_loss": 0.89010829, + "learning_rate": 0.0009844739036198233, + "loss": 0.90133309, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.32592773, + "step": 557, + "time_per_iteration": 2.6462793350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113246, + "balance_loss_mlp": 1.09927225, + "epoch": 0.10734898037706811, + "flos": 540432829440.0, + "grad_norm": 0.0683091886411484, + "language_loss": 0.96000761, + "learning_rate": 0.0009843967759658448, + "loss": 0.97133219, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.33203125, + "step": 558, + "time_per_iteration": 2.664320707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369087, + "balance_loss_mlp": 1.3546865, + "epoch": 0.10754136206233167, + "flos": 1475870008320.0, + "grad_norm": 0.12144998025248735, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74136841, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.14355469, + "step": 559, + "time_per_iteration": 4.836310148239136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124853, + "balance_loss_mlp": 1.0925231, + "epoch": 0.10773374374759523, + "flos": 512155243008.0, + "grad_norm": 0.06725764235558847, + "language_loss": 0.96045369, + "learning_rate": 0.000984241956509384, + "loss": 0.97170222, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.32324219, + "step": 560, + "time_per_iteration": 2.7409372329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134795, + "balance_loss_mlp": 1.10005689, + "epoch": 0.10792612543285879, + "flos": 496261016064.0, + "grad_norm": 0.08502468521942065, + "language_loss": 0.91520619, + "learning_rate": 0.0009841642647670078, + "loss": 0.92655414, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.34741211, + "step": 561, + "time_per_iteration": 2.5360167026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134435, + "balance_loss_mlp": 1.10050821, + "epoch": 0.10811850711812235, + "flos": 735131317248.0, + "grad_norm": 0.08550854990342285, + "language_loss": 0.86122006, + "learning_rate": 0.0009840863850553944, + "loss": 0.87256444, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.33911133, + "step": 562, + "time_per_iteration": 3.0013930797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118751, + "balance_loss_mlp": 1.08604038, + "epoch": 0.10831088880338592, + "flos": 611257024512.0, + "grad_norm": 0.07414056330929218, + "language_loss": 0.92513216, + "learning_rate": 0.0009840083174047782, + "loss": 0.93631971, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.3269043, + "step": 563, + "time_per_iteration": 2.761746883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125103, + "balance_loss_mlp": 1.09353685, + "epoch": 0.10850327048864948, + "flos": 556022928384.0, + "grad_norm": 0.06849160846851732, + "language_loss": 0.86520386, + "learning_rate": 0.0009839300618454685, + "loss": 0.87645483, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.31518555, + "step": 564, + "time_per_iteration": 2.833545684814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124691, + "balance_loss_mlp": 1.09291005, + "epoch": 0.10869565217391304, + "flos": 602902476288.0, + "grad_norm": 0.06688991061359367, + "language_loss": 0.92471159, + "learning_rate": 0.0009838516184078466, + "loss": 0.9359585, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.31762695, + "step": 565, + "time_per_iteration": 2.838482618331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112559, + "balance_loss_mlp": 1.09345102, + "epoch": 0.1088880338591766, + "flos": 525922288128.0, + "grad_norm": 0.08266802783800845, + "language_loss": 0.89073956, + "learning_rate": 0.0009837729871223669, + "loss": 0.90199542, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.3215332, + "step": 566, + "time_per_iteration": 2.6670589447021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134729, + "balance_loss_mlp": 1.10073042, + "epoch": 0.10908041554444017, + "flos": 619986921984.0, + "grad_norm": 0.06816497946354988, + "language_loss": 0.89503658, + "learning_rate": 0.0009836941680195568, + "loss": 0.90638387, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.34033203, + "step": 567, + "time_per_iteration": 2.7894582748413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131677, + "balance_loss_mlp": 1.09691525, + "epoch": 0.10927279722970373, + "flos": 897740195328.0, + "grad_norm": 0.07371226629870802, + "language_loss": 0.8534497, + "learning_rate": 0.0009836151611300166, + "loss": 0.86476642, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.34765625, + "step": 568, + "time_per_iteration": 3.204500913619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116051, + "balance_loss_mlp": 1.08467555, + "epoch": 0.10946517891496729, + "flos": 528408852480.0, + "grad_norm": 0.061952855977424344, + "language_loss": 0.96103537, + "learning_rate": 0.0009835359664844194, + "loss": 0.97219586, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.3137207, + "step": 569, + "time_per_iteration": 2.6154720783233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124163, + "balance_loss_mlp": 1.11014414, + "epoch": 0.10965756060023085, + "flos": 1559944714752.0, + "grad_norm": 0.03358522647050957, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82160974, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.140625, + "step": 570, + "time_per_iteration": 4.907090187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112487, + "balance_loss_mlp": 1.09406638, + "epoch": 0.10984994228549443, + "flos": 512820163584.0, + "grad_norm": 0.08674533322611513, + "language_loss": 0.9339065, + "learning_rate": 0.0009833770140481118, + "loss": 0.9451552, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.30786133, + "step": 571, + "time_per_iteration": 2.694821357727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121358, + "balance_loss_mlp": 1.09072113, + "epoch": 0.11004232397075799, + "flos": 954314307072.0, + "grad_norm": 0.07582699316256973, + "language_loss": 0.84126109, + "learning_rate": 0.000983297256319112, + "loss": 0.85247469, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.30664062, + "step": 572, + "time_per_iteration": 3.208728313446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144326, + "balance_loss_mlp": 1.11097169, + "epoch": 0.11023470565602154, + "flos": 487921024512.0, + "grad_norm": 0.07530566153242002, + "language_loss": 0.8789041, + "learning_rate": 0.000983217310957477, + "loss": 0.89034736, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.33349609, + "step": 573, + "time_per_iteration": 2.7521331310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113474, + "balance_loss_mlp": 1.1014812, + "epoch": 0.1104270873412851, + "flos": 655521970176.0, + "grad_norm": 0.08427122985019045, + "language_loss": 0.91161472, + "learning_rate": 0.000983137177994244, + "loss": 0.92296207, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.33300781, + "step": 574, + "time_per_iteration": 2.869795083999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105984, + "balance_loss_mlp": 1.0752039, + "epoch": 0.11061946902654868, + "flos": 723097165824.0, + "grad_norm": 0.0803000190442887, + "language_loss": 0.87202144, + "learning_rate": 0.0009830568574605235, + "loss": 0.88308132, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.30737305, + "step": 575, + "time_per_iteration": 2.952505111694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111674, + "balance_loss_mlp": 1.07963109, + "epoch": 0.11081185071181224, + "flos": 835113397248.0, + "grad_norm": 0.07764025760375837, + "language_loss": 0.89234924, + "learning_rate": 0.0009829763493874992, + "loss": 0.90346599, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.3203125, + "step": 576, + "time_per_iteration": 3.0367727279663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110641, + "balance_loss_mlp": 1.07508206, + "epoch": 0.1110042323970758, + "flos": 608776252416.0, + "grad_norm": 0.06795308301133055, + "language_loss": 0.94366598, + "learning_rate": 0.0009828956538064264, + "loss": 0.95473009, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.31347656, + "step": 577, + "time_per_iteration": 2.783268928527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091394, + "balance_loss_mlp": 1.0610671, + "epoch": 0.11119661408233936, + "flos": 595643604480.0, + "grad_norm": 0.0662915232098912, + "language_loss": 0.9183138, + "learning_rate": 0.0009828147707486344, + "loss": 0.92922771, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.30297852, + "step": 578, + "time_per_iteration": 2.6628670692443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092993, + "balance_loss_mlp": 1.06109214, + "epoch": 0.11138899576760293, + "flos": 555573385728.0, + "grad_norm": 0.07355059798421615, + "language_loss": 0.87444091, + "learning_rate": 0.0009827337002455245, + "loss": 0.88537085, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.31884766, + "step": 579, + "time_per_iteration": 2.616842031478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087398, + "balance_loss_mlp": 1.05857313, + "epoch": 0.11158137745286649, + "flos": 689418667008.0, + "grad_norm": 0.05531737995895799, + "language_loss": 0.89474124, + "learning_rate": 0.0009826524423285712, + "loss": 0.90561521, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.28808594, + "step": 580, + "time_per_iteration": 2.896409749984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093471, + "balance_loss_mlp": 1.06393051, + "epoch": 0.11177375913813005, + "flos": 762688728576.0, + "grad_norm": 0.06807232662928764, + "language_loss": 0.9046967, + "learning_rate": 0.0009825709970293218, + "loss": 0.91563141, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.2956543, + "step": 581, + "time_per_iteration": 2.8843319416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096208, + "balance_loss_mlp": 1.0669775, + "epoch": 0.11196614082339361, + "flos": 806211588096.0, + "grad_norm": 0.07053725402235117, + "language_loss": 0.96166003, + "learning_rate": 0.0009824893643793956, + "loss": 0.9726221, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.29248047, + "step": 582, + "time_per_iteration": 3.04577898979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104715, + "balance_loss_mlp": 1.07305288, + "epoch": 0.11215852250865718, + "flos": 558350931456.0, + "grad_norm": 0.10752491555358674, + "language_loss": 0.89033759, + "learning_rate": 0.0009824075444104857, + "loss": 0.90138471, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.31689453, + "step": 583, + "time_per_iteration": 2.682020902633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125233, + "balance_loss_mlp": 1.09497714, + "epoch": 0.11235090419392074, + "flos": 513322140672.0, + "grad_norm": 0.06606619546840543, + "language_loss": 0.94941097, + "learning_rate": 0.000982325537154357, + "loss": 0.9606632, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.30224609, + "step": 584, + "time_per_iteration": 2.577632427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122311, + "balance_loss_mlp": 1.09045827, + "epoch": 0.1125432858791843, + "flos": 491209311744.0, + "grad_norm": 0.07452844115700766, + "language_loss": 0.95190644, + "learning_rate": 0.0009822433426428484, + "loss": 0.96312958, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.31860352, + "step": 585, + "time_per_iteration": 2.560591220855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126565, + "balance_loss_mlp": 1.09280539, + "epoch": 0.11273566756444786, + "flos": 510476193792.0, + "grad_norm": 0.11434848401200806, + "language_loss": 0.87964213, + "learning_rate": 0.0009821609609078697, + "loss": 0.89090776, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.3371582, + "step": 586, + "time_per_iteration": 2.633925437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118464, + "balance_loss_mlp": 1.08785152, + "epoch": 0.11292804924971142, + "flos": 622149009408.0, + "grad_norm": 0.08000190427267627, + "language_loss": 0.905334, + "learning_rate": 0.0009820783919814045, + "loss": 0.91651857, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.3059082, + "step": 587, + "time_per_iteration": 2.806704044342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111289, + "balance_loss_mlp": 1.07857847, + "epoch": 0.113120430934975, + "flos": 477811823616.0, + "grad_norm": 0.09357252991594707, + "language_loss": 0.83955467, + "learning_rate": 0.0009819956358955095, + "loss": 0.8506676, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.32714844, + "step": 588, + "time_per_iteration": 2.5903711318969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109097, + "balance_loss_mlp": 1.07455039, + "epoch": 0.11331281262023855, + "flos": 466801975296.0, + "grad_norm": 0.06610764616840299, + "language_loss": 0.85348701, + "learning_rate": 0.0009819126926823127, + "loss": 0.86457801, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.34570312, + "step": 589, + "time_per_iteration": 2.5726494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108183, + "balance_loss_mlp": 1.07535291, + "epoch": 0.11350519430550211, + "flos": 650164727808.0, + "grad_norm": 0.06035980490561805, + "language_loss": 0.87806922, + "learning_rate": 0.000981829562374016, + "loss": 0.8891511, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.328125, + "step": 590, + "time_per_iteration": 2.7960643768310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112987, + "balance_loss_mlp": 1.08041859, + "epoch": 0.11369757599076567, + "flos": 557547798528.0, + "grad_norm": 0.08830164474684658, + "language_loss": 0.98550045, + "learning_rate": 0.0009817462450028933, + "loss": 0.99663031, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.32568359, + "step": 591, + "time_per_iteration": 2.654860734939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107596, + "balance_loss_mlp": 1.07526684, + "epoch": 0.11388995767602925, + "flos": 570774988800.0, + "grad_norm": 0.06245390963608315, + "language_loss": 0.86587834, + "learning_rate": 0.0009816627406012916, + "loss": 0.87695432, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.32348633, + "step": 592, + "time_per_iteration": 2.8017733097076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101808, + "balance_loss_mlp": 1.07074225, + "epoch": 0.1140823393612928, + "flos": 740069540352.0, + "grad_norm": 0.06581053360364857, + "language_loss": 0.8595314, + "learning_rate": 0.0009815790492016295, + "loss": 0.87054944, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.31030273, + "step": 593, + "time_per_iteration": 2.9602174758911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097875, + "balance_loss_mlp": 1.06666636, + "epoch": 0.11427472104655637, + "flos": 698694211584.0, + "grad_norm": 0.07124053574400792, + "language_loss": 0.87982339, + "learning_rate": 0.0009814951708363993, + "loss": 0.89080215, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.31201172, + "step": 594, + "time_per_iteration": 2.818460702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167391, + "balance_loss_mlp": 1.15413451, + "epoch": 0.11446710273181993, + "flos": 1476387952128.0, + "grad_norm": 0.04038129773095179, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79158378, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.1328125, + "step": 595, + "time_per_iteration": 4.776912450790405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110386, + "balance_loss_mlp": 1.07250798, + "epoch": 0.1146594844170835, + "flos": 494641603584.0, + "grad_norm": 0.1404346857169784, + "language_loss": 0.89489102, + "learning_rate": 0.0009813268533395648, + "loss": 0.90592968, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.3137207, + "step": 596, + "time_per_iteration": 2.562816858291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115324, + "balance_loss_mlp": 1.08344746, + "epoch": 0.11485186610234706, + "flos": 474596319744.0, + "grad_norm": 0.07456374098915484, + "language_loss": 0.89145029, + "learning_rate": 0.0009812424142733073, + "loss": 0.90260351, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.31884766, + "step": 597, + "time_per_iteration": 2.5198655128479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_mlp": 1.0946219, + "epoch": 0.11504424778761062, + "flos": 730858014720.0, + "grad_norm": 0.05033183127205697, + "language_loss": 0.86898923, + "learning_rate": 0.000981157788372175, + "loss": 0.88022888, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.29345703, + "step": 598, + "time_per_iteration": 3.004558563232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155063, + "balance_loss_mlp": 1.12290049, + "epoch": 0.11523662947287418, + "flos": 545539788288.0, + "grad_norm": 0.07554757352201513, + "language_loss": 0.90216064, + "learning_rate": 0.0009810729756690223, + "loss": 0.91371131, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.3215332, + "step": 599, + "time_per_iteration": 2.7165520191192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149643, + "balance_loss_mlp": 1.11790919, + "epoch": 0.11542901115813775, + "flos": 774737436672.0, + "grad_norm": 0.08801397326806587, + "language_loss": 0.92855275, + "learning_rate": 0.0009809879761967766, + "loss": 0.94004917, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.31738281, + "step": 600, + "time_per_iteration": 2.9548492431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115619, + "balance_loss_mlp": 1.12004542, + "epoch": 0.11562139284340131, + "flos": 730585972224.0, + "grad_norm": 0.08285308963026158, + "language_loss": 0.87716347, + "learning_rate": 0.0009809027899884378, + "loss": 0.8887254, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.36157227, + "step": 601, + "time_per_iteration": 2.9107346534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131924, + "balance_loss_mlp": 1.10085821, + "epoch": 0.11581377452866487, + "flos": 535589148672.0, + "grad_norm": 0.07059046613839054, + "language_loss": 0.89834028, + "learning_rate": 0.0009808174170770779, + "loss": 0.90965956, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.31079102, + "step": 602, + "time_per_iteration": 2.79127836227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217718, + "balance_loss_mlp": 1.20541608, + "epoch": 0.11600615621392843, + "flos": 1554968613888.0, + "grad_norm": 0.07528653751738872, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86115962, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.12304688, + "step": 603, + "time_per_iteration": 4.862261772155762 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103115, + "balance_loss_mlp": 1.07238269, + "epoch": 0.116198537899192, + "flos": 537178037760.0, + "grad_norm": 0.08106568577848162, + "language_loss": 0.94434869, + "learning_rate": 0.0009806461112779462, + "loss": 0.95537978, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.30737305, + "step": 604, + "time_per_iteration": 2.600008249282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097342, + "balance_loss_mlp": 1.06427336, + "epoch": 0.11639091958445556, + "flos": 453970483200.0, + "grad_norm": 0.09761910402267754, + "language_loss": 0.89590895, + "learning_rate": 0.0009805601784566814, + "loss": 0.90688241, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.33056641, + "step": 605, + "time_per_iteration": 2.4687013626098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097807, + "balance_loss_mlp": 1.06635928, + "epoch": 0.11658330126971912, + "flos": 554815332864.0, + "grad_norm": 0.0628453025897625, + "language_loss": 0.96235836, + "learning_rate": 0.0009804740590654089, + "loss": 0.97333646, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.31469727, + "step": 606, + "time_per_iteration": 2.654134750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109435, + "balance_loss_mlp": 1.0789417, + "epoch": 0.11677568295498268, + "flos": 716025968640.0, + "grad_norm": 0.07837472156111998, + "language_loss": 0.90884066, + "learning_rate": 0.0009803877531375635, + "loss": 0.91993499, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.30493164, + "step": 607, + "time_per_iteration": 2.825778007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_mlp": 1.08074808, + "epoch": 0.11696806464024626, + "flos": 609474668544.0, + "grad_norm": 0.07263848878870109, + "language_loss": 0.91923869, + "learning_rate": 0.0009803012607066523, + "loss": 0.93036401, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.31787109, + "step": 608, + "time_per_iteration": 2.721005916595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101062, + "balance_loss_mlp": 1.06980491, + "epoch": 0.11716044632550981, + "flos": 520127087616.0, + "grad_norm": 0.06980646294906427, + "language_loss": 0.9077962, + "learning_rate": 0.0009802145818062543, + "loss": 0.91880679, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.31225586, + "step": 609, + "time_per_iteration": 2.707643985748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102401, + "balance_loss_mlp": 1.07035792, + "epoch": 0.11735282801077337, + "flos": 507246133248.0, + "grad_norm": 0.07162886221417876, + "language_loss": 0.9293434, + "learning_rate": 0.0009801277164700212, + "loss": 0.9403674, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.3203125, + "step": 610, + "time_per_iteration": 2.6389639377593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094537, + "balance_loss_mlp": 1.06323278, + "epoch": 0.11754520969603693, + "flos": 686339965440.0, + "grad_norm": 0.07220465483683103, + "language_loss": 0.90727574, + "learning_rate": 0.0009800406647316776, + "loss": 0.91822106, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.31274414, + "step": 611, + "time_per_iteration": 2.8033382892608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066854, + "balance_loss_mlp": 1.05369329, + "epoch": 0.1177375913813005, + "flos": 1541673022464.0, + "grad_norm": 0.030783707978337852, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.77981311, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.13183594, + "step": 612, + "time_per_iteration": 4.777275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116404, + "balance_loss_mlp": 1.08307314, + "epoch": 0.11792997306656407, + "flos": 520269682176.0, + "grad_norm": 0.07589987368124408, + "language_loss": 0.8961159, + "learning_rate": 0.000979866002183916, + "loss": 0.90727997, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.33325195, + "step": 613, + "time_per_iteration": 2.6848883628845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109453, + "balance_loss_mlp": 1.07719529, + "epoch": 0.11812235475182763, + "flos": 665980379136.0, + "grad_norm": 0.08667718058784188, + "language_loss": 0.91197205, + "learning_rate": 0.0009797783914423082, + "loss": 0.92306662, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.32250977, + "step": 614, + "time_per_iteration": 2.832414388656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.07140493, + "epoch": 0.11831473643709119, + "flos": 621021399552.0, + "grad_norm": 0.06050640051516142, + "language_loss": 0.85425436, + "learning_rate": 0.0009796905944342094, + "loss": 0.86530626, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.33813477, + "step": 615, + "time_per_iteration": 2.8220455646514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112849, + "balance_loss_mlp": 1.07913685, + "epoch": 0.11850711812235475, + "flos": 456438108672.0, + "grad_norm": 0.0714748534502384, + "language_loss": 0.893188, + "learning_rate": 0.0009796026111937057, + "loss": 0.90431643, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.3371582, + "step": 616, + "time_per_iteration": 2.590566873550415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102524, + "balance_loss_mlp": 1.07005119, + "epoch": 0.11869949980761832, + "flos": 513598565376.0, + "grad_norm": 0.06492309219220607, + "language_loss": 0.89778733, + "learning_rate": 0.0009795144417549552, + "loss": 0.90881252, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.32470703, + "step": 617, + "time_per_iteration": 2.672914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109626, + "balance_loss_mlp": 1.0773685, + "epoch": 0.11889188149288188, + "flos": 534732171264.0, + "grad_norm": 0.057544425945024125, + "language_loss": 0.90660846, + "learning_rate": 0.0009794260861521883, + "loss": 0.9177047, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.32250977, + "step": 618, + "time_per_iteration": 2.817354202270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_mlp": 1.07009149, + "epoch": 0.11908426317814544, + "flos": 498344527872.0, + "grad_norm": 0.0773697745436404, + "language_loss": 0.87738883, + "learning_rate": 0.0009793375444197075, + "loss": 0.88841403, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.32397461, + "step": 619, + "time_per_iteration": 2.607475996017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011109, + "balance_loss_mlp": 1.07697332, + "epoch": 0.119276644863409, + "flos": 659598833664.0, + "grad_norm": 0.06767977381214116, + "language_loss": 0.86337721, + "learning_rate": 0.000979248816591888, + "loss": 0.87448615, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.33935547, + "step": 620, + "time_per_iteration": 2.758866548538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098794, + "balance_loss_mlp": 1.06667948, + "epoch": 0.11946902654867257, + "flos": 758396487168.0, + "grad_norm": 0.06819106164994826, + "language_loss": 0.87032986, + "learning_rate": 0.0009791599027031766, + "loss": 0.88131785, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.32128906, + "step": 621, + "time_per_iteration": 3.029431104660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088156, + "balance_loss_mlp": 1.05611241, + "epoch": 0.11966140823393613, + "flos": 680697533952.0, + "grad_norm": 0.0732554324646167, + "language_loss": 0.87112588, + "learning_rate": 0.0009790708027880932, + "loss": 0.88200748, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.32055664, + "step": 622, + "time_per_iteration": 2.855576992034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056461, + "balance_loss_mlp": 1.04444504, + "epoch": 0.11985378991919969, + "flos": 1450268070912.0, + "grad_norm": 0.03732324883573809, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78483754, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.12011719, + "step": 623, + "time_per_iteration": 4.840993165969849 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108671, + "balance_loss_mlp": 1.0551914, + "epoch": 0.12004617160446325, + "flos": 527586780672.0, + "grad_norm": 0.07309096746678648, + "language_loss": 0.94236648, + "learning_rate": 0.0009788920450172487, + "loss": 0.9532336, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.31518555, + "step": 624, + "time_per_iteration": 2.6301677227020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102663, + "balance_loss_mlp": 1.07023823, + "epoch": 0.12023855328972682, + "flos": 473980861440.0, + "grad_norm": 0.15739190650861204, + "language_loss": 0.91515559, + "learning_rate": 0.0009788023872308875, + "loss": 0.92618221, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.32421875, + "step": 625, + "time_per_iteration": 2.506446361541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014454, + "balance_loss_mlp": 1.0033915, + "epoch": 0.12043093497499038, + "flos": 1530954155520.0, + "grad_norm": 0.02216054665264375, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76443458, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.11083984, + "step": 626, + "time_per_iteration": 4.713289260864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114644, + "balance_loss_mlp": 1.08391225, + "epoch": 0.12062331666025394, + "flos": 539571469824.0, + "grad_norm": 0.0672242080300053, + "language_loss": 0.94766486, + "learning_rate": 0.0009786225140303285, + "loss": 0.95881128, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.30761719, + "step": 627, + "time_per_iteration": 2.61875057220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011267, + "balance_loss_mlp": 1.09503818, + "epoch": 0.1208156983455175, + "flos": 511634327040.0, + "grad_norm": 0.06510849521455, + "language_loss": 0.925771, + "learning_rate": 0.0009785322986859634, + "loss": 0.93703806, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.31640625, + "step": 628, + "time_per_iteration": 2.6567625999450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141777, + "balance_loss_mlp": 1.11059177, + "epoch": 0.12100808003078108, + "flos": 596195043840.0, + "grad_norm": 0.06735600063735754, + "language_loss": 0.93719506, + "learning_rate": 0.0009784418975588838, + "loss": 0.94861281, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.31152344, + "step": 629, + "time_per_iteration": 2.697376012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122983, + "balance_loss_mlp": 1.09222674, + "epoch": 0.12120046171604464, + "flos": 522698019840.0, + "grad_norm": 0.47103484407124013, + "language_loss": 0.93927598, + "learning_rate": 0.0009783513106841862, + "loss": 0.95050573, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.30761719, + "step": 630, + "time_per_iteration": 2.7226808071136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143332, + "balance_loss_mlp": 1.13179243, + "epoch": 0.1213928434013082, + "flos": 1553605277184.0, + "grad_norm": 0.056788624646596834, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77876031, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.11523438, + "step": 631, + "time_per_iteration": 4.948111295700073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228128, + "balance_loss_mlp": 1.19219875, + "epoch": 0.12158522508657175, + "flos": 495143580672.0, + "grad_norm": 0.06834333100250278, + "language_loss": 0.88515621, + "learning_rate": 0.0009781695798326854, + "loss": 0.89743745, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.35961914, + "step": 632, + "time_per_iteration": 2.5616555213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267845, + "balance_loss_mlp": 1.23050833, + "epoch": 0.12177760677183531, + "flos": 475335433728.0, + "grad_norm": 0.1009303482431908, + "language_loss": 0.88543177, + "learning_rate": 0.0009780784359264365, + "loss": 0.89811015, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.37329102, + "step": 633, + "time_per_iteration": 2.597935438156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265484, + "balance_loss_mlp": 1.25370574, + "epoch": 0.12196998845709889, + "flos": 1467630351360.0, + "grad_norm": 0.08843071113371018, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75454181, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.11767578, + "step": 634, + "time_per_iteration": 4.768415451049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235432, + "balance_loss_mlp": 1.19976473, + "epoch": 0.12216237014236245, + "flos": 586279309824.0, + "grad_norm": 0.0829698455775257, + "language_loss": 0.88074899, + "learning_rate": 0.000977895591329867, + "loss": 0.89310336, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.35668945, + "step": 635, + "time_per_iteration": 2.7918457984924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214994, + "balance_loss_mlp": 1.17720437, + "epoch": 0.12235475182762601, + "flos": 597721324032.0, + "grad_norm": 0.0916527997361875, + "language_loss": 0.87791145, + "learning_rate": 0.000977803890710533, + "loss": 0.89006138, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.37792969, + "step": 636, + "time_per_iteration": 2.7248313426971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186705, + "balance_loss_mlp": 1.1509428, + "epoch": 0.12254713351288957, + "flos": 497487550464.0, + "grad_norm": 0.0702522126388857, + "language_loss": 0.93856937, + "learning_rate": 0.0009777120045912774, + "loss": 0.95043641, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.35766602, + "step": 637, + "time_per_iteration": 2.6079726219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180704, + "balance_loss_mlp": 1.14236617, + "epoch": 0.12273951519815314, + "flos": 605565130752.0, + "grad_norm": 0.06645311005239844, + "language_loss": 0.90599251, + "learning_rate": 0.0009776199330077736, + "loss": 0.91779959, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.38330078, + "step": 638, + "time_per_iteration": 2.7671282291412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196025, + "balance_loss_mlp": 1.15940344, + "epoch": 0.1229318968834167, + "flos": 597578729472.0, + "grad_norm": 0.09015200479441979, + "language_loss": 0.93140519, + "learning_rate": 0.0009775276759957667, + "loss": 0.94336545, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.36621094, + "step": 639, + "time_per_iteration": 2.6990442276000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179898, + "balance_loss_mlp": 1.14265716, + "epoch": 0.12312427856868026, + "flos": 678082931712.0, + "grad_norm": 0.08188642922116089, + "language_loss": 0.90714514, + "learning_rate": 0.0009774352335910745, + "loss": 0.91894412, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.37280273, + "step": 640, + "time_per_iteration": 2.7950265407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115004, + "balance_loss_mlp": 1.11658967, + "epoch": 0.12331666025394382, + "flos": 608656978944.0, + "grad_norm": 0.07361380744806716, + "language_loss": 0.95549798, + "learning_rate": 0.000977342605829586, + "loss": 0.96699834, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.3347168, + "step": 641, + "time_per_iteration": 2.6966538429260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140018, + "balance_loss_mlp": 1.10497069, + "epoch": 0.12350904193920739, + "flos": 762172194816.0, + "grad_norm": 0.08211004604029591, + "language_loss": 0.86708105, + "learning_rate": 0.0009772497927472623, + "loss": 0.87848121, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.35083008, + "step": 642, + "time_per_iteration": 3.050595998764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121507, + "balance_loss_mlp": 1.0852437, + "epoch": 0.12370142362447095, + "flos": 540699079680.0, + "grad_norm": 0.0716743258864478, + "language_loss": 0.85363436, + "learning_rate": 0.0009771567943801368, + "loss": 0.86484945, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.36254883, + "step": 643, + "time_per_iteration": 2.627019166946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112067, + "balance_loss_mlp": 1.07744884, + "epoch": 0.12389380530973451, + "flos": 547848852480.0, + "grad_norm": 0.06992166814052157, + "language_loss": 0.89936745, + "learning_rate": 0.0009770636107643152, + "loss": 0.91048813, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.34643555, + "step": 644, + "time_per_iteration": 2.696233034133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102963, + "balance_loss_mlp": 1.06846356, + "epoch": 0.12408618699499807, + "flos": 540048715776.0, + "grad_norm": 0.06268128655507912, + "language_loss": 0.88181639, + "learning_rate": 0.0009769702419359738, + "loss": 0.89284605, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.3449707, + "step": 645, + "time_per_iteration": 2.61401104927063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116268, + "balance_loss_mlp": 1.0810535, + "epoch": 0.12427856868026164, + "flos": 745451513856.0, + "grad_norm": 0.07610574883038115, + "language_loss": 0.89730537, + "learning_rate": 0.000976876687931362, + "loss": 0.90846807, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.35229492, + "step": 646, + "time_per_iteration": 2.999408721923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131315, + "balance_loss_mlp": 1.09622002, + "epoch": 0.1244709503655252, + "flos": 533460556800.0, + "grad_norm": 0.19449531308307466, + "language_loss": 0.85410094, + "learning_rate": 0.0009767829487868005, + "loss": 0.86541414, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.35107422, + "step": 647, + "time_per_iteration": 2.617666721343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138117, + "balance_loss_mlp": 1.10159075, + "epoch": 0.12466333205078876, + "flos": 507847034880.0, + "grad_norm": 0.07509451505155453, + "language_loss": 0.89358151, + "learning_rate": 0.000976689024538682, + "loss": 0.90496266, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.36499023, + "step": 648, + "time_per_iteration": 2.5929009914398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138062, + "balance_loss_mlp": 1.10110736, + "epoch": 0.12485571373605232, + "flos": 681023420928.0, + "grad_norm": 0.07057439208121223, + "language_loss": 0.87662494, + "learning_rate": 0.0009765949152234716, + "loss": 0.8880055, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.36962891, + "step": 649, + "time_per_iteration": 2.874701976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147728, + "balance_loss_mlp": 1.13504386, + "epoch": 0.1250480954213159, + "flos": 1329402668544.0, + "grad_norm": 0.04527818124304351, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79833812, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.12695312, + "step": 650, + "time_per_iteration": 4.680933713912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138039, + "balance_loss_mlp": 1.10287213, + "epoch": 0.12524047710657946, + "flos": 938140683264.0, + "grad_norm": 0.08375968037938068, + "language_loss": 0.82443976, + "learning_rate": 0.0009764061415379919, + "loss": 0.83582014, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.35205078, + "step": 651, + "time_per_iteration": 3.2550604343414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135369, + "balance_loss_mlp": 1.09774697, + "epoch": 0.12543285879184302, + "flos": 513642235392.0, + "grad_norm": 0.07146085627000143, + "language_loss": 0.89363486, + "learning_rate": 0.0009763114772410109, + "loss": 0.90498853, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.3762207, + "step": 652, + "time_per_iteration": 2.5937142372131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139745, + "balance_loss_mlp": 1.10419679, + "epoch": 0.12562524047710658, + "flos": 717991617024.0, + "grad_norm": 0.07913079577836896, + "language_loss": 0.87230957, + "learning_rate": 0.0009762166280235146, + "loss": 0.88370705, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.35571289, + "step": 653, + "time_per_iteration": 2.96162748336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147653, + "balance_loss_mlp": 1.10974443, + "epoch": 0.12581762216237014, + "flos": 563441923584.0, + "grad_norm": 0.06492259826928527, + "language_loss": 0.87890899, + "learning_rate": 0.0009761215939223267, + "loss": 0.89038551, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.37890625, + "step": 654, + "time_per_iteration": 2.714641809463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145182, + "balance_loss_mlp": 1.1077261, + "epoch": 0.1260100038476337, + "flos": 481642785792.0, + "grad_norm": 0.07920721431290144, + "language_loss": 0.86875665, + "learning_rate": 0.0009760263749743428, + "loss": 0.88020849, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.37426758, + "step": 655, + "time_per_iteration": 2.547499179840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145343, + "balance_loss_mlp": 1.11074805, + "epoch": 0.12620238553289725, + "flos": 575269461504.0, + "grad_norm": 0.06357383816141966, + "language_loss": 0.90176344, + "learning_rate": 0.0009759309712165299, + "loss": 0.91321695, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.34570312, + "step": 656, + "time_per_iteration": 2.693922996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137509, + "balance_loss_mlp": 1.103248, + "epoch": 0.12639476721816084, + "flos": 530909973504.0, + "grad_norm": 0.07169490366111804, + "language_loss": 0.93258119, + "learning_rate": 0.0009758353826859272, + "loss": 0.94395626, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.34277344, + "step": 657, + "time_per_iteration": 2.5744612216949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139269, + "balance_loss_mlp": 1.10314822, + "epoch": 0.1265871489034244, + "flos": 689654393856.0, + "grad_norm": 0.06860158128637554, + "language_loss": 0.89679217, + "learning_rate": 0.0009757396094196456, + "loss": 0.90818477, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36132812, + "step": 658, + "time_per_iteration": 2.851700782775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143308, + "balance_loss_mlp": 1.10675859, + "epoch": 0.12677953058868796, + "flos": 536863735296.0, + "grad_norm": 0.0696485175834739, + "language_loss": 0.84555894, + "learning_rate": 0.0009756436514548673, + "loss": 0.85699201, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.36523438, + "step": 659, + "time_per_iteration": 2.7971351146698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122244, + "balance_loss_mlp": 1.08800757, + "epoch": 0.12697191227395152, + "flos": 518749194240.0, + "grad_norm": 0.05327633329409036, + "language_loss": 0.88343394, + "learning_rate": 0.0009755475088288466, + "loss": 0.89465636, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.34228516, + "step": 660, + "time_per_iteration": 2.670555353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103127, + "balance_loss_mlp": 1.06903291, + "epoch": 0.12716429395921508, + "flos": 566341714944.0, + "grad_norm": 0.06801254087798507, + "language_loss": 0.90210187, + "learning_rate": 0.0009754511815789095, + "loss": 0.91313314, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.34106445, + "step": 661, + "time_per_iteration": 2.748224973678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102552, + "balance_loss_mlp": 1.06798172, + "epoch": 0.12735667564447864, + "flos": 513844466688.0, + "grad_norm": 0.06975204014846512, + "language_loss": 0.86245489, + "learning_rate": 0.0009753546697424533, + "loss": 0.87348044, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.34594727, + "step": 662, + "time_per_iteration": 2.664799213409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092108, + "balance_loss_mlp": 1.05863369, + "epoch": 0.1275490573297422, + "flos": 541023556608.0, + "grad_norm": 0.05485824904298714, + "language_loss": 0.90572149, + "learning_rate": 0.0009752579733569475, + "loss": 0.91664255, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.3347168, + "step": 663, + "time_per_iteration": 2.679975748062134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267369, + "balance_loss_mlp": 1.2515384, + "epoch": 0.12774143901500576, + "flos": 1557872787456.0, + "grad_norm": 0.0685532780556388, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7614876, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.15820312, + "step": 664, + "time_per_iteration": 4.938101053237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096151, + "balance_loss_mlp": 1.06177139, + "epoch": 0.12793382070026935, + "flos": 613462781952.0, + "grad_norm": 0.06920677464457729, + "language_loss": 0.90523887, + "learning_rate": 0.0009750640270890217, + "loss": 0.9162004, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.34375, + "step": 665, + "time_per_iteration": 2.6939845085144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099083, + "balance_loss_mlp": 1.06563258, + "epoch": 0.1281262023855329, + "flos": 707386231296.0, + "grad_norm": 0.06773970450457005, + "language_loss": 0.96531481, + "learning_rate": 0.0009749667772818983, + "loss": 0.9763056, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.33447266, + "step": 666, + "time_per_iteration": 2.967853307723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164762, + "balance_loss_mlp": 1.15131497, + "epoch": 0.12831858407079647, + "flos": 1424250086400.0, + "grad_norm": 0.045177828452490555, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78100705, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.13476562, + "step": 667, + "time_per_iteration": 4.85069465637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093582, + "balance_loss_mlp": 1.05958366, + "epoch": 0.12851096575606002, + "flos": 448869316608.0, + "grad_norm": 0.07778909975942494, + "language_loss": 0.95426726, + "learning_rate": 0.0009747717245101093, + "loss": 0.96520311, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.34008789, + "step": 668, + "time_per_iteration": 2.5234692096710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098998, + "balance_loss_mlp": 1.06519032, + "epoch": 0.12870334744132358, + "flos": 479697486336.0, + "grad_norm": 0.05465485885236262, + "language_loss": 0.84969366, + "learning_rate": 0.00097467392162117, + "loss": 0.86068368, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.33789062, + "step": 669, + "time_per_iteration": 2.601684808731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096385, + "balance_loss_mlp": 1.06341171, + "epoch": 0.12889572912658714, + "flos": 638633963520.0, + "grad_norm": 0.05954757179165737, + "language_loss": 0.91292465, + "learning_rate": 0.0009745759344474708, + "loss": 0.92388856, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.32983398, + "step": 670, + "time_per_iteration": 2.8225347995758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098806, + "balance_loss_mlp": 1.06411648, + "epoch": 0.1290881108118507, + "flos": 509693409792.0, + "grad_norm": 0.06976130099981656, + "language_loss": 0.89229816, + "learning_rate": 0.0009744777630270536, + "loss": 0.90328622, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.34692383, + "step": 671, + "time_per_iteration": 2.633571147918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109875, + "balance_loss_mlp": 1.07435024, + "epoch": 0.12928049249711426, + "flos": 670746894336.0, + "grad_norm": 0.08011077975608555, + "language_loss": 0.93749923, + "learning_rate": 0.000974379407398032, + "loss": 0.94859791, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.35546875, + "step": 672, + "time_per_iteration": 2.875609874725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093721, + "balance_loss_mlp": 1.06065273, + "epoch": 0.12947287418237785, + "flos": 793158925824.0, + "grad_norm": 0.05850057523774312, + "language_loss": 0.82016242, + "learning_rate": 0.0009742808675985913, + "loss": 0.83109969, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.33056641, + "step": 673, + "time_per_iteration": 3.087738275527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101029, + "balance_loss_mlp": 1.0646224, + "epoch": 0.1296652558676414, + "flos": 485222054400.0, + "grad_norm": 0.08954381825883409, + "language_loss": 0.9153564, + "learning_rate": 0.0009741821436669876, + "loss": 0.92636657, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.36450195, + "step": 674, + "time_per_iteration": 2.539849281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101326, + "balance_loss_mlp": 1.06673169, + "epoch": 0.12985763755290497, + "flos": 453226987008.0, + "grad_norm": 0.0648114016490977, + "language_loss": 0.9288274, + "learning_rate": 0.0009740832356415492, + "loss": 0.93984067, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.34619141, + "step": 675, + "time_per_iteration": 2.467801094055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097673, + "balance_loss_mlp": 1.06315041, + "epoch": 0.13005001923816853, + "flos": 824719007232.0, + "grad_norm": 0.0735546441878898, + "language_loss": 0.8857609, + "learning_rate": 0.0009739841435606756, + "loss": 0.89673769, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.34545898, + "step": 676, + "time_per_iteration": 3.008781909942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109741, + "balance_loss_mlp": 1.06457949, + "epoch": 0.1302424009234321, + "flos": 531107822592.0, + "grad_norm": 0.07312926894822828, + "language_loss": 0.90675485, + "learning_rate": 0.0009738848674628377, + "loss": 0.9177289, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.328125, + "step": 677, + "time_per_iteration": 2.695338010787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104607, + "balance_loss_mlp": 1.06955981, + "epoch": 0.13043478260869565, + "flos": 525626924544.0, + "grad_norm": 0.06033597827839572, + "language_loss": 0.89643902, + "learning_rate": 0.000973785407386578, + "loss": 0.90748513, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.35058594, + "step": 678, + "time_per_iteration": 2.7727599143981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101976, + "balance_loss_mlp": 1.06714272, + "epoch": 0.1306271642939592, + "flos": 625862108160.0, + "grad_norm": 0.05570081952525763, + "language_loss": 0.87361526, + "learning_rate": 0.0009736857633705103, + "loss": 0.88463503, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.34814453, + "step": 679, + "time_per_iteration": 2.843129873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110176, + "balance_loss_mlp": 1.06630766, + "epoch": 0.13081954597922277, + "flos": 550438723584.0, + "grad_norm": 0.06405817655948583, + "language_loss": 0.93204647, + "learning_rate": 0.0009735859354533196, + "loss": 0.94306409, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.35473633, + "step": 680, + "time_per_iteration": 2.7122464179992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093118, + "balance_loss_mlp": 1.05914354, + "epoch": 0.13101192766448633, + "flos": 536651329536.0, + "grad_norm": 0.06779912020183775, + "language_loss": 0.91948998, + "learning_rate": 0.0009734859236737628, + "loss": 0.93042123, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.33984375, + "step": 681, + "time_per_iteration": 2.594881296157837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093326, + "balance_loss_mlp": 1.0593034, + "epoch": 0.13120430934974991, + "flos": 503258019840.0, + "grad_norm": 0.06413082246497326, + "language_loss": 0.93904501, + "learning_rate": 0.0009733857280706678, + "loss": 0.94997829, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.34033203, + "step": 682, + "time_per_iteration": 2.5831425189971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010929, + "balance_loss_mlp": 1.05992687, + "epoch": 0.13139669103501347, + "flos": 614014221312.0, + "grad_norm": 0.06246118190021366, + "language_loss": 0.85051745, + "learning_rate": 0.000973285348682934, + "loss": 0.86144638, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.33007812, + "step": 683, + "time_per_iteration": 2.7236225605010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226892, + "balance_loss_mlp": 1.21096563, + "epoch": 0.13158907272027703, + "flos": 1484163357696.0, + "grad_norm": 0.08359566880013784, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79125261, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.15917969, + "step": 684, + "time_per_iteration": 4.87854790687561 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087932, + "balance_loss_mlp": 1.05488706, + "epoch": 0.1317814544055406, + "flos": 985049344512.0, + "grad_norm": 0.07039095593234826, + "language_loss": 0.85449159, + "learning_rate": 0.0009730840387095046, + "loss": 0.86537099, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.33056641, + "step": 685, + "time_per_iteration": 3.30759596824646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096943, + "balance_loss_mlp": 1.06156158, + "epoch": 0.13197383609080415, + "flos": 611163892224.0, + "grad_norm": 0.05759402546544749, + "language_loss": 0.912597, + "learning_rate": 0.0009729831082019642, + "loss": 0.92356646, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.35351562, + "step": 686, + "time_per_iteration": 2.7965087890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093388, + "balance_loss_mlp": 1.0589608, + "epoch": 0.1321662177760677, + "flos": 494116305408.0, + "grad_norm": 0.058033147986452156, + "language_loss": 0.89668858, + "learning_rate": 0.0009728819940660958, + "loss": 0.90762246, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.34399414, + "step": 687, + "time_per_iteration": 2.7347469329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110102, + "balance_loss_mlp": 1.0653528, + "epoch": 0.13235859946133127, + "flos": 495591713280.0, + "grad_norm": 0.07548862234195632, + "language_loss": 0.86088693, + "learning_rate": 0.0009727806963411557, + "loss": 0.87189722, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.35668945, + "step": 688, + "time_per_iteration": 2.621638774871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098222, + "balance_loss_mlp": 1.06279302, + "epoch": 0.13255098114659483, + "flos": 511417539072.0, + "grad_norm": 0.08656773393569435, + "language_loss": 0.88000298, + "learning_rate": 0.000972679215066471, + "loss": 0.89098513, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.35449219, + "step": 689, + "time_per_iteration": 2.6806418895721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103388, + "balance_loss_mlp": 1.06900764, + "epoch": 0.13274336283185842, + "flos": 547114120704.0, + "grad_norm": 0.07064056682134613, + "language_loss": 0.99675226, + "learning_rate": 0.0009725775502814401, + "loss": 1.00778604, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.34350586, + "step": 690, + "time_per_iteration": 2.607179641723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121046, + "balance_loss_mlp": 1.08397222, + "epoch": 0.13293574451712198, + "flos": 640465781760.0, + "grad_norm": 0.08777481913975324, + "language_loss": 0.85673726, + "learning_rate": 0.0009724757020255327, + "loss": 0.86794776, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.37084961, + "step": 691, + "time_per_iteration": 2.81113338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111244, + "balance_loss_mlp": 1.07726967, + "epoch": 0.13312812620238554, + "flos": 491234042880.0, + "grad_norm": 0.09165524457583717, + "language_loss": 0.87811983, + "learning_rate": 0.0009723736703382902, + "loss": 0.88923222, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.33984375, + "step": 692, + "time_per_iteration": 2.548689603805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110186, + "balance_loss_mlp": 1.0692203, + "epoch": 0.1333205078876491, + "flos": 508693837824.0, + "grad_norm": 0.061462060991887495, + "language_loss": 0.83746743, + "learning_rate": 0.0009722714552593244, + "loss": 0.84848601, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.32641602, + "step": 693, + "time_per_iteration": 2.6584513187408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099112, + "balance_loss_mlp": 1.06358743, + "epoch": 0.13351288957291266, + "flos": 418474722816.0, + "grad_norm": 0.07144638741394425, + "language_loss": 0.94810003, + "learning_rate": 0.000972169056828319, + "loss": 0.95909119, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.35522461, + "step": 694, + "time_per_iteration": 2.461437702178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_mlp": 1.06751275, + "epoch": 0.13370527125817622, + "flos": 615614694912.0, + "grad_norm": 0.05672506947017021, + "language_loss": 0.87834966, + "learning_rate": 0.0009720664750850283, + "loss": 0.88935745, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.33251953, + "step": 695, + "time_per_iteration": 2.7716193199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103519, + "balance_loss_mlp": 1.07085609, + "epoch": 0.13389765294343978, + "flos": 625757391360.0, + "grad_norm": 0.07304651625724701, + "language_loss": 0.93482703, + "learning_rate": 0.0009719637100692784, + "loss": 0.94586229, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.32666016, + "step": 696, + "time_per_iteration": 2.7741310596466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111476, + "balance_loss_mlp": 1.08090401, + "epoch": 0.13409003462870334, + "flos": 609391710720.0, + "grad_norm": 0.06235589965882817, + "language_loss": 0.83759153, + "learning_rate": 0.0009718607618209661, + "loss": 0.84873915, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.33862305, + "step": 697, + "time_per_iteration": 2.869180202484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128671, + "balance_loss_mlp": 1.09488726, + "epoch": 0.13428241631396692, + "flos": 683499810816.0, + "grad_norm": 0.0709058406100417, + "language_loss": 0.88053036, + "learning_rate": 0.0009717576303800595, + "loss": 0.89181709, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.33789062, + "step": 698, + "time_per_iteration": 3.007253408432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122193, + "balance_loss_mlp": 1.08716917, + "epoch": 0.13447479799923048, + "flos": 508565799936.0, + "grad_norm": 0.07060238478807088, + "language_loss": 0.86057615, + "learning_rate": 0.0009716543157865975, + "loss": 0.87179804, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.35083008, + "step": 699, + "time_per_iteration": 2.6622114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112811, + "balance_loss_mlp": 1.07812154, + "epoch": 0.13466717968449404, + "flos": 897124737024.0, + "grad_norm": 0.06896685381510245, + "language_loss": 0.84149206, + "learning_rate": 0.0009715508180806907, + "loss": 0.85262012, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.34716797, + "step": 700, + "time_per_iteration": 3.175494909286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106112, + "balance_loss_mlp": 1.07054055, + "epoch": 0.1348595613697576, + "flos": 989501557248.0, + "grad_norm": 0.07388845252403331, + "language_loss": 0.90260321, + "learning_rate": 0.0009714471373025202, + "loss": 0.91366434, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.35546875, + "step": 701, + "time_per_iteration": 3.3912835121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090254, + "balance_loss_mlp": 1.05499172, + "epoch": 0.13505194305502116, + "flos": 487580580864.0, + "grad_norm": 0.07959074518459132, + "language_loss": 0.89355272, + "learning_rate": 0.0009713432734923386, + "loss": 0.9044553, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.35253906, + "step": 702, + "time_per_iteration": 2.6718733310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090728, + "balance_loss_mlp": 1.05572796, + "epoch": 0.13524432474028472, + "flos": 613103399424.0, + "grad_norm": 0.06387437846302528, + "language_loss": 0.875036, + "learning_rate": 0.0009712392266904696, + "loss": 0.88594317, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.34985352, + "step": 703, + "time_per_iteration": 2.6985831260681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010863, + "balance_loss_mlp": 1.0524683, + "epoch": 0.13543670642554828, + "flos": 904425868800.0, + "grad_norm": 0.06666466963859687, + "language_loss": 0.86250496, + "learning_rate": 0.0009711349969373076, + "loss": 0.87336791, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.33862305, + "step": 704, + "time_per_iteration": 3.1328465938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095762, + "balance_loss_mlp": 1.0610956, + "epoch": 0.13562908811081184, + "flos": 550335416832.0, + "grad_norm": 0.0628446006314887, + "language_loss": 0.80944061, + "learning_rate": 0.0009710305842733178, + "loss": 0.82039821, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.34667969, + "step": 705, + "time_per_iteration": 2.7668187618255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093976, + "balance_loss_mlp": 1.06147909, + "epoch": 0.1358214697960754, + "flos": 507797572608.0, + "grad_norm": 0.06635154625105166, + "language_loss": 0.90133065, + "learning_rate": 0.0009709259887390373, + "loss": 0.91227043, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.32519531, + "step": 706, + "time_per_iteration": 2.656233072280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096924, + "balance_loss_mlp": 1.06390333, + "epoch": 0.136013851481339, + "flos": 528640197120.0, + "grad_norm": 0.09290535615143355, + "language_loss": 0.91425377, + "learning_rate": 0.0009708212103750737, + "loss": 0.92522299, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.33007812, + "step": 707, + "time_per_iteration": 2.569655656814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_mlp": 1.06812644, + "epoch": 0.13620623316660255, + "flos": 658772379648.0, + "grad_norm": 0.06731423560591156, + "language_loss": 0.87756282, + "learning_rate": 0.0009707162492221051, + "loss": 0.88857424, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.33007812, + "step": 708, + "time_per_iteration": 2.880669593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103707, + "balance_loss_mlp": 1.07009029, + "epoch": 0.1363986148518661, + "flos": 671583522816.0, + "grad_norm": 0.07312175328849302, + "language_loss": 0.88322687, + "learning_rate": 0.0009706111053208815, + "loss": 0.89426386, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.33642578, + "step": 709, + "time_per_iteration": 2.7878787517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097257, + "balance_loss_mlp": 1.06342554, + "epoch": 0.13659099653712967, + "flos": 472828520448.0, + "grad_norm": 0.06741688104713542, + "language_loss": 0.86067665, + "learning_rate": 0.0009705057787122232, + "loss": 0.87164921, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.33862305, + "step": 710, + "time_per_iteration": 2.528298854827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105446, + "balance_loss_mlp": 1.07190061, + "epoch": 0.13678337822239323, + "flos": 452483490816.0, + "grad_norm": 0.05706590332145298, + "language_loss": 0.91653168, + "learning_rate": 0.0009704002694370216, + "loss": 0.92758614, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.33569336, + "step": 711, + "time_per_iteration": 2.5201761722564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114394, + "balance_loss_mlp": 1.0794661, + "epoch": 0.13697575990765679, + "flos": 519373416960.0, + "grad_norm": 0.06387130477766731, + "language_loss": 0.86892813, + "learning_rate": 0.0009702945775362388, + "loss": 0.88007212, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.34960938, + "step": 712, + "time_per_iteration": 2.661848783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130562, + "balance_loss_mlp": 1.0947994, + "epoch": 0.13716814159292035, + "flos": 480145618944.0, + "grad_norm": 0.06038249383316015, + "language_loss": 0.87339497, + "learning_rate": 0.0009701887030509086, + "loss": 0.8847006, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.35766602, + "step": 713, + "time_per_iteration": 2.6068434715270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125148, + "balance_loss_mlp": 1.0908401, + "epoch": 0.1373605232781839, + "flos": 545376844800.0, + "grad_norm": 0.06924339631343991, + "language_loss": 0.92127877, + "learning_rate": 0.0009700826460221346, + "loss": 0.93253028, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.34301758, + "step": 714, + "time_per_iteration": 2.653224468231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145818, + "balance_loss_mlp": 1.11050797, + "epoch": 0.1375529049634475, + "flos": 708473143296.0, + "grad_norm": 0.0682346884445605, + "language_loss": 0.93435562, + "learning_rate": 0.0009699764064910921, + "loss": 0.94581378, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.35302734, + "step": 715, + "time_per_iteration": 2.878445625305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130305, + "balance_loss_mlp": 1.09542441, + "epoch": 0.13774528664871105, + "flos": 486452971008.0, + "grad_norm": 0.07091873756636237, + "language_loss": 0.87931371, + "learning_rate": 0.0009698699844990268, + "loss": 0.89061677, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.34863281, + "step": 716, + "time_per_iteration": 2.6278092861175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124133, + "balance_loss_mlp": 1.09070659, + "epoch": 0.1379376683339746, + "flos": 679885636608.0, + "grad_norm": 0.0686032560828043, + "language_loss": 0.88731855, + "learning_rate": 0.0009697633800872555, + "loss": 0.89855987, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.33422852, + "step": 717, + "time_per_iteration": 2.888576030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112997, + "balance_loss_mlp": 1.07825947, + "epoch": 0.13813005001923817, + "flos": 610628419584.0, + "grad_norm": 0.07907714555147631, + "language_loss": 0.9128629, + "learning_rate": 0.0009696565932971655, + "loss": 0.92399287, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.34741211, + "step": 718, + "time_per_iteration": 2.8937225341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110249, + "balance_loss_mlp": 1.06837237, + "epoch": 0.13832243170450173, + "flos": 588431222784.0, + "grad_norm": 0.05947825646897862, + "language_loss": 0.9001984, + "learning_rate": 0.0009695496241702153, + "loss": 0.91122329, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.34155273, + "step": 719, + "time_per_iteration": 2.791111469268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094313, + "balance_loss_mlp": 1.06093454, + "epoch": 0.1385148133897653, + "flos": 699674844672.0, + "grad_norm": 0.07440757355955382, + "language_loss": 0.86308432, + "learning_rate": 0.0009694424727479339, + "loss": 0.87402749, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.33398438, + "step": 720, + "time_per_iteration": 2.8781325817108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088445, + "balance_loss_mlp": 1.05475688, + "epoch": 0.13870719507502885, + "flos": 597977399808.0, + "grad_norm": 0.059872525751604476, + "language_loss": 0.90073895, + "learning_rate": 0.0009693351390719213, + "loss": 0.91162348, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.3371582, + "step": 721, + "time_per_iteration": 2.691493272781372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095999, + "balance_loss_mlp": 1.06242967, + "epoch": 0.1388995767602924, + "flos": 586279309824.0, + "grad_norm": 0.07792099406652078, + "language_loss": 0.91640067, + "learning_rate": 0.000969227623183848, + "loss": 0.92736065, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.33569336, + "step": 722, + "time_per_iteration": 2.768209218978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086676, + "balance_loss_mlp": 1.05475235, + "epoch": 0.139091958445556, + "flos": 650810709504.0, + "grad_norm": 0.07717859695455091, + "language_loss": 0.91485119, + "learning_rate": 0.0009691199251254554, + "loss": 0.92571795, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.3190918, + "step": 723, + "time_per_iteration": 2.813594102859497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093708, + "balance_loss_mlp": 1.06159282, + "epoch": 0.13928434013081956, + "flos": 575446961664.0, + "grad_norm": 0.06414169604653322, + "language_loss": 0.8718468, + "learning_rate": 0.0009690120449385555, + "loss": 0.88278389, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.32104492, + "step": 724, + "time_per_iteration": 2.732372999191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110008, + "balance_loss_mlp": 1.06574821, + "epoch": 0.13947672181608312, + "flos": 562954503168.0, + "grad_norm": 0.07538454681544235, + "language_loss": 0.93399024, + "learning_rate": 0.0009689039826650312, + "loss": 0.94499099, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.34375, + "step": 725, + "time_per_iteration": 2.769481658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111743, + "balance_loss_mlp": 1.09967864, + "epoch": 0.13966910350134668, + "flos": 1520699387904.0, + "grad_norm": 0.042030956775344956, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77634799, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.12060547, + "step": 726, + "time_per_iteration": 4.903716802597046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101674, + "balance_loss_mlp": 1.06619751, + "epoch": 0.13986148518661023, + "flos": 499604557824.0, + "grad_norm": 0.07361028590256702, + "language_loss": 0.88265646, + "learning_rate": 0.0009686873120259941, + "loss": 0.89367324, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.35522461, + "step": 727, + "time_per_iteration": 2.639673948287964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099007, + "balance_loss_mlp": 1.06612897, + "epoch": 0.1400538668718738, + "flos": 598381862400.0, + "grad_norm": 0.053177263225715844, + "language_loss": 0.87612498, + "learning_rate": 0.0009685787037446004, + "loss": 0.88711506, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.32885742, + "step": 728, + "time_per_iteration": 2.7457332611083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095755, + "balance_loss_mlp": 1.06135106, + "epoch": 0.14024624855713735, + "flos": 593757941760.0, + "grad_norm": 0.0730266030670127, + "language_loss": 0.88032103, + "learning_rate": 0.0009684699135448201, + "loss": 0.89127851, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.34423828, + "step": 729, + "time_per_iteration": 2.6995558738708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091636, + "balance_loss_mlp": 1.05940139, + "epoch": 0.1404386302424009, + "flos": 506335311360.0, + "grad_norm": 0.06378774069808751, + "language_loss": 0.93033969, + "learning_rate": 0.0009683609414688895, + "loss": 0.94125605, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.32226562, + "step": 730, + "time_per_iteration": 2.6648926734924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097348, + "balance_loss_mlp": 1.06175184, + "epoch": 0.14063101192766447, + "flos": 573132105216.0, + "grad_norm": 0.05452232030629634, + "language_loss": 0.86945236, + "learning_rate": 0.0009682517875591154, + "loss": 0.88042581, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.35620117, + "step": 731, + "time_per_iteration": 2.7333967685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099629, + "balance_loss_mlp": 1.06656027, + "epoch": 0.14082339361292806, + "flos": 564333806592.0, + "grad_norm": 0.06482276791137384, + "language_loss": 0.87207299, + "learning_rate": 0.0009681424518578749, + "loss": 0.88306928, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.33081055, + "step": 732, + "time_per_iteration": 2.706704616546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_mlp": 1.06734443, + "epoch": 0.14101577529819162, + "flos": 463336187904.0, + "grad_norm": 0.05411989278901109, + "language_loss": 0.88122302, + "learning_rate": 0.000968032934407616, + "loss": 0.89222693, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.33056641, + "step": 733, + "time_per_iteration": 2.5904436111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100997, + "balance_loss_mlp": 1.06766593, + "epoch": 0.14120815698345518, + "flos": 595791991296.0, + "grad_norm": 0.06321555834593343, + "language_loss": 0.82077157, + "learning_rate": 0.0009679232352508571, + "loss": 0.83178151, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.33349609, + "step": 734, + "time_per_iteration": 2.758493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102299, + "balance_loss_mlp": 1.06992185, + "epoch": 0.14140053866871874, + "flos": 534864591360.0, + "grad_norm": 0.05697576898708014, + "language_loss": 0.81442666, + "learning_rate": 0.0009678133544301871, + "loss": 0.82544965, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.32373047, + "step": 735, + "time_per_iteration": 2.6508195400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092208, + "balance_loss_mlp": 1.06006956, + "epoch": 0.1415929203539823, + "flos": 520013606400.0, + "grad_norm": 0.0400187761209974, + "language_loss": 0.91843486, + "learning_rate": 0.0009677032919882658, + "loss": 0.92935699, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.32128906, + "step": 736, + "time_per_iteration": 2.705019474029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_mlp": 1.06975937, + "epoch": 0.14178530203924586, + "flos": 482095300608.0, + "grad_norm": 0.07179339183341249, + "language_loss": 0.92199683, + "learning_rate": 0.000967593047967823, + "loss": 0.93300164, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.30712891, + "step": 737, + "time_per_iteration": 2.55415415763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109678, + "balance_loss_mlp": 1.07577443, + "epoch": 0.14197768372450942, + "flos": 676339863552.0, + "grad_norm": 0.08640894081958116, + "language_loss": 0.87084705, + "learning_rate": 0.0009674826224116593, + "loss": 0.88194382, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.33911133, + "step": 738, + "time_per_iteration": 2.819878101348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097633, + "balance_loss_mlp": 1.06544614, + "epoch": 0.14217006540977298, + "flos": 445802199552.0, + "grad_norm": 0.06953952980021996, + "language_loss": 0.8713401, + "learning_rate": 0.0009673720153626455, + "loss": 0.88231641, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.32177734, + "step": 739, + "time_per_iteration": 2.5987422466278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096281, + "balance_loss_mlp": 1.06385565, + "epoch": 0.14236244709503657, + "flos": 496261016064.0, + "grad_norm": 0.08400230511878481, + "language_loss": 0.87465405, + "learning_rate": 0.0009672612268637235, + "loss": 0.88561684, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.32421875, + "step": 740, + "time_per_iteration": 2.6148736476898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098472, + "balance_loss_mlp": 1.06669128, + "epoch": 0.14255482878030012, + "flos": 648022989312.0, + "grad_norm": 0.0806935070673247, + "language_loss": 0.846753, + "learning_rate": 0.0009671502569579048, + "loss": 0.85773772, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.31762695, + "step": 741, + "time_per_iteration": 2.7533769607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089599, + "balance_loss_mlp": 1.05774641, + "epoch": 0.14274721046556368, + "flos": 535888894464.0, + "grad_norm": 0.06572551706098649, + "language_loss": 0.90748239, + "learning_rate": 0.0009670391056882719, + "loss": 0.91837835, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.31835938, + "step": 742, + "time_per_iteration": 2.698690176010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088511, + "balance_loss_mlp": 1.0565629, + "epoch": 0.14293959215082724, + "flos": 956677215744.0, + "grad_norm": 0.07291469749344824, + "language_loss": 0.89417249, + "learning_rate": 0.0009669277730979776, + "loss": 0.90505755, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.31958008, + "step": 743, + "time_per_iteration": 3.1728732585906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108649, + "balance_loss_mlp": 1.05408931, + "epoch": 0.1431319738360908, + "flos": 692766590976.0, + "grad_norm": 0.06693583917292938, + "language_loss": 0.85588205, + "learning_rate": 0.0009668162592302449, + "loss": 0.86674696, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.32397461, + "step": 744, + "time_per_iteration": 2.896467685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099426, + "balance_loss_mlp": 1.06673896, + "epoch": 0.14332435552135436, + "flos": 565174817280.0, + "grad_norm": 0.0717564206721674, + "language_loss": 0.86683381, + "learning_rate": 0.0009667045641283676, + "loss": 0.877828, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.3269043, + "step": 745, + "time_per_iteration": 2.6326427459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095955, + "balance_loss_mlp": 1.06336319, + "epoch": 0.14351673720661792, + "flos": 738045665280.0, + "grad_norm": 0.07083856064802352, + "language_loss": 0.95545924, + "learning_rate": 0.0009665926878357092, + "loss": 0.96641874, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.32592773, + "step": 746, + "time_per_iteration": 2.902628183364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108393, + "balance_loss_mlp": 1.07565856, + "epoch": 0.14370911889188148, + "flos": 548951731200.0, + "grad_norm": 0.08672542857876225, + "language_loss": 0.91510898, + "learning_rate": 0.0009664806303957043, + "loss": 0.92619288, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.32714844, + "step": 747, + "time_per_iteration": 2.678656578063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107271, + "balance_loss_mlp": 1.07448816, + "epoch": 0.14390150057714507, + "flos": 589973469696.0, + "grad_norm": 0.06575006445724518, + "language_loss": 0.87633115, + "learning_rate": 0.0009663683918518571, + "loss": 0.88740385, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.32788086, + "step": 748, + "time_per_iteration": 2.894339084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116744, + "balance_loss_mlp": 1.08226848, + "epoch": 0.14409388226240863, + "flos": 590773782528.0, + "grad_norm": 0.06412555003569581, + "language_loss": 0.86334193, + "learning_rate": 0.0009662559722477428, + "loss": 0.87450933, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.3449707, + "step": 749, + "time_per_iteration": 2.6673357486724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116866, + "balance_loss_mlp": 1.15397346, + "epoch": 0.1442862639476722, + "flos": 1510418479104.0, + "grad_norm": 0.05654081816866197, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77331638, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.14648438, + "step": 750, + "time_per_iteration": 4.97744607925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111461, + "balance_loss_mlp": 1.0782733, + "epoch": 0.14447864563293575, + "flos": 496493770752.0, + "grad_norm": 0.05840496998451829, + "language_loss": 0.89989787, + "learning_rate": 0.0009660305900333632, + "loss": 0.91101241, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.33203125, + "step": 751, + "time_per_iteration": 2.6919631958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108129, + "balance_loss_mlp": 1.07513142, + "epoch": 0.1446710273181993, + "flos": 589400271360.0, + "grad_norm": 0.0663289310880325, + "language_loss": 0.83084202, + "learning_rate": 0.0009659176275105992, + "loss": 0.8419233, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.33007812, + "step": 752, + "time_per_iteration": 2.702003240585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097403, + "balance_loss_mlp": 1.0634284, + "epoch": 0.14486340900346287, + "flos": 585521256960.0, + "grad_norm": 0.05748666507804042, + "language_loss": 0.86628646, + "learning_rate": 0.0009658044841025701, + "loss": 0.87726045, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.34008789, + "step": 753, + "time_per_iteration": 2.7666702270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106626, + "balance_loss_mlp": 1.07114923, + "epoch": 0.14505579068872643, + "flos": 504405978624.0, + "grad_norm": 0.07320865998852653, + "language_loss": 0.81996346, + "learning_rate": 0.0009656911598532021, + "loss": 0.83102977, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.35498047, + "step": 754, + "time_per_iteration": 2.6273839473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094053, + "balance_loss_mlp": 1.05936301, + "epoch": 0.14524817237399, + "flos": 486566452224.0, + "grad_norm": 0.05776902712696923, + "language_loss": 0.90229332, + "learning_rate": 0.0009655776548064917, + "loss": 0.91323388, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.34667969, + "step": 755, + "time_per_iteration": 2.6639461517333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092368, + "balance_loss_mlp": 1.05867922, + "epoch": 0.14544055405925355, + "flos": 727857888768.0, + "grad_norm": 0.059694446461720084, + "language_loss": 0.88762641, + "learning_rate": 0.0009654639690065054, + "loss": 0.89855003, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.33691406, + "step": 756, + "time_per_iteration": 2.881164789199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092737, + "balance_loss_mlp": 1.05981112, + "epoch": 0.14563293574451713, + "flos": 593359271424.0, + "grad_norm": 0.0719411984245977, + "language_loss": 0.88362074, + "learning_rate": 0.00096535010249738, + "loss": 0.89454818, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.3293457, + "step": 757, + "time_per_iteration": 2.703355312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092325, + "balance_loss_mlp": 1.05925632, + "epoch": 0.1458253174297807, + "flos": 560192924160.0, + "grad_norm": 0.09095988428785044, + "language_loss": 0.8300786, + "learning_rate": 0.0009652360553233224, + "loss": 0.84100187, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.33081055, + "step": 758, + "time_per_iteration": 2.7321062088012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_mlp": 1.04821551, + "epoch": 0.14601769911504425, + "flos": 1557025984512.0, + "grad_norm": 0.03493248396843453, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74836457, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.14453125, + "step": 759, + "time_per_iteration": 4.917184591293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099151, + "balance_loss_mlp": 1.06605887, + "epoch": 0.1462100808003078, + "flos": 865922628096.0, + "grad_norm": 0.05465610046720203, + "language_loss": 0.8166393, + "learning_rate": 0.0009650074191575883, + "loss": 0.82763088, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.33105469, + "step": 760, + "time_per_iteration": 3.2009472846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097005, + "balance_loss_mlp": 1.06341171, + "epoch": 0.14640246248557137, + "flos": 522673288704.0, + "grad_norm": 0.07890258703475667, + "language_loss": 0.86329532, + "learning_rate": 0.0009648928302546766, + "loss": 0.87426543, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.3359375, + "step": 761, + "time_per_iteration": 2.6858482360839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087335, + "balance_loss_mlp": 1.05340791, + "epoch": 0.14659484417083493, + "flos": 1030121805312.0, + "grad_norm": 0.05505233607608704, + "language_loss": 0.8584463, + "learning_rate": 0.0009647780608643613, + "loss": 0.86931968, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.33935547, + "step": 762, + "time_per_iteration": 3.3784618377685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087006, + "balance_loss_mlp": 1.05365133, + "epoch": 0.1467872258560985, + "flos": 500426629632.0, + "grad_norm": 0.083565321416964, + "language_loss": 0.88299912, + "learning_rate": 0.0009646631110312001, + "loss": 0.89386916, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.33349609, + "step": 763, + "time_per_iteration": 2.642038345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096574, + "balance_loss_mlp": 1.06465006, + "epoch": 0.14697960754136205, + "flos": 547514201088.0, + "grad_norm": 0.05646167170610495, + "language_loss": 0.88908124, + "learning_rate": 0.0009645479807998203, + "loss": 0.900047, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.3190918, + "step": 764, + "time_per_iteration": 2.7709102630615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093321, + "balance_loss_mlp": 1.0614922, + "epoch": 0.14717198922662564, + "flos": 517586678784.0, + "grad_norm": 0.06731397985108602, + "language_loss": 0.93233657, + "learning_rate": 0.0009644326702149196, + "loss": 0.94326979, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.31811523, + "step": 765, + "time_per_iteration": 2.691761016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098194, + "balance_loss_mlp": 1.06472015, + "epoch": 0.1473643709118892, + "flos": 731661147648.0, + "grad_norm": 0.08664060064789567, + "language_loss": 0.85604531, + "learning_rate": 0.0009643171793212653, + "loss": 0.86702728, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.33496094, + "step": 766, + "time_per_iteration": 3.0578510761260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095374, + "balance_loss_mlp": 1.06190002, + "epoch": 0.14755675259715276, + "flos": 620257554432.0, + "grad_norm": 0.06875066800131625, + "language_loss": 0.90379435, + "learning_rate": 0.0009642015081636952, + "loss": 0.91474807, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.33496094, + "step": 767, + "time_per_iteration": 2.690892219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091039, + "balance_loss_mlp": 1.05830407, + "epoch": 0.14774913428241632, + "flos": 451981513728.0, + "grad_norm": 0.06617868208271054, + "language_loss": 0.88812423, + "learning_rate": 0.0009640856567871166, + "loss": 0.89903462, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.32714844, + "step": 768, + "time_per_iteration": 2.5108768939971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086849, + "balance_loss_mlp": 1.05316067, + "epoch": 0.14794151596767988, + "flos": 836881196544.0, + "grad_norm": 0.06813910901976611, + "language_loss": 0.89643073, + "learning_rate": 0.0009639696252365072, + "loss": 0.90729922, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.33691406, + "step": 769, + "time_per_iteration": 3.036872386932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087914, + "balance_loss_mlp": 1.05546558, + "epoch": 0.14813389765294344, + "flos": 685765204992.0, + "grad_norm": 0.06952898718112278, + "language_loss": 0.82433641, + "learning_rate": 0.0009638534135569144, + "loss": 0.83521557, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.32446289, + "step": 770, + "time_per_iteration": 2.920228958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096521, + "balance_loss_mlp": 1.06395316, + "epoch": 0.148326279338207, + "flos": 509625008640.0, + "grad_norm": 0.05850145176667806, + "language_loss": 0.90417981, + "learning_rate": 0.0009637370217934554, + "loss": 0.91514498, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.32568359, + "step": 771, + "time_per_iteration": 2.6692943572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0624088, + "epoch": 0.14851866102347056, + "flos": 587869608960.0, + "grad_norm": 0.06374792966079154, + "language_loss": 0.83362675, + "learning_rate": 0.0009636204499913175, + "loss": 0.84457153, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.32055664, + "step": 772, + "time_per_iteration": 2.9103784561157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101595, + "balance_loss_mlp": 1.07129157, + "epoch": 0.14871104270873411, + "flos": 690722366976.0, + "grad_norm": 0.05784692032564958, + "language_loss": 0.8891257, + "learning_rate": 0.0009635036981957581, + "loss": 0.90014172, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.30273438, + "step": 773, + "time_per_iteration": 2.840233087539673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109331, + "balance_loss_mlp": 1.06112361, + "epoch": 0.1489034243939977, + "flos": 654803205120.0, + "grad_norm": 0.06091674471201955, + "language_loss": 0.9126395, + "learning_rate": 0.0009633867664521043, + "loss": 0.9235726, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.32202148, + "step": 774, + "time_per_iteration": 2.8467912673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098219, + "balance_loss_mlp": 1.0643878, + "epoch": 0.14909580607926126, + "flos": 475595891712.0, + "grad_norm": 0.06395321005815084, + "language_loss": 0.87366414, + "learning_rate": 0.0009632696548057527, + "loss": 0.8846463, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.33862305, + "step": 775, + "time_per_iteration": 2.55267596244812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090534, + "balance_loss_mlp": 1.05729866, + "epoch": 0.14928818776452482, + "flos": 610789953024.0, + "grad_norm": 0.07257335679926562, + "language_loss": 0.85489643, + "learning_rate": 0.0009631523633021704, + "loss": 0.86580181, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.33251953, + "step": 776, + "time_per_iteration": 2.800656795501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090063, + "balance_loss_mlp": 1.05694628, + "epoch": 0.14948056944978838, + "flos": 561487859712.0, + "grad_norm": 0.058446141184189525, + "language_loss": 0.88943005, + "learning_rate": 0.0009630348919868936, + "loss": 0.90033066, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.33129883, + "step": 777, + "time_per_iteration": 2.7306644916534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088502, + "balance_loss_mlp": 1.05397916, + "epoch": 0.14967295113505194, + "flos": 448972623360.0, + "grad_norm": 0.08136314957760014, + "language_loss": 0.81536144, + "learning_rate": 0.0009629172409055293, + "loss": 0.82624644, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.34545898, + "step": 778, + "time_per_iteration": 2.532480239868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091534, + "balance_loss_mlp": 1.05937171, + "epoch": 0.1498653328203155, + "flos": 571000541184.0, + "grad_norm": 0.06865521140792329, + "language_loss": 0.88039231, + "learning_rate": 0.0009627994101037531, + "loss": 0.89130771, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.3215332, + "step": 779, + "time_per_iteration": 2.7336056232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091586, + "balance_loss_mlp": 1.05811191, + "epoch": 0.15005771450557906, + "flos": 630918194688.0, + "grad_norm": 0.06277485509918372, + "language_loss": 0.8981787, + "learning_rate": 0.0009626813996273114, + "loss": 0.90909451, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.3347168, + "step": 780, + "time_per_iteration": 2.8651859760284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092654, + "balance_loss_mlp": 1.06018162, + "epoch": 0.15025009619084262, + "flos": 577633780224.0, + "grad_norm": 0.06737111741199381, + "language_loss": 0.89359641, + "learning_rate": 0.0009625632095220198, + "loss": 0.90452296, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.32470703, + "step": 781, + "time_per_iteration": 2.910163640975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093984, + "balance_loss_mlp": 1.06041455, + "epoch": 0.1504424778761062, + "flos": 483646311936.0, + "grad_norm": 0.06188715182302237, + "language_loss": 0.87568116, + "learning_rate": 0.0009624448398337637, + "loss": 0.88662094, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.3359375, + "step": 782, + "time_per_iteration": 2.532055616378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100397, + "balance_loss_mlp": 1.06751907, + "epoch": 0.15063485956136977, + "flos": 762167812608.0, + "grad_norm": 0.06229794960735175, + "language_loss": 0.89905757, + "learning_rate": 0.0009623262906084984, + "loss": 0.91006154, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.32861328, + "step": 783, + "time_per_iteration": 2.9851605892181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104855, + "balance_loss_mlp": 1.0712378, + "epoch": 0.15082724124663333, + "flos": 497369687040.0, + "grad_norm": 0.060596744514248076, + "language_loss": 0.90796679, + "learning_rate": 0.0009622075618922486, + "loss": 0.91901541, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.33642578, + "step": 784, + "time_per_iteration": 2.6796786785125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102349, + "balance_loss_mlp": 1.06928015, + "epoch": 0.15101962293189689, + "flos": 509476621824.0, + "grad_norm": 0.06389342174673626, + "language_loss": 0.87423813, + "learning_rate": 0.0009620886537311091, + "loss": 0.8852616, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.33081055, + "step": 785, + "time_per_iteration": 2.6153056621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113163, + "balance_loss_mlp": 1.07685184, + "epoch": 0.15121200461716044, + "flos": 457520638464.0, + "grad_norm": 0.06793935281312648, + "language_loss": 0.85492945, + "learning_rate": 0.000961969566171244, + "loss": 0.86606109, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.36303711, + "step": 786, + "time_per_iteration": 2.506267786026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122272, + "balance_loss_mlp": 1.08703363, + "epoch": 0.151404386302424, + "flos": 537729477120.0, + "grad_norm": 0.0670602351843582, + "language_loss": 0.90370345, + "learning_rate": 0.0009618502992588873, + "loss": 0.91492617, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.35253906, + "step": 787, + "time_per_iteration": 2.623457670211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141844, + "balance_loss_mlp": 1.10658193, + "epoch": 0.15159676798768756, + "flos": 687858891264.0, + "grad_norm": 0.06543467559167064, + "language_loss": 0.88581872, + "learning_rate": 0.0009617308530403424, + "loss": 0.89723718, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.35302734, + "step": 788, + "time_per_iteration": 2.975861072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149381, + "balance_loss_mlp": 1.11371326, + "epoch": 0.15178914967295112, + "flos": 545042193408.0, + "grad_norm": 0.059566397417978756, + "language_loss": 0.87806541, + "learning_rate": 0.0009616112275619825, + "loss": 0.88955921, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.35668945, + "step": 789, + "time_per_iteration": 2.683262348175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152452, + "balance_loss_mlp": 1.1169517, + "epoch": 0.1519815313582147, + "flos": 511510671360.0, + "grad_norm": 0.05728483560240697, + "language_loss": 0.84466863, + "learning_rate": 0.0009614914228702503, + "loss": 0.85619313, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.35498047, + "step": 790, + "time_per_iteration": 2.6616339683532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142719, + "balance_loss_mlp": 1.10850596, + "epoch": 0.15217391304347827, + "flos": 683747122176.0, + "grad_norm": 0.057799273493116435, + "language_loss": 0.89279461, + "learning_rate": 0.0009613714390116581, + "loss": 0.90422177, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.34204102, + "step": 791, + "time_per_iteration": 2.947608470916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133841, + "balance_loss_mlp": 1.0997231, + "epoch": 0.15236629472874183, + "flos": 643873342464.0, + "grad_norm": 0.06413295296627212, + "language_loss": 0.86589968, + "learning_rate": 0.0009612512760327879, + "loss": 0.87723809, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.34155273, + "step": 792, + "time_per_iteration": 2.8261189460754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124449, + "balance_loss_mlp": 1.08727932, + "epoch": 0.1525586764140054, + "flos": 412654791168.0, + "grad_norm": 0.06095846853214657, + "language_loss": 0.85749042, + "learning_rate": 0.0009611309339802909, + "loss": 0.86873484, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.37182617, + "step": 793, + "time_per_iteration": 2.438474178314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113041, + "balance_loss_mlp": 1.07811236, + "epoch": 0.15275105809926895, + "flos": 802444644864.0, + "grad_norm": 0.04691390558901254, + "language_loss": 0.84620011, + "learning_rate": 0.0009610104129008881, + "loss": 0.85733056, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.34985352, + "step": 794, + "time_per_iteration": 3.1149892807006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092099, + "balance_loss_mlp": 1.05786228, + "epoch": 0.1529434397845325, + "flos": 612143115264.0, + "grad_norm": 0.06446455819394356, + "language_loss": 0.88995111, + "learning_rate": 0.0009608897128413701, + "loss": 0.90087205, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.3425293, + "step": 795, + "time_per_iteration": 2.7310965061187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085841, + "balance_loss_mlp": 1.05160367, + "epoch": 0.15313582146979607, + "flos": 614941009920.0, + "grad_norm": 0.04580320827636504, + "language_loss": 0.8595438, + "learning_rate": 0.0009607688338485965, + "loss": 0.87040222, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.3425293, + "step": 796, + "time_per_iteration": 2.8534584045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088916, + "balance_loss_mlp": 1.05427384, + "epoch": 0.15332820315505963, + "flos": 793256440320.0, + "grad_norm": 0.053101967265095064, + "language_loss": 0.91128695, + "learning_rate": 0.0009606477759694969, + "loss": 0.92217612, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.34643555, + "step": 797, + "time_per_iteration": 3.0544466972351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_mlp": 1.06441545, + "epoch": 0.1535205848403232, + "flos": 549945510912.0, + "grad_norm": 0.0662794157411924, + "language_loss": 0.87591946, + "learning_rate": 0.0009605265392510703, + "loss": 0.88690674, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.34350586, + "step": 798, + "time_per_iteration": 2.6120660305023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011091, + "balance_loss_mlp": 1.07417202, + "epoch": 0.15371296652558677, + "flos": 535691045376.0, + "grad_norm": 0.07220239734969772, + "language_loss": 0.92342889, + "learning_rate": 0.0009604051237403846, + "loss": 0.93451989, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.34960938, + "step": 799, + "time_per_iteration": 2.640749216079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111728, + "balance_loss_mlp": 1.07808757, + "epoch": 0.15390534821085033, + "flos": 395002939392.0, + "grad_norm": 0.06314402273456009, + "language_loss": 0.86126584, + "learning_rate": 0.0009602835294845776, + "loss": 0.87238312, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.33666992, + "step": 800, + "time_per_iteration": 2.44914174079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117351, + "balance_loss_mlp": 1.08254242, + "epoch": 0.1540977298961139, + "flos": 535587738624.0, + "grad_norm": 0.057636094576239, + "language_loss": 0.91100746, + "learning_rate": 0.0009601617565308565, + "loss": 0.92218101, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.34790039, + "step": 801, + "time_per_iteration": 2.599679470062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119062, + "balance_loss_mlp": 1.08511138, + "epoch": 0.15429011158137745, + "flos": 723388147200.0, + "grad_norm": 0.05961266019354579, + "language_loss": 0.86783326, + "learning_rate": 0.0009600398049264977, + "loss": 0.87902391, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.33935547, + "step": 802, + "time_per_iteration": 2.9514007568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121787, + "balance_loss_mlp": 1.08735943, + "epoch": 0.154482493266641, + "flos": 620209502208.0, + "grad_norm": 0.06366105456569557, + "language_loss": 0.92098475, + "learning_rate": 0.0009599176747188469, + "loss": 0.9322027, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.34448242, + "step": 803, + "time_per_iteration": 2.8068411350250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_mlp": 1.08012128, + "epoch": 0.15467487495190457, + "flos": 525351909888.0, + "grad_norm": 0.08101366702111423, + "language_loss": 0.83651662, + "learning_rate": 0.0009597953659553196, + "loss": 0.84765685, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.33911133, + "step": 804, + "time_per_iteration": 2.7075448036193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098377, + "balance_loss_mlp": 1.06616712, + "epoch": 0.15486725663716813, + "flos": 527473299456.0, + "grad_norm": 0.07377431927286832, + "language_loss": 0.89624304, + "learning_rate": 0.0009596728786833997, + "loss": 0.90722686, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.32202148, + "step": 805, + "time_per_iteration": 2.6376051902770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088085, + "balance_loss_mlp": 1.05420554, + "epoch": 0.1550596383224317, + "flos": 1048118482944.0, + "grad_norm": 0.06708822771662253, + "language_loss": 0.90018022, + "learning_rate": 0.0009595502129506415, + "loss": 0.91106105, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.33911133, + "step": 806, + "time_per_iteration": 3.3391284942626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092582, + "balance_loss_mlp": 1.05903625, + "epoch": 0.15525202000769528, + "flos": 613438050816.0, + "grad_norm": 0.06052700763637142, + "language_loss": 0.83084035, + "learning_rate": 0.0009594273688046678, + "loss": 0.84176612, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.33544922, + "step": 807, + "time_per_iteration": 2.7136006355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088184, + "balance_loss_mlp": 1.05273128, + "epoch": 0.15544440169295884, + "flos": 532805810688.0, + "grad_norm": 0.07048562468234597, + "language_loss": 0.86048424, + "learning_rate": 0.000959304346293171, + "loss": 0.87136608, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.35473633, + "step": 808, + "time_per_iteration": 2.6744906902313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097573, + "balance_loss_mlp": 1.06254935, + "epoch": 0.1556367833782224, + "flos": 644433546240.0, + "grad_norm": 0.06803397985071584, + "language_loss": 0.88331544, + "learning_rate": 0.0009591811454639125, + "loss": 0.89429116, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.3503418, + "step": 809, + "time_per_iteration": 2.730431079864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0610261, + "epoch": 0.15582916506348596, + "flos": 543540644352.0, + "grad_norm": 0.06204685505428811, + "language_loss": 0.88227659, + "learning_rate": 0.0009590577663647234, + "loss": 0.89322132, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.3347168, + "step": 810, + "time_per_iteration": 2.71469783782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104684, + "balance_loss_mlp": 1.07078123, + "epoch": 0.15602154674874952, + "flos": 579740613120.0, + "grad_norm": 0.05672341894910533, + "language_loss": 0.86610442, + "learning_rate": 0.0009589342090435036, + "loss": 0.87715125, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.33935547, + "step": 811, + "time_per_iteration": 2.799246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110387, + "balance_loss_mlp": 1.06918025, + "epoch": 0.15621392843401308, + "flos": 534982454784.0, + "grad_norm": 0.0647852675732537, + "language_loss": 0.87778354, + "learning_rate": 0.0009588104735482223, + "loss": 0.8888222, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.34692383, + "step": 812, + "time_per_iteration": 2.6684510707855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126921, + "balance_loss_mlp": 1.09106326, + "epoch": 0.15640631011927664, + "flos": 550635162624.0, + "grad_norm": 0.08222618986335321, + "language_loss": 0.84280443, + "learning_rate": 0.0009586865599269177, + "loss": 0.85407358, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.35864258, + "step": 813, + "time_per_iteration": 2.6293816566467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131277, + "balance_loss_mlp": 1.09651566, + "epoch": 0.1565986918045402, + "flos": 637190641152.0, + "grad_norm": 0.05945515562529824, + "language_loss": 0.88725412, + "learning_rate": 0.0009585624682276977, + "loss": 0.89856684, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.34814453, + "step": 814, + "time_per_iteration": 2.744253158569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137563, + "balance_loss_mlp": 1.10113239, + "epoch": 0.15679107348980378, + "flos": 490569122304.0, + "grad_norm": 0.09591637295165127, + "language_loss": 0.87945771, + "learning_rate": 0.0009584381984987386, + "loss": 0.89083332, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.36474609, + "step": 815, + "time_per_iteration": 2.5264036655426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124613, + "balance_loss_mlp": 1.0911386, + "epoch": 0.15698345517506734, + "flos": 529689231360.0, + "grad_norm": 0.05838460881618622, + "language_loss": 0.90277314, + "learning_rate": 0.0009583137507882864, + "loss": 0.91401929, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.3347168, + "step": 816, + "time_per_iteration": 2.6488330364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109898, + "balance_loss_mlp": 1.07418323, + "epoch": 0.1571758368603309, + "flos": 545779897344.0, + "grad_norm": 0.07313796537718548, + "language_loss": 0.81262791, + "learning_rate": 0.000958189125144656, + "loss": 0.82372689, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.35766602, + "step": 817, + "time_per_iteration": 2.7040657997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101746, + "balance_loss_mlp": 1.06672239, + "epoch": 0.15736821854559446, + "flos": 565377048576.0, + "grad_norm": 0.067694528538076, + "language_loss": 0.88558215, + "learning_rate": 0.0009580643216162313, + "loss": 0.89659959, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.3503418, + "step": 818, + "time_per_iteration": 2.6538634300231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096156, + "balance_loss_mlp": 1.06110835, + "epoch": 0.15756060023085802, + "flos": 500707436544.0, + "grad_norm": 0.05957146674366314, + "language_loss": 0.79884583, + "learning_rate": 0.0009579393402514652, + "loss": 0.80980736, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.35107422, + "step": 819, + "time_per_iteration": 2.5606625080108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082975, + "balance_loss_mlp": 1.048738, + "epoch": 0.15775298191612158, + "flos": 519014034432.0, + "grad_norm": 0.06194437160070725, + "language_loss": 0.91126758, + "learning_rate": 0.0009578141810988801, + "loss": 0.92209733, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.34228516, + "step": 820, + "time_per_iteration": 2.55538010597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082194, + "balance_loss_mlp": 1.04712272, + "epoch": 0.15794536360138514, + "flos": 465891153408.0, + "grad_norm": 0.060184436438788555, + "language_loss": 0.91010749, + "learning_rate": 0.0009576888442070668, + "loss": 0.92092943, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.35083008, + "step": 821, + "time_per_iteration": 2.6139276027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094225, + "balance_loss_mlp": 1.05982161, + "epoch": 0.1581377452866487, + "flos": 516911583744.0, + "grad_norm": 0.06832586535724347, + "language_loss": 0.92820144, + "learning_rate": 0.0009575633296246854, + "loss": 0.93914366, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.34423828, + "step": 822, + "time_per_iteration": 2.557404041290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096972, + "balance_loss_mlp": 1.06242526, + "epoch": 0.15833012697191226, + "flos": 549522109440.0, + "grad_norm": 0.06257557491721027, + "language_loss": 0.83520567, + "learning_rate": 0.0009574376374004652, + "loss": 0.84617537, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.34570312, + "step": 823, + "time_per_iteration": 2.673220157623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100077, + "balance_loss_mlp": 1.06395626, + "epoch": 0.15852250865717585, + "flos": 487206641664.0, + "grad_norm": 0.07116075590187526, + "language_loss": 0.81073487, + "learning_rate": 0.000957311767583204, + "loss": 0.82173562, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.36132812, + "step": 824, + "time_per_iteration": 2.605074882507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159484, + "balance_loss_mlp": 1.14126849, + "epoch": 0.1587148903424394, + "flos": 1309041672192.0, + "grad_norm": 0.051809649393169656, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83231074, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.18261719, + "step": 825, + "time_per_iteration": 4.726073265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111019, + "balance_loss_mlp": 1.07349157, + "epoch": 0.15890727202770297, + "flos": 466634649600.0, + "grad_norm": 0.07947222616221912, + "language_loss": 0.92132723, + "learning_rate": 0.0009570594953650961, + "loss": 0.93243748, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.37524414, + "step": 826, + "time_per_iteration": 2.5146830081939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109067, + "balance_loss_mlp": 1.07208848, + "epoch": 0.15909965371296653, + "flos": 776733608448.0, + "grad_norm": 0.06013225990958685, + "language_loss": 0.80852252, + "learning_rate": 0.00095693309306219, + "loss": 0.81961316, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.36962891, + "step": 827, + "time_per_iteration": 3.095632553100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117102, + "balance_loss_mlp": 1.07945621, + "epoch": 0.1592920353982301, + "flos": 1077852538368.0, + "grad_norm": 0.05984978885312211, + "language_loss": 0.88600951, + "learning_rate": 0.0009568065133621244, + "loss": 0.89718056, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.37646484, + "step": 828, + "time_per_iteration": 3.3153574466705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111269, + "balance_loss_mlp": 1.07584, + "epoch": 0.15948441708349365, + "flos": 725307305472.0, + "grad_norm": 0.0632864692280333, + "language_loss": 0.85493571, + "learning_rate": 0.0009566797563140422, + "loss": 0.86604846, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.35449219, + "step": 829, + "time_per_iteration": 2.8705785274505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121543, + "balance_loss_mlp": 1.08470702, + "epoch": 0.1596767987687572, + "flos": 578447087616.0, + "grad_norm": 0.06433687205870958, + "language_loss": 0.88630873, + "learning_rate": 0.0009565528219671547, + "loss": 0.89752412, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.36816406, + "step": 830, + "time_per_iteration": 2.8890771865844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137333, + "balance_loss_mlp": 1.10049748, + "epoch": 0.15986918045402077, + "flos": 528728947200.0, + "grad_norm": 0.04994246668943954, + "language_loss": 0.85232639, + "learning_rate": 0.0009564257103707418, + "loss": 0.86369967, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.36816406, + "step": 831, + "time_per_iteration": 2.5870308876037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133852, + "balance_loss_mlp": 1.09632492, + "epoch": 0.16006156213928435, + "flos": 574313559552.0, + "grad_norm": 0.0648316290803925, + "language_loss": 0.91675746, + "learning_rate": 0.0009562984215741533, + "loss": 0.92809594, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.37524414, + "step": 832, + "time_per_iteration": 2.655066967010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117496, + "balance_loss_mlp": 1.08170903, + "epoch": 0.1602539438245479, + "flos": 515258675712.0, + "grad_norm": 0.14271195523272245, + "language_loss": 0.82911491, + "learning_rate": 0.0009561709556268065, + "loss": 0.84028995, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.35839844, + "step": 833, + "time_per_iteration": 2.69999098777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119914, + "balance_loss_mlp": 1.08419931, + "epoch": 0.16044632550981147, + "flos": 620730418176.0, + "grad_norm": 0.05962773238435596, + "language_loss": 0.95060706, + "learning_rate": 0.0009560433125781884, + "loss": 0.96180618, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.35693359, + "step": 834, + "time_per_iteration": 2.711109161376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126421, + "balance_loss_mlp": 1.08977628, + "epoch": 0.16063870719507503, + "flos": 560817146880.0, + "grad_norm": 0.06388697234939344, + "language_loss": 0.92829657, + "learning_rate": 0.0009559154924778544, + "loss": 0.93956077, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.36621094, + "step": 835, + "time_per_iteration": 2.695260763168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121916, + "balance_loss_mlp": 1.08789361, + "epoch": 0.1608310888803386, + "flos": 804778440192.0, + "grad_norm": 0.05750453212643973, + "language_loss": 0.85217482, + "learning_rate": 0.0009557874953754284, + "loss": 0.86339402, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.34057617, + "step": 836, + "time_per_iteration": 3.002013921737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126166, + "balance_loss_mlp": 1.09204817, + "epoch": 0.16102347056560215, + "flos": 600311195136.0, + "grad_norm": 0.06332628409766573, + "language_loss": 0.84060842, + "learning_rate": 0.0009556593213206038, + "loss": 0.85187006, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.34130859, + "step": 837, + "time_per_iteration": 2.698716163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125003, + "balance_loss_mlp": 1.09102869, + "epoch": 0.1612158522508657, + "flos": 553235208192.0, + "grad_norm": 0.07524747482874264, + "language_loss": 0.87844718, + "learning_rate": 0.0009555309703631414, + "loss": 0.88969719, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.33984375, + "step": 838, + "time_per_iteration": 2.669588327407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133813, + "balance_loss_mlp": 1.09752607, + "epoch": 0.16140823393612927, + "flos": 555701423616.0, + "grad_norm": 0.07144746672945328, + "language_loss": 0.87685311, + "learning_rate": 0.0009554024425528722, + "loss": 0.88819122, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.36279297, + "step": 839, + "time_per_iteration": 2.6809709072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112028, + "balance_loss_mlp": 1.08737814, + "epoch": 0.16160061562139286, + "flos": 543613427712.0, + "grad_norm": 0.06970106087394082, + "language_loss": 0.8929134, + "learning_rate": 0.0009552737379396948, + "loss": 0.90411627, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.32885742, + "step": 840, + "time_per_iteration": 2.6100995540618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102587, + "balance_loss_mlp": 1.06920815, + "epoch": 0.16179299730665642, + "flos": 603590717952.0, + "grad_norm": 0.06131687325166246, + "language_loss": 0.87945604, + "learning_rate": 0.0009551448565735767, + "loss": 0.89048195, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.33398438, + "step": 841, + "time_per_iteration": 2.7360360622406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095736, + "balance_loss_mlp": 1.06168985, + "epoch": 0.16198537899191998, + "flos": 786821050368.0, + "grad_norm": 0.07162496841720159, + "language_loss": 0.8519845, + "learning_rate": 0.0009550157985045543, + "loss": 0.86294186, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.34082031, + "step": 842, + "time_per_iteration": 3.0436456203460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087632, + "balance_loss_mlp": 1.05413389, + "epoch": 0.16217776067718354, + "flos": 519550917120.0, + "grad_norm": 0.060562390499230526, + "language_loss": 0.89622426, + "learning_rate": 0.0009548865637827321, + "loss": 0.90710062, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.33496094, + "step": 843, + "time_per_iteration": 2.6422221660614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086448, + "balance_loss_mlp": 1.05342698, + "epoch": 0.1623701423624471, + "flos": 505015644672.0, + "grad_norm": 0.07097995853224412, + "language_loss": 0.90216166, + "learning_rate": 0.0009547571524582838, + "loss": 0.91302609, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.33032227, + "step": 844, + "time_per_iteration": 2.6082894802093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095057, + "balance_loss_mlp": 1.06031966, + "epoch": 0.16256252404771065, + "flos": 496940493312.0, + "grad_norm": 0.06932052947515681, + "language_loss": 0.92511153, + "learning_rate": 0.0009546275645814512, + "loss": 0.9360621, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.34765625, + "step": 845, + "time_per_iteration": 2.5985872745513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100013, + "balance_loss_mlp": 1.065418, + "epoch": 0.16275490573297421, + "flos": 502110061056.0, + "grad_norm": 0.07540183512891604, + "language_loss": 0.90294898, + "learning_rate": 0.0009544978002025446, + "loss": 0.91394913, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.34619141, + "step": 846, + "time_per_iteration": 2.5778391361236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096289, + "balance_loss_mlp": 1.06174231, + "epoch": 0.16294728741823777, + "flos": 506952179712.0, + "grad_norm": 0.06018935314915502, + "language_loss": 0.87532055, + "learning_rate": 0.0009543678593719434, + "loss": 0.8862834, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.34570312, + "step": 847, + "time_per_iteration": 2.697566270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102624, + "balance_loss_mlp": 1.06824434, + "epoch": 0.16313966910350133, + "flos": 509418395136.0, + "grad_norm": 0.054217985504269955, + "language_loss": 0.8754853, + "learning_rate": 0.0009542377421400945, + "loss": 0.88651162, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.34375, + "step": 848, + "time_per_iteration": 2.786766290664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104457, + "balance_loss_mlp": 1.06847942, + "epoch": 0.16333205078876492, + "flos": 543712352256.0, + "grad_norm": 0.06122856356214084, + "language_loss": 0.83524954, + "learning_rate": 0.0009541074485575145, + "loss": 0.84629411, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.35986328, + "step": 849, + "time_per_iteration": 2.713759183883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098701, + "balance_loss_mlp": 1.06346297, + "epoch": 0.16352443247402848, + "flos": 507477477888.0, + "grad_norm": 0.06331477383231503, + "language_loss": 0.92240757, + "learning_rate": 0.0009539769786747874, + "loss": 0.93339461, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.35253906, + "step": 850, + "time_per_iteration": 2.589945077896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100894, + "balance_loss_mlp": 1.06584692, + "epoch": 0.16371681415929204, + "flos": 541851420672.0, + "grad_norm": 0.06704648725492578, + "language_loss": 0.81567919, + "learning_rate": 0.0009538463325425665, + "loss": 0.82668811, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.35083008, + "step": 851, + "time_per_iteration": 2.6779844760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105544, + "balance_loss_mlp": 1.07042515, + "epoch": 0.1639091958445556, + "flos": 520501026816.0, + "grad_norm": 0.058426853420895056, + "language_loss": 0.8673842, + "learning_rate": 0.0009537155102115728, + "loss": 0.87843966, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.35131836, + "step": 852, + "time_per_iteration": 2.5614206790924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106136, + "balance_loss_mlp": 1.07175565, + "epoch": 0.16410157752981916, + "flos": 547149026304.0, + "grad_norm": 0.06460558975646845, + "language_loss": 0.83482397, + "learning_rate": 0.0009535845117325961, + "loss": 0.84588534, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.34423828, + "step": 853, + "time_per_iteration": 2.6453073024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098632, + "balance_loss_mlp": 1.06470513, + "epoch": 0.16429395921508272, + "flos": 582561828864.0, + "grad_norm": 0.052152281018199936, + "language_loss": 0.93584174, + "learning_rate": 0.0009534533371564946, + "loss": 0.94682807, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.33959961, + "step": 854, + "time_per_iteration": 2.75186824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111206, + "balance_loss_mlp": 1.07670665, + "epoch": 0.16448634090034628, + "flos": 530678628864.0, + "grad_norm": 0.06475772966833339, + "language_loss": 0.8907218, + "learning_rate": 0.0009533219865341949, + "loss": 0.90183383, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.3449707, + "step": 855, + "time_per_iteration": 2.581479787826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108852, + "balance_loss_mlp": 1.07285094, + "epoch": 0.16467872258560984, + "flos": 491623948800.0, + "grad_norm": 0.06378602693040462, + "language_loss": 0.87287533, + "learning_rate": 0.0009531904599166916, + "loss": 0.88396388, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.36035156, + "step": 856, + "time_per_iteration": 2.6429831981658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107008, + "balance_loss_mlp": 1.07141232, + "epoch": 0.16487110427087343, + "flos": 506015216640.0, + "grad_norm": 0.07162133431974482, + "language_loss": 0.85139728, + "learning_rate": 0.0009530587573550478, + "loss": 0.86246729, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.35620117, + "step": 857, + "time_per_iteration": 2.5667338371276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071319, + "balance_loss_mlp": 1.05434394, + "epoch": 0.16506348595613698, + "flos": 1432006553088.0, + "grad_norm": 0.02717136097410494, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75390708, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.16992188, + "step": 858, + "time_per_iteration": 5.02930474281311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107487, + "balance_loss_mlp": 1.0740366, + "epoch": 0.16525586764140054, + "flos": 476890827264.0, + "grad_norm": 0.06438670275364486, + "language_loss": 0.90481895, + "learning_rate": 0.0009527948246039337, + "loss": 0.91589379, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.33447266, + "step": 859, + "time_per_iteration": 2.5222055912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109782, + "balance_loss_mlp": 1.07618856, + "epoch": 0.1654482493266641, + "flos": 880737297408.0, + "grad_norm": 0.058857893791213665, + "language_loss": 0.88361865, + "learning_rate": 0.000952662594516931, + "loss": 0.8947165, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.33618164, + "step": 860, + "time_per_iteration": 3.065053701400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109979, + "balance_loss_mlp": 1.07497942, + "epoch": 0.16564063101192766, + "flos": 626527028736.0, + "grad_norm": 0.058557043780191484, + "language_loss": 0.86803752, + "learning_rate": 0.0009525301886907234, + "loss": 0.87913728, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.34985352, + "step": 861, + "time_per_iteration": 2.873415470123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121945, + "balance_loss_mlp": 1.08537149, + "epoch": 0.16583301269719122, + "flos": 561250722816.0, + "grad_norm": 0.0761086770239273, + "language_loss": 0.8825953, + "learning_rate": 0.0009523976071767155, + "loss": 0.8938148, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.36572266, + "step": 862, + "time_per_iteration": 2.71508526802063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115561, + "balance_loss_mlp": 1.07994115, + "epoch": 0.16602539438245478, + "flos": 567510022656.0, + "grad_norm": 0.05388299317844869, + "language_loss": 0.88433009, + "learning_rate": 0.00095226485002638, + "loss": 0.8954857, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.35620117, + "step": 863, + "time_per_iteration": 2.7524497509002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111173, + "balance_loss_mlp": 1.07617354, + "epoch": 0.16621777606771834, + "flos": 574589984256.0, + "grad_norm": 0.05833582522103205, + "language_loss": 0.89311475, + "learning_rate": 0.0009521319172912576, + "loss": 0.90422642, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.35009766, + "step": 864, + "time_per_iteration": 2.717493772506714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134798, + "balance_loss_mlp": 1.09846306, + "epoch": 0.16641015775298193, + "flos": 514292599296.0, + "grad_norm": 0.05644176285984134, + "language_loss": 0.94990546, + "learning_rate": 0.0009519988090229579, + "loss": 0.96125346, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.36352539, + "step": 865, + "time_per_iteration": 2.6850624084472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133116, + "balance_loss_mlp": 1.09668565, + "epoch": 0.1666025394382455, + "flos": 621395338752.0, + "grad_norm": 0.05816643645022503, + "language_loss": 0.88684535, + "learning_rate": 0.0009518655252731576, + "loss": 0.89817655, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.36450195, + "step": 866, + "time_per_iteration": 2.7240021228790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124082, + "balance_loss_mlp": 1.0882715, + "epoch": 0.16679492112350905, + "flos": 548528329728.0, + "grad_norm": 0.06128727898968579, + "language_loss": 0.9070124, + "learning_rate": 0.0009517320660936022, + "loss": 0.91825324, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.35839844, + "step": 867, + "time_per_iteration": 2.6959731578826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118134, + "balance_loss_mlp": 1.08260965, + "epoch": 0.1669873028087726, + "flos": 665379477504.0, + "grad_norm": 0.05857722537468161, + "language_loss": 0.83557463, + "learning_rate": 0.0009515984315361051, + "loss": 0.84675598, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.35546875, + "step": 868, + "time_per_iteration": 2.813674211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122458, + "balance_loss_mlp": 1.08638549, + "epoch": 0.16717968449403617, + "flos": 538305647616.0, + "grad_norm": 0.06553445455365839, + "language_loss": 0.87103701, + "learning_rate": 0.000951464621652548, + "loss": 0.88226151, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.36083984, + "step": 869, + "time_per_iteration": 2.674333333969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111253, + "balance_loss_mlp": 1.07757819, + "epoch": 0.16737206617929973, + "flos": 529833235968.0, + "grad_norm": 0.059309523866322815, + "language_loss": 0.78951609, + "learning_rate": 0.0009513306364948804, + "loss": 0.80064136, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.34985352, + "step": 870, + "time_per_iteration": 2.7519431114196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011143, + "balance_loss_mlp": 1.07953846, + "epoch": 0.1675644478645633, + "flos": 480529732608.0, + "grad_norm": 0.06711563999134491, + "language_loss": 0.89559376, + "learning_rate": 0.0009511964761151197, + "loss": 0.90673673, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.34814453, + "step": 871, + "time_per_iteration": 2.544520854949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113298, + "balance_loss_mlp": 1.07820272, + "epoch": 0.16775682954982685, + "flos": 494311334400.0, + "grad_norm": 0.06484202096701225, + "language_loss": 0.9050945, + "learning_rate": 0.0009510621405653521, + "loss": 0.91622752, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.35131836, + "step": 872, + "time_per_iteration": 2.594224452972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106918, + "balance_loss_mlp": 1.07265687, + "epoch": 0.1679492112350904, + "flos": 751694846976.0, + "grad_norm": 0.060317450015561574, + "language_loss": 0.847211, + "learning_rate": 0.0009509276298977309, + "loss": 0.85828018, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.34277344, + "step": 873, + "time_per_iteration": 2.9428915977478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110568, + "balance_loss_mlp": 1.07261181, + "epoch": 0.168141592920354, + "flos": 1135413075456.0, + "grad_norm": 0.05441785661992682, + "language_loss": 0.81867516, + "learning_rate": 0.0009507929441644778, + "loss": 0.82978088, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.37939453, + "step": 874, + "time_per_iteration": 3.52008318901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101336, + "balance_loss_mlp": 1.06640816, + "epoch": 0.16833397460561755, + "flos": 632114205696.0, + "grad_norm": 0.06557720885733571, + "language_loss": 0.86201179, + "learning_rate": 0.0009506580834178826, + "loss": 0.87302518, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.34936523, + "step": 875, + "time_per_iteration": 2.7744014263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110472, + "balance_loss_mlp": 1.06817079, + "epoch": 0.1685263562908811, + "flos": 541171943424.0, + "grad_norm": 0.06007828909359903, + "language_loss": 0.91612709, + "learning_rate": 0.0009505230477103028, + "loss": 0.92717427, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.36547852, + "step": 876, + "time_per_iteration": 2.6593635082244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103279, + "balance_loss_mlp": 1.06703997, + "epoch": 0.16871873797614467, + "flos": 619036812288.0, + "grad_norm": 0.08702038824672748, + "language_loss": 0.81312418, + "learning_rate": 0.0009503878370941641, + "loss": 0.824157, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.36206055, + "step": 877, + "time_per_iteration": 2.7511024475097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094199, + "balance_loss_mlp": 1.05986643, + "epoch": 0.16891111966140823, + "flos": 606067107840.0, + "grad_norm": 0.06953183101172467, + "language_loss": 0.88841844, + "learning_rate": 0.0009502524516219595, + "loss": 0.89936042, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.34375, + "step": 878, + "time_per_iteration": 2.697455406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091575, + "balance_loss_mlp": 1.05757689, + "epoch": 0.1691035013466718, + "flos": 552058136064.0, + "grad_norm": 0.0721678347454753, + "language_loss": 0.89980447, + "learning_rate": 0.0009501168913462506, + "loss": 0.91072023, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.34008789, + "step": 879, + "time_per_iteration": 2.6825287342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080465, + "balance_loss_mlp": 1.06263125, + "epoch": 0.16929588303193535, + "flos": 1475544121344.0, + "grad_norm": 0.044515803528062385, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80202389, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.17871094, + "step": 880, + "time_per_iteration": 4.825777769088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.0464623, + "epoch": 0.1694882647171989, + "flos": 925850456064.0, + "grad_norm": 0.06491790696384477, + "language_loss": 0.85360616, + "learning_rate": 0.0009498452465949042, + "loss": 0.86441934, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.34887695, + "step": 881, + "time_per_iteration": 3.2700376510620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086319, + "balance_loss_mlp": 1.05227244, + "epoch": 0.1696806464024625, + "flos": 545829359616.0, + "grad_norm": 0.057533624801199786, + "language_loss": 0.916857, + "learning_rate": 0.0009497091622247285, + "loss": 0.92772019, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.34082031, + "step": 882, + "time_per_iteration": 2.711721181869507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085559, + "balance_loss_mlp": 1.05184698, + "epoch": 0.16987302808772606, + "flos": 528970466304.0, + "grad_norm": 0.08384615451013337, + "language_loss": 0.93744707, + "learning_rate": 0.0009495729032619723, + "loss": 0.94830269, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.33740234, + "step": 883, + "time_per_iteration": 2.688525438308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084621, + "balance_loss_mlp": 1.05062199, + "epoch": 0.17006540977298962, + "flos": 754855096320.0, + "grad_norm": 0.06073677328113264, + "language_loss": 0.84419179, + "learning_rate": 0.0009494364697595354, + "loss": 0.85503805, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.34033203, + "step": 884, + "time_per_iteration": 2.9112000465393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092434, + "balance_loss_mlp": 1.05750597, + "epoch": 0.17025779145825318, + "flos": 558532813824.0, + "grad_norm": 0.06728326387015754, + "language_loss": 0.89818925, + "learning_rate": 0.0009492998617703867, + "loss": 0.90911365, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.34936523, + "step": 885, + "time_per_iteration": 2.6760926246643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093981, + "balance_loss_mlp": 1.06045985, + "epoch": 0.17045017314351674, + "flos": 511963186176.0, + "grad_norm": 0.0687386743468794, + "language_loss": 0.87971282, + "learning_rate": 0.0009491630793475619, + "loss": 0.89065266, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.33520508, + "step": 886, + "time_per_iteration": 2.59726619720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.06011951, + "epoch": 0.1706425548287803, + "flos": 508674898944.0, + "grad_norm": 0.058204707286146434, + "language_loss": 0.85722017, + "learning_rate": 0.0009490261225441643, + "loss": 0.8681649, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.34350586, + "step": 887, + "time_per_iteration": 2.900501012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087689, + "balance_loss_mlp": 1.0545013, + "epoch": 0.17083493651404386, + "flos": 717016776192.0, + "grad_norm": 0.05310353290702558, + "language_loss": 0.90775931, + "learning_rate": 0.0009488889914133656, + "loss": 0.9186362, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.33203125, + "step": 888, + "time_per_iteration": 2.992532968521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089684, + "balance_loss_mlp": 1.05520868, + "epoch": 0.17102731819930742, + "flos": 558852908544.0, + "grad_norm": 0.047287767355612194, + "language_loss": 0.88680297, + "learning_rate": 0.0009487516860084047, + "loss": 0.89769983, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.34472656, + "step": 889, + "time_per_iteration": 2.7029643058776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082798, + "balance_loss_mlp": 1.04858518, + "epoch": 0.17121969988457098, + "flos": 494542679040.0, + "grad_norm": 0.0765590367769256, + "language_loss": 0.88680983, + "learning_rate": 0.0009486142063825884, + "loss": 0.89763772, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.34228516, + "step": 890, + "time_per_iteration": 2.5640931129455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_mlp": 1.02402985, + "epoch": 0.17141208156983456, + "flos": 1548088063488.0, + "grad_norm": 0.02832238153451814, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.7346493, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.14648438, + "step": 891, + "time_per_iteration": 4.979609251022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108867, + "balance_loss_mlp": 1.0540278, + "epoch": 0.17160446325509812, + "flos": 619282713600.0, + "grad_norm": 0.06449268303648867, + "language_loss": 0.90758598, + "learning_rate": 0.0009483387246819542, + "loss": 0.91847265, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.34667969, + "step": 892, + "time_per_iteration": 2.731332540512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_mlp": 1.01767898, + "epoch": 0.17179684494036168, + "flos": 1381026972672.0, + "grad_norm": 0.016720826063608682, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83318138, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.1484375, + "step": 893, + "time_per_iteration": 4.641844987869263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097484, + "balance_loss_mlp": 1.06386662, + "epoch": 0.17198922662562524, + "flos": 492386383872.0, + "grad_norm": 0.05711411270468228, + "language_loss": 0.89587665, + "learning_rate": 0.0009480625467392688, + "loss": 0.90685147, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.33642578, + "step": 894, + "time_per_iteration": 2.6310250759124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_mlp": 1.01795936, + "epoch": 0.1721816083108888, + "flos": 1457529914880.0, + "grad_norm": 0.013728573618451478, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79027796, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.15136719, + "step": 895, + "time_per_iteration": 4.7525646686553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132534, + "balance_loss_mlp": 1.09834456, + "epoch": 0.17237398999615236, + "flos": 527853030912.0, + "grad_norm": 0.05821127752563967, + "language_loss": 0.87793648, + "learning_rate": 0.0009477856729834196, + "loss": 0.88926184, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.34228516, + "step": 896, + "time_per_iteration": 2.7015438079833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132108, + "balance_loss_mlp": 1.09901524, + "epoch": 0.17256637168141592, + "flos": 603644562432.0, + "grad_norm": 0.08337200045302615, + "language_loss": 0.9056648, + "learning_rate": 0.0009476469753098809, + "loss": 0.91698587, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.33105469, + "step": 897, + "time_per_iteration": 2.7035813331604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125108, + "balance_loss_mlp": 1.08922589, + "epoch": 0.17275875336667948, + "flos": 509437334016.0, + "grad_norm": 0.05742024530278536, + "language_loss": 0.874506, + "learning_rate": 0.0009475081038443738, + "loss": 0.88575709, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.35913086, + "step": 898, + "time_per_iteration": 2.584437370300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115669, + "balance_loss_mlp": 1.07971573, + "epoch": 0.17295113505194307, + "flos": 664951693824.0, + "grad_norm": 0.06535241228499304, + "language_loss": 0.85809892, + "learning_rate": 0.0009473690586408124, + "loss": 0.86925566, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.35986328, + "step": 899, + "time_per_iteration": 2.83156418800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116393, + "balance_loss_mlp": 1.08084452, + "epoch": 0.17314351673720663, + "flos": 555125253120.0, + "grad_norm": 0.0683413775827569, + "language_loss": 0.86639923, + "learning_rate": 0.0009472298397531792, + "loss": 0.87756318, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.35546875, + "step": 900, + "time_per_iteration": 2.6944193840026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123337, + "balance_loss_mlp": 1.08635855, + "epoch": 0.17333589842247019, + "flos": 503361326592.0, + "grad_norm": 0.09670394547256775, + "language_loss": 0.87118709, + "learning_rate": 0.0009470904472355235, + "loss": 0.88242042, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.36987305, + "step": 901, + "time_per_iteration": 2.637882709503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114178, + "balance_loss_mlp": 1.07982159, + "epoch": 0.17352828010773375, + "flos": 555924003840.0, + "grad_norm": 0.06358596699153923, + "language_loss": 0.79912066, + "learning_rate": 0.0009469508811419626, + "loss": 0.81026244, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.34399414, + "step": 902, + "time_per_iteration": 2.726072311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077408, + "balance_loss_mlp": 1.06453359, + "epoch": 0.1737206617929973, + "flos": 1553711556096.0, + "grad_norm": 0.030950293127884103, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72691238, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.12890625, + "step": 903, + "time_per_iteration": 4.791790723800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109546, + "balance_loss_mlp": 1.07445073, + "epoch": 0.17391304347826086, + "flos": 516390667776.0, + "grad_norm": 0.06883251868009001, + "language_loss": 0.84220147, + "learning_rate": 0.0009466712284439292, + "loss": 0.85329694, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.35131836, + "step": 904, + "time_per_iteration": 2.7648017406463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104504, + "balance_loss_mlp": 1.06995738, + "epoch": 0.17410542516352442, + "flos": 540773273088.0, + "grad_norm": 0.06988851938988141, + "language_loss": 0.8903957, + "learning_rate": 0.0009465311419480276, + "loss": 0.90144074, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.34545898, + "step": 905, + "time_per_iteration": 2.725829601287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098399, + "balance_loss_mlp": 1.0637325, + "epoch": 0.17429780684878798, + "flos": 623542869504.0, + "grad_norm": 0.06312030659776342, + "language_loss": 0.88624233, + "learning_rate": 0.0009463908820933622, + "loss": 0.89722633, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.34692383, + "step": 906, + "time_per_iteration": 2.8389482498168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093859, + "balance_loss_mlp": 1.05900264, + "epoch": 0.17449018853405157, + "flos": 575368386048.0, + "grad_norm": 0.056066721215551084, + "language_loss": 0.83138871, + "learning_rate": 0.0009462504489343868, + "loss": 0.84232736, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.34863281, + "step": 907, + "time_per_iteration": 2.863349437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086469, + "balance_loss_mlp": 1.05199337, + "epoch": 0.17468257021931513, + "flos": 533499844608.0, + "grad_norm": 0.07604190500703253, + "language_loss": 0.894853, + "learning_rate": 0.0009461098425256222, + "loss": 0.90571761, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.3449707, + "step": 908, + "time_per_iteration": 2.5941011905670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108834, + "balance_loss_mlp": 1.05345941, + "epoch": 0.1748749519045787, + "flos": 540496848384.0, + "grad_norm": 0.050694136543679796, + "language_loss": 0.85873353, + "learning_rate": 0.0009459690629216567, + "loss": 0.86961693, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.34887695, + "step": 909, + "time_per_iteration": 2.6097571849823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109032, + "balance_loss_mlp": 1.0570612, + "epoch": 0.17506733358984225, + "flos": 498373641216.0, + "grad_norm": 0.0569262349138849, + "language_loss": 0.88138729, + "learning_rate": 0.0009458281101771457, + "loss": 0.89229047, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.33276367, + "step": 910, + "time_per_iteration": 2.5904784202575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091588, + "balance_loss_mlp": 1.05744696, + "epoch": 0.1752597152751058, + "flos": 622621873152.0, + "grad_norm": 0.06350455217589325, + "language_loss": 0.8266046, + "learning_rate": 0.0009456869843468122, + "loss": 0.83752048, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.34179688, + "step": 911, + "time_per_iteration": 2.8930556774139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090023, + "balance_loss_mlp": 1.05476046, + "epoch": 0.17545209696036937, + "flos": 520717814784.0, + "grad_norm": 0.07844481886152296, + "language_loss": 0.79097009, + "learning_rate": 0.0009455456854854459, + "loss": 0.80187035, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.35302734, + "step": 912, + "time_per_iteration": 2.5984511375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096028, + "balance_loss_mlp": 1.0631026, + "epoch": 0.17564447864563293, + "flos": 461750270976.0, + "grad_norm": 0.05516798292623818, + "language_loss": 0.84505737, + "learning_rate": 0.0009454042136479039, + "loss": 0.85601771, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.3293457, + "step": 913, + "time_per_iteration": 2.586195945739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085981, + "balance_loss_mlp": 1.05286503, + "epoch": 0.1758368603308965, + "flos": 480416251392.0, + "grad_norm": 0.05301404729603274, + "language_loss": 0.83308446, + "learning_rate": 0.0009452625688891103, + "loss": 0.84394431, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.33129883, + "step": 914, + "time_per_iteration": 2.5374929904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052517, + "balance_loss_mlp": 1.038975, + "epoch": 0.17602924201616005, + "flos": 1478160133632.0, + "grad_norm": 0.03507986977902886, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79787254, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.13574219, + "step": 915, + "time_per_iteration": 4.5561583042144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097389, + "balance_loss_mlp": 1.06226993, + "epoch": 0.17622162370142364, + "flos": 602010593280.0, + "grad_norm": 0.06815502965849334, + "language_loss": 0.93451321, + "learning_rate": 0.0009449787608278015, + "loss": 0.94548714, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.35131836, + "step": 916, + "time_per_iteration": 2.7807908058166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092918, + "balance_loss_mlp": 1.0588007, + "epoch": 0.1764140053866872, + "flos": 442473214464.0, + "grad_norm": 0.0637680644109211, + "language_loss": 0.92700857, + "learning_rate": 0.0009448365976354704, + "loss": 0.93793774, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.34130859, + "step": 917, + "time_per_iteration": 2.4800124168395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.06909323, + "epoch": 0.17660638707195075, + "flos": 500362610688.0, + "grad_norm": 0.07080486598597346, + "language_loss": 0.90158784, + "learning_rate": 0.0009446942617422558, + "loss": 0.91264427, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.36547852, + "step": 918, + "time_per_iteration": 2.5415430068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101469, + "balance_loss_mlp": 1.06766129, + "epoch": 0.17679876875721431, + "flos": 538621360128.0, + "grad_norm": 0.060000223973742446, + "language_loss": 0.86201262, + "learning_rate": 0.0009445517532034176, + "loss": 0.87302732, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.33789062, + "step": 919, + "time_per_iteration": 2.6849868297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121669, + "balance_loss_mlp": 1.08569145, + "epoch": 0.17699115044247787, + "flos": 497477376000.0, + "grad_norm": 0.08221632690768264, + "language_loss": 0.89522099, + "learning_rate": 0.0009444090720742824, + "loss": 0.9064377, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.35986328, + "step": 920, + "time_per_iteration": 2.6034600734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113572, + "balance_loss_mlp": 1.07883418, + "epoch": 0.17718353212774143, + "flos": 662444780544.0, + "grad_norm": 0.08029288241638204, + "language_loss": 0.88040781, + "learning_rate": 0.0009442662184102439, + "loss": 0.89154357, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.34741211, + "step": 921, + "time_per_iteration": 2.767352342605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105947, + "balance_loss_mlp": 1.07309294, + "epoch": 0.177375913813005, + "flos": 582340658688.0, + "grad_norm": 0.0705507668945597, + "language_loss": 0.87951338, + "learning_rate": 0.000944123192266763, + "loss": 0.89057279, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.32836914, + "step": 922, + "time_per_iteration": 2.789315700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108289, + "balance_loss_mlp": 1.0727644, + "epoch": 0.17756829549826855, + "flos": 552285098496.0, + "grad_norm": 0.06115562628552814, + "language_loss": 0.83835006, + "learning_rate": 0.0009439799936993671, + "loss": 0.84943295, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.35546875, + "step": 923, + "time_per_iteration": 2.7160987854003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090669, + "balance_loss_mlp": 1.05733824, + "epoch": 0.17776067718353214, + "flos": 556060806144.0, + "grad_norm": 0.07059184324253498, + "language_loss": 0.88508618, + "learning_rate": 0.0009438366227636511, + "loss": 0.89599288, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.33349609, + "step": 924, + "time_per_iteration": 2.6319191455841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084897, + "balance_loss_mlp": 1.05163789, + "epoch": 0.1779530588687957, + "flos": 658161303552.0, + "grad_norm": 0.06263940487075517, + "language_loss": 0.86677843, + "learning_rate": 0.0009436930795152763, + "loss": 0.87762737, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.33276367, + "step": 925, + "time_per_iteration": 2.8063783645629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084326, + "balance_loss_mlp": 1.05159163, + "epoch": 0.17814544055405926, + "flos": 644187644928.0, + "grad_norm": 0.06448697412821461, + "language_loss": 0.8710525, + "learning_rate": 0.0009435493640099713, + "loss": 0.88189578, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.32739258, + "step": 926, + "time_per_iteration": 2.7599081993103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080787, + "balance_loss_mlp": 1.04664516, + "epoch": 0.17833782223932282, + "flos": 460672123392.0, + "grad_norm": 0.06497730431564504, + "language_loss": 0.84328961, + "learning_rate": 0.0009434054763035314, + "loss": 0.85409749, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.34155273, + "step": 927, + "time_per_iteration": 2.612910032272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081381, + "balance_loss_mlp": 1.04740596, + "epoch": 0.17853020392458638, + "flos": 759212766720.0, + "grad_norm": 0.04594292129212818, + "language_loss": 0.85898727, + "learning_rate": 0.0009432614164518185, + "loss": 0.8698011, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.33984375, + "step": 928, + "time_per_iteration": 2.926981210708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086846, + "balance_loss_mlp": 1.05153632, + "epoch": 0.17872258560984994, + "flos": 782320785408.0, + "grad_norm": 0.055185850673896385, + "language_loss": 0.84792197, + "learning_rate": 0.000943117184510762, + "loss": 0.85879046, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.35327148, + "step": 929, + "time_per_iteration": 2.995514154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039861, + "balance_loss_mlp": 1.02660513, + "epoch": 0.1789149672951135, + "flos": 1459095482880.0, + "grad_norm": 0.021362691821678215, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79829824, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.1328125, + "step": 930, + "time_per_iteration": 4.99839448928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091678, + "balance_loss_mlp": 1.05739331, + "epoch": 0.17910734898037706, + "flos": 503598463488.0, + "grad_norm": 0.05761618473313655, + "language_loss": 0.88773429, + "learning_rate": 0.0009428282045846674, + "loss": 0.89865112, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.34301758, + "step": 931, + "time_per_iteration": 2.6966652870178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.05452061, + "epoch": 0.17929973066564064, + "flos": 745895264256.0, + "grad_norm": 0.05798282919409206, + "language_loss": 0.89983928, + "learning_rate": 0.0009426834567118214, + "loss": 0.91071755, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.33300781, + "step": 932, + "time_per_iteration": 3.072160482406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092765, + "balance_loss_mlp": 1.05907631, + "epoch": 0.1794921123509042, + "flos": 712875893760.0, + "grad_norm": 0.055390897890994044, + "language_loss": 0.80879378, + "learning_rate": 0.0009425385369740155, + "loss": 0.81972146, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.3371582, + "step": 933, + "time_per_iteration": 3.0337042808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092731, + "balance_loss_mlp": 1.05825567, + "epoch": 0.17968449403616776, + "flos": 632838763008.0, + "grad_norm": 0.0687685702394307, + "language_loss": 0.87443584, + "learning_rate": 0.0009423934454275125, + "loss": 0.8853631, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.3449707, + "step": 934, + "time_per_iteration": 2.7970879077911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089195, + "balance_loss_mlp": 1.05526757, + "epoch": 0.17987687572143132, + "flos": 536060602368.0, + "grad_norm": 0.08214865293258214, + "language_loss": 0.92215371, + "learning_rate": 0.0009422481821286418, + "loss": 0.93304563, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.33959961, + "step": 935, + "time_per_iteration": 2.7134642601013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091708, + "balance_loss_mlp": 1.05914021, + "epoch": 0.18006925740669488, + "flos": 537818227200.0, + "grad_norm": 0.0718764173736199, + "language_loss": 0.87967253, + "learning_rate": 0.0009421027471337998, + "loss": 0.89058959, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.32568359, + "step": 936, + "time_per_iteration": 2.608764171600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098474, + "balance_loss_mlp": 1.06333113, + "epoch": 0.18026163909195844, + "flos": 539255757312.0, + "grad_norm": 0.06697051800305152, + "language_loss": 0.82882118, + "learning_rate": 0.0009419571404994493, + "loss": 0.83980596, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.3515625, + "step": 937, + "time_per_iteration": 2.620296001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096306, + "balance_loss_mlp": 1.06240284, + "epoch": 0.180454020777222, + "flos": 500382959616.0, + "grad_norm": 0.08555714620461663, + "language_loss": 0.90948844, + "learning_rate": 0.00094181136228212, + "loss": 0.92045152, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.33935547, + "step": 938, + "time_per_iteration": 2.62837290763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109732, + "balance_loss_mlp": 1.06415629, + "epoch": 0.18064640246248556, + "flos": 498689353728.0, + "grad_norm": 0.06983123921060745, + "language_loss": 0.86323059, + "learning_rate": 0.0009416654125384077, + "loss": 0.8742038, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.33154297, + "step": 939, + "time_per_iteration": 2.715686321258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054723, + "balance_loss_mlp": 1.04242051, + "epoch": 0.18083878414774912, + "flos": 1518572358144.0, + "grad_norm": 0.027679884047562747, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80827093, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.12304688, + "step": 940, + "time_per_iteration": 4.941875219345093 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090642, + "balance_loss_mlp": 1.05728722, + "epoch": 0.1810311658330127, + "flos": 727006703616.0, + "grad_norm": 0.07011009980003599, + "language_loss": 0.84326053, + "learning_rate": 0.000941372998698552, + "loss": 0.85416698, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.33374023, + "step": 941, + "time_per_iteration": 2.931520938873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094264, + "balance_loss_mlp": 1.0597409, + "epoch": 0.18122354751827627, + "flos": 564643726848.0, + "grad_norm": 0.08254502738164117, + "language_loss": 0.8207435, + "learning_rate": 0.0009412265347159336, + "loss": 0.8316862, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.34570312, + "step": 942, + "time_per_iteration": 2.696354627609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091238, + "balance_loss_mlp": 1.05869377, + "epoch": 0.18141592920353983, + "flos": 519024208896.0, + "grad_norm": 0.05729066672306875, + "language_loss": 0.85217965, + "learning_rate": 0.0009410798994339829, + "loss": 0.86309201, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.32543945, + "step": 943, + "time_per_iteration": 2.6009600162506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088013, + "balance_loss_mlp": 1.0545156, + "epoch": 0.1816083108888034, + "flos": 512219261952.0, + "grad_norm": 0.05342615519744699, + "language_loss": 0.88234782, + "learning_rate": 0.000940933092909628, + "loss": 0.89322793, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.33520508, + "step": 944, + "time_per_iteration": 2.618419647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095526, + "balance_loss_mlp": 1.06286263, + "epoch": 0.18180069257406695, + "flos": 492144864768.0, + "grad_norm": 0.053227732023653135, + "language_loss": 0.8393383, + "learning_rate": 0.0009407861151998649, + "loss": 0.85029352, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.32666016, + "step": 945, + "time_per_iteration": 2.5718705654144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097774, + "balance_loss_mlp": 1.06406188, + "epoch": 0.1819930742593305, + "flos": 569891870208.0, + "grad_norm": 0.05775782434029923, + "language_loss": 0.86156505, + "learning_rate": 0.0009406389663617552, + "loss": 0.87254274, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.33740234, + "step": 946, + "time_per_iteration": 2.66513729095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097522, + "balance_loss_mlp": 1.06433463, + "epoch": 0.18218545594459407, + "flos": 605693168640.0, + "grad_norm": 0.06350431386522506, + "language_loss": 0.85736459, + "learning_rate": 0.000940491646452427, + "loss": 0.86833978, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.33203125, + "step": 947, + "time_per_iteration": 2.715071201324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103916, + "balance_loss_mlp": 1.07010818, + "epoch": 0.18237783762985763, + "flos": 548419230720.0, + "grad_norm": 0.06277969821047595, + "language_loss": 0.91195452, + "learning_rate": 0.000940344155529075, + "loss": 0.92299366, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.33837891, + "step": 948, + "time_per_iteration": 2.6502938270568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099574, + "balance_loss_mlp": 1.06550407, + "epoch": 0.1825702193151212, + "flos": 450509078016.0, + "grad_norm": 0.06933176029299125, + "language_loss": 0.87683523, + "learning_rate": 0.0009401964936489605, + "loss": 0.88783091, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.34106445, + "step": 949, + "time_per_iteration": 2.5181798934936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084135, + "balance_loss_mlp": 1.05247355, + "epoch": 0.18276260100038477, + "flos": 588962313216.0, + "grad_norm": 0.07980064544074586, + "language_loss": 0.85422635, + "learning_rate": 0.0009400486608694108, + "loss": 0.86506772, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.31640625, + "step": 950, + "time_per_iteration": 2.7189955711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087871, + "balance_loss_mlp": 1.05384839, + "epoch": 0.18295498268564833, + "flos": 786988376064.0, + "grad_norm": 0.05265351460276348, + "language_loss": 0.87225658, + "learning_rate": 0.0009399006572478195, + "loss": 0.88313532, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.34033203, + "step": 951, + "time_per_iteration": 3.0805773735046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086089, + "balance_loss_mlp": 1.05218577, + "epoch": 0.1831473643709119, + "flos": 577878271488.0, + "grad_norm": 0.059447924131550096, + "language_loss": 0.91242015, + "learning_rate": 0.0009397524828416468, + "loss": 0.92328107, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.33935547, + "step": 952, + "time_per_iteration": 2.6567108631134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082155, + "balance_loss_mlp": 1.04801321, + "epoch": 0.18333974605617545, + "flos": 566622521856.0, + "grad_norm": 0.05513512337372911, + "language_loss": 0.96184212, + "learning_rate": 0.0009396041377084192, + "loss": 0.97266364, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.34179688, + "step": 953, + "time_per_iteration": 2.6937921047210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079403, + "balance_loss_mlp": 1.04478431, + "epoch": 0.183532127741439, + "flos": 526725421056.0, + "grad_norm": 0.07204875194033089, + "language_loss": 0.87840325, + "learning_rate": 0.0009394556219057295, + "loss": 0.88919723, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.34667969, + "step": 954, + "time_per_iteration": 2.6962215900421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107777, + "balance_loss_mlp": 1.04272258, + "epoch": 0.18372450942670257, + "flos": 594259918848.0, + "grad_norm": 0.07227161235955501, + "language_loss": 0.83883446, + "learning_rate": 0.0009393069354912362, + "loss": 0.84961212, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.35058594, + "step": 955, + "time_per_iteration": 2.718308925628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081248, + "balance_loss_mlp": 1.04677236, + "epoch": 0.18391689111196613, + "flos": 644720145408.0, + "grad_norm": 0.07091738302891186, + "language_loss": 0.82511717, + "learning_rate": 0.0009391580785226649, + "loss": 0.83592963, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.34521484, + "step": 956, + "time_per_iteration": 2.907367467880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077991, + "balance_loss_mlp": 1.06216049, + "epoch": 0.18410927279722972, + "flos": 1456246563840.0, + "grad_norm": 0.048423099914415325, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80418444, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.15820312, + "step": 957, + "time_per_iteration": 4.78663969039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091525, + "balance_loss_mlp": 1.05702567, + "epoch": 0.18430165448249328, + "flos": 658437728256.0, + "grad_norm": 0.09319397884021513, + "language_loss": 0.86484683, + "learning_rate": 0.0009388598531545196, + "loss": 0.8757621, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.34545898, + "step": 958, + "time_per_iteration": 2.8470118045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087326, + "balance_loss_mlp": 1.05285025, + "epoch": 0.18449403616775684, + "flos": 517679811072.0, + "grad_norm": 0.07377492103556435, + "language_loss": 0.86076611, + "learning_rate": 0.000938710484870727, + "loss": 0.87163937, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.3449707, + "step": 959, + "time_per_iteration": 2.5930731296539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090986, + "balance_loss_mlp": 1.05672574, + "epoch": 0.1846864178530204, + "flos": 552481537536.0, + "grad_norm": 0.06589557505977534, + "language_loss": 0.86379164, + "learning_rate": 0.0009385609462644189, + "loss": 0.8747015, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.34277344, + "step": 960, + "time_per_iteration": 2.688706636428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096456, + "balance_loss_mlp": 1.06212378, + "epoch": 0.18487879953828396, + "flos": 465930441216.0, + "grad_norm": 0.0643439417949763, + "language_loss": 0.86035949, + "learning_rate": 0.0009384112373936514, + "loss": 0.871324, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.34326172, + "step": 961, + "time_per_iteration": 4.0801496505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095895, + "balance_loss_mlp": 1.06132412, + "epoch": 0.18507118122354752, + "flos": 648200489472.0, + "grad_norm": 0.0614591664996872, + "language_loss": 0.91820455, + "learning_rate": 0.0009382613583165467, + "loss": 0.92916346, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.34594727, + "step": 962, + "time_per_iteration": 2.790069341659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113098, + "balance_loss_mlp": 1.07921863, + "epoch": 0.18526356290881107, + "flos": 626486330880.0, + "grad_norm": 0.06374556186760763, + "language_loss": 0.89594233, + "learning_rate": 0.0009381113090912928, + "loss": 0.90707326, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.33886719, + "step": 963, + "time_per_iteration": 2.6891098022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117951, + "balance_loss_mlp": 1.08559799, + "epoch": 0.18545594459407463, + "flos": 432497843712.0, + "grad_norm": 0.06491910119233056, + "language_loss": 0.90103394, + "learning_rate": 0.000937961089776144, + "loss": 0.91221344, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.32348633, + "step": 964, + "time_per_iteration": 2.5821962356567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124554, + "balance_loss_mlp": 1.08926833, + "epoch": 0.1856483262793382, + "flos": 748720862208.0, + "grad_norm": 0.06849062336391444, + "language_loss": 0.829036, + "learning_rate": 0.0009378107004294208, + "loss": 0.84028149, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.35302734, + "step": 965, + "time_per_iteration": 2.9898061752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115255, + "balance_loss_mlp": 1.081972, + "epoch": 0.18584070796460178, + "flos": 530058788352.0, + "grad_norm": 0.08647217477609576, + "language_loss": 0.91352308, + "learning_rate": 0.0009376601411095096, + "loss": 0.92467564, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.33300781, + "step": 966, + "time_per_iteration": 2.6415059566497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093436, + "balance_loss_mlp": 1.06196475, + "epoch": 0.18603308964986534, + "flos": 482863527936.0, + "grad_norm": 0.05783783242438048, + "language_loss": 0.8708145, + "learning_rate": 0.0009375094118748622, + "loss": 0.88174886, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.31445312, + "step": 967, + "time_per_iteration": 2.5149550437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089094, + "balance_loss_mlp": 1.05650234, + "epoch": 0.1862254713351289, + "flos": 800976591360.0, + "grad_norm": 0.0756042683078202, + "language_loss": 0.9083451, + "learning_rate": 0.0009373585127839976, + "loss": 0.91923606, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.32592773, + "step": 968, + "time_per_iteration": 2.9569485187530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084434, + "balance_loss_mlp": 1.05250978, + "epoch": 0.18641785302039246, + "flos": 478082456064.0, + "grad_norm": 0.06160067145414361, + "language_loss": 0.91074634, + "learning_rate": 0.0009372074438954994, + "loss": 0.92159069, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.3190918, + "step": 969, + "time_per_iteration": 2.508530378341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083542, + "balance_loss_mlp": 1.05040169, + "epoch": 0.18661023470565602, + "flos": 388695587328.0, + "grad_norm": 0.07517959095695621, + "language_loss": 0.91676056, + "learning_rate": 0.0009370562052680181, + "loss": 0.92759597, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.33154297, + "step": 970, + "time_per_iteration": 2.4572672843933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087332, + "balance_loss_mlp": 1.05400109, + "epoch": 0.18680261639091958, + "flos": 564402207744.0, + "grad_norm": 0.052448577146131624, + "language_loss": 0.89610398, + "learning_rate": 0.0009369047969602695, + "loss": 0.90697736, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.33349609, + "step": 971, + "time_per_iteration": 2.714704751968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101064, + "balance_loss_mlp": 1.06556404, + "epoch": 0.18699499807618314, + "flos": 479018009088.0, + "grad_norm": 0.06595213007116614, + "language_loss": 0.8674072, + "learning_rate": 0.0009367532190310357, + "loss": 0.87841785, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.35498047, + "step": 972, + "time_per_iteration": 2.589667558670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111914, + "balance_loss_mlp": 1.07660413, + "epoch": 0.1871873797614467, + "flos": 553022802432.0, + "grad_norm": 0.0720295199384638, + "language_loss": 0.88701892, + "learning_rate": 0.0009366014715391644, + "loss": 0.89813805, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.35327148, + "step": 973, + "time_per_iteration": 2.634023904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107724, + "balance_loss_mlp": 1.07389259, + "epoch": 0.18737976144671029, + "flos": 552526617600.0, + "grad_norm": 0.05153911900793568, + "language_loss": 0.8432554, + "learning_rate": 0.0009364495545435693, + "loss": 0.85433269, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.33837891, + "step": 974, + "time_per_iteration": 2.7729458808898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107099, + "balance_loss_mlp": 1.07281494, + "epoch": 0.18757214313197385, + "flos": 502002372096.0, + "grad_norm": 0.05815108638233015, + "language_loss": 0.88620323, + "learning_rate": 0.0009362974681032297, + "loss": 0.8972742, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.34326172, + "step": 975, + "time_per_iteration": 2.631744623184204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105498, + "balance_loss_mlp": 1.07130909, + "epoch": 0.1877645248172374, + "flos": 674691337728.0, + "grad_norm": 0.06841603134690444, + "language_loss": 0.88265896, + "learning_rate": 0.0009361452122771907, + "loss": 0.89371395, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.34204102, + "step": 976, + "time_per_iteration": 2.8427281379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094167, + "balance_loss_mlp": 1.06012082, + "epoch": 0.18795690650250096, + "flos": 404771696640.0, + "grad_norm": 0.07319435948671522, + "language_loss": 0.8377496, + "learning_rate": 0.0009359927871245635, + "loss": 0.84869128, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.34057617, + "step": 977, + "time_per_iteration": 2.4665186405181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090091, + "balance_loss_mlp": 1.0565697, + "epoch": 0.18814928818776452, + "flos": 637599485952.0, + "grad_norm": 0.05986452276683665, + "language_loss": 0.86337954, + "learning_rate": 0.0009358401927045246, + "loss": 0.87428045, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.33520508, + "step": 978, + "time_per_iteration": 2.8037781715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090672, + "balance_loss_mlp": 1.05707908, + "epoch": 0.18834166987302808, + "flos": 1137825446400.0, + "grad_norm": 0.054509582003230646, + "language_loss": 0.88314402, + "learning_rate": 0.0009356874290763166, + "loss": 0.89405078, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.33618164, + "step": 979, + "time_per_iteration": 3.456723213195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097981, + "balance_loss_mlp": 1.06481671, + "epoch": 0.18853405155829164, + "flos": 504538398720.0, + "grad_norm": 0.06366920756378494, + "language_loss": 0.8866874, + "learning_rate": 0.0009355344962992474, + "loss": 0.89766723, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.33154297, + "step": 980, + "time_per_iteration": 2.6105339527130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109825, + "balance_loss_mlp": 1.06494308, + "epoch": 0.1887264332435552, + "flos": 607879987200.0, + "grad_norm": 0.05130215804193928, + "language_loss": 0.88147485, + "learning_rate": 0.0009353813944326908, + "loss": 0.89245737, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.33325195, + "step": 981, + "time_per_iteration": 2.882836103439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109809, + "balance_loss_mlp": 1.0758822, + "epoch": 0.1889188149288188, + "flos": 552264749568.0, + "grad_norm": 0.07032712681879846, + "language_loss": 0.83146608, + "learning_rate": 0.0009352281235360863, + "loss": 0.84256417, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.33959961, + "step": 982, + "time_per_iteration": 2.695748805999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120418, + "balance_loss_mlp": 1.08775461, + "epoch": 0.18911119661408235, + "flos": 418332128256.0, + "grad_norm": 0.06033753714629359, + "language_loss": 0.84987485, + "learning_rate": 0.0009350746836689389, + "loss": 0.86107904, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.32666016, + "step": 983, + "time_per_iteration": 2.5073440074920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260435, + "balance_loss_mlp": 1.23916793, + "epoch": 0.1893035782993459, + "flos": 1481141320704.0, + "grad_norm": 0.0731593378732656, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82699656, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.21289062, + "step": 984, + "time_per_iteration": 5.065609931945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133244, + "balance_loss_mlp": 1.09831583, + "epoch": 0.18949595998460947, + "flos": 508220974080.0, + "grad_norm": 0.09166419018528392, + "language_loss": 0.83211792, + "learning_rate": 0.0009347672972613634, + "loss": 0.84345031, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.34936523, + "step": 985, + "time_per_iteration": 2.580009937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115864, + "balance_loss_mlp": 1.08270001, + "epoch": 0.18968834166987303, + "flos": 530812459008.0, + "grad_norm": 0.0668772854373454, + "language_loss": 0.85875785, + "learning_rate": 0.0009346133508402735, + "loss": 0.8699165, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.33178711, + "step": 986, + "time_per_iteration": 2.6872711181640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111031, + "balance_loss_mlp": 1.07724667, + "epoch": 0.1898807233551366, + "flos": 499515807744.0, + "grad_norm": 0.11088649382938841, + "language_loss": 0.8420769, + "learning_rate": 0.0009344592356873166, + "loss": 0.8531872, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.33813477, + "step": 987, + "time_per_iteration": 2.6347994804382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098007, + "balance_loss_mlp": 1.06462848, + "epoch": 0.19007310504040015, + "flos": 601936399872.0, + "grad_norm": 0.05765681888892058, + "language_loss": 0.78527796, + "learning_rate": 0.0009343049518623255, + "loss": 0.79625803, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.33398438, + "step": 988, + "time_per_iteration": 2.696929693222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082914, + "balance_loss_mlp": 1.05029869, + "epoch": 0.1902654867256637, + "flos": 601374786048.0, + "grad_norm": 0.05732720380572914, + "language_loss": 0.83250153, + "learning_rate": 0.0009341504994251985, + "loss": 0.84333068, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.32617188, + "step": 989, + "time_per_iteration": 2.8399016857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095582, + "balance_loss_mlp": 1.07841623, + "epoch": 0.19045786841092727, + "flos": 1574925147648.0, + "grad_norm": 0.03888561388969961, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74616081, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.171875, + "step": 990, + "time_per_iteration": 5.072636842727661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109747, + "balance_loss_mlp": 1.06394839, + "epoch": 0.19065025009619085, + "flos": 681280906752.0, + "grad_norm": 0.135211113906906, + "language_loss": 0.818295, + "learning_rate": 0.0009338410889544574, + "loss": 0.82926977, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.33544922, + "step": 991, + "time_per_iteration": 3.050665855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_mlp": 1.06786811, + "epoch": 0.1908426317814544, + "flos": 601971305472.0, + "grad_norm": 0.06286082016671143, + "language_loss": 0.87738663, + "learning_rate": 0.000933686131040967, + "loss": 0.88840532, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.34033203, + "step": 992, + "time_per_iteration": 2.7589659690856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089273, + "balance_loss_mlp": 1.05672884, + "epoch": 0.19103501346671797, + "flos": 586027616256.0, + "grad_norm": 0.0561482479745879, + "language_loss": 0.90427077, + "learning_rate": 0.0009335310047555883, + "loss": 0.91516346, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.32543945, + "step": 993, + "time_per_iteration": 2.7133467197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108709, + "balance_loss_mlp": 1.0532825, + "epoch": 0.19122739515198153, + "flos": 545494708224.0, + "grad_norm": 0.06221036652136981, + "language_loss": 0.88114065, + "learning_rate": 0.0009333757101585467, + "loss": 0.89201152, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.33837891, + "step": 994, + "time_per_iteration": 2.6733241081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083527, + "balance_loss_mlp": 1.05105424, + "epoch": 0.1914197768372451, + "flos": 521171739648.0, + "grad_norm": 0.05606370206634765, + "language_loss": 0.93617988, + "learning_rate": 0.0009332202473101329, + "loss": 0.94701517, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.32470703, + "step": 995, + "time_per_iteration": 2.6689558029174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079312, + "balance_loss_mlp": 1.04536152, + "epoch": 0.19161215852250865, + "flos": 610961660928.0, + "grad_norm": 0.05986652691328414, + "language_loss": 0.83121806, + "learning_rate": 0.0009330646162707028, + "loss": 0.84201121, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.33984375, + "step": 996, + "time_per_iteration": 2.7264511585235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081823, + "balance_loss_mlp": 1.04849207, + "epoch": 0.1918045402077722, + "flos": 846281806848.0, + "grad_norm": 0.05485586532204223, + "language_loss": 0.84800065, + "learning_rate": 0.0009329088171006779, + "loss": 0.85881883, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.33349609, + "step": 997, + "time_per_iteration": 3.1486315727233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097705, + "balance_loss_mlp": 1.06220424, + "epoch": 0.19199692189303577, + "flos": 465699096576.0, + "grad_norm": 0.06540772430376247, + "language_loss": 0.84963006, + "learning_rate": 0.0009327528498605446, + "loss": 0.86060709, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.35522461, + "step": 998, + "time_per_iteration": 2.532460927963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.0542109, + "epoch": 0.19218930357829936, + "flos": 531318818304.0, + "grad_norm": 0.06065225266474605, + "language_loss": 0.89716202, + "learning_rate": 0.0009325967146108548, + "loss": 0.90804029, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.33642578, + "step": 999, + "time_per_iteration": 2.6381072998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108792, + "balance_loss_mlp": 1.05334902, + "epoch": 0.19238168526356292, + "flos": 601350054912.0, + "grad_norm": 0.06318510310852068, + "language_loss": 0.87984866, + "learning_rate": 0.0009324404114122258, + "loss": 0.89072788, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.34594727, + "step": 1000, + "time_per_iteration": 2.7017252445220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088105, + "balance_loss_mlp": 1.0544883, + "epoch": 0.19257406694882648, + "flos": 571690192896.0, + "grad_norm": 0.05361295189234855, + "language_loss": 0.87132722, + "learning_rate": 0.0009322839403253397, + "loss": 0.88220823, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.33642578, + "step": 1001, + "time_per_iteration": 2.7725350856781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091645, + "balance_loss_mlp": 1.05759907, + "epoch": 0.19276644863409004, + "flos": 801478568448.0, + "grad_norm": 0.0661765462165054, + "language_loss": 0.84038174, + "learning_rate": 0.0009321273014109439, + "loss": 0.85129815, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.34082031, + "step": 1002, + "time_per_iteration": 2.9275383949279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089997, + "balance_loss_mlp": 1.05676103, + "epoch": 0.1929588303193536, + "flos": 563024314368.0, + "grad_norm": 0.05133430998282463, + "language_loss": 0.85232604, + "learning_rate": 0.0009319704947298513, + "loss": 0.863226, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.33251953, + "step": 1003, + "time_per_iteration": 2.9198272228240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083204, + "balance_loss_mlp": 1.05120838, + "epoch": 0.19315121200461716, + "flos": 626550349824.0, + "grad_norm": 0.04652496586479965, + "language_loss": 0.88737059, + "learning_rate": 0.0009318135203429393, + "loss": 0.8982026, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.31982422, + "step": 1004, + "time_per_iteration": 2.7145965099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094807, + "balance_loss_mlp": 1.06116605, + "epoch": 0.19334359368988072, + "flos": 517169069568.0, + "grad_norm": 0.06711221272981459, + "language_loss": 0.88228458, + "learning_rate": 0.0009316563783111511, + "loss": 0.8932327, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.33642578, + "step": 1005, + "time_per_iteration": 2.68135404586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095243, + "balance_loss_mlp": 1.06050563, + "epoch": 0.19353597537514428, + "flos": 693751606272.0, + "grad_norm": 0.04947727679523619, + "language_loss": 0.82323831, + "learning_rate": 0.0009314990686954943, + "loss": 0.83419079, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.34765625, + "step": 1006, + "time_per_iteration": 2.9068872928619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098932, + "balance_loss_mlp": 1.06495738, + "epoch": 0.19372835706040784, + "flos": 1209665180160.0, + "grad_norm": 0.05336104081377929, + "language_loss": 0.80917025, + "learning_rate": 0.000931341591557042, + "loss": 0.82015955, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.34008789, + "step": 1007, + "time_per_iteration": 3.759119749069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098415, + "balance_loss_mlp": 1.06291509, + "epoch": 0.19392073874567142, + "flos": 520368606720.0, + "grad_norm": 0.06549831272650784, + "language_loss": 0.87757689, + "learning_rate": 0.0009311839469569325, + "loss": 0.88856107, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.35522461, + "step": 1008, + "time_per_iteration": 2.6298930644989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100893, + "balance_loss_mlp": 1.06620264, + "epoch": 0.19411312043093498, + "flos": 588543293952.0, + "grad_norm": 0.06763315162421418, + "language_loss": 0.8732397, + "learning_rate": 0.0009310261349563687, + "loss": 0.88424855, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.34692383, + "step": 1009, + "time_per_iteration": 2.6843061447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110066, + "balance_loss_mlp": 1.06718588, + "epoch": 0.19430550211619854, + "flos": 579085867008.0, + "grad_norm": 0.05371296475785438, + "language_loss": 0.8534441, + "learning_rate": 0.0009308681556166186, + "loss": 0.86445075, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.33496094, + "step": 1010, + "time_per_iteration": 2.8197336196899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107606, + "balance_loss_mlp": 1.07291579, + "epoch": 0.1944978838014621, + "flos": 620848281600.0, + "grad_norm": 0.08312668477716535, + "language_loss": 0.87206143, + "learning_rate": 0.0009307100089990152, + "loss": 0.88313752, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.34716797, + "step": 1011, + "time_per_iteration": 2.7118990421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101924, + "balance_loss_mlp": 1.0672822, + "epoch": 0.19469026548672566, + "flos": 598440089088.0, + "grad_norm": 0.061832865854500894, + "language_loss": 0.83946323, + "learning_rate": 0.0009305516951649568, + "loss": 0.85048252, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.34667969, + "step": 1012, + "time_per_iteration": 2.667672872543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096764, + "balance_loss_mlp": 1.06314659, + "epoch": 0.19488264717198922, + "flos": 551890810368.0, + "grad_norm": 0.04827143175142062, + "language_loss": 0.87187612, + "learning_rate": 0.0009303932141759057, + "loss": 0.88284373, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.33642578, + "step": 1013, + "time_per_iteration": 2.7321088314056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101502, + "balance_loss_mlp": 1.06705046, + "epoch": 0.19507502885725278, + "flos": 665842166784.0, + "grad_norm": 0.05715794205563071, + "language_loss": 0.84201366, + "learning_rate": 0.0009302345660933902, + "loss": 0.85302866, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.3449707, + "step": 1014, + "time_per_iteration": 2.7699263095855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109904, + "balance_loss_mlp": 1.07616735, + "epoch": 0.19526741054251634, + "flos": 670771625472.0, + "grad_norm": 0.05949834877265084, + "language_loss": 0.84866655, + "learning_rate": 0.0009300757509790026, + "loss": 0.85976553, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.33764648, + "step": 1015, + "time_per_iteration": 2.8250515460968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110813, + "balance_loss_mlp": 1.0766474, + "epoch": 0.19545979222777993, + "flos": 446983653888.0, + "grad_norm": 0.0671511226198219, + "language_loss": 0.90974069, + "learning_rate": 0.0009299167688944005, + "loss": 0.92084885, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.34204102, + "step": 1016, + "time_per_iteration": 2.545133590698242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111157, + "balance_loss_mlp": 1.07778645, + "epoch": 0.1956521739130435, + "flos": 568813722624.0, + "grad_norm": 0.06338586690579641, + "language_loss": 0.85958129, + "learning_rate": 0.0009297576199013063, + "loss": 0.87069696, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.33813477, + "step": 1017, + "time_per_iteration": 2.668503761291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148218, + "balance_loss_mlp": 1.13295972, + "epoch": 0.19584455559830705, + "flos": 1454969157120.0, + "grad_norm": 0.047651466398381144, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74150348, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.15234375, + "step": 1018, + "time_per_iteration": 4.920944929122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104842, + "balance_loss_mlp": 1.09015501, + "epoch": 0.1960369372835706, + "flos": 1590320369664.0, + "grad_norm": 0.036993279908541045, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80531144, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.14648438, + "step": 1019, + "time_per_iteration": 6.0059425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118502, + "balance_loss_mlp": 1.08505166, + "epoch": 0.19622931896883417, + "flos": 615709237248.0, + "grad_norm": 0.05240041234704895, + "language_loss": 0.86600977, + "learning_rate": 0.0009292791720892659, + "loss": 0.87719476, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.3347168, + "step": 1020, + "time_per_iteration": 2.995192527770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113873, + "balance_loss_mlp": 1.07930255, + "epoch": 0.19642170065409773, + "flos": 465950790144.0, + "grad_norm": 0.0657036282835547, + "language_loss": 0.88724279, + "learning_rate": 0.0009291193560807218, + "loss": 0.89838147, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.34594727, + "step": 1021, + "time_per_iteration": 2.633256196975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114293, + "balance_loss_mlp": 1.07962656, + "epoch": 0.19661408233936128, + "flos": 515040477696.0, + "grad_norm": 0.054836200403870924, + "language_loss": 0.87439638, + "learning_rate": 0.0009289593734732688, + "loss": 0.88553929, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.34716797, + "step": 1022, + "time_per_iteration": 2.622284173965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107262, + "balance_loss_mlp": 1.0736922, + "epoch": 0.19680646402462484, + "flos": 392427624960.0, + "grad_norm": 0.053036961045345866, + "language_loss": 0.94139373, + "learning_rate": 0.0009287992243290175, + "loss": 0.95246631, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.3359375, + "step": 1023, + "time_per_iteration": 2.4402668476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108975, + "balance_loss_mlp": 1.07247353, + "epoch": 0.19699884570988843, + "flos": 626122566144.0, + "grad_norm": 0.056904835680118435, + "language_loss": 0.90850759, + "learning_rate": 0.0009286389087101435, + "loss": 0.91959733, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.36523438, + "step": 1024, + "time_per_iteration": 2.762068271636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110519, + "balance_loss_mlp": 1.06957078, + "epoch": 0.197191227395152, + "flos": 557710742016.0, + "grad_norm": 0.05298833269370499, + "language_loss": 0.88575542, + "learning_rate": 0.0009284784266788864, + "loss": 0.89680731, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.35668945, + "step": 1025, + "time_per_iteration": 4.087035417556763 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109606, + "balance_loss_mlp": 1.07565546, + "epoch": 0.19738360908041555, + "flos": 664681061376.0, + "grad_norm": 0.0565537913278748, + "language_loss": 0.92494339, + "learning_rate": 0.0009283177782975512, + "loss": 0.93603945, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.33984375, + "step": 1026, + "time_per_iteration": 2.948167562484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095626, + "balance_loss_mlp": 1.06117415, + "epoch": 0.1975759907656791, + "flos": 522244094976.0, + "grad_norm": 0.06218898027866582, + "language_loss": 0.88052273, + "learning_rate": 0.000928156963628507, + "loss": 0.89147896, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.3449707, + "step": 1027, + "time_per_iteration": 2.564019203186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091019, + "balance_loss_mlp": 1.05694866, + "epoch": 0.19776837245094267, + "flos": 462233309184.0, + "grad_norm": 0.056114928823487176, + "language_loss": 0.8826099, + "learning_rate": 0.0009279959827341877, + "loss": 0.89352006, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.34082031, + "step": 1028, + "time_per_iteration": 2.7226340770721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090946, + "balance_loss_mlp": 1.05699515, + "epoch": 0.19796075413620623, + "flos": 502809887232.0, + "grad_norm": 0.05507551359640612, + "language_loss": 0.88204837, + "learning_rate": 0.0009278348356770915, + "loss": 0.89295781, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.33984375, + "step": 1029, + "time_per_iteration": 2.592756748199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085455, + "balance_loss_mlp": 1.05093157, + "epoch": 0.1981531358214698, + "flos": 507281038848.0, + "grad_norm": 0.061172366255401664, + "language_loss": 0.85939109, + "learning_rate": 0.0009276735225197814, + "loss": 0.87024558, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.34570312, + "step": 1030, + "time_per_iteration": 2.598607063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088832, + "balance_loss_mlp": 1.05495238, + "epoch": 0.19834551750673335, + "flos": 531275148288.0, + "grad_norm": 0.0802549423316463, + "language_loss": 0.86293721, + "learning_rate": 0.0009275120433248847, + "loss": 0.87382561, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.33886719, + "step": 1031, + "time_per_iteration": 2.7143311500549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090216, + "balance_loss_mlp": 1.05726683, + "epoch": 0.1985378991919969, + "flos": 775147691520.0, + "grad_norm": 0.05308511447166053, + "language_loss": 0.86272347, + "learning_rate": 0.0009273503981550931, + "loss": 0.87362564, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.32958984, + "step": 1032, + "time_per_iteration": 3.0616648197174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087082, + "balance_loss_mlp": 1.05351269, + "epoch": 0.1987302808772605, + "flos": 434063411712.0, + "grad_norm": 0.059916166081832097, + "language_loss": 0.8703599, + "learning_rate": 0.0009271885870731626, + "loss": 0.88123071, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.3359375, + "step": 1033, + "time_per_iteration": 2.487316131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092715, + "balance_loss_mlp": 1.05921745, + "epoch": 0.19892266256252406, + "flos": 553342897152.0, + "grad_norm": 0.06168947094446192, + "language_loss": 0.88599998, + "learning_rate": 0.0009270266101419143, + "loss": 0.89692712, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.33520508, + "step": 1034, + "time_per_iteration": 2.5978119373321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085912, + "balance_loss_mlp": 1.05403578, + "epoch": 0.19911504424778761, + "flos": 549596302848.0, + "grad_norm": 0.06019117447906982, + "language_loss": 0.85564321, + "learning_rate": 0.0009268644674242328, + "loss": 0.86650234, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.31860352, + "step": 1035, + "time_per_iteration": 2.7259163856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097892, + "balance_loss_mlp": 1.0645138, + "epoch": 0.19930742593305117, + "flos": 518024636928.0, + "grad_norm": 0.05869793462101787, + "language_loss": 0.81141233, + "learning_rate": 0.0009267021589830678, + "loss": 0.82239127, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.33398438, + "step": 1036, + "time_per_iteration": 2.597724199295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161292, + "balance_loss_mlp": 1.14507985, + "epoch": 0.19949980761831473, + "flos": 1508516849664.0, + "grad_norm": 0.04621309141147155, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78788376, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.16210938, + "step": 1037, + "time_per_iteration": 4.918612241744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093993, + "balance_loss_mlp": 1.06044722, + "epoch": 0.1996921893035783, + "flos": 697803738624.0, + "grad_norm": 0.061892224045152405, + "language_loss": 0.93283784, + "learning_rate": 0.000926377045182406, + "loss": 0.94377768, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.33569336, + "step": 1038, + "time_per_iteration": 2.8800160884857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096412, + "balance_loss_mlp": 1.06334293, + "epoch": 0.19988457098884185, + "flos": 726682226688.0, + "grad_norm": 0.0613562398808313, + "language_loss": 0.87972045, + "learning_rate": 0.0009262142399491296, + "loss": 0.89068449, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.33081055, + "step": 1039, + "time_per_iteration": 3.0561435222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097284, + "balance_loss_mlp": 1.06345224, + "epoch": 0.2000769526741054, + "flos": 560275881984.0, + "grad_norm": 0.06364175085873486, + "language_loss": 0.87837642, + "learning_rate": 0.0009260512692448105, + "loss": 0.88934934, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.33862305, + "step": 1040, + "time_per_iteration": 2.7037088871002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090351, + "balance_loss_mlp": 1.05697203, + "epoch": 0.200269334359369, + "flos": 571758594048.0, + "grad_norm": 0.05851279903795688, + "language_loss": 0.84325236, + "learning_rate": 0.000925888133132719, + "loss": 0.85415584, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.33398438, + "step": 1041, + "time_per_iteration": 2.836177110671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082089, + "balance_loss_mlp": 1.06730711, + "epoch": 0.20046171604463256, + "flos": 1485362340864.0, + "grad_norm": 0.029405325300647274, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80692518, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.14746094, + "step": 1042, + "time_per_iteration": 4.901337146759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094947, + "balance_loss_mlp": 1.06140149, + "epoch": 0.20065409772989612, + "flos": 496266808320.0, + "grad_norm": 0.07205728359427886, + "language_loss": 0.81256473, + "learning_rate": 0.0009255613649386244, + "loss": 0.82351422, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.33544922, + "step": 1043, + "time_per_iteration": 2.6198885440826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089381, + "balance_loss_mlp": 1.05686069, + "epoch": 0.20084647941515968, + "flos": 579094631424.0, + "grad_norm": 0.06625931968059934, + "language_loss": 0.79017001, + "learning_rate": 0.0009253977329834838, + "loss": 0.80106384, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.32519531, + "step": 1044, + "time_per_iteration": 2.6872498989105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111234, + "balance_loss_mlp": 1.07745981, + "epoch": 0.20103886110042324, + "flos": 641775273984.0, + "grad_norm": 0.06628294367657735, + "language_loss": 0.86666185, + "learning_rate": 0.0009252339358742965, + "loss": 0.87778521, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.34912109, + "step": 1045, + "time_per_iteration": 2.7749996185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116952, + "balance_loss_mlp": 1.08219087, + "epoch": 0.2012312427856868, + "flos": 441720953856.0, + "grad_norm": 0.05401214919341486, + "language_loss": 0.83449644, + "learning_rate": 0.000925069973674654, + "loss": 0.84566593, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.34814453, + "step": 1046, + "time_per_iteration": 2.662992477416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114258, + "balance_loss_mlp": 1.08116508, + "epoch": 0.20142362447095036, + "flos": 554135855616.0, + "grad_norm": 0.049297877184233195, + "language_loss": 0.88960069, + "learning_rate": 0.000924905846448212, + "loss": 0.90074325, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.33105469, + "step": 1047, + "time_per_iteration": 2.8164925575256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127137, + "balance_loss_mlp": 1.09306693, + "epoch": 0.20161600615621392, + "flos": 669988841472.0, + "grad_norm": 0.07365230282100185, + "language_loss": 0.85615861, + "learning_rate": 0.0009247415542586906, + "loss": 0.86742997, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.34106445, + "step": 1048, + "time_per_iteration": 2.858611822128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115399, + "balance_loss_mlp": 1.08130527, + "epoch": 0.2018083878414775, + "flos": 572788689408.0, + "grad_norm": 0.05223287600750505, + "language_loss": 0.83514655, + "learning_rate": 0.0009245770971698735, + "loss": 0.84630048, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.34106445, + "step": 1049, + "time_per_iteration": 2.8758163452148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103626, + "balance_loss_mlp": 1.07036686, + "epoch": 0.20200076952674106, + "flos": 425624495616.0, + "grad_norm": 0.061140118103518055, + "language_loss": 0.88792473, + "learning_rate": 0.0009244124752456087, + "loss": 0.89896095, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.33276367, + "step": 1050, + "time_per_iteration": 2.501565456390381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097765, + "balance_loss_mlp": 1.06457758, + "epoch": 0.20219315121200462, + "flos": 536326852608.0, + "grad_norm": 0.049507299183714965, + "language_loss": 0.85344577, + "learning_rate": 0.0009242476885498081, + "loss": 0.86442339, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.33203125, + "step": 1051, + "time_per_iteration": 2.698791027069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095962, + "balance_loss_mlp": 1.06222594, + "epoch": 0.20238553289726818, + "flos": 477634323456.0, + "grad_norm": 0.07140169421024865, + "language_loss": 0.8134433, + "learning_rate": 0.0009240827371464474, + "loss": 0.82440293, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.33764648, + "step": 1052, + "time_per_iteration": 2.5603079795837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082157, + "balance_loss_mlp": 1.04958868, + "epoch": 0.20257791458253174, + "flos": 1151611430400.0, + "grad_norm": 0.06069279327125781, + "language_loss": 0.84372044, + "learning_rate": 0.0009239176210995666, + "loss": 0.85454196, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.32568359, + "step": 1053, + "time_per_iteration": 3.4549684524536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088056, + "balance_loss_mlp": 1.05393791, + "epoch": 0.2027702962677953, + "flos": 666606011904.0, + "grad_norm": 0.06066867592012189, + "language_loss": 0.93657684, + "learning_rate": 0.0009237523404732695, + "loss": 0.94745743, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.34130859, + "step": 1054, + "time_per_iteration": 4.344247817993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078384, + "balance_loss_mlp": 1.04567289, + "epoch": 0.20296267795305886, + "flos": 641011428864.0, + "grad_norm": 0.0678922331878557, + "language_loss": 0.84289086, + "learning_rate": 0.0009235868953317235, + "loss": 0.85367465, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.32714844, + "step": 1055, + "time_per_iteration": 2.7755184173583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086857, + "balance_loss_mlp": 1.05321646, + "epoch": 0.20315505963832242, + "flos": 930187777536.0, + "grad_norm": 0.06816541670806936, + "language_loss": 0.85603452, + "learning_rate": 0.0009234212857391602, + "loss": 0.86690307, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.33642578, + "step": 1056, + "time_per_iteration": 3.1736087799072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089034, + "balance_loss_mlp": 1.05477369, + "epoch": 0.20334744132358598, + "flos": 561818128896.0, + "grad_norm": 0.05209348313890264, + "language_loss": 0.88978589, + "learning_rate": 0.000923255511759875, + "loss": 0.90067613, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.34301758, + "step": 1057, + "time_per_iteration": 2.7823617458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093097, + "balance_loss_mlp": 1.05945587, + "epoch": 0.20353982300884957, + "flos": 643902455808.0, + "grad_norm": 0.061337083912670884, + "language_loss": 0.85219932, + "learning_rate": 0.000923089573458227, + "loss": 0.86313027, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.33666992, + "step": 1058, + "time_per_iteration": 2.8398988246917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092738, + "balance_loss_mlp": 1.05952644, + "epoch": 0.20373220469411313, + "flos": 651101690880.0, + "grad_norm": 0.0713114334987562, + "language_loss": 0.84425724, + "learning_rate": 0.0009229234708986392, + "loss": 0.85518456, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.33203125, + "step": 1059, + "time_per_iteration": 2.891934394836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069957, + "balance_loss_mlp": 1.05603302, + "epoch": 0.2039245863793767, + "flos": 1436939136000.0, + "grad_norm": 0.037855568460977755, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.8273685, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.13964844, + "step": 1060, + "time_per_iteration": 4.673142194747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092929, + "balance_loss_mlp": 1.05985999, + "epoch": 0.20411696806464025, + "flos": 596678082048.0, + "grad_norm": 0.07190006801568614, + "language_loss": 0.85404283, + "learning_rate": 0.0009225907732636548, + "loss": 0.86497211, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.33081055, + "step": 1061, + "time_per_iteration": 2.74110746383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091714, + "balance_loss_mlp": 1.0585742, + "epoch": 0.2043093497499038, + "flos": 573530775552.0, + "grad_norm": 0.06271161302412134, + "language_loss": 0.86991799, + "learning_rate": 0.0009224241783174227, + "loss": 0.88083506, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.33154297, + "step": 1062, + "time_per_iteration": 2.6885697841644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084619, + "balance_loss_mlp": 1.05233693, + "epoch": 0.20450173143516737, + "flos": 630061217280.0, + "grad_norm": 0.055816021094363524, + "language_loss": 0.85842204, + "learning_rate": 0.0009222574193715802, + "loss": 0.86926818, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.32275391, + "step": 1063, + "time_per_iteration": 2.7569899559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087796, + "balance_loss_mlp": 1.05522823, + "epoch": 0.20469411312043093, + "flos": 573718450176.0, + "grad_norm": 0.051897822989382614, + "language_loss": 0.8621105, + "learning_rate": 0.000922090496490869, + "loss": 0.87298846, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.32568359, + "step": 1064, + "time_per_iteration": 2.7099597454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082485, + "balance_loss_mlp": 1.05025065, + "epoch": 0.20488649480569449, + "flos": 636748300800.0, + "grad_norm": 0.04962250787968228, + "language_loss": 0.90165728, + "learning_rate": 0.0009219234097400937, + "loss": 0.91248214, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.32226562, + "step": 1065, + "time_per_iteration": 2.8492319583892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108042, + "balance_loss_mlp": 1.04806709, + "epoch": 0.20507887649095807, + "flos": 975383894016.0, + "grad_norm": 0.051536979552593745, + "language_loss": 0.83029723, + "learning_rate": 0.0009217561591841237, + "loss": 0.84110147, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.32348633, + "step": 1066, + "time_per_iteration": 3.267207145690918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082085, + "balance_loss_mlp": 1.04951739, + "epoch": 0.20527125817622163, + "flos": 485940819456.0, + "grad_norm": 0.09661652793466288, + "language_loss": 0.81334901, + "learning_rate": 0.0009215887448878913, + "loss": 0.82416987, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.32568359, + "step": 1067, + "time_per_iteration": 2.5429391860961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088998, + "balance_loss_mlp": 1.05552411, + "epoch": 0.2054636398614852, + "flos": 526921860096.0, + "grad_norm": 0.09953641970782799, + "language_loss": 0.85144234, + "learning_rate": 0.0009214211669163922, + "loss": 0.86233234, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.33496094, + "step": 1068, + "time_per_iteration": 2.7006540298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085875, + "balance_loss_mlp": 1.05428481, + "epoch": 0.20565602154674875, + "flos": 557898416640.0, + "grad_norm": 0.048729379907622286, + "language_loss": 0.93896133, + "learning_rate": 0.0009212534253346862, + "loss": 0.94982004, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.31567383, + "step": 1069, + "time_per_iteration": 2.7544496059417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098993, + "balance_loss_mlp": 1.06713986, + "epoch": 0.2058484032320123, + "flos": 503976784896.0, + "grad_norm": 0.06649355865978995, + "language_loss": 0.8497259, + "learning_rate": 0.0009210855202078964, + "loss": 0.86071587, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.31835938, + "step": 1070, + "time_per_iteration": 2.59660005569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113348, + "balance_loss_mlp": 1.07975471, + "epoch": 0.20604078491727587, + "flos": 432950358528.0, + "grad_norm": 0.06315152856471482, + "language_loss": 0.87476587, + "learning_rate": 0.0009209174516012091, + "loss": 0.88589936, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.3359375, + "step": 1071, + "time_per_iteration": 2.498087167739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110052, + "balance_loss_mlp": 1.07624412, + "epoch": 0.20623316660253943, + "flos": 608421252096.0, + "grad_norm": 0.06211591839104366, + "language_loss": 0.89244497, + "learning_rate": 0.0009207492195798747, + "loss": 0.9035455, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.33837891, + "step": 1072, + "time_per_iteration": 2.760019063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116237, + "balance_loss_mlp": 1.08142757, + "epoch": 0.206425548287803, + "flos": 480184906752.0, + "grad_norm": 0.07379229384440758, + "language_loss": 0.84887302, + "learning_rate": 0.0009205808242092061, + "loss": 0.86003542, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.34838867, + "step": 1073, + "time_per_iteration": 2.6034529209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118683, + "balance_loss_mlp": 1.08423102, + "epoch": 0.20661792997306658, + "flos": 949007937024.0, + "grad_norm": 0.0763588165275792, + "language_loss": 0.82845032, + "learning_rate": 0.0009204122655545808, + "loss": 0.83963716, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.34472656, + "step": 1074, + "time_per_iteration": 3.3222029209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111187, + "balance_loss_mlp": 1.07759392, + "epoch": 0.20681031165833014, + "flos": 603206604288.0, + "grad_norm": 0.05592396046249817, + "language_loss": 0.80705297, + "learning_rate": 0.0009202435436814388, + "loss": 0.81816483, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.33618164, + "step": 1075, + "time_per_iteration": 2.721888780593872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114903, + "balance_loss_mlp": 1.08121455, + "epoch": 0.2070026933435937, + "flos": 708665200128.0, + "grad_norm": 0.07630450069092473, + "language_loss": 0.89700603, + "learning_rate": 0.0009200746586552836, + "loss": 0.90815508, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.3371582, + "step": 1076, + "time_per_iteration": 2.8797900676727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107662, + "balance_loss_mlp": 1.07416463, + "epoch": 0.20719507502885726, + "flos": 829456409088.0, + "grad_norm": 0.06176881640488279, + "language_loss": 0.84210765, + "learning_rate": 0.0009199056105416825, + "loss": 0.85318428, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.33520508, + "step": 1077, + "time_per_iteration": 3.120950698852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107166, + "balance_loss_mlp": 1.07312012, + "epoch": 0.20738745671412082, + "flos": 637993774080.0, + "grad_norm": 0.055893649084458805, + "language_loss": 0.86594802, + "learning_rate": 0.0009197363994062654, + "loss": 0.87701964, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.34057617, + "step": 1078, + "time_per_iteration": 2.8197755813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086727, + "balance_loss_mlp": 1.05480289, + "epoch": 0.20757983839938438, + "flos": 685258845696.0, + "grad_norm": 0.054433441748304986, + "language_loss": 0.84861732, + "learning_rate": 0.0009195670253147262, + "loss": 0.85948461, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.3190918, + "step": 1079, + "time_per_iteration": 2.966987133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096354, + "balance_loss_mlp": 1.06340468, + "epoch": 0.20777222008464794, + "flos": 519024208896.0, + "grad_norm": 0.07868801214896702, + "language_loss": 0.82301188, + "learning_rate": 0.0009193974883328216, + "loss": 0.83397532, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.32958984, + "step": 1080, + "time_per_iteration": 2.620704174041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091209, + "balance_loss_mlp": 1.05725837, + "epoch": 0.2079646017699115, + "flos": 511136732160.0, + "grad_norm": 0.09961486538628272, + "language_loss": 0.87482947, + "learning_rate": 0.0009192277885263718, + "loss": 0.88574153, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.33984375, + "step": 1081, + "time_per_iteration": 2.6247479915618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087114, + "balance_loss_mlp": 1.05352044, + "epoch": 0.20815698345517505, + "flos": 931409929728.0, + "grad_norm": 0.05448561879445608, + "language_loss": 0.86255312, + "learning_rate": 0.0009190579259612602, + "loss": 0.87342417, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.33618164, + "step": 1082, + "time_per_iteration": 3.2661428451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085916, + "balance_loss_mlp": 1.05187023, + "epoch": 0.20834936514043864, + "flos": 632114205696.0, + "grad_norm": 0.059638645169798186, + "language_loss": 0.86669636, + "learning_rate": 0.000918887900703433, + "loss": 0.87755549, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.34082031, + "step": 1083, + "time_per_iteration": 2.7930080890655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083976, + "balance_loss_mlp": 1.05038285, + "epoch": 0.2085417468257022, + "flos": 394170693120.0, + "grad_norm": 0.06326041775418027, + "language_loss": 0.90427065, + "learning_rate": 0.0009187177128188999, + "loss": 0.91511047, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.33618164, + "step": 1084, + "time_per_iteration": 2.431358814239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054848, + "balance_loss_mlp": 1.04159176, + "epoch": 0.20873412851096576, + "flos": 1401387969024.0, + "grad_norm": 0.04127554175786628, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78211385, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.1328125, + "step": 1085, + "time_per_iteration": 6.352816343307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080772, + "balance_loss_mlp": 1.04686832, + "epoch": 0.20892651019622932, + "flos": 447599112192.0, + "grad_norm": 0.06234370040412467, + "language_loss": 0.8612783, + "learning_rate": 0.000918376849434071, + "loss": 0.87208605, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.33935547, + "step": 1086, + "time_per_iteration": 2.5168843269348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084999, + "balance_loss_mlp": 1.05040443, + "epoch": 0.20911889188149288, + "flos": 492863629824.0, + "grad_norm": 0.07820142019527274, + "language_loss": 0.90828383, + "learning_rate": 0.0009182061740661098, + "loss": 0.91913384, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.34643555, + "step": 1087, + "time_per_iteration": 2.5461525917053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083648, + "balance_loss_mlp": 1.0494113, + "epoch": 0.20931127356675644, + "flos": 840928946688.0, + "grad_norm": 0.05821627614551514, + "language_loss": 0.85034752, + "learning_rate": 0.0009180353363361127, + "loss": 0.86118406, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.3425293, + "step": 1088, + "time_per_iteration": 3.11942982673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084605, + "balance_loss_mlp": 1.05036855, + "epoch": 0.20950365525202, + "flos": 756796013568.0, + "grad_norm": 0.06471498550944753, + "language_loss": 0.82101512, + "learning_rate": 0.0009178643363104044, + "loss": 0.83186114, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.34277344, + "step": 1089, + "time_per_iteration": 3.0986390113830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107905, + "balance_loss_mlp": 1.04543328, + "epoch": 0.20969603693728356, + "flos": 472301812224.0, + "grad_norm": 0.07091461504319575, + "language_loss": 0.91050649, + "learning_rate": 0.0009176931740553735, + "loss": 0.92129695, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.33642578, + "step": 1090, + "time_per_iteration": 2.4965460300445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108219, + "balance_loss_mlp": 1.04845381, + "epoch": 0.20988841862254715, + "flos": 976507121664.0, + "grad_norm": 0.05967441428812083, + "language_loss": 0.82829833, + "learning_rate": 0.0009175218496374708, + "loss": 0.83912027, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.33740234, + "step": 1091, + "time_per_iteration": 3.325467348098755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082454, + "balance_loss_mlp": 1.04917121, + "epoch": 0.2100808003078107, + "flos": 1092697731072.0, + "grad_norm": 0.06552872916111846, + "language_loss": 0.85816884, + "learning_rate": 0.0009173503631232103, + "loss": 0.86899334, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.33300781, + "step": 1092, + "time_per_iteration": 3.3492543697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080972, + "balance_loss_mlp": 1.04761767, + "epoch": 0.21027318199307427, + "flos": 1012567468032.0, + "grad_norm": 0.06864870254184631, + "language_loss": 0.8205356, + "learning_rate": 0.0009171787145791691, + "loss": 0.83134532, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.33374023, + "step": 1093, + "time_per_iteration": 3.229302167892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083111, + "balance_loss_mlp": 1.04975629, + "epoch": 0.21046556367833782, + "flos": 521141216256.0, + "grad_norm": 0.08362122797221107, + "language_loss": 0.80208671, + "learning_rate": 0.000917006904071987, + "loss": 0.81291783, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.33374023, + "step": 1094, + "time_per_iteration": 2.5901217460632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091484, + "balance_loss_mlp": 1.05843902, + "epoch": 0.21065794536360138, + "flos": 603437948928.0, + "grad_norm": 0.05523679641811596, + "language_loss": 0.87209588, + "learning_rate": 0.0009168349316683669, + "loss": 0.88301063, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.33056641, + "step": 1095, + "time_per_iteration": 2.67250919342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093141, + "balance_loss_mlp": 1.06081104, + "epoch": 0.21085032704886494, + "flos": 603045070848.0, + "grad_norm": 0.05347318487829757, + "language_loss": 0.82685143, + "learning_rate": 0.0009166627974350741, + "loss": 0.83778286, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.32324219, + "step": 1096, + "time_per_iteration": 2.9291882514953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097603, + "balance_loss_mlp": 1.06362867, + "epoch": 0.2110427087341285, + "flos": 637382697984.0, + "grad_norm": 0.059512513015867, + "language_loss": 0.89716321, + "learning_rate": 0.0009164905014389373, + "loss": 0.90813923, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.34008789, + "step": 1097, + "time_per_iteration": 2.7384700775146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100621, + "balance_loss_mlp": 1.06798196, + "epoch": 0.21123509041939206, + "flos": 522667496448.0, + "grad_norm": 0.08051519151754843, + "language_loss": 0.87020361, + "learning_rate": 0.0009163180437468476, + "loss": 0.88120985, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.32641602, + "step": 1098, + "time_per_iteration": 2.584890365600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109632, + "balance_loss_mlp": 1.0635848, + "epoch": 0.21142747210465565, + "flos": 450938271744.0, + "grad_norm": 0.05811835985780437, + "language_loss": 0.86184567, + "learning_rate": 0.000916145424425759, + "loss": 0.87280893, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.32739258, + "step": 1099, + "time_per_iteration": 2.6362791061401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_mlp": 1.07059634, + "epoch": 0.2116198537899192, + "flos": 875813630976.0, + "grad_norm": 0.07623729144092387, + "language_loss": 0.9082064, + "learning_rate": 0.0009159726435426885, + "loss": 0.91924655, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.33447266, + "step": 1100, + "time_per_iteration": 3.0668158531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108924, + "balance_loss_mlp": 1.0554564, + "epoch": 0.21181223547518277, + "flos": 523410992640.0, + "grad_norm": 0.06029059005678133, + "language_loss": 0.90809137, + "learning_rate": 0.0009157997011647154, + "loss": 0.91898382, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.33813477, + "step": 1101, + "time_per_iteration": 2.5932393074035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110663, + "balance_loss_mlp": 1.07327497, + "epoch": 0.21200461716044633, + "flos": 572014669824.0, + "grad_norm": 0.05812758027328986, + "language_loss": 0.86378956, + "learning_rate": 0.0009156265973589817, + "loss": 0.87485588, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.33374023, + "step": 1102, + "time_per_iteration": 2.79496431350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110266, + "balance_loss_mlp": 1.07672012, + "epoch": 0.2121969988457099, + "flos": 544869075456.0, + "grad_norm": 0.0704183859149776, + "language_loss": 0.89789248, + "learning_rate": 0.0009154533321926926, + "loss": 0.90899515, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.33569336, + "step": 1103, + "time_per_iteration": 2.5982048511505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107149, + "balance_loss_mlp": 1.07393694, + "epoch": 0.21238938053097345, + "flos": 843489704448.0, + "grad_norm": 0.06399101868010165, + "language_loss": 0.87705767, + "learning_rate": 0.0009152799057331156, + "loss": 0.88812917, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.33227539, + "step": 1104, + "time_per_iteration": 3.088672637939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100871, + "balance_loss_mlp": 1.06768262, + "epoch": 0.212581762216237, + "flos": 445984081920.0, + "grad_norm": 0.064105004549741, + "language_loss": 0.91047186, + "learning_rate": 0.0009151063180475805, + "loss": 0.9214806, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.33203125, + "step": 1105, + "time_per_iteration": 2.569998025894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090698, + "balance_loss_mlp": 1.05798697, + "epoch": 0.21277414390150057, + "flos": 514129655808.0, + "grad_norm": 0.05967324045126681, + "language_loss": 0.84732658, + "learning_rate": 0.0009149325692034803, + "loss": 0.85823357, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.32714844, + "step": 1106, + "time_per_iteration": 2.6009016036987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149918, + "balance_loss_mlp": 1.13380063, + "epoch": 0.21296652558676413, + "flos": 1484790552576.0, + "grad_norm": 0.04210654195905191, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80353343, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.16113281, + "step": 1107, + "time_per_iteration": 4.820629596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082056, + "balance_loss_mlp": 1.04994082, + "epoch": 0.21315890727202771, + "flos": 845689669632.0, + "grad_norm": 0.07454945953507684, + "language_loss": 0.87513995, + "learning_rate": 0.0009145845883094678, + "loss": 0.88596046, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.32080078, + "step": 1108, + "time_per_iteration": 3.0311591625213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083334, + "balance_loss_mlp": 1.04971695, + "epoch": 0.21335128895729127, + "flos": 629086376448.0, + "grad_norm": 0.07212897946446892, + "language_loss": 0.85387337, + "learning_rate": 0.000914410356394654, + "loss": 0.86470675, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.33642578, + "step": 1109, + "time_per_iteration": 2.7746968269348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085071, + "balance_loss_mlp": 1.05102468, + "epoch": 0.21354367064255483, + "flos": 710649787392.0, + "grad_norm": 0.053148069764317206, + "language_loss": 0.85104829, + "learning_rate": 0.0009142359635914709, + "loss": 0.86189902, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.34057617, + "step": 1110, + "time_per_iteration": 3.109018564224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083884, + "balance_loss_mlp": 1.05067194, + "epoch": 0.2137360523278184, + "flos": 455950688256.0, + "grad_norm": 0.07113647076789116, + "language_loss": 0.84692943, + "learning_rate": 0.0009140614099676245, + "loss": 0.8577683, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.33203125, + "step": 1111, + "time_per_iteration": 2.5607409477233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083673, + "balance_loss_mlp": 1.05072355, + "epoch": 0.21392843401308195, + "flos": 665749034496.0, + "grad_norm": 0.059219994241997045, + "language_loss": 0.82997137, + "learning_rate": 0.0009138866955908821, + "loss": 0.84080815, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.32958984, + "step": 1112, + "time_per_iteration": 2.901376724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086583, + "balance_loss_mlp": 1.05327559, + "epoch": 0.2141208156983455, + "flos": 748656843264.0, + "grad_norm": 0.06302145936378449, + "language_loss": 0.80617785, + "learning_rate": 0.0009137118205290738, + "loss": 0.81704366, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.33325195, + "step": 1113, + "time_per_iteration": 2.9629132747650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085777, + "balance_loss_mlp": 1.05142069, + "epoch": 0.21431319738360907, + "flos": 418898124288.0, + "grad_norm": 0.06913372638273338, + "language_loss": 0.90778732, + "learning_rate": 0.0009135367848500924, + "loss": 0.91864502, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.34399414, + "step": 1114, + "time_per_iteration": 2.5860419273376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087087, + "balance_loss_mlp": 1.05406582, + "epoch": 0.21450557906887263, + "flos": 608849035776.0, + "grad_norm": 0.07370492115341919, + "language_loss": 0.86567605, + "learning_rate": 0.0009133615886218927, + "loss": 0.87654686, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.33032227, + "step": 1115, + "time_per_iteration": 2.6986265182495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095845, + "balance_loss_mlp": 1.06082106, + "epoch": 0.21469796075413622, + "flos": 561649393152.0, + "grad_norm": 0.0682239504380638, + "language_loss": 0.88444531, + "learning_rate": 0.0009131862319124917, + "loss": 0.89540386, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.3503418, + "step": 1116, + "time_per_iteration": 2.644977569580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086331, + "balance_loss_mlp": 1.0540725, + "epoch": 0.21489034243939978, + "flos": 594363225600.0, + "grad_norm": 0.06937847326766512, + "language_loss": 0.8429122, + "learning_rate": 0.0009130107147899691, + "loss": 0.85377544, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.32250977, + "step": 1117, + "time_per_iteration": 2.768064498901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084886, + "balance_loss_mlp": 1.05148315, + "epoch": 0.21508272412466334, + "flos": 441661317120.0, + "grad_norm": 0.09911577685113587, + "language_loss": 0.85504615, + "learning_rate": 0.0009128350373224665, + "loss": 0.86589503, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.33422852, + "step": 1118, + "time_per_iteration": 2.5369865894317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_mlp": 1.02206326, + "epoch": 0.2152751058099269, + "flos": 1495397348352.0, + "grad_norm": 0.028624916140058014, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82490271, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.11767578, + "step": 1119, + "time_per_iteration": 4.634536266326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109091, + "balance_loss_mlp": 1.05741262, + "epoch": 0.21546748749519046, + "flos": 493759895040.0, + "grad_norm": 0.057336284262766976, + "language_loss": 0.85470641, + "learning_rate": 0.0009124832016254005, + "loss": 0.86561549, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.33520508, + "step": 1120, + "time_per_iteration": 2.57099986076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097048, + "balance_loss_mlp": 1.06245303, + "epoch": 0.21565986918045402, + "flos": 634241387520.0, + "grad_norm": 0.0556622286599547, + "language_loss": 0.8842063, + "learning_rate": 0.0009123070435324316, + "loss": 0.89517677, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.34619141, + "step": 1121, + "time_per_iteration": 2.73698091506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010392, + "balance_loss_mlp": 1.02780366, + "epoch": 0.21585225086571758, + "flos": 1582502704128.0, + "grad_norm": 0.024824935431588098, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78914982, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.11376953, + "step": 1122, + "time_per_iteration": 4.960963010787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093163, + "balance_loss_mlp": 1.05897415, + "epoch": 0.21604463255098114, + "flos": 683799556608.0, + "grad_norm": 0.06637115500638362, + "language_loss": 0.86772728, + "learning_rate": 0.0009119542471995752, + "loss": 0.87865889, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.34204102, + "step": 1123, + "time_per_iteration": 2.819042205810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109659, + "balance_loss_mlp": 1.06228209, + "epoch": 0.2162370142362447, + "flos": 780660675072.0, + "grad_norm": 0.06221084946299637, + "language_loss": 0.81623554, + "learning_rate": 0.0009117776090966554, + "loss": 0.82720149, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.34326172, + "step": 1124, + "time_per_iteration": 2.9435975551605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090546, + "balance_loss_mlp": 1.05473578, + "epoch": 0.21642939592150828, + "flos": 1001745294336.0, + "grad_norm": 0.06219513600405685, + "language_loss": 0.86821365, + "learning_rate": 0.0009116008111274899, + "loss": 0.8791191, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.35839844, + "step": 1125, + "time_per_iteration": 3.250828504562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023544, + "balance_loss_mlp": 1.01271951, + "epoch": 0.21662177760677184, + "flos": 1481867440128.0, + "grad_norm": 0.013492425774453086, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.8013047, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.10839844, + "step": 1126, + "time_per_iteration": 4.836662530899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078043, + "balance_loss_mlp": 1.0431627, + "epoch": 0.2168141592920354, + "flos": 887030092800.0, + "grad_norm": 0.06408405180788145, + "language_loss": 0.84878719, + "learning_rate": 0.0009112467358650396, + "loss": 0.85956764, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.34912109, + "step": 1127, + "time_per_iteration": 3.118460178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087154, + "balance_loss_mlp": 1.05205846, + "epoch": 0.21700654097729896, + "flos": 545682382848.0, + "grad_norm": 0.06014422622436645, + "language_loss": 0.86521864, + "learning_rate": 0.0009110694587092192, + "loss": 0.87609017, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.35131836, + "step": 1128, + "time_per_iteration": 2.736814022064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080165, + "balance_loss_mlp": 1.0446167, + "epoch": 0.21719892266256252, + "flos": 509270008320.0, + "grad_norm": 0.06606219668196793, + "language_loss": 0.81429344, + "learning_rate": 0.0009108920219620815, + "loss": 0.82509506, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.35571289, + "step": 1129, + "time_per_iteration": 2.6214489936828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083138, + "balance_loss_mlp": 1.04782772, + "epoch": 0.21739130434782608, + "flos": 543150738432.0, + "grad_norm": 0.060577581075914995, + "language_loss": 0.89903116, + "learning_rate": 0.0009107144256925133, + "loss": 0.90986252, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.35302734, + "step": 1130, + "time_per_iteration": 2.6337971687316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079633, + "balance_loss_mlp": 1.04489541, + "epoch": 0.21758368603308964, + "flos": 616564804608.0, + "grad_norm": 0.0674499307688184, + "language_loss": 0.82610142, + "learning_rate": 0.0009105366699694638, + "loss": 0.83689773, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.34790039, + "step": 1131, + "time_per_iteration": 2.6984267234802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085371, + "balance_loss_mlp": 1.04979873, + "epoch": 0.2177760677183532, + "flos": 634813175808.0, + "grad_norm": 0.051829013054278075, + "language_loss": 0.8159321, + "learning_rate": 0.0009103587548619439, + "loss": 0.8267858, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.35571289, + "step": 1132, + "time_per_iteration": 2.8308732509613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083319, + "balance_loss_mlp": 1.04850996, + "epoch": 0.2179684494036168, + "flos": 532181587968.0, + "grad_norm": 0.06772780520844247, + "language_loss": 0.86115086, + "learning_rate": 0.0009101806804390261, + "loss": 0.87198412, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.34863281, + "step": 1133, + "time_per_iteration": 2.7745282649993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081903, + "balance_loss_mlp": 1.04671264, + "epoch": 0.21816083108888035, + "flos": 474980433408.0, + "grad_norm": 0.05911376481567057, + "language_loss": 0.90451765, + "learning_rate": 0.0009100024467698453, + "loss": 0.91533667, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.35205078, + "step": 1134, + "time_per_iteration": 2.551278829574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086865, + "balance_loss_mlp": 1.051054, + "epoch": 0.2183532127741439, + "flos": 577198794240.0, + "grad_norm": 0.07962415284192025, + "language_loss": 0.83050048, + "learning_rate": 0.0009098240539235981, + "loss": 0.84136909, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.3581543, + "step": 1135, + "time_per_iteration": 2.660019636154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086508, + "balance_loss_mlp": 1.05172312, + "epoch": 0.21854559445940747, + "flos": 593832135168.0, + "grad_norm": 0.05867668726509775, + "language_loss": 0.87679315, + "learning_rate": 0.0009096455019695423, + "loss": 0.88765824, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.34838867, + "step": 1136, + "time_per_iteration": 2.756463050842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090029, + "balance_loss_mlp": 1.05426645, + "epoch": 0.21873797614467103, + "flos": 408464446464.0, + "grad_norm": 0.06290439978907646, + "language_loss": 0.90092266, + "learning_rate": 0.000909466790976998, + "loss": 0.91182297, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.35791016, + "step": 1137, + "time_per_iteration": 2.5046186447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089401, + "balance_loss_mlp": 1.05373359, + "epoch": 0.21893035782993459, + "flos": 893824865280.0, + "grad_norm": 0.05253297698454947, + "language_loss": 0.83030021, + "learning_rate": 0.0009092879210153473, + "loss": 0.84119421, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.35668945, + "step": 1138, + "time_per_iteration": 3.1294023990631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092735, + "balance_loss_mlp": 1.05835557, + "epoch": 0.21912273951519814, + "flos": 467392702464.0, + "grad_norm": 0.05516730570048504, + "language_loss": 0.88930631, + "learning_rate": 0.0009091088921540333, + "loss": 0.90023363, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.34399414, + "step": 1139, + "time_per_iteration": 2.5161380767822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081896, + "balance_loss_mlp": 1.06921172, + "epoch": 0.2193151212004617, + "flos": 1531262665728.0, + "grad_norm": 0.036356034107047845, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76590574, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.12695312, + "step": 1140, + "time_per_iteration": 4.929131984710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087847, + "balance_loss_mlp": 1.05306172, + "epoch": 0.2195075028857253, + "flos": 590901820416.0, + "grad_norm": 0.07364984820319191, + "language_loss": 0.8488574, + "learning_rate": 0.0009087503580104985, + "loss": 0.85973585, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.34814453, + "step": 1141, + "time_per_iteration": 2.676321029663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087752, + "balance_loss_mlp": 1.05287158, + "epoch": 0.21969988457098885, + "flos": 636033917952.0, + "grad_norm": 0.0662048159418312, + "language_loss": 0.79610777, + "learning_rate": 0.0009085708528674728, + "loss": 0.80698538, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.34912109, + "step": 1142, + "time_per_iteration": 2.7667393684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088746, + "balance_loss_mlp": 1.05202913, + "epoch": 0.2198922662562524, + "flos": 911974311936.0, + "grad_norm": 0.07907290305355467, + "language_loss": 0.86086833, + "learning_rate": 0.0009083911891031745, + "loss": 0.87175578, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.36743164, + "step": 1143, + "time_per_iteration": 3.1026079654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093334, + "balance_loss_mlp": 1.05809617, + "epoch": 0.22008464794151597, + "flos": 822603409920.0, + "grad_norm": 0.06284406217527433, + "language_loss": 0.91362917, + "learning_rate": 0.0009082113667873553, + "loss": 0.92456251, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.3527832, + "step": 1144, + "time_per_iteration": 3.098741292953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107482, + "balance_loss_mlp": 1.07188582, + "epoch": 0.22027702962677953, + "flos": 459416475648.0, + "grad_norm": 0.06625631151069579, + "language_loss": 0.90562177, + "learning_rate": 0.0009080313859898283, + "loss": 0.91669661, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.35620117, + "step": 1145, + "time_per_iteration": 2.4998207092285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111259, + "balance_loss_mlp": 1.07606888, + "epoch": 0.2204694113120431, + "flos": 530998723584.0, + "grad_norm": 0.05051092763003013, + "language_loss": 0.91815794, + "learning_rate": 0.0009078512467804684, + "loss": 0.92927051, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.35180664, + "step": 1146, + "time_per_iteration": 2.569073438644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117194, + "balance_loss_mlp": 1.08162224, + "epoch": 0.22066179299730665, + "flos": 522382307328.0, + "grad_norm": 0.06837547739928014, + "language_loss": 0.90610331, + "learning_rate": 0.0009076709492292119, + "loss": 0.91727525, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.35571289, + "step": 1147, + "time_per_iteration": 2.614039659500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114459, + "balance_loss_mlp": 1.07969809, + "epoch": 0.2208541746825702, + "flos": 546188742144.0, + "grad_norm": 0.06837160959472317, + "language_loss": 0.89193797, + "learning_rate": 0.0009074904934060562, + "loss": 0.90308249, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.34790039, + "step": 1148, + "time_per_iteration": 2.6419012546539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112121, + "balance_loss_mlp": 1.08578134, + "epoch": 0.22104655636783377, + "flos": 708404742144.0, + "grad_norm": 0.07108081727062696, + "language_loss": 0.84988266, + "learning_rate": 0.0009073098793810607, + "loss": 0.86109483, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.35473633, + "step": 1149, + "time_per_iteration": 2.909515142440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116065, + "balance_loss_mlp": 1.08061242, + "epoch": 0.22123893805309736, + "flos": 584594468352.0, + "grad_norm": 0.07695680382665727, + "language_loss": 0.88374794, + "learning_rate": 0.000907129107224346, + "loss": 0.89490861, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.35522461, + "step": 1150, + "time_per_iteration": 2.7029008865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099143, + "balance_loss_mlp": 1.06369042, + "epoch": 0.22143131973836092, + "flos": 492002270208.0, + "grad_norm": 0.049049579749502144, + "language_loss": 0.88305712, + "learning_rate": 0.0009069481770060939, + "loss": 0.89404863, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.35498047, + "step": 1151, + "time_per_iteration": 2.65167236328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097606, + "balance_loss_mlp": 1.06248736, + "epoch": 0.22162370142362448, + "flos": 1079227459584.0, + "grad_norm": 0.054063738490033035, + "language_loss": 0.84240663, + "learning_rate": 0.000906767088796548, + "loss": 0.85338271, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.35180664, + "step": 1152, + "time_per_iteration": 3.423985004425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110736, + "balance_loss_mlp": 1.07300401, + "epoch": 0.22181608310888803, + "flos": 492258345984.0, + "grad_norm": 0.057939830998464815, + "language_loss": 0.87012136, + "learning_rate": 0.0009065858426660127, + "loss": 0.88119501, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.34399414, + "step": 1153, + "time_per_iteration": 2.5987319946289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108178, + "balance_loss_mlp": 1.07355952, + "epoch": 0.2220084647941516, + "flos": 723687892992.0, + "grad_norm": 0.0653796708212952, + "language_loss": 0.84926325, + "learning_rate": 0.0009064044386848543, + "loss": 0.86034507, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.34667969, + "step": 1154, + "time_per_iteration": 2.9024224281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112372, + "balance_loss_mlp": 1.0753932, + "epoch": 0.22220084647941515, + "flos": 488988997632.0, + "grad_norm": 0.06606878403176955, + "language_loss": 0.88905716, + "learning_rate": 0.0009062228769234997, + "loss": 0.90018088, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.36987305, + "step": 1155, + "time_per_iteration": 2.5483920574188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104305, + "balance_loss_mlp": 1.06868565, + "epoch": 0.2223932281646787, + "flos": 536025696768.0, + "grad_norm": 0.06185680569649912, + "language_loss": 0.81360811, + "learning_rate": 0.0009060411574524376, + "loss": 0.82465118, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.35644531, + "step": 1156, + "time_per_iteration": 2.629166841506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114341, + "balance_loss_mlp": 1.0794363, + "epoch": 0.22258560984994227, + "flos": 931034580480.0, + "grad_norm": 0.06288530021121215, + "language_loss": 0.88191485, + "learning_rate": 0.0009058592803422178, + "loss": 0.8930583, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.34936523, + "step": 1157, + "time_per_iteration": 3.133453845977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219449, + "balance_loss_mlp": 1.20495331, + "epoch": 0.22277799153520586, + "flos": 1198998443520.0, + "grad_norm": 0.06392494715081258, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79929739, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.14453125, + "step": 1158, + "time_per_iteration": 4.838433027267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095231, + "balance_loss_mlp": 1.0620904, + "epoch": 0.22297037322046942, + "flos": 501052262400.0, + "grad_norm": 0.059439708082357066, + "language_loss": 0.90095651, + "learning_rate": 0.00090549505348681, + "loss": 0.91190875, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.33154297, + "step": 1159, + "time_per_iteration": 2.561887264251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083603, + "balance_loss_mlp": 1.04977143, + "epoch": 0.22316275490573298, + "flos": 752413612032.0, + "grad_norm": 0.05610915875378834, + "language_loss": 0.84254742, + "learning_rate": 0.0009053127038830275, + "loss": 0.85338354, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.33862305, + "step": 1160, + "time_per_iteration": 2.9465925693511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082896, + "balance_loss_mlp": 1.04844403, + "epoch": 0.22335513659099654, + "flos": 514553057280.0, + "grad_norm": 0.06657410760601727, + "language_loss": 0.87182009, + "learning_rate": 0.000905130196922898, + "loss": 0.88264906, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.3449707, + "step": 1161, + "time_per_iteration": 2.597325325012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076013, + "balance_loss_mlp": 1.04213357, + "epoch": 0.2235475182762601, + "flos": 484286501376.0, + "grad_norm": 0.057467913173926514, + "language_loss": 0.87228084, + "learning_rate": 0.0009049475326772769, + "loss": 0.88304096, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.33911133, + "step": 1162, + "time_per_iteration": 2.591592788696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107232, + "balance_loss_mlp": 1.0379163, + "epoch": 0.22373989996152366, + "flos": 469698794496.0, + "grad_norm": 0.05481831884816676, + "language_loss": 0.83362567, + "learning_rate": 0.0009047647112170811, + "loss": 0.84434885, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.34448242, + "step": 1163, + "time_per_iteration": 2.7466936111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080512, + "balance_loss_mlp": 1.04503536, + "epoch": 0.22393228164678722, + "flos": 1270512594432.0, + "grad_norm": 0.0775991801606853, + "language_loss": 0.87402856, + "learning_rate": 0.0009045817326132876, + "loss": 0.88483369, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.35498047, + "step": 1164, + "time_per_iteration": 3.6615524291992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082563, + "balance_loss_mlp": 1.04665732, + "epoch": 0.22412466333205078, + "flos": 596052449280.0, + "grad_norm": 0.05603114612800397, + "language_loss": 0.83484542, + "learning_rate": 0.0009043985969369357, + "loss": 0.84567106, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.35913086, + "step": 1165, + "time_per_iteration": 2.800389528274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084793, + "balance_loss_mlp": 1.047647, + "epoch": 0.22431704501731436, + "flos": 608136062976.0, + "grad_norm": 0.052919924442321326, + "language_loss": 0.84423298, + "learning_rate": 0.0009042153042591245, + "loss": 0.85508084, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.37158203, + "step": 1166, + "time_per_iteration": 2.7848384380340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080872, + "balance_loss_mlp": 1.04622972, + "epoch": 0.22450942670257792, + "flos": 906203842560.0, + "grad_norm": 0.054053491984114646, + "language_loss": 0.85318398, + "learning_rate": 0.0009040318546510146, + "loss": 0.86399269, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.34667969, + "step": 1167, + "time_per_iteration": 3.1406538486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080871, + "balance_loss_mlp": 1.04529881, + "epoch": 0.22470180838784148, + "flos": 565032222720.0, + "grad_norm": 0.06590224184570584, + "language_loss": 0.85490131, + "learning_rate": 0.0009038482481838275, + "loss": 0.86571002, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.35620117, + "step": 1168, + "time_per_iteration": 2.6623363494873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107832, + "balance_loss_mlp": 1.04265296, + "epoch": 0.22489419007310504, + "flos": 834109443072.0, + "grad_norm": 0.05295244004415107, + "language_loss": 0.87364161, + "learning_rate": 0.0009036644849288455, + "loss": 0.88442481, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.35668945, + "step": 1169, + "time_per_iteration": 3.096397638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082872, + "balance_loss_mlp": 1.04567838, + "epoch": 0.2250865717583686, + "flos": 580788237312.0, + "grad_norm": 0.06189616009675637, + "language_loss": 0.85257494, + "learning_rate": 0.0009034805649574118, + "loss": 0.86340362, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.37207031, + "step": 1170, + "time_per_iteration": 2.655629873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081931, + "balance_loss_mlp": 1.04669285, + "epoch": 0.22527895344363216, + "flos": 600091435008.0, + "grad_norm": 0.0574349936504533, + "language_loss": 0.85081124, + "learning_rate": 0.0009032964883409308, + "loss": 0.86163056, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.35253906, + "step": 1171, + "time_per_iteration": 2.9228479862213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096453, + "balance_loss_mlp": 1.08109915, + "epoch": 0.22547133512889572, + "flos": 1440009073152.0, + "grad_norm": 0.03435009764288223, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74146986, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.15332031, + "step": 1172, + "time_per_iteration": 4.9870195388793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099751, + "balance_loss_mlp": 1.06365418, + "epoch": 0.22566371681415928, + "flos": 490377065472.0, + "grad_norm": 0.06750207251725504, + "language_loss": 0.87597418, + "learning_rate": 0.0009029278654587462, + "loss": 0.88697171, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.36108398, + "step": 1173, + "time_per_iteration": 2.545078754425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098309, + "balance_loss_mlp": 1.06156898, + "epoch": 0.22585609849942284, + "flos": 604334214144.0, + "grad_norm": 0.06244795934891309, + "language_loss": 0.82517409, + "learning_rate": 0.0009027433193361548, + "loss": 0.8361572, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.36767578, + "step": 1174, + "time_per_iteration": 2.69753098487854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100346, + "balance_loss_mlp": 1.06305695, + "epoch": 0.22604848018468643, + "flos": 635280247296.0, + "grad_norm": 0.06854123529633785, + "language_loss": 0.87138826, + "learning_rate": 0.00090255861685474, + "loss": 0.88239175, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.37280273, + "step": 1175, + "time_per_iteration": 2.7199149131774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094451, + "balance_loss_mlp": 1.05744886, + "epoch": 0.22624086186995, + "flos": 479633467392.0, + "grad_norm": 0.06836538183258173, + "language_loss": 0.91474092, + "learning_rate": 0.0009023737580862095, + "loss": 0.92568541, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.36962891, + "step": 1176, + "time_per_iteration": 2.51255464553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092025, + "balance_loss_mlp": 1.0570724, + "epoch": 0.22643324355521355, + "flos": 495566982144.0, + "grad_norm": 0.05906016973995859, + "language_loss": 0.83066601, + "learning_rate": 0.0009021887431023321, + "loss": 0.84158623, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.34985352, + "step": 1177, + "time_per_iteration": 2.5783960819244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084144, + "balance_loss_mlp": 1.04842877, + "epoch": 0.2266256252404771, + "flos": 561271071744.0, + "grad_norm": 0.05542928649781209, + "language_loss": 0.87720597, + "learning_rate": 0.0009020035719749369, + "loss": 0.8880474, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.35742188, + "step": 1178, + "time_per_iteration": 2.7076900005340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088743, + "balance_loss_mlp": 1.05259871, + "epoch": 0.22681800692574067, + "flos": 579353527296.0, + "grad_norm": 0.05892405405909356, + "language_loss": 0.77506709, + "learning_rate": 0.0009018182447759136, + "loss": 0.78595448, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.36157227, + "step": 1179, + "time_per_iteration": 2.974362373352051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083275, + "balance_loss_mlp": 1.04798961, + "epoch": 0.22701038861100423, + "flos": 739842577920.0, + "grad_norm": 0.0555118465290956, + "language_loss": 0.80168724, + "learning_rate": 0.0009016327615772126, + "loss": 0.81252003, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.35327148, + "step": 1180, + "time_per_iteration": 2.9207658767700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082522, + "balance_loss_mlp": 1.04776096, + "epoch": 0.2272027702962678, + "flos": 576996562944.0, + "grad_norm": 0.06857059729731818, + "language_loss": 0.88146389, + "learning_rate": 0.0009014471224508451, + "loss": 0.8922891, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.34790039, + "step": 1181, + "time_per_iteration": 2.6884429454803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080826, + "balance_loss_mlp": 1.0466603, + "epoch": 0.22739515198153135, + "flos": 544012098048.0, + "grad_norm": 0.07386093909180869, + "language_loss": 0.83020878, + "learning_rate": 0.0009012613274688823, + "loss": 0.84101701, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.34204102, + "step": 1182, + "time_per_iteration": 2.625973701477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082925, + "balance_loss_mlp": 1.04735291, + "epoch": 0.22758753366679493, + "flos": 439932805632.0, + "grad_norm": 0.06621157637783351, + "language_loss": 0.87839937, + "learning_rate": 0.0009010753767034565, + "loss": 0.88922858, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.35571289, + "step": 1183, + "time_per_iteration": 2.545569658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086712, + "balance_loss_mlp": 1.05030489, + "epoch": 0.2277799153520585, + "flos": 729104772096.0, + "grad_norm": 0.07242159959797279, + "language_loss": 0.79501748, + "learning_rate": 0.0009008892702267599, + "loss": 0.80588454, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.36425781, + "step": 1184, + "time_per_iteration": 2.9862120151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089174, + "balance_loss_mlp": 1.05322075, + "epoch": 0.22797229703732205, + "flos": 526641053184.0, + "grad_norm": 0.0740207336504876, + "language_loss": 0.89059424, + "learning_rate": 0.0009007030081110457, + "loss": 0.90148592, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.35961914, + "step": 1185, + "time_per_iteration": 2.6184284687042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083236, + "balance_loss_mlp": 1.04783034, + "epoch": 0.2281646787225856, + "flos": 535159954944.0, + "grad_norm": 0.06479663876551665, + "language_loss": 0.84969211, + "learning_rate": 0.000900516590428627, + "loss": 0.86052454, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.35449219, + "step": 1186, + "time_per_iteration": 2.724161386489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083897, + "balance_loss_mlp": 1.049088, + "epoch": 0.22835706040784917, + "flos": 541107924480.0, + "grad_norm": 0.052728830082858405, + "language_loss": 0.89177948, + "learning_rate": 0.0009003300172518778, + "loss": 0.90261841, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.34790039, + "step": 1187, + "time_per_iteration": 2.6810121536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108576, + "balance_loss_mlp": 1.05018783, + "epoch": 0.22854944209311273, + "flos": 790297012224.0, + "grad_norm": 0.05376177869775473, + "language_loss": 0.84676045, + "learning_rate": 0.0009001432886532321, + "loss": 0.85761803, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.35571289, + "step": 1188, + "time_per_iteration": 2.977433919906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109735, + "balance_loss_mlp": 1.0614686, + "epoch": 0.2287418237783763, + "flos": 469047020544.0, + "grad_norm": 0.06589135500726684, + "language_loss": 0.86752445, + "learning_rate": 0.0008999564047051843, + "loss": 0.87849802, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.35913086, + "step": 1189, + "time_per_iteration": 2.5126237869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094774, + "balance_loss_mlp": 1.06017935, + "epoch": 0.22893420546363985, + "flos": 467786990592.0, + "grad_norm": 0.061551577223012334, + "language_loss": 0.84713042, + "learning_rate": 0.0008997693654802894, + "loss": 0.85807812, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.34643555, + "step": 1190, + "time_per_iteration": 2.6570322513580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088276, + "balance_loss_mlp": 1.05318046, + "epoch": 0.22912658714890344, + "flos": 625974179328.0, + "grad_norm": 0.05326512300588333, + "language_loss": 0.86549705, + "learning_rate": 0.0008995821710511625, + "loss": 0.87637979, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.35107422, + "step": 1191, + "time_per_iteration": 2.8115806579589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108036, + "balance_loss_mlp": 1.04731488, + "epoch": 0.229318968834167, + "flos": 502785156096.0, + "grad_norm": 0.06330680163661413, + "language_loss": 0.8511278, + "learning_rate": 0.0008993948214904786, + "loss": 0.86193144, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.33056641, + "step": 1192, + "time_per_iteration": 2.546410083770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125589, + "balance_loss_mlp": 1.11023474, + "epoch": 0.22951135051943056, + "flos": 1374108544512.0, + "grad_norm": 0.06153086464986019, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79547799, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.15332031, + "step": 1193, + "time_per_iteration": 4.891269207000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099997, + "balance_loss_mlp": 1.06306624, + "epoch": 0.22970373220469412, + "flos": 644045050368.0, + "grad_norm": 0.06658536787009234, + "language_loss": 0.79028845, + "learning_rate": 0.0008990196572654427, + "loss": 0.80128849, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.36914062, + "step": 1194, + "time_per_iteration": 2.8504366874694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100467, + "balance_loss_mlp": 1.06582475, + "epoch": 0.22989611388995768, + "flos": 499945001472.0, + "grad_norm": 0.048025217626556156, + "language_loss": 0.87748766, + "learning_rate": 0.0008988318427467426, + "loss": 0.88849235, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.34667969, + "step": 1195, + "time_per_iteration": 2.735084056854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099859, + "balance_loss_mlp": 1.06524014, + "epoch": 0.23008849557522124, + "flos": 1096071796224.0, + "grad_norm": 0.06731751876810108, + "language_loss": 0.86263168, + "learning_rate": 0.0008986438733877887, + "loss": 0.87363023, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.34667969, + "step": 1196, + "time_per_iteration": 3.435035228729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100924, + "balance_loss_mlp": 1.06606746, + "epoch": 0.2302808772604848, + "flos": 683313546240.0, + "grad_norm": 0.04733445604251135, + "language_loss": 0.84099567, + "learning_rate": 0.0008984557492615576, + "loss": 0.85200489, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.34887695, + "step": 1197, + "time_per_iteration": 2.927668809890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107534, + "balance_loss_mlp": 1.07317793, + "epoch": 0.23047325894574835, + "flos": 528664928256.0, + "grad_norm": 0.0630370564804667, + "language_loss": 0.89949608, + "learning_rate": 0.0008982674704410854, + "loss": 0.91057146, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.34399414, + "step": 1198, + "time_per_iteration": 2.691016435623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107479, + "balance_loss_mlp": 1.07228875, + "epoch": 0.23066564063101191, + "flos": 682427455488.0, + "grad_norm": 0.06209648084563375, + "language_loss": 0.77829844, + "learning_rate": 0.0008980790369994682, + "loss": 0.78937328, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.35205078, + "step": 1199, + "time_per_iteration": 2.9320883750915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_mlp": 1.06525421, + "epoch": 0.2308580223162755, + "flos": 558247624704.0, + "grad_norm": 0.09180159748966574, + "language_loss": 0.87396461, + "learning_rate": 0.000897890449009863, + "loss": 0.88496947, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.3527832, + "step": 1200, + "time_per_iteration": 2.6804869174957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096306, + "balance_loss_mlp": 1.06183052, + "epoch": 0.23105040400153906, + "flos": 555406060032.0, + "grad_norm": 0.05982856494430897, + "language_loss": 0.90313268, + "learning_rate": 0.0008977017065454853, + "loss": 0.9140957, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.3449707, + "step": 1201, + "time_per_iteration": 2.639636754989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090282, + "balance_loss_mlp": 1.05556786, + "epoch": 0.23124278568680262, + "flos": 704474855424.0, + "grad_norm": 0.06077351963181601, + "language_loss": 0.80804175, + "learning_rate": 0.0008975128096796121, + "loss": 0.81894457, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.34765625, + "step": 1202, + "time_per_iteration": 2.8410260677337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078695, + "balance_loss_mlp": 1.04431474, + "epoch": 0.23143516737206618, + "flos": 612469002240.0, + "grad_norm": 0.07413481943536562, + "language_loss": 0.85940087, + "learning_rate": 0.0008973237584855794, + "loss": 0.87018776, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.34423828, + "step": 1203, + "time_per_iteration": 2.898423671722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077022, + "balance_loss_mlp": 1.04233205, + "epoch": 0.23162754905732974, + "flos": 389030238720.0, + "grad_norm": 0.06038618944932519, + "language_loss": 0.8201915, + "learning_rate": 0.0008971345530367832, + "loss": 0.83096182, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.34716797, + "step": 1204, + "time_per_iteration": 2.486668586730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082339, + "balance_loss_mlp": 1.04738712, + "epoch": 0.2318199307425933, + "flos": 667481928192.0, + "grad_norm": 0.05427081728260985, + "language_loss": 0.85029405, + "learning_rate": 0.0008969451934066799, + "loss": 0.86111748, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.34960938, + "step": 1205, + "time_per_iteration": 2.771306276321411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091653, + "balance_loss_mlp": 1.05662966, + "epoch": 0.23201231242785686, + "flos": 666093860352.0, + "grad_norm": 0.0707913589572404, + "language_loss": 0.80143172, + "learning_rate": 0.0008967556796687854, + "loss": 0.81234825, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.35058594, + "step": 1206, + "time_per_iteration": 2.8904309272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087698, + "balance_loss_mlp": 1.05353224, + "epoch": 0.23220469411312042, + "flos": 748498281984.0, + "grad_norm": 0.05559113870944949, + "language_loss": 0.83954245, + "learning_rate": 0.0008965660118966752, + "loss": 0.8504194, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.34204102, + "step": 1207, + "time_per_iteration": 2.9140615463256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108846, + "balance_loss_mlp": 1.05529559, + "epoch": 0.232397075798384, + "flos": 666763163136.0, + "grad_norm": 0.04975334384076733, + "language_loss": 0.90441763, + "learning_rate": 0.0008963761901639851, + "loss": 0.91530222, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.33154297, + "step": 1208, + "time_per_iteration": 2.8032286167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094486, + "balance_loss_mlp": 1.06008244, + "epoch": 0.23258945748364757, + "flos": 609937357824.0, + "grad_norm": 0.05840728669351643, + "language_loss": 0.83201033, + "learning_rate": 0.0008961862145444103, + "loss": 0.84295517, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.34399414, + "step": 1209, + "time_per_iteration": 2.7161943912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_mlp": 1.05954397, + "epoch": 0.23278183916891113, + "flos": 489397842432.0, + "grad_norm": 0.06743904317466738, + "language_loss": 0.85216832, + "learning_rate": 0.0008959960851117059, + "loss": 0.86311138, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.34790039, + "step": 1210, + "time_per_iteration": 2.5817031860351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095047, + "balance_loss_mlp": 1.06014228, + "epoch": 0.23297422085417469, + "flos": 511314232320.0, + "grad_norm": 0.057575168534165826, + "language_loss": 0.84338427, + "learning_rate": 0.0008958058019396868, + "loss": 0.85433477, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.34936523, + "step": 1211, + "time_per_iteration": 2.7788164615631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091213, + "balance_loss_mlp": 1.05730987, + "epoch": 0.23316660253943824, + "flos": 546145072128.0, + "grad_norm": 0.057082370400879795, + "language_loss": 0.86897939, + "learning_rate": 0.0008956153651022274, + "loss": 0.87989151, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.33935547, + "step": 1212, + "time_per_iteration": 2.7309062480926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090204, + "balance_loss_mlp": 1.05608642, + "epoch": 0.2333589842247018, + "flos": 509998947840.0, + "grad_norm": 0.06317396982696966, + "language_loss": 0.84641176, + "learning_rate": 0.0008954247746732618, + "loss": 0.85731381, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.34155273, + "step": 1213, + "time_per_iteration": 2.619058609008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084199, + "balance_loss_mlp": 1.05072522, + "epoch": 0.23355136590996536, + "flos": 662834686464.0, + "grad_norm": 0.09780220222501788, + "language_loss": 0.90954423, + "learning_rate": 0.0008952340307267837, + "loss": 0.9203862, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.33496094, + "step": 1214, + "time_per_iteration": 2.869351387023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108593, + "balance_loss_mlp": 1.05128753, + "epoch": 0.23374374759522892, + "flos": 508206417408.0, + "grad_norm": 0.061496426320555984, + "language_loss": 0.83373952, + "learning_rate": 0.0008950431333368468, + "loss": 0.84459883, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.34667969, + "step": 1215, + "time_per_iteration": 2.5557806491851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093645, + "balance_loss_mlp": 1.05928898, + "epoch": 0.2339361292804925, + "flos": 1293964028928.0, + "grad_norm": 0.062331860667319446, + "language_loss": 0.84730738, + "learning_rate": 0.0008948520825775634, + "loss": 0.85824382, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.34399414, + "step": 1216, + "time_per_iteration": 3.6050164699554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098293, + "balance_loss_mlp": 1.06343639, + "epoch": 0.23412851096575607, + "flos": 705617021952.0, + "grad_norm": 0.06500023378725601, + "language_loss": 0.84162283, + "learning_rate": 0.0008946608785231067, + "loss": 0.8526057, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.34863281, + "step": 1217, + "time_per_iteration": 2.858696699142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109995, + "balance_loss_mlp": 1.065045, + "epoch": 0.23432089265101963, + "flos": 438036968448.0, + "grad_norm": 0.06356573317347913, + "language_loss": 0.84325957, + "learning_rate": 0.0008944695212477084, + "loss": 0.85425907, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.34912109, + "step": 1218, + "time_per_iteration": 2.4787168502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107192, + "balance_loss_mlp": 1.07190585, + "epoch": 0.2345132743362832, + "flos": 480697058304.0, + "grad_norm": 0.05460931090439532, + "language_loss": 0.86098325, + "learning_rate": 0.0008942780108256599, + "loss": 0.87205517, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.35327148, + "step": 1219, + "time_per_iteration": 2.574692726135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105521, + "balance_loss_mlp": 1.06966305, + "epoch": 0.23470565602154675, + "flos": 411231817728.0, + "grad_norm": 0.05853057396081394, + "language_loss": 0.86360055, + "learning_rate": 0.0008940863473313121, + "loss": 0.87465572, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.35839844, + "step": 1220, + "time_per_iteration": 2.4849462509155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115361, + "balance_loss_mlp": 1.08024168, + "epoch": 0.2348980377068103, + "flos": 545189170176.0, + "grad_norm": 0.0745659618807548, + "language_loss": 0.87691534, + "learning_rate": 0.0008938945308390756, + "loss": 0.88806891, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.3515625, + "step": 1221, + "time_per_iteration": 2.6100285053253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107677, + "balance_loss_mlp": 1.07248664, + "epoch": 0.23509041939207387, + "flos": 575465900544.0, + "grad_norm": 0.055913245264753976, + "language_loss": 0.87316763, + "learning_rate": 0.00089370256142342, + "loss": 0.88424438, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.35205078, + "step": 1222, + "time_per_iteration": 2.726897716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109541, + "balance_loss_mlp": 1.06007659, + "epoch": 0.23528280107733743, + "flos": 588568025088.0, + "grad_norm": 0.04976165943815558, + "language_loss": 0.85095507, + "learning_rate": 0.0008935104391588746, + "loss": 0.86190915, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.35351562, + "step": 1223, + "time_per_iteration": 2.7249879837036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108966, + "balance_loss_mlp": 1.07313156, + "epoch": 0.235475182762601, + "flos": 822948235776.0, + "grad_norm": 0.05651852634602403, + "language_loss": 0.82749176, + "learning_rate": 0.0008933181641200276, + "loss": 0.83858138, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.35839844, + "step": 1224, + "time_per_iteration": 3.1651737689971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094197, + "balance_loss_mlp": 1.06017447, + "epoch": 0.23566756444786457, + "flos": 679865287680.0, + "grad_norm": 0.06356150049585653, + "language_loss": 0.8609674, + "learning_rate": 0.0008931257363815271, + "loss": 0.87190938, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.34033203, + "step": 1225, + "time_per_iteration": 2.891789674758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091395, + "balance_loss_mlp": 1.05711007, + "epoch": 0.23585994613312813, + "flos": 701481931776.0, + "grad_norm": 0.04853721262867189, + "language_loss": 0.89892405, + "learning_rate": 0.0008929331560180798, + "loss": 0.90983796, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.34277344, + "step": 1226, + "time_per_iteration": 2.934101104736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093973, + "balance_loss_mlp": 1.06002271, + "epoch": 0.2360523278183917, + "flos": 523923144192.0, + "grad_norm": 0.06491881814379113, + "language_loss": 0.91129786, + "learning_rate": 0.0008927404231044525, + "loss": 0.92223763, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.33984375, + "step": 1227, + "time_per_iteration": 2.682377815246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083665, + "balance_loss_mlp": 1.0493325, + "epoch": 0.23624470950365525, + "flos": 524027860992.0, + "grad_norm": 0.053423388326064705, + "language_loss": 0.81944436, + "learning_rate": 0.0008925475377154703, + "loss": 0.83028102, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.34375, + "step": 1228, + "time_per_iteration": 2.7511117458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077291, + "balance_loss_mlp": 1.04169512, + "epoch": 0.2364370911889188, + "flos": 596525313024.0, + "grad_norm": 0.05836717970983508, + "language_loss": 0.8241868, + "learning_rate": 0.0008923544999260183, + "loss": 0.83495975, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.35644531, + "step": 1229, + "time_per_iteration": 2.7915079593658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087923, + "balance_loss_mlp": 1.05194569, + "epoch": 0.23662947287418237, + "flos": 756519588864.0, + "grad_norm": 0.08156392485297027, + "language_loss": 0.91757852, + "learning_rate": 0.00089216130981104, + "loss": 0.92845774, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.35986328, + "step": 1230, + "time_per_iteration": 3.037900924682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089884, + "balance_loss_mlp": 1.0531199, + "epoch": 0.23682185455944593, + "flos": 545907935232.0, + "grad_norm": 0.05473268619072285, + "language_loss": 0.82659578, + "learning_rate": 0.000891967967445539, + "loss": 0.83749461, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.36743164, + "step": 1231, + "time_per_iteration": 2.6595497131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088497, + "balance_loss_mlp": 1.05201924, + "epoch": 0.2370142362447095, + "flos": 661977709056.0, + "grad_norm": 0.04604146434030928, + "language_loss": 0.88502473, + "learning_rate": 0.0008917744729045772, + "loss": 0.89590967, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.36499023, + "step": 1232, + "time_per_iteration": 2.851651668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095833, + "balance_loss_mlp": 1.05868709, + "epoch": 0.23720661792997308, + "flos": 683361598464.0, + "grad_norm": 0.06835104069372223, + "language_loss": 0.84165156, + "learning_rate": 0.0008915808262632757, + "loss": 0.85260987, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.37133789, + "step": 1233, + "time_per_iteration": 2.8114235401153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095976, + "balance_loss_mlp": 1.05892599, + "epoch": 0.23739899961523664, + "flos": 558631738368.0, + "grad_norm": 0.055258261409357204, + "language_loss": 0.92769438, + "learning_rate": 0.0008913870275968148, + "loss": 0.93865418, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.37036133, + "step": 1234, + "time_per_iteration": 2.705349922180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092916, + "balance_loss_mlp": 1.05629516, + "epoch": 0.2375913813005002, + "flos": 889144128000.0, + "grad_norm": 0.12876854850300654, + "language_loss": 0.87540263, + "learning_rate": 0.0008911930769804342, + "loss": 0.8863318, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.36621094, + "step": 1235, + "time_per_iteration": 3.2342941761016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091192, + "balance_loss_mlp": 1.05492854, + "epoch": 0.23778376298576376, + "flos": 640810607616.0, + "grad_norm": 0.044375832072417805, + "language_loss": 0.91481459, + "learning_rate": 0.0008909989744894318, + "loss": 0.92572653, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.36303711, + "step": 1236, + "time_per_iteration": 2.8858232498168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089597, + "balance_loss_mlp": 1.05333364, + "epoch": 0.23797614467102732, + "flos": 616540073472.0, + "grad_norm": 0.05892762197337364, + "language_loss": 0.81707233, + "learning_rate": 0.0008908047201991649, + "loss": 0.82796836, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.36279297, + "step": 1237, + "time_per_iteration": 2.785226583480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085624, + "balance_loss_mlp": 1.05071974, + "epoch": 0.23816852635629088, + "flos": 623941539840.0, + "grad_norm": 0.051487502947417364, + "language_loss": 0.86561942, + "learning_rate": 0.0008906103141850502, + "loss": 0.87647569, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.34960938, + "step": 1238, + "time_per_iteration": 2.868241310119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095513, + "balance_loss_mlp": 1.05901158, + "epoch": 0.23836090804155444, + "flos": 521180504064.0, + "grad_norm": 0.07170300234131513, + "language_loss": 0.88119614, + "learning_rate": 0.0008904157565225621, + "loss": 0.89215136, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.36499023, + "step": 1239, + "time_per_iteration": 2.610048294067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092952, + "balance_loss_mlp": 1.05716562, + "epoch": 0.238553289726818, + "flos": 1153527616512.0, + "grad_norm": 0.07764557472008667, + "language_loss": 0.82042629, + "learning_rate": 0.000890221047287235, + "loss": 0.83135581, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.3581543, + "step": 1240, + "time_per_iteration": 3.547147274017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102634, + "balance_loss_mlp": 1.06772995, + "epoch": 0.23874567141208156, + "flos": 499600175616.0, + "grad_norm": 0.07123563443936186, + "language_loss": 0.91052604, + "learning_rate": 0.0008900261865546615, + "loss": 0.92155242, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.34936523, + "step": 1241, + "time_per_iteration": 2.6277406215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110411, + "balance_loss_mlp": 1.06768012, + "epoch": 0.23893805309734514, + "flos": 556657325568.0, + "grad_norm": 0.08027126565183675, + "language_loss": 0.84991688, + "learning_rate": 0.0008898311744004936, + "loss": 0.86095798, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.36425781, + "step": 1242, + "time_per_iteration": 2.687009811401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112655, + "balance_loss_mlp": 1.07708287, + "epoch": 0.2391304347826087, + "flos": 549009957888.0, + "grad_norm": 0.05686617926086787, + "language_loss": 0.86918116, + "learning_rate": 0.0008896360109004414, + "loss": 0.88030773, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.35595703, + "step": 1243, + "time_per_iteration": 2.6292564868927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111871, + "balance_loss_mlp": 1.07629931, + "epoch": 0.23932281646787226, + "flos": 515794148352.0, + "grad_norm": 0.05075175877282041, + "language_loss": 0.84481502, + "learning_rate": 0.0008894406961302742, + "loss": 0.85593379, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.35595703, + "step": 1244, + "time_per_iteration": 2.5960640907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122594, + "balance_loss_mlp": 1.08737969, + "epoch": 0.23951519815313582, + "flos": 743353445376.0, + "grad_norm": 0.06488001286924965, + "language_loss": 0.84053004, + "learning_rate": 0.0008892452301658201, + "loss": 0.85175598, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.35253906, + "step": 1245, + "time_per_iteration": 2.9320998191833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123528, + "balance_loss_mlp": 1.08728814, + "epoch": 0.23970757983839938, + "flos": 553855048704.0, + "grad_norm": 0.05553543969160018, + "language_loss": 0.83631629, + "learning_rate": 0.0008890496130829653, + "loss": 0.84755158, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.36230469, + "step": 1246, + "time_per_iteration": 2.6420071125030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123934, + "balance_loss_mlp": 1.08802795, + "epoch": 0.23989996152366294, + "flos": 480416251392.0, + "grad_norm": 0.0595921721906752, + "language_loss": 0.85551775, + "learning_rate": 0.0008888538449576555, + "loss": 0.86675715, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.359375, + "step": 1247, + "time_per_iteration": 2.544706344604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123543, + "balance_loss_mlp": 1.08687472, + "epoch": 0.2400923432089265, + "flos": 485069285376.0, + "grad_norm": 0.06973867138143126, + "language_loss": 0.82958472, + "learning_rate": 0.0008886579258658944, + "loss": 0.84082007, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.36669922, + "step": 1248, + "time_per_iteration": 2.5460424423217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108744, + "balance_loss_mlp": 1.0724808, + "epoch": 0.24028472489419006, + "flos": 623247505920.0, + "grad_norm": 0.04817062293972818, + "language_loss": 0.85353303, + "learning_rate": 0.0008884618558837446, + "loss": 0.86462045, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.36279297, + "step": 1249, + "time_per_iteration": 2.80222487449646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125107, + "balance_loss_mlp": 1.08765173, + "epoch": 0.24047710657945365, + "flos": 601302002688.0, + "grad_norm": 0.052194699096834656, + "language_loss": 0.86387813, + "learning_rate": 0.0008882656350873273, + "loss": 0.87512922, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.37426758, + "step": 1250, + "time_per_iteration": 2.830887794494629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136265, + "balance_loss_mlp": 1.09911942, + "epoch": 0.2406694882647172, + "flos": 841199579136.0, + "grad_norm": 0.07156482775024936, + "language_loss": 0.86951184, + "learning_rate": 0.0008880692635528219, + "loss": 0.88087451, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.37109375, + "step": 1251, + "time_per_iteration": 3.0439021587371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140454, + "balance_loss_mlp": 1.10450029, + "epoch": 0.24086186994998077, + "flos": 526789440000.0, + "grad_norm": 0.062254670736574515, + "language_loss": 0.89187038, + "learning_rate": 0.0008878727413564669, + "loss": 0.90327489, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.36010742, + "step": 1252, + "time_per_iteration": 2.7363240718841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094248, + "balance_loss_mlp": 1.07717752, + "epoch": 0.24105425163524433, + "flos": 1337464673280.0, + "grad_norm": 0.032126183170312766, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81229842, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.17089844, + "step": 1253, + "time_per_iteration": 4.8370680809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175937, + "balance_loss_mlp": 1.13755202, + "epoch": 0.24124663332050789, + "flos": 613822164480.0, + "grad_norm": 0.06436318886622608, + "language_loss": 0.78452635, + "learning_rate": 0.0008874792452834528, + "loss": 0.79628575, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.38354492, + "step": 1254, + "time_per_iteration": 2.724947452545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151033, + "balance_loss_mlp": 1.11381602, + "epoch": 0.24143901500577145, + "flos": 575278225920.0, + "grad_norm": 0.08996846201845816, + "language_loss": 0.87516546, + "learning_rate": 0.0008872822715595626, + "loss": 0.88667583, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.37207031, + "step": 1255, + "time_per_iteration": 2.676539659500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118396, + "balance_loss_mlp": 1.08275259, + "epoch": 0.241631396691035, + "flos": 494941349376.0, + "grad_norm": 0.06475486920780314, + "language_loss": 0.87080252, + "learning_rate": 0.0008870851474793598, + "loss": 0.88198644, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.35668945, + "step": 1256, + "time_per_iteration": 2.5523862838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108714, + "balance_loss_mlp": 1.07287991, + "epoch": 0.24182377837629856, + "flos": 635891323392.0, + "grad_norm": 0.0627898868455093, + "language_loss": 0.89724898, + "learning_rate": 0.0008868878731193752, + "loss": 0.90833616, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.35888672, + "step": 1257, + "time_per_iteration": 2.8152451515197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094242, + "balance_loss_mlp": 1.05933762, + "epoch": 0.24201616006156215, + "flos": 514938580992.0, + "grad_norm": 0.06450256361572139, + "language_loss": 0.89708877, + "learning_rate": 0.0008866904485561973, + "loss": 0.90803117, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.34936523, + "step": 1258, + "time_per_iteration": 2.7304298877716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095529, + "balance_loss_mlp": 1.05945659, + "epoch": 0.2422085417468257, + "flos": 614837703168.0, + "grad_norm": 0.05809143078881904, + "language_loss": 0.83024096, + "learning_rate": 0.000886492873866473, + "loss": 0.8411963, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.36108398, + "step": 1259, + "time_per_iteration": 2.828904628753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090865, + "balance_loss_mlp": 1.05576968, + "epoch": 0.24240092343208927, + "flos": 585515464704.0, + "grad_norm": 0.07568124142212555, + "language_loss": 0.84760439, + "learning_rate": 0.000886295149126908, + "loss": 0.85851306, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.35131836, + "step": 1260, + "time_per_iteration": 2.7011313438415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109068, + "balance_loss_mlp": 1.05537009, + "epoch": 0.24259330511735283, + "flos": 761930675712.0, + "grad_norm": 0.05459059834864095, + "language_loss": 0.85652769, + "learning_rate": 0.0008860972744142655, + "loss": 0.8674345, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.35327148, + "step": 1261, + "time_per_iteration": 2.9039082527160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084953, + "balance_loss_mlp": 1.05028725, + "epoch": 0.2427856868026164, + "flos": 626566316544.0, + "grad_norm": 0.06267274795834049, + "language_loss": 0.8183161, + "learning_rate": 0.0008858992498053671, + "loss": 0.82916564, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.34692383, + "step": 1262, + "time_per_iteration": 2.8293697834014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080455, + "balance_loss_mlp": 1.06586385, + "epoch": 0.24297806848787995, + "flos": 1510840470528.0, + "grad_norm": 0.02756761643082338, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77669203, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.14550781, + "step": 1263, + "time_per_iteration": 4.8116748332977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087756, + "balance_loss_mlp": 1.05328119, + "epoch": 0.2431704501731435, + "flos": 541669538304.0, + "grad_norm": 0.05501719814044903, + "language_loss": 0.83684969, + "learning_rate": 0.0008855027512063817, + "loss": 0.8477273, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.3449707, + "step": 1264, + "time_per_iteration": 2.6995394229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084053, + "balance_loss_mlp": 1.0493865, + "epoch": 0.24336283185840707, + "flos": 523588492800.0, + "grad_norm": 0.06804757776515359, + "language_loss": 0.85974693, + "learning_rate": 0.0008853042773702292, + "loss": 0.87058747, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.34692383, + "step": 1265, + "time_per_iteration": 2.683969497680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086337, + "balance_loss_mlp": 1.05002618, + "epoch": 0.24355521354367063, + "flos": 536839004160.0, + "grad_norm": 0.05444938358074035, + "language_loss": 0.87678754, + "learning_rate": 0.0008851056539456896, + "loss": 0.88765097, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.36303711, + "step": 1266, + "time_per_iteration": 2.674891471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_mlp": 1.04830909, + "epoch": 0.24374759522893422, + "flos": 930050975232.0, + "grad_norm": 0.04940136823280911, + "language_loss": 0.82195789, + "learning_rate": 0.0008849068810098755, + "loss": 0.83279288, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.35229492, + "step": 1267, + "time_per_iteration": 3.27172589302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083691, + "balance_loss_mlp": 1.04859626, + "epoch": 0.24393997691419778, + "flos": 427564002816.0, + "grad_norm": 0.07591960175092535, + "language_loss": 0.83287823, + "learning_rate": 0.0008847079586399575, + "loss": 0.84371519, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.35131836, + "step": 1268, + "time_per_iteration": 2.4539763927459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010876, + "balance_loss_mlp": 1.05281472, + "epoch": 0.24413235859946134, + "flos": 578582479872.0, + "grad_norm": 0.059755639557228325, + "language_loss": 0.86095846, + "learning_rate": 0.0008845088869131641, + "loss": 0.87183452, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.34790039, + "step": 1269, + "time_per_iteration": 2.651010274887085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094661, + "balance_loss_mlp": 1.058851, + "epoch": 0.2443247402847249, + "flos": 529600481280.0, + "grad_norm": 0.07776240560166553, + "language_loss": 0.89366186, + "learning_rate": 0.0008843096659067818, + "loss": 0.90460849, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.35839844, + "step": 1270, + "time_per_iteration": 2.61082124710083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108773, + "balance_loss_mlp": 1.05292046, + "epoch": 0.24451712196998845, + "flos": 695996651520.0, + "grad_norm": 0.05083497592617014, + "language_loss": 0.86395383, + "learning_rate": 0.000884110295698155, + "loss": 0.87483108, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.34863281, + "step": 1271, + "time_per_iteration": 2.930372476577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085133, + "balance_loss_mlp": 1.05089653, + "epoch": 0.24470950365525201, + "flos": 529575750144.0, + "grad_norm": 0.05520811698213447, + "language_loss": 0.86009014, + "learning_rate": 0.0008839107763646861, + "loss": 0.87094152, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.34277344, + "step": 1272, + "time_per_iteration": 2.576322078704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088964, + "balance_loss_mlp": 1.05324888, + "epoch": 0.24490188534051557, + "flos": 491091448320.0, + "grad_norm": 0.0616556586287024, + "language_loss": 0.9024111, + "learning_rate": 0.0008837111079838353, + "loss": 0.91330075, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.35742188, + "step": 1273, + "time_per_iteration": 2.6859118938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109117, + "balance_loss_mlp": 1.05631351, + "epoch": 0.24509426702577913, + "flos": 473916842496.0, + "grad_norm": 0.05704478566457949, + "language_loss": 0.89869869, + "learning_rate": 0.000883511290633121, + "loss": 0.90961039, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.34887695, + "step": 1274, + "time_per_iteration": 2.5262861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096408, + "balance_loss_mlp": 1.06152773, + "epoch": 0.24528664871104272, + "flos": 550329624576.0, + "grad_norm": 0.04914382449005864, + "language_loss": 0.92288065, + "learning_rate": 0.000883311324390119, + "loss": 0.93384475, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.34887695, + "step": 1275, + "time_per_iteration": 2.6791441440582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100279, + "balance_loss_mlp": 1.0631578, + "epoch": 0.24547903039630628, + "flos": 825546871296.0, + "grad_norm": 0.0705624444694786, + "language_loss": 0.81542301, + "learning_rate": 0.0008831112093324629, + "loss": 0.82642579, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.37060547, + "step": 1276, + "time_per_iteration": 3.0612823963165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100002, + "balance_loss_mlp": 1.06419206, + "epoch": 0.24567141208156984, + "flos": 591325221888.0, + "grad_norm": 0.0822852621967946, + "language_loss": 0.89184481, + "learning_rate": 0.0008829109455378444, + "loss": 0.90284485, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.35839844, + "step": 1277, + "time_per_iteration": 2.6601858139038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091196, + "balance_loss_mlp": 1.05567181, + "epoch": 0.2458637937668334, + "flos": 547611715584.0, + "grad_norm": 0.05101212903881184, + "language_loss": 0.86474031, + "learning_rate": 0.000882710533084013, + "loss": 0.87565225, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.35546875, + "step": 1278, + "time_per_iteration": 2.6333553791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087707, + "balance_loss_mlp": 1.05158627, + "epoch": 0.24605617545209696, + "flos": 515641379328.0, + "grad_norm": 0.04855931692812416, + "language_loss": 0.89387107, + "learning_rate": 0.0008825099720487755, + "loss": 0.9047482, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.36108398, + "step": 1279, + "time_per_iteration": 2.6388816833496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071735, + "balance_loss_mlp": 1.05943298, + "epoch": 0.24624855713736052, + "flos": 1510953951744.0, + "grad_norm": 0.03612446278815301, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76332873, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.12304688, + "step": 1280, + "time_per_iteration": 4.837193727493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044143, + "balance_loss_mlp": 1.03145874, + "epoch": 0.24644093882262408, + "flos": 1526826419712.0, + "grad_norm": 0.020354868078157083, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.78988254, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.12695312, + "step": 1281, + "time_per_iteration": 4.7473485469818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078971, + "balance_loss_mlp": 1.04418564, + "epoch": 0.24663332050788764, + "flos": 658811667456.0, + "grad_norm": 0.060866999123497585, + "language_loss": 0.89327228, + "learning_rate": 0.0008819073982335619, + "loss": 0.90406203, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.34838867, + "step": 1282, + "time_per_iteration": 2.839691162109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107951, + "balance_loss_mlp": 1.0446527, + "epoch": 0.24682570219315123, + "flos": 541510977024.0, + "grad_norm": 0.05752783194209404, + "language_loss": 0.84339237, + "learning_rate": 0.0008817062436519235, + "loss": 0.85418749, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.34887695, + "step": 1283, + "time_per_iteration": 2.6106019020080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080546, + "balance_loss_mlp": 1.04459274, + "epoch": 0.24701808387841478, + "flos": 440455131648.0, + "grad_norm": 0.05999718389674832, + "language_loss": 0.89926815, + "learning_rate": 0.0008815049408787788, + "loss": 0.91007358, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.36010742, + "step": 1284, + "time_per_iteration": 2.5186686515808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079282, + "balance_loss_mlp": 1.04518795, + "epoch": 0.24721046556367834, + "flos": 467826278400.0, + "grad_norm": 0.054777388157378364, + "language_loss": 0.8565737, + "learning_rate": 0.0008813034899922805, + "loss": 0.86736655, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.34106445, + "step": 1285, + "time_per_iteration": 2.5217878818511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082636, + "balance_loss_mlp": 1.04730225, + "epoch": 0.2474028472489419, + "flos": 504183398400.0, + "grad_norm": 0.06351521025868076, + "language_loss": 0.90182853, + "learning_rate": 0.0008811018910706387, + "loss": 0.91265488, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.35375977, + "step": 1286, + "time_per_iteration": 2.549523115158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010823, + "balance_loss_mlp": 1.04787278, + "epoch": 0.24759522893420546, + "flos": 479707660800.0, + "grad_norm": 0.06857789842871208, + "language_loss": 0.81978023, + "learning_rate": 0.0008809001441921211, + "loss": 0.83060318, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.34448242, + "step": 1287, + "time_per_iteration": 2.7147598266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083728, + "balance_loss_mlp": 1.04984844, + "epoch": 0.24778761061946902, + "flos": 533446000128.0, + "grad_norm": 0.05733880184845353, + "language_loss": 0.85523212, + "learning_rate": 0.0008806982494350528, + "loss": 0.86606944, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.33911133, + "step": 1288, + "time_per_iteration": 2.6304967403411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086884, + "balance_loss_mlp": 1.05197978, + "epoch": 0.24797999230473258, + "flos": 559513446912.0, + "grad_norm": 0.04849910181782432, + "language_loss": 0.90370154, + "learning_rate": 0.0008804962068778161, + "loss": 0.91457039, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.34936523, + "step": 1289, + "time_per_iteration": 2.8194985389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087315, + "balance_loss_mlp": 1.05291104, + "epoch": 0.24817237398999614, + "flos": 623912426496.0, + "grad_norm": 0.05410640942937228, + "language_loss": 0.80728722, + "learning_rate": 0.0008802940165988511, + "loss": 0.81816041, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.34423828, + "step": 1290, + "time_per_iteration": 2.8703298568725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096846, + "balance_loss_mlp": 1.06225193, + "epoch": 0.2483647556752597, + "flos": 611981581824.0, + "grad_norm": 0.06277561181530684, + "language_loss": 0.88376027, + "learning_rate": 0.000880091678676655, + "loss": 0.89472872, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.34619141, + "step": 1291, + "time_per_iteration": 2.7943451404571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088363, + "balance_loss_mlp": 1.05496097, + "epoch": 0.2485571373605233, + "flos": 583270419456.0, + "grad_norm": 0.061640996967182685, + "language_loss": 0.89207399, + "learning_rate": 0.0008798891931897821, + "loss": 0.90295762, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.33422852, + "step": 1292, + "time_per_iteration": 2.7013609409332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05463946, + "epoch": 0.24874951904578685, + "flos": 494503391232.0, + "grad_norm": 0.0568342609101268, + "language_loss": 0.84605837, + "learning_rate": 0.0008796865602168447, + "loss": 0.8569414, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.33691406, + "step": 1293, + "time_per_iteration": 2.517571210861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.05448937, + "epoch": 0.2489419007310504, + "flos": 455925957120.0, + "grad_norm": 0.05011975537228715, + "language_loss": 0.88745099, + "learning_rate": 0.0008794837798365115, + "loss": 0.8983261, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.33032227, + "step": 1294, + "time_per_iteration": 2.6243135929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093031, + "balance_loss_mlp": 1.05958056, + "epoch": 0.24913428241631397, + "flos": 485198733312.0, + "grad_norm": 0.05031013210073455, + "language_loss": 0.88537574, + "learning_rate": 0.0008792808521275089, + "loss": 0.89630604, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.3347168, + "step": 1295, + "time_per_iteration": 2.743821144104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090644, + "balance_loss_mlp": 1.05664551, + "epoch": 0.24932666410157753, + "flos": 518654651904.0, + "grad_norm": 0.0628198177294759, + "language_loss": 0.87554896, + "learning_rate": 0.0008790777771686206, + "loss": 0.8864553, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.34033203, + "step": 1296, + "time_per_iteration": 2.55996036529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091195, + "balance_loss_mlp": 1.05819798, + "epoch": 0.2495190457868411, + "flos": 472365831168.0, + "grad_norm": 0.05367084005526609, + "language_loss": 0.85479438, + "learning_rate": 0.0008788745550386872, + "loss": 0.86570632, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.33007812, + "step": 1297, + "time_per_iteration": 2.555238723754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099543, + "balance_loss_mlp": 1.06494844, + "epoch": 0.24971142747210465, + "flos": 745559202816.0, + "grad_norm": 0.05557204977607519, + "language_loss": 0.80045742, + "learning_rate": 0.0008786711858166063, + "loss": 0.81145287, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.34643555, + "step": 1298, + "time_per_iteration": 2.940908670425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090601, + "balance_loss_mlp": 1.05757999, + "epoch": 0.2499038091573682, + "flos": 749222839296.0, + "grad_norm": 0.08262860681241094, + "language_loss": 0.83490336, + "learning_rate": 0.0008784676695813332, + "loss": 0.84580934, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.33032227, + "step": 1299, + "time_per_iteration": 2.966646432876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092772, + "balance_loss_mlp": 1.05870187, + "epoch": 0.2500961908426318, + "flos": 744741513216.0, + "grad_norm": 0.04756275395178792, + "language_loss": 0.84761405, + "learning_rate": 0.0008782640064118796, + "loss": 0.85854173, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.34082031, + "step": 1300, + "time_per_iteration": 2.8889827728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078115, + "balance_loss_mlp": 1.06447709, + "epoch": 0.2502885725278953, + "flos": 1416652180992.0, + "grad_norm": 0.036683670441934005, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77262866, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.13671875, + "step": 1301, + "time_per_iteration": 4.988169431686401 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094633, + "balance_loss_mlp": 1.06196928, + "epoch": 0.2504809542131589, + "flos": 514961902080.0, + "grad_norm": 0.05923567857946263, + "language_loss": 0.86476314, + "learning_rate": 0.0008778562395867648, + "loss": 0.87570941, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.32666016, + "step": 1302, + "time_per_iteration": 2.5900919437408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087269, + "balance_loss_mlp": 1.05436766, + "epoch": 0.25067333589842244, + "flos": 525562905600.0, + "grad_norm": 0.06049595368492962, + "language_loss": 0.83774143, + "learning_rate": 0.0008776521360894127, + "loss": 0.8486141, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.32910156, + "step": 1303, + "time_per_iteration": 2.6029298305511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_mlp": 1.02275884, + "epoch": 0.25086571758368603, + "flos": 1473085108224.0, + "grad_norm": 0.024331867442101186, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.79997885, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.13085938, + "step": 1304, + "time_per_iteration": 4.800757646560669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096353, + "balance_loss_mlp": 1.063833, + "epoch": 0.2510580992689496, + "flos": 528128045568.0, + "grad_norm": 0.053799887970574674, + "language_loss": 0.90341735, + "learning_rate": 0.0008772434893213186, + "loss": 0.91438091, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.32519531, + "step": 1305, + "time_per_iteration": 2.5816421508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104465, + "balance_loss_mlp": 1.07123005, + "epoch": 0.25125048095421315, + "flos": 517192390656.0, + "grad_norm": 0.058690449713219205, + "language_loss": 0.84433925, + "learning_rate": 0.0008770389462092276, + "loss": 0.85538393, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.33251953, + "step": 1306, + "time_per_iteration": 2.6747090816497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106931, + "balance_loss_mlp": 1.07309926, + "epoch": 0.25144286263947674, + "flos": 620160039936.0, + "grad_norm": 0.16660488736040688, + "language_loss": 0.86719346, + "learning_rate": 0.0008768342567176357, + "loss": 0.87826276, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.33862305, + "step": 1307, + "time_per_iteration": 2.7788002490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098824, + "balance_loss_mlp": 1.06425333, + "epoch": 0.25163524432474027, + "flos": 503534444544.0, + "grad_norm": 0.04824933548887647, + "language_loss": 0.90589297, + "learning_rate": 0.0008766294209260107, + "loss": 0.91688126, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.34619141, + "step": 1308, + "time_per_iteration": 2.6300241947174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010929, + "balance_loss_mlp": 1.05852032, + "epoch": 0.25182762601000386, + "flos": 508821875712.0, + "grad_norm": 0.0633327884934456, + "language_loss": 0.91549027, + "learning_rate": 0.0008764244389138767, + "loss": 0.92641926, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.34399414, + "step": 1309, + "time_per_iteration": 2.574056625366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088516, + "balance_loss_mlp": 1.05306351, + "epoch": 0.2520200076952674, + "flos": 633596815872.0, + "grad_norm": 0.05898934519456769, + "language_loss": 0.8269434, + "learning_rate": 0.000876219310760815, + "loss": 0.83782852, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.35449219, + "step": 1310, + "time_per_iteration": 2.87404465675354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010989, + "balance_loss_mlp": 1.06299448, + "epoch": 0.252212389380531, + "flos": 494385527808.0, + "grad_norm": 0.05968729718727878, + "language_loss": 0.8144334, + "learning_rate": 0.0008760140365464631, + "loss": 0.82542241, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.35913086, + "step": 1311, + "time_per_iteration": 2.599480390548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105508, + "balance_loss_mlp": 1.07062793, + "epoch": 0.2524047710657945, + "flos": 490298489856.0, + "grad_norm": 0.06557576312810307, + "language_loss": 0.87226975, + "learning_rate": 0.0008758086163505156, + "loss": 0.88332486, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.34912109, + "step": 1312, + "time_per_iteration": 2.6165666580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112941, + "balance_loss_mlp": 1.07698762, + "epoch": 0.2525971527510581, + "flos": 647136898560.0, + "grad_norm": 0.06425852435188892, + "language_loss": 0.89039612, + "learning_rate": 0.0008756030502527239, + "loss": 0.90152562, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.36010742, + "step": 1313, + "time_per_iteration": 2.794595956802368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112418, + "balance_loss_mlp": 1.07636952, + "epoch": 0.2527895344363217, + "flos": 568991222784.0, + "grad_norm": 0.05792474282671988, + "language_loss": 0.90396988, + "learning_rate": 0.0008753973383328954, + "loss": 0.91509414, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.36108398, + "step": 1314, + "time_per_iteration": 2.66343355178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110344, + "balance_loss_mlp": 1.07491553, + "epoch": 0.2529819161215852, + "flos": 513795004416.0, + "grad_norm": 0.10488361484557306, + "language_loss": 0.84231019, + "learning_rate": 0.0008751914806708952, + "loss": 0.8534137, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.35449219, + "step": 1315, + "time_per_iteration": 2.5714006423950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099121, + "balance_loss_mlp": 1.06357241, + "epoch": 0.2531742978068488, + "flos": 530979784704.0, + "grad_norm": 0.0646255116041034, + "language_loss": 0.81763697, + "learning_rate": 0.0008749854773466439, + "loss": 0.82862812, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.35571289, + "step": 1316, + "time_per_iteration": 2.6507568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093995, + "balance_loss_mlp": 1.05892396, + "epoch": 0.25336667949211233, + "flos": 596362369536.0, + "grad_norm": 0.11519177634747009, + "language_loss": 0.84297431, + "learning_rate": 0.0008747793284401192, + "loss": 0.8539142, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.35107422, + "step": 1317, + "time_per_iteration": 2.6708261966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109538, + "balance_loss_mlp": 1.05966473, + "epoch": 0.2535590611773759, + "flos": 601764691968.0, + "grad_norm": 0.05376009268762157, + "language_loss": 0.86145389, + "learning_rate": 0.0008745730340313551, + "loss": 0.87240773, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.35742188, + "step": 1318, + "time_per_iteration": 2.7465810775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100043, + "balance_loss_mlp": 1.06504369, + "epoch": 0.25375144286263945, + "flos": 495079561728.0, + "grad_norm": 0.053440140598651036, + "language_loss": 0.8468703, + "learning_rate": 0.0008743665942004422, + "loss": 0.85787076, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.35009766, + "step": 1319, + "time_per_iteration": 2.632645606994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109509, + "balance_loss_mlp": 1.05908918, + "epoch": 0.25394382454790304, + "flos": 512219261952.0, + "grad_norm": 0.050076364746318706, + "language_loss": 0.92529714, + "learning_rate": 0.0008741600090275277, + "loss": 0.93624806, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.35986328, + "step": 1320, + "time_per_iteration": 2.564730405807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097426, + "balance_loss_mlp": 1.06092453, + "epoch": 0.25413620623316663, + "flos": 958586047488.0, + "grad_norm": 0.058049172943507095, + "language_loss": 0.83939385, + "learning_rate": 0.0008739532785928151, + "loss": 0.85036814, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.36474609, + "step": 1321, + "time_per_iteration": 3.4496617317199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056798, + "balance_loss_mlp": 1.04439986, + "epoch": 0.25432858791843016, + "flos": 1576445635584.0, + "grad_norm": 0.03297734471592195, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75950378, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.12353516, + "step": 1322, + "time_per_iteration": 4.803644418716431 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102711, + "balance_loss_mlp": 1.06706691, + "epoch": 0.25452096960369375, + "flos": 583530877440.0, + "grad_norm": 0.056711392027496164, + "language_loss": 0.83213425, + "learning_rate": 0.0008735393822590908, + "loss": 0.84316134, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.35668945, + "step": 1323, + "time_per_iteration": 2.6643528938293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099364, + "balance_loss_mlp": 1.06434083, + "epoch": 0.2547133512889573, + "flos": 508344629760.0, + "grad_norm": 0.06006943476027706, + "language_loss": 0.87018919, + "learning_rate": 0.0008733322165207681, + "loss": 0.88118285, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.35083008, + "step": 1324, + "time_per_iteration": 2.627495765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110409, + "balance_loss_mlp": 1.06863689, + "epoch": 0.25490573297422087, + "flos": 782266940928.0, + "grad_norm": 0.05604709920606865, + "language_loss": 0.83055937, + "learning_rate": 0.0008731249058420247, + "loss": 0.8416003, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.35498047, + "step": 1325, + "time_per_iteration": 3.0361831188201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100095, + "balance_loss_mlp": 1.06468964, + "epoch": 0.2550981146594844, + "flos": 509610451968.0, + "grad_norm": 0.06314633870869373, + "language_loss": 0.90780556, + "learning_rate": 0.0008729174503033459, + "loss": 0.91880649, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.35424805, + "step": 1326, + "time_per_iteration": 2.639625072479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109522, + "balance_loss_mlp": 1.06007695, + "epoch": 0.255290496344748, + "flos": 676360212480.0, + "grad_norm": 0.06489195741671011, + "language_loss": 0.82650065, + "learning_rate": 0.0008727098499852728, + "loss": 0.83745289, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.35180664, + "step": 1327, + "time_per_iteration": 2.830500602722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109753, + "balance_loss_mlp": 1.06231546, + "epoch": 0.2554828780300115, + "flos": 537524273664.0, + "grad_norm": 0.06666455638552511, + "language_loss": 0.89945138, + "learning_rate": 0.0008725021049684034, + "loss": 0.91042662, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.35253906, + "step": 1328, + "time_per_iteration": 2.747800350189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097741, + "balance_loss_mlp": 1.06240726, + "epoch": 0.2556752597152751, + "flos": 823828534272.0, + "grad_norm": 0.052131047599379726, + "language_loss": 0.82919741, + "learning_rate": 0.000872294215333391, + "loss": 0.84017479, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.35400391, + "step": 1329, + "time_per_iteration": 3.1658926010131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096694, + "balance_loss_mlp": 1.06176591, + "epoch": 0.2558676414005387, + "flos": 570517502976.0, + "grad_norm": 0.05425014800623288, + "language_loss": 0.82993001, + "learning_rate": 0.0008720861811609457, + "loss": 0.84089696, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.34985352, + "step": 1330, + "time_per_iteration": 2.709085702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101009, + "balance_loss_mlp": 1.06448317, + "epoch": 0.2560600230858022, + "flos": 486419475456.0, + "grad_norm": 0.05425594622111712, + "language_loss": 0.83756936, + "learning_rate": 0.0008718780025318338, + "loss": 0.84857947, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.36523438, + "step": 1331, + "time_per_iteration": 2.7126388549804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097427, + "balance_loss_mlp": 1.06280875, + "epoch": 0.2562524047710658, + "flos": 512874008064.0, + "grad_norm": 0.06594145834934585, + "language_loss": 0.8406449, + "learning_rate": 0.0008716696795268771, + "loss": 0.85161918, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.34667969, + "step": 1332, + "time_per_iteration": 2.6650350093841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089329, + "balance_loss_mlp": 1.05308938, + "epoch": 0.25644478645632934, + "flos": 634498873344.0, + "grad_norm": 0.051413439896644035, + "language_loss": 0.85076845, + "learning_rate": 0.0008714612122269538, + "loss": 0.86166173, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.36279297, + "step": 1333, + "time_per_iteration": 2.8611392974853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109443, + "balance_loss_mlp": 1.05754697, + "epoch": 0.25663716814159293, + "flos": 436353537024.0, + "grad_norm": 0.0705935369031189, + "language_loss": 0.89120972, + "learning_rate": 0.0008712526007129982, + "loss": 0.90215403, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.36889648, + "step": 1334, + "time_per_iteration": 2.5217065811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109442, + "balance_loss_mlp": 1.05813241, + "epoch": 0.25682954982685646, + "flos": 497892013056.0, + "grad_norm": 0.06578019441075163, + "language_loss": 0.90784955, + "learning_rate": 0.0008710438450660003, + "loss": 0.91879368, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.36303711, + "step": 1335, + "time_per_iteration": 2.651367425918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093579, + "balance_loss_mlp": 1.05643392, + "epoch": 0.25702193151212005, + "flos": 457471176192.0, + "grad_norm": 0.07087944464696884, + "language_loss": 0.8744905, + "learning_rate": 0.0008708349453670064, + "loss": 0.88542628, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.37133789, + "step": 1336, + "time_per_iteration": 2.51411771774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090754, + "balance_loss_mlp": 1.0543952, + "epoch": 0.2572143131973836, + "flos": 598002130944.0, + "grad_norm": 0.06329524505646734, + "language_loss": 0.91480416, + "learning_rate": 0.0008706259016971185, + "loss": 0.92571175, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.36401367, + "step": 1337, + "time_per_iteration": 2.754173517227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096487, + "balance_loss_mlp": 1.0589596, + "epoch": 0.25740669488264717, + "flos": 698004559872.0, + "grad_norm": 0.06697174190166053, + "language_loss": 0.83331275, + "learning_rate": 0.0008704167141374944, + "loss": 0.84427762, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.375, + "step": 1338, + "time_per_iteration": 2.795552968978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011015, + "balance_loss_mlp": 1.06385398, + "epoch": 0.25759907656791076, + "flos": 502130409984.0, + "grad_norm": 0.06639008708045263, + "language_loss": 0.88657552, + "learning_rate": 0.0008702073827693482, + "loss": 0.89759052, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.3762207, + "step": 1339, + "time_per_iteration": 2.6935572624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103743, + "balance_loss_mlp": 1.065763, + "epoch": 0.2577914582531743, + "flos": 773541425664.0, + "grad_norm": 0.06917089880544881, + "language_loss": 0.88938046, + "learning_rate": 0.0008699979076739494, + "loss": 0.90041792, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.37963867, + "step": 1340, + "time_per_iteration": 2.951148509979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102218, + "balance_loss_mlp": 1.06552505, + "epoch": 0.2579838399384379, + "flos": 459431032320.0, + "grad_norm": 0.07085954822691051, + "language_loss": 0.88831556, + "learning_rate": 0.0008697882889326234, + "loss": 0.89933777, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.36669922, + "step": 1341, + "time_per_iteration": 2.492182731628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105931, + "balance_loss_mlp": 1.06916654, + "epoch": 0.2581762216237014, + "flos": 568917029376.0, + "grad_norm": 0.060702491086151805, + "language_loss": 0.86630756, + "learning_rate": 0.0008695785266267515, + "loss": 0.8773669, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.36816406, + "step": 1342, + "time_per_iteration": 2.6635031700134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111038, + "balance_loss_mlp": 1.07448828, + "epoch": 0.258368603308965, + "flos": 603906430464.0, + "grad_norm": 0.06467765584173796, + "language_loss": 0.83112109, + "learning_rate": 0.0008693686208377704, + "loss": 0.84223145, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.36547852, + "step": 1343, + "time_per_iteration": 2.7769596576690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105659, + "balance_loss_mlp": 1.06975329, + "epoch": 0.2585609849942285, + "flos": 491204929536.0, + "grad_norm": 0.06376456739082713, + "language_loss": 0.88889539, + "learning_rate": 0.0008691585716471733, + "loss": 0.89995199, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.35913086, + "step": 1344, + "time_per_iteration": 2.6467716693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104044, + "balance_loss_mlp": 1.06809044, + "epoch": 0.2587533666794921, + "flos": 640455607296.0, + "grad_norm": 0.057733681270749564, + "language_loss": 0.85255873, + "learning_rate": 0.0008689483791365079, + "loss": 0.86359918, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.35961914, + "step": 1345, + "time_per_iteration": 2.8041999340057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099237, + "balance_loss_mlp": 1.06380773, + "epoch": 0.2589457483647557, + "flos": 576564397056.0, + "grad_norm": 0.05015471530609978, + "language_loss": 0.89365089, + "learning_rate": 0.0008687380433873786, + "loss": 0.9046433, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.35473633, + "step": 1346, + "time_per_iteration": 2.7955591678619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101447, + "balance_loss_mlp": 1.06630445, + "epoch": 0.25913813005001923, + "flos": 535164337152.0, + "grad_norm": 0.06074647569776127, + "language_loss": 0.82164252, + "learning_rate": 0.0008685275644814448, + "loss": 0.83265698, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.3515625, + "step": 1347, + "time_per_iteration": 2.6922154426574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100414, + "balance_loss_mlp": 1.06419861, + "epoch": 0.2593305117352828, + "flos": 720713908224.0, + "grad_norm": 0.05981927153656866, + "language_loss": 0.8445859, + "learning_rate": 0.0008683169425004216, + "loss": 0.85558999, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.36230469, + "step": 1348, + "time_per_iteration": 2.8701395988464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.05186677, + "epoch": 0.25952289342054635, + "flos": 709782635520.0, + "grad_norm": 0.06994851779161643, + "language_loss": 0.83445579, + "learning_rate": 0.0008681061775260799, + "loss": 0.84533083, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.35644531, + "step": 1349, + "time_per_iteration": 2.8206968307495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096473, + "balance_loss_mlp": 1.06032848, + "epoch": 0.25971527510580994, + "flos": 455688820224.0, + "grad_norm": 0.06118298275127208, + "language_loss": 0.91987318, + "learning_rate": 0.0008678952696402458, + "loss": 0.93083793, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.36132812, + "step": 1350, + "time_per_iteration": 2.5547540187835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108511, + "balance_loss_mlp": 1.04932308, + "epoch": 0.25990765679107347, + "flos": 612223100928.0, + "grad_norm": 0.04808004024566397, + "language_loss": 0.86496055, + "learning_rate": 0.000867684218924801, + "loss": 0.8758117, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.35791016, + "step": 1351, + "time_per_iteration": 2.8406949043273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110301, + "balance_loss_mlp": 1.09857082, + "epoch": 0.26010003847633706, + "flos": 1537105766400.0, + "grad_norm": 0.059206679514604114, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80057395, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.1171875, + "step": 1352, + "time_per_iteration": 4.8775153160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082749, + "balance_loss_mlp": 1.04753494, + "epoch": 0.2602924201616006, + "flos": 715947393024.0, + "grad_norm": 0.046134849134736367, + "language_loss": 0.85103661, + "learning_rate": 0.0008672616893328834, + "loss": 0.86186409, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.35253906, + "step": 1353, + "time_per_iteration": 2.98063588142395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108322, + "balance_loss_mlp": 1.04764819, + "epoch": 0.2604848018468642, + "flos": 643241917440.0, + "grad_norm": 0.060512322591449175, + "language_loss": 0.9000203, + "learning_rate": 0.0008670502106204512, + "loss": 0.91085243, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.35595703, + "step": 1354, + "time_per_iteration": 2.832679271697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087805, + "balance_loss_mlp": 1.05073047, + "epoch": 0.26067718353212777, + "flos": 516783545856.0, + "grad_norm": 0.05860289542603218, + "language_loss": 0.8165139, + "learning_rate": 0.0008668385894064892, + "loss": 0.82739192, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.37084961, + "step": 1355, + "time_per_iteration": 2.6204822063446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086317, + "balance_loss_mlp": 1.05143666, + "epoch": 0.2608695652173913, + "flos": 822361890816.0, + "grad_norm": 0.0623840657908754, + "language_loss": 0.88803548, + "learning_rate": 0.0008666268257731562, + "loss": 0.8988986, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.34912109, + "step": 1356, + "time_per_iteration": 3.113147735595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109295, + "balance_loss_mlp": 1.0566628, + "epoch": 0.2610619469026549, + "flos": 1007451744768.0, + "grad_norm": 0.056693012024963345, + "language_loss": 0.85794425, + "learning_rate": 0.0008664149198026662, + "loss": 0.86887372, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.36279297, + "step": 1357, + "time_per_iteration": 3.2569541931152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095714, + "balance_loss_mlp": 1.06040418, + "epoch": 0.2612543285879184, + "flos": 536523291648.0, + "grad_norm": 0.061594313952015485, + "language_loss": 0.88599586, + "learning_rate": 0.0008662028715772883, + "loss": 0.89695299, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.35351562, + "step": 1358, + "time_per_iteration": 2.6102256774902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100258, + "balance_loss_mlp": 1.06475711, + "epoch": 0.261446710273182, + "flos": 519166803456.0, + "grad_norm": 0.04975036534081278, + "language_loss": 0.85662109, + "learning_rate": 0.0008659906811793467, + "loss": 0.86762363, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.35546875, + "step": 1359, + "time_per_iteration": 2.6921935081481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101082, + "balance_loss_mlp": 1.06543839, + "epoch": 0.26163909195844554, + "flos": 582975055872.0, + "grad_norm": 0.06646109128582675, + "language_loss": 0.89397144, + "learning_rate": 0.0008657783486912215, + "loss": 0.90498233, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.35693359, + "step": 1360, + "time_per_iteration": 2.7003283500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110494, + "balance_loss_mlp": 1.06920147, + "epoch": 0.2618314736437091, + "flos": 958362057216.0, + "grad_norm": 0.06344844215605515, + "language_loss": 0.89840877, + "learning_rate": 0.0008655658741953472, + "loss": 0.90945816, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.35742188, + "step": 1361, + "time_per_iteration": 3.207960844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101357, + "balance_loss_mlp": 1.0664053, + "epoch": 0.26202385532897265, + "flos": 574530347520.0, + "grad_norm": 0.04606923720206454, + "language_loss": 0.88105857, + "learning_rate": 0.0008653532577742136, + "loss": 0.89207214, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.34960938, + "step": 1362, + "time_per_iteration": 2.69209885597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091565, + "balance_loss_mlp": 1.05744767, + "epoch": 0.26221623701423624, + "flos": 445240585728.0, + "grad_norm": 0.05480512848555835, + "language_loss": 0.87200153, + "learning_rate": 0.0008651404995103659, + "loss": 0.88291717, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.34155273, + "step": 1363, + "time_per_iteration": 2.5325255393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095927, + "balance_loss_mlp": 1.06164205, + "epoch": 0.26240861869949983, + "flos": 535459700736.0, + "grad_norm": 0.04992660146640532, + "language_loss": 0.870713, + "learning_rate": 0.0008649275994864041, + "loss": 0.8816722, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.34301758, + "step": 1364, + "time_per_iteration": 2.682365894317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098846, + "balance_loss_mlp": 1.06267846, + "epoch": 0.26260100038476336, + "flos": 564940500480.0, + "grad_norm": 0.05369640644722127, + "language_loss": 0.83917898, + "learning_rate": 0.0008647145577849834, + "loss": 0.85016745, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.36157227, + "step": 1365, + "time_per_iteration": 2.8129918575286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102304, + "balance_loss_mlp": 1.06701851, + "epoch": 0.26279338207002695, + "flos": 612745426944.0, + "grad_norm": 0.045782565775991005, + "language_loss": 0.82809973, + "learning_rate": 0.0008645013744888139, + "loss": 0.83912277, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.35327148, + "step": 1366, + "time_per_iteration": 2.8523411750793457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101615, + "balance_loss_mlp": 1.06730664, + "epoch": 0.2629857637552905, + "flos": 522555425280.0, + "grad_norm": 0.0597350257589219, + "language_loss": 0.87579656, + "learning_rate": 0.0008642880496806607, + "loss": 0.88681269, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.34350586, + "step": 1367, + "time_per_iteration": 2.766350507736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105811, + "balance_loss_mlp": 1.0706687, + "epoch": 0.26317814544055407, + "flos": 534273864192.0, + "grad_norm": 0.05812227598952832, + "language_loss": 0.84219468, + "learning_rate": 0.0008640745834433437, + "loss": 0.85325277, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.35205078, + "step": 1368, + "time_per_iteration": 2.7220964431762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100459, + "balance_loss_mlp": 1.06553102, + "epoch": 0.2633705271258176, + "flos": 555235762176.0, + "grad_norm": 0.06954601812969684, + "language_loss": 0.86862296, + "learning_rate": 0.000863860975859738, + "loss": 0.87962759, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.34960938, + "step": 1369, + "time_per_iteration": 2.8985280990600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094199, + "balance_loss_mlp": 1.05931866, + "epoch": 0.2635629088110812, + "flos": 552136711680.0, + "grad_norm": 0.06493737783890446, + "language_loss": 0.88711715, + "learning_rate": 0.0008636472270127733, + "loss": 0.89805913, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.34936523, + "step": 1370, + "time_per_iteration": 2.634615182876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090389, + "balance_loss_mlp": 1.05498338, + "epoch": 0.2637552904963448, + "flos": 455752839168.0, + "grad_norm": 0.062476231294863314, + "language_loss": 0.89913595, + "learning_rate": 0.0008634333369854345, + "loss": 0.91003978, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.35424805, + "step": 1371, + "time_per_iteration": 2.5908331871032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082759, + "balance_loss_mlp": 1.04818797, + "epoch": 0.2639476721816083, + "flos": 612847323648.0, + "grad_norm": 0.05509554660574217, + "language_loss": 0.87495965, + "learning_rate": 0.0008632193058607608, + "loss": 0.88578725, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.34594727, + "step": 1372, + "time_per_iteration": 2.6963188648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108436, + "balance_loss_mlp": 1.04878759, + "epoch": 0.2641400538668719, + "flos": 571645112832.0, + "grad_norm": 0.05982264925210271, + "language_loss": 0.81028771, + "learning_rate": 0.0008630051337218466, + "loss": 0.82113135, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.35595703, + "step": 1373, + "time_per_iteration": 2.644540786743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079985, + "balance_loss_mlp": 1.04582, + "epoch": 0.2643324355521354, + "flos": 581979866112.0, + "grad_norm": 0.08561984623812412, + "language_loss": 0.82428128, + "learning_rate": 0.0008627908206518409, + "loss": 0.8350811, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.34179688, + "step": 1374, + "time_per_iteration": 2.660578966140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110354, + "balance_loss_mlp": 1.08904421, + "epoch": 0.264524817237399, + "flos": 1543845284352.0, + "grad_norm": 0.03725698642258328, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76254791, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.14453125, + "step": 1375, + "time_per_iteration": 4.9595324993133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079461, + "balance_loss_mlp": 1.04474711, + "epoch": 0.26471719892266254, + "flos": 517783117824.0, + "grad_norm": 0.05493851972821551, + "language_loss": 0.91330564, + "learning_rate": 0.0008623617720514241, + "loss": 0.92410028, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.34741211, + "step": 1376, + "time_per_iteration": 2.5929205417633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.05498242, + "epoch": 0.26490958060792613, + "flos": 516936314880.0, + "grad_norm": 0.08106601153347975, + "language_loss": 0.84946424, + "learning_rate": 0.0008621470366875848, + "loss": 0.8603667, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.3527832, + "step": 1377, + "time_per_iteration": 2.5729684829711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084439, + "balance_loss_mlp": 1.0497725, + "epoch": 0.26510196229318966, + "flos": 596298350592.0, + "grad_norm": 0.05588669268878349, + "language_loss": 0.87771004, + "learning_rate": 0.0008619321607257966, + "loss": 0.88855445, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.34692383, + "step": 1378, + "time_per_iteration": 2.6708004474639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082921, + "balance_loss_mlp": 1.0483501, + "epoch": 0.26529434397845325, + "flos": 685488780288.0, + "grad_norm": 0.051774701706919043, + "language_loss": 0.82311988, + "learning_rate": 0.000861717144249482, + "loss": 0.83394915, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.34594727, + "step": 1379, + "time_per_iteration": 2.8249831199645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081979, + "balance_loss_mlp": 1.0468595, + "epoch": 0.26548672566371684, + "flos": 424127328768.0, + "grad_norm": 0.06288210815556809, + "language_loss": 0.90205348, + "learning_rate": 0.0008615019873421175, + "loss": 0.91287327, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.35131836, + "step": 1380, + "time_per_iteration": 2.455320358276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108606, + "balance_loss_mlp": 1.05108428, + "epoch": 0.26567910734898037, + "flos": 489619012608.0, + "grad_norm": 0.05393715583789803, + "language_loss": 0.8609767, + "learning_rate": 0.0008612866900872349, + "loss": 0.87183726, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.35009766, + "step": 1381, + "time_per_iteration": 2.54070782661438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083002, + "balance_loss_mlp": 1.04895568, + "epoch": 0.26587148903424396, + "flos": 533947977216.0, + "grad_norm": 0.05290962754614328, + "language_loss": 0.88052452, + "learning_rate": 0.0008610712525684197, + "loss": 0.8913545, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.34082031, + "step": 1382, + "time_per_iteration": 2.6350595951080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084878, + "balance_loss_mlp": 1.05049801, + "epoch": 0.2660638707195075, + "flos": 1017067732992.0, + "grad_norm": 0.06267977315545337, + "language_loss": 0.84534729, + "learning_rate": 0.0008608556748693121, + "loss": 0.85619605, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.34423828, + "step": 1383, + "time_per_iteration": 3.231172561645508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086499, + "balance_loss_mlp": 1.05216646, + "epoch": 0.2662562524047711, + "flos": 523712148480.0, + "grad_norm": 0.0585640776606728, + "language_loss": 0.86247015, + "learning_rate": 0.000860639957073607, + "loss": 0.87333512, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.34375, + "step": 1384, + "time_per_iteration": 2.72265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108718, + "balance_loss_mlp": 1.05280018, + "epoch": 0.2664486340900346, + "flos": 552107598336.0, + "grad_norm": 0.07312693577598182, + "language_loss": 0.87888551, + "learning_rate": 0.0008604240992650534, + "loss": 0.88975734, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.34423828, + "step": 1385, + "time_per_iteration": 2.6524593830108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079729, + "balance_loss_mlp": 1.0455637, + "epoch": 0.2666410157752982, + "flos": 469895233536.0, + "grad_norm": 0.058731941016447735, + "language_loss": 0.89070451, + "learning_rate": 0.0008602081015274545, + "loss": 0.90150183, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.34179688, + "step": 1386, + "time_per_iteration": 2.7026963233947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083917, + "balance_loss_mlp": 1.04953694, + "epoch": 0.2668333974605617, + "flos": 569645968896.0, + "grad_norm": 0.04572049987167494, + "language_loss": 0.83031899, + "learning_rate": 0.0008599919639446684, + "loss": 0.84115815, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.34423828, + "step": 1387, + "time_per_iteration": 2.6891515254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083945, + "balance_loss_mlp": 1.04920745, + "epoch": 0.2670257791458253, + "flos": 398755325952.0, + "grad_norm": 0.06113709644372323, + "language_loss": 0.80263156, + "learning_rate": 0.000859775686600607, + "loss": 0.81347102, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.34765625, + "step": 1388, + "time_per_iteration": 2.5367043018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088195, + "balance_loss_mlp": 1.05400586, + "epoch": 0.2672181608310889, + "flos": 515587534848.0, + "grad_norm": 0.07715457421599592, + "language_loss": 0.85045016, + "learning_rate": 0.0008595592695792367, + "loss": 0.86133218, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.34228516, + "step": 1389, + "time_per_iteration": 2.653684139251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097788, + "balance_loss_mlp": 1.06348014, + "epoch": 0.26741054251635243, + "flos": 507270864384.0, + "grad_norm": 0.05276083290405683, + "language_loss": 0.9085412, + "learning_rate": 0.0008593427129645778, + "loss": 0.91951907, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.34350586, + "step": 1390, + "time_per_iteration": 2.5497426986694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100169, + "balance_loss_mlp": 1.06512117, + "epoch": 0.267602924201616, + "flos": 576357783552.0, + "grad_norm": 0.0907689109524766, + "language_loss": 0.85371816, + "learning_rate": 0.0008591260168407052, + "loss": 0.86471987, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.35083008, + "step": 1391, + "time_per_iteration": 2.752777576446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099505, + "balance_loss_mlp": 1.06598353, + "epoch": 0.26779530588687955, + "flos": 523731087360.0, + "grad_norm": 0.05201595058269412, + "language_loss": 0.83216429, + "learning_rate": 0.0008589091812917479, + "loss": 0.84315932, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.33544922, + "step": 1392, + "time_per_iteration": 2.602858781814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092875, + "balance_loss_mlp": 1.05897164, + "epoch": 0.26798768757214314, + "flos": 556508938752.0, + "grad_norm": 0.054199587407170555, + "language_loss": 0.85476619, + "learning_rate": 0.0008586922064018887, + "loss": 0.86569488, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.33911133, + "step": 1393, + "time_per_iteration": 2.663135528564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110003, + "balance_loss_mlp": 1.0641005, + "epoch": 0.2681800692574067, + "flos": 930246004224.0, + "grad_norm": 0.05606615550920643, + "language_loss": 0.89228028, + "learning_rate": 0.0008584750922553651, + "loss": 0.90328062, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.35961914, + "step": 1394, + "time_per_iteration": 3.126030206680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094414, + "balance_loss_mlp": 1.06020141, + "epoch": 0.26837245094267026, + "flos": 700771931136.0, + "grad_norm": 0.055333821001128054, + "language_loss": 0.83724457, + "learning_rate": 0.0008582578389364677, + "loss": 0.84818876, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.3425293, + "step": 1395, + "time_per_iteration": 2.858774423599243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096924, + "balance_loss_mlp": 1.06187642, + "epoch": 0.26856483262793385, + "flos": 592892199936.0, + "grad_norm": 0.04773968262798697, + "language_loss": 0.9195987, + "learning_rate": 0.0008580404465295422, + "loss": 0.93056792, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.35058594, + "step": 1396, + "time_per_iteration": 2.7737646102905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096258, + "balance_loss_mlp": 1.06190252, + "epoch": 0.2687572143131974, + "flos": 713943866880.0, + "grad_norm": 0.07288281155022573, + "language_loss": 0.88208908, + "learning_rate": 0.0008578229151189876, + "loss": 0.89305162, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.34375, + "step": 1397, + "time_per_iteration": 2.9974029064178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087674, + "balance_loss_mlp": 1.05441451, + "epoch": 0.26894959599846097, + "flos": 467481452544.0, + "grad_norm": 0.0581153622766974, + "language_loss": 0.81433654, + "learning_rate": 0.0008576052447892573, + "loss": 0.82521319, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.33276367, + "step": 1398, + "time_per_iteration": 2.586427688598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090085, + "balance_loss_mlp": 1.05551457, + "epoch": 0.2691419776837245, + "flos": 468470850048.0, + "grad_norm": 0.08083264737918114, + "language_loss": 0.86589479, + "learning_rate": 0.000857387435624858, + "loss": 0.87679559, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.34619141, + "step": 1399, + "time_per_iteration": 2.5227789878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096966, + "balance_loss_mlp": 1.06239533, + "epoch": 0.2693343593689881, + "flos": 937244418048.0, + "grad_norm": 0.0443934808912178, + "language_loss": 0.88525635, + "learning_rate": 0.0008571694877103513, + "loss": 0.89622605, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.34594727, + "step": 1400, + "time_per_iteration": 3.252573251724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095335, + "balance_loss_mlp": 1.06064546, + "epoch": 0.2695267410542516, + "flos": 577303511040.0, + "grad_norm": 0.05297583192015558, + "language_loss": 0.87603962, + "learning_rate": 0.0008569514011303515, + "loss": 0.88699305, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.34716797, + "step": 1401, + "time_per_iteration": 2.7824223041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092716, + "balance_loss_mlp": 1.0577879, + "epoch": 0.2697191227395152, + "flos": 556539462144.0, + "grad_norm": 0.06414718170709632, + "language_loss": 0.87859815, + "learning_rate": 0.0008567331759695277, + "loss": 0.88952529, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.34985352, + "step": 1402, + "time_per_iteration": 2.7109498977661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098142, + "balance_loss_mlp": 1.06183052, + "epoch": 0.26991150442477874, + "flos": 529024310784.0, + "grad_norm": 0.05462837975359106, + "language_loss": 0.86148876, + "learning_rate": 0.0008565148123126023, + "loss": 0.87247014, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.36328125, + "step": 1403, + "time_per_iteration": 2.6526310443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088301, + "balance_loss_mlp": 1.05289555, + "epoch": 0.2701038861100423, + "flos": 531737837568.0, + "grad_norm": 0.12276519374226595, + "language_loss": 0.86177158, + "learning_rate": 0.0008562963102443516, + "loss": 0.87265456, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.35400391, + "step": 1404, + "time_per_iteration": 2.6809849739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092814, + "balance_loss_mlp": 1.05757618, + "epoch": 0.2702962677953059, + "flos": 734908737024.0, + "grad_norm": 0.05743337882617235, + "language_loss": 0.85265231, + "learning_rate": 0.0008560776698496056, + "loss": 0.86358047, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.3527832, + "step": 1405, + "time_per_iteration": 2.9008774757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099947, + "balance_loss_mlp": 1.06420779, + "epoch": 0.27048864948056944, + "flos": 574453181952.0, + "grad_norm": 0.06281004106283315, + "language_loss": 0.85864103, + "learning_rate": 0.0008558588912132481, + "loss": 0.86964047, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.35742188, + "step": 1406, + "time_per_iteration": 2.8967840671539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057025, + "balance_loss_mlp": 1.04519963, + "epoch": 0.27068103116583303, + "flos": 1423091953152.0, + "grad_norm": 0.03126478371356873, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77516007, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.11816406, + "step": 1407, + "time_per_iteration": 4.933698892593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109874, + "balance_loss_mlp": 1.06400251, + "epoch": 0.27087341285109656, + "flos": 531742219776.0, + "grad_norm": 0.050597666424933845, + "language_loss": 0.82942683, + "learning_rate": 0.0008554209195555016, + "loss": 0.84041423, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.34765625, + "step": 1408, + "time_per_iteration": 2.6599888801574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093614, + "balance_loss_mlp": 1.0582329, + "epoch": 0.27106579453636015, + "flos": 581108332032.0, + "grad_norm": 0.058412744436649705, + "language_loss": 0.88199335, + "learning_rate": 0.0008552017267041483, + "loss": 0.89292949, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.35375977, + "step": 1409, + "time_per_iteration": 2.673828363418579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091257, + "balance_loss_mlp": 1.05563736, + "epoch": 0.2712581762216237, + "flos": 506533160448.0, + "grad_norm": 0.05246666988179206, + "language_loss": 0.83264577, + "learning_rate": 0.0008549823959512549, + "loss": 0.84355831, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.35644531, + "step": 1410, + "time_per_iteration": 2.634523868560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091517, + "balance_loss_mlp": 1.05451441, + "epoch": 0.27145055790688727, + "flos": 997019476992.0, + "grad_norm": 0.050905982394943275, + "language_loss": 0.86668658, + "learning_rate": 0.0008547629273819728, + "loss": 0.87760168, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.36987305, + "step": 1411, + "time_per_iteration": 3.3559322357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094566, + "balance_loss_mlp": 1.05875564, + "epoch": 0.2716429395921508, + "flos": 546420086784.0, + "grad_norm": 0.06363965087638479, + "language_loss": 0.83773881, + "learning_rate": 0.0008545433210815074, + "loss": 0.84868449, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.35839844, + "step": 1412, + "time_per_iteration": 2.607379913330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109226, + "balance_loss_mlp": 1.05606771, + "epoch": 0.2718353212774144, + "flos": 572954605056.0, + "grad_norm": 0.05881941163427475, + "language_loss": 0.87753916, + "learning_rate": 0.0008543235771351176, + "loss": 0.88846171, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.36230469, + "step": 1413, + "time_per_iteration": 2.722318649291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096323, + "balance_loss_mlp": 1.06118035, + "epoch": 0.272027702962678, + "flos": 643986823680.0, + "grad_norm": 0.044269909609048815, + "language_loss": 0.84649068, + "learning_rate": 0.0008541036956281154, + "loss": 0.85745388, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.3515625, + "step": 1414, + "time_per_iteration": 2.8785104751586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100326, + "balance_loss_mlp": 1.0645628, + "epoch": 0.2722200846479415, + "flos": 653410755072.0, + "grad_norm": 0.0658433573318433, + "language_loss": 0.82281864, + "learning_rate": 0.0008538836766458665, + "loss": 0.83382189, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.35791016, + "step": 1415, + "time_per_iteration": 2.834148645401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098566, + "balance_loss_mlp": 1.06311321, + "epoch": 0.2724124663332051, + "flos": 579346324992.0, + "grad_norm": 0.07330345680392343, + "language_loss": 0.85275221, + "learning_rate": 0.0008536635202737897, + "loss": 0.86373788, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.35473633, + "step": 1416, + "time_per_iteration": 2.7886626720428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099993, + "balance_loss_mlp": 1.06513667, + "epoch": 0.2726048480184686, + "flos": 537178037760.0, + "grad_norm": 0.06667202152625065, + "language_loss": 0.82212454, + "learning_rate": 0.0008534432265973573, + "loss": 0.8331244, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.34912109, + "step": 1417, + "time_per_iteration": 2.604626417160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095149, + "balance_loss_mlp": 1.05931497, + "epoch": 0.2727972297037322, + "flos": 995360776704.0, + "grad_norm": 0.08172035912068172, + "language_loss": 0.88052338, + "learning_rate": 0.000853222795702095, + "loss": 0.8914749, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.35839844, + "step": 1418, + "time_per_iteration": 3.391664505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095275, + "balance_loss_mlp": 1.06125307, + "epoch": 0.27298961138899575, + "flos": 605924513280.0, + "grad_norm": 0.05480231780963067, + "language_loss": 0.83608377, + "learning_rate": 0.0008530022276735813, + "loss": 0.84703648, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.34057617, + "step": 1419, + "time_per_iteration": 2.705235004425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107985, + "balance_loss_mlp": 1.04666257, + "epoch": 0.27318199307425933, + "flos": 529059216384.0, + "grad_norm": 0.054542785174291425, + "language_loss": 0.85957551, + "learning_rate": 0.0008527815225974489, + "loss": 0.87037402, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.33203125, + "step": 1420, + "time_per_iteration": 2.654003620147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087931, + "balance_loss_mlp": 1.05395651, + "epoch": 0.2733743747595229, + "flos": 408809272320.0, + "grad_norm": 0.06460584893454492, + "language_loss": 0.88538897, + "learning_rate": 0.0008525606805593829, + "loss": 0.89626825, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.33984375, + "step": 1421, + "time_per_iteration": 2.4287912845611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087097, + "balance_loss_mlp": 1.05233574, + "epoch": 0.27356675644478645, + "flos": 515976030720.0, + "grad_norm": 0.055753761808712644, + "language_loss": 0.82379127, + "learning_rate": 0.0008523397016451213, + "loss": 0.8346622, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.34814453, + "step": 1422, + "time_per_iteration": 2.5620808601379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096714, + "balance_loss_mlp": 1.0617379, + "epoch": 0.27375913813005004, + "flos": 1051914539520.0, + "grad_norm": 0.0481984630724129, + "language_loss": 0.87272507, + "learning_rate": 0.0008521185859404564, + "loss": 0.88369215, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.34985352, + "step": 1423, + "time_per_iteration": 3.361171245574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085067, + "balance_loss_mlp": 1.05037737, + "epoch": 0.27395151981531357, + "flos": 624507535872.0, + "grad_norm": 0.05502068897485729, + "language_loss": 0.89717311, + "learning_rate": 0.0008518973335312326, + "loss": 0.90802383, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.34716797, + "step": 1424, + "time_per_iteration": 2.7964961528778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088932, + "balance_loss_mlp": 1.05390823, + "epoch": 0.27414390150057716, + "flos": 550112836608.0, + "grad_norm": 0.056708357312241935, + "language_loss": 0.83878243, + "learning_rate": 0.0008516759445033477, + "loss": 0.84967172, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.35058594, + "step": 1425, + "time_per_iteration": 2.6100170612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090578, + "balance_loss_mlp": 1.05603087, + "epoch": 0.2743362831858407, + "flos": 539596200960.0, + "grad_norm": 0.061048707375716146, + "language_loss": 0.84983361, + "learning_rate": 0.0008514544189427526, + "loss": 0.86073935, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.34594727, + "step": 1426, + "time_per_iteration": 2.6465015411376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088002, + "balance_loss_mlp": 1.05347919, + "epoch": 0.2745286648711043, + "flos": 468352986624.0, + "grad_norm": 0.061383055639382046, + "language_loss": 0.8704657, + "learning_rate": 0.0008512327569354511, + "loss": 0.88134569, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.34570312, + "step": 1427, + "time_per_iteration": 2.5696229934692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087297, + "balance_loss_mlp": 1.05212998, + "epoch": 0.2747210465563678, + "flos": 472617524736.0, + "grad_norm": 0.05941983459852813, + "language_loss": 0.8349936, + "learning_rate": 0.0008510109585675001, + "loss": 0.84586656, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.35180664, + "step": 1428, + "time_per_iteration": 2.6195123195648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086947, + "balance_loss_mlp": 1.07245111, + "epoch": 0.2749134282416314, + "flos": 1314345070080.0, + "grad_norm": 0.037284634304165044, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82240289, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.14453125, + "step": 1429, + "time_per_iteration": 4.714681625366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083808, + "balance_loss_mlp": 1.04938078, + "epoch": 0.275105809926895, + "flos": 970445670912.0, + "grad_norm": 0.07686972857934649, + "language_loss": 0.80942416, + "learning_rate": 0.0008505669530941415, + "loss": 0.82026225, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.34472656, + "step": 1430, + "time_per_iteration": 3.2975006103515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080869, + "balance_loss_mlp": 1.04715657, + "epoch": 0.2752981916121585, + "flos": 527089185792.0, + "grad_norm": 0.061626933195079385, + "language_loss": 0.84357536, + "learning_rate": 0.000850344746161112, + "loss": 0.85438406, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.33740234, + "step": 1431, + "time_per_iteration": 2.596623182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079512, + "balance_loss_mlp": 1.04487014, + "epoch": 0.2754905732974221, + "flos": 453487444992.0, + "grad_norm": 0.05883177646218185, + "language_loss": 0.87880194, + "learning_rate": 0.0008501224032121894, + "loss": 0.88959706, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.34692383, + "step": 1432, + "time_per_iteration": 2.5134201049804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082699, + "balance_loss_mlp": 1.04817569, + "epoch": 0.27568295498268564, + "flos": 497216918016.0, + "grad_norm": 0.05235854639463291, + "language_loss": 0.82002538, + "learning_rate": 0.0008498999243336946, + "loss": 0.83085239, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.34570312, + "step": 1433, + "time_per_iteration": 2.601771593093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095225, + "balance_loss_mlp": 1.06060696, + "epoch": 0.2758753366679492, + "flos": 607890161664.0, + "grad_norm": 0.05891633941102979, + "language_loss": 0.87444806, + "learning_rate": 0.0008496773096120021, + "loss": 0.8854003, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.34643555, + "step": 1434, + "time_per_iteration": 2.788516044616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096763, + "balance_loss_mlp": 1.06212115, + "epoch": 0.27606771835321275, + "flos": 739803290112.0, + "grad_norm": 0.06770297286276174, + "language_loss": 0.84185004, + "learning_rate": 0.0008494545591335381, + "loss": 0.85281765, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.34667969, + "step": 1435, + "time_per_iteration": 2.8759751319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110127, + "balance_loss_mlp": 1.06736696, + "epoch": 0.27626010003847634, + "flos": 554279860224.0, + "grad_norm": 0.04450223786838935, + "language_loss": 0.87244952, + "learning_rate": 0.0008492316729847823, + "loss": 0.88346225, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.33935547, + "step": 1436, + "time_per_iteration": 2.781244993209839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094118, + "balance_loss_mlp": 1.06023908, + "epoch": 0.2764524817237399, + "flos": 542270439936.0, + "grad_norm": 0.055325808882979444, + "language_loss": 0.79874223, + "learning_rate": 0.0008490086512522664, + "loss": 0.80968338, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.33911133, + "step": 1437, + "time_per_iteration": 2.7197165489196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093138, + "balance_loss_mlp": 1.05913925, + "epoch": 0.27664486340900346, + "flos": 406027344384.0, + "grad_norm": 0.0539948754920925, + "language_loss": 0.90713239, + "learning_rate": 0.0008487854940225755, + "loss": 0.91806382, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.34008789, + "step": 1438, + "time_per_iteration": 2.438218593597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094602, + "balance_loss_mlp": 1.06017423, + "epoch": 0.27683724509426705, + "flos": 521884712448.0, + "grad_norm": 0.06140365718889793, + "language_loss": 0.90140885, + "learning_rate": 0.0008485622013823466, + "loss": 0.91235483, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.34448242, + "step": 1439, + "time_per_iteration": 2.653393268585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102393, + "balance_loss_mlp": 1.06770289, + "epoch": 0.2770296267795306, + "flos": 535085761536.0, + "grad_norm": 0.07554461134761571, + "language_loss": 0.8283006, + "learning_rate": 0.00084833877341827, + "loss": 0.83932453, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.34692383, + "step": 1440, + "time_per_iteration": 2.6312928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109947, + "balance_loss_mlp": 1.06587648, + "epoch": 0.27722200846479417, + "flos": 487747906560.0, + "grad_norm": 0.12145939933268801, + "language_loss": 0.8064183, + "learning_rate": 0.000848115210217088, + "loss": 0.81741297, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.33618164, + "step": 1441, + "time_per_iteration": 2.5490710735321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086774, + "balance_loss_mlp": 1.05167925, + "epoch": 0.2774143901500577, + "flos": 618012509184.0, + "grad_norm": 0.05766268366580332, + "language_loss": 0.82057106, + "learning_rate": 0.0008478915118655952, + "loss": 0.83143878, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.35131836, + "step": 1442, + "time_per_iteration": 2.710240602493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086605, + "balance_loss_mlp": 1.05160558, + "epoch": 0.2776067718353213, + "flos": 513563659776.0, + "grad_norm": 0.05774564569051742, + "language_loss": 0.86505657, + "learning_rate": 0.0008476676784506393, + "loss": 0.87592262, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.35009766, + "step": 1443, + "time_per_iteration": 2.636622667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108686, + "balance_loss_mlp": 1.0526228, + "epoch": 0.2777991535205848, + "flos": 1003985957376.0, + "grad_norm": 0.10311001825576924, + "language_loss": 0.82024419, + "learning_rate": 0.0008474437100591201, + "loss": 0.8311128, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.34277344, + "step": 1444, + "time_per_iteration": 3.282383918762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091274, + "balance_loss_mlp": 1.05517697, + "epoch": 0.2779915352058484, + "flos": 550005147648.0, + "grad_norm": 0.05151300271624721, + "language_loss": 0.85496646, + "learning_rate": 0.0008472196067779898, + "loss": 0.86587918, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.36108398, + "step": 1445, + "time_per_iteration": 2.703263998031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092087, + "balance_loss_mlp": 1.05591917, + "epoch": 0.278183916891112, + "flos": 873444930048.0, + "grad_norm": 0.06736388569990436, + "language_loss": 0.85432607, + "learning_rate": 0.0008469953686942531, + "loss": 0.86524689, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.36181641, + "step": 1446, + "time_per_iteration": 3.0834743976593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087224, + "balance_loss_mlp": 1.05305839, + "epoch": 0.2783762985763755, + "flos": 623782978560.0, + "grad_norm": 0.06536474240751361, + "language_loss": 0.83167183, + "learning_rate": 0.0008467709958949668, + "loss": 0.84254414, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.34204102, + "step": 1447, + "time_per_iteration": 2.737135887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087199, + "balance_loss_mlp": 1.05362952, + "epoch": 0.2785686802616391, + "flos": 581571021312.0, + "grad_norm": 0.057056565872365954, + "language_loss": 0.85917461, + "learning_rate": 0.0008465464884672403, + "loss": 0.87004662, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.33569336, + "step": 1448, + "time_per_iteration": 2.6771810054779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087224, + "balance_loss_mlp": 1.05346394, + "epoch": 0.27876106194690264, + "flos": 587032980480.0, + "grad_norm": 0.06237565084734976, + "language_loss": 0.85356677, + "learning_rate": 0.0008463218464982348, + "loss": 0.86443901, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.33789062, + "step": 1449, + "time_per_iteration": 2.799407720565796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086336, + "balance_loss_mlp": 1.05228984, + "epoch": 0.27895344363216623, + "flos": 875621574144.0, + "grad_norm": 0.06450477685794259, + "language_loss": 0.87800258, + "learning_rate": 0.0008460970700751645, + "loss": 0.88886595, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.34057617, + "step": 1450, + "time_per_iteration": 3.0517759323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086942, + "balance_loss_mlp": 1.05322921, + "epoch": 0.27914582531742976, + "flos": 603630005760.0, + "grad_norm": 0.06893143963997089, + "language_loss": 0.87761652, + "learning_rate": 0.000845872159285295, + "loss": 0.88848597, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.33740234, + "step": 1451, + "time_per_iteration": 2.6964316368103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042481, + "balance_loss_mlp": 1.0276041, + "epoch": 0.27933820700269335, + "flos": 1496892953088.0, + "grad_norm": 0.0242162718076618, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78809333, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.1484375, + "step": 1452, + "time_per_iteration": 4.936378717422485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085289, + "balance_loss_mlp": 1.05162406, + "epoch": 0.2795305886879569, + "flos": 1031445854208.0, + "grad_norm": 0.05721806240601363, + "language_loss": 0.86067116, + "learning_rate": 0.0008454219349544836, + "loss": 0.87152404, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.33691406, + "step": 1453, + "time_per_iteration": 3.336336135864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086797, + "balance_loss_mlp": 1.053442, + "epoch": 0.27972297037322047, + "flos": 606766934016.0, + "grad_norm": 0.056433536115445035, + "language_loss": 0.81829166, + "learning_rate": 0.000845196621588334, + "loss": 0.82915968, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.33374023, + "step": 1454, + "time_per_iteration": 2.7415192127227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088727, + "balance_loss_mlp": 1.05475271, + "epoch": 0.27991535205848406, + "flos": 630085948416.0, + "grad_norm": 0.05700257056170363, + "language_loss": 0.76605666, + "learning_rate": 0.0008449711742049706, + "loss": 0.77694392, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.33984375, + "step": 1455, + "time_per_iteration": 2.755082130432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093567, + "balance_loss_mlp": 1.06035542, + "epoch": 0.2801077337437476, + "flos": 549034689024.0, + "grad_norm": 0.056412270826162, + "language_loss": 0.83750427, + "learning_rate": 0.0008447455928919196, + "loss": 0.84843993, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.33227539, + "step": 1456, + "time_per_iteration": 2.601306438446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097544, + "balance_loss_mlp": 1.06423688, + "epoch": 0.2803001154290112, + "flos": 486516989952.0, + "grad_norm": 0.08664389404831466, + "language_loss": 0.86875856, + "learning_rate": 0.0008445198777367595, + "loss": 0.87973404, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.33325195, + "step": 1457, + "time_per_iteration": 2.5534915924072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106361, + "balance_loss_mlp": 1.07267249, + "epoch": 0.2804924971142747, + "flos": 521820693504.0, + "grad_norm": 0.060155581105879694, + "language_loss": 0.8096568, + "learning_rate": 0.0008442940288271208, + "loss": 0.82072043, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.33691406, + "step": 1458, + "time_per_iteration": 2.6646370887756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108311, + "balance_loss_mlp": 1.07459903, + "epoch": 0.2806848787995383, + "flos": 527410690560.0, + "grad_norm": 0.05492641307724046, + "language_loss": 0.86995763, + "learning_rate": 0.0008440680462506856, + "loss": 0.88104069, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.33740234, + "step": 1459, + "time_per_iteration": 2.793306589126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103222, + "balance_loss_mlp": 1.06927133, + "epoch": 0.2808772604848018, + "flos": 485246785536.0, + "grad_norm": 0.053370474172872176, + "language_loss": 0.86799729, + "learning_rate": 0.0008438419300951883, + "loss": 0.87902945, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.33984375, + "step": 1460, + "time_per_iteration": 2.6732945442199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098403, + "balance_loss_mlp": 1.06488156, + "epoch": 0.2810696421700654, + "flos": 617840801280.0, + "grad_norm": 0.06081455520295947, + "language_loss": 0.86599934, + "learning_rate": 0.0008436156804484148, + "loss": 0.87698334, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.33544922, + "step": 1461, + "time_per_iteration": 2.768385410308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087899, + "balance_loss_mlp": 1.05397177, + "epoch": 0.28126202385532895, + "flos": 454521922560.0, + "grad_norm": 0.061036272851527865, + "language_loss": 0.88221931, + "learning_rate": 0.0008433892973982031, + "loss": 0.89309829, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.33959961, + "step": 1462, + "time_per_iteration": 2.502269983291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108859, + "balance_loss_mlp": 1.0546149, + "epoch": 0.28145440554059253, + "flos": 530447284224.0, + "grad_norm": 0.06533100110399645, + "language_loss": 0.85006338, + "learning_rate": 0.0008431627810324431, + "loss": 0.86094928, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.34008789, + "step": 1463, + "time_per_iteration": 2.6481637954711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097854, + "balance_loss_mlp": 1.06344974, + "epoch": 0.2816467872258561, + "flos": 451996070400.0, + "grad_norm": 0.053948569536927254, + "language_loss": 0.81259125, + "learning_rate": 0.000842936131439076, + "loss": 0.82356977, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.34423828, + "step": 1464, + "time_per_iteration": 2.598619222640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100723, + "balance_loss_mlp": 1.06665313, + "epoch": 0.28183916891111965, + "flos": 472464755712.0, + "grad_norm": 0.06117554261618067, + "language_loss": 0.88043475, + "learning_rate": 0.0008427093487060951, + "loss": 0.89144206, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.34106445, + "step": 1465, + "time_per_iteration": 2.611067533493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092883, + "balance_loss_mlp": 1.05917072, + "epoch": 0.28203155059638324, + "flos": 556770806784.0, + "grad_norm": 0.05001896034653533, + "language_loss": 0.84742111, + "learning_rate": 0.000842482432921545, + "loss": 0.85834992, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.33740234, + "step": 1466, + "time_per_iteration": 2.8155059814453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109283, + "balance_loss_mlp": 1.05990458, + "epoch": 0.28222393228164677, + "flos": 416756385792.0, + "grad_norm": 0.06017010781955974, + "language_loss": 0.87132335, + "learning_rate": 0.0008422553841735225, + "loss": 0.88225162, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.3293457, + "step": 1467, + "time_per_iteration": 2.459348201751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091932, + "balance_loss_mlp": 1.05731392, + "epoch": 0.28241631396691036, + "flos": 604629577728.0, + "grad_norm": 0.060074700521020694, + "language_loss": 0.84810078, + "learning_rate": 0.0008420282025501757, + "loss": 0.85902011, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.34643555, + "step": 1468, + "time_per_iteration": 2.7499678134918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086806, + "balance_loss_mlp": 1.05275989, + "epoch": 0.2826086956521739, + "flos": 572698529280.0, + "grad_norm": 0.05717030328031113, + "language_loss": 0.854882, + "learning_rate": 0.0008418008881397043, + "loss": 0.86575013, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.34082031, + "step": 1469, + "time_per_iteration": 2.6512861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080943, + "balance_loss_mlp": 1.04677796, + "epoch": 0.2828010773374375, + "flos": 842367886848.0, + "grad_norm": 0.05184982716140645, + "language_loss": 0.82590878, + "learning_rate": 0.0008415734410303595, + "loss": 0.8367182, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.34204102, + "step": 1470, + "time_per_iteration": 3.1787467002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085547, + "balance_loss_mlp": 1.05214453, + "epoch": 0.28299345902270107, + "flos": 542402860032.0, + "grad_norm": 0.04590644458835405, + "language_loss": 0.90709066, + "learning_rate": 0.0008413458613104444, + "loss": 0.9179461, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.33447266, + "step": 1471, + "time_per_iteration": 2.6650707721710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092285, + "balance_loss_mlp": 1.05780995, + "epoch": 0.2831858407079646, + "flos": 571320635904.0, + "grad_norm": 0.05367648266979066, + "language_loss": 0.82737631, + "learning_rate": 0.0008411181490683129, + "loss": 0.83829916, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.34472656, + "step": 1472, + "time_per_iteration": 2.7423322200775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104233, + "balance_loss_mlp": 1.06909025, + "epoch": 0.2833782223932282, + "flos": 763491861504.0, + "grad_norm": 0.05498194123694656, + "language_loss": 0.82467097, + "learning_rate": 0.0008408903043923707, + "loss": 0.83571333, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.35180664, + "step": 1473, + "time_per_iteration": 2.991787910461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114011, + "balance_loss_mlp": 1.07810485, + "epoch": 0.2835706040784917, + "flos": 538793068032.0, + "grad_norm": 0.05681509946110844, + "language_loss": 0.81401253, + "learning_rate": 0.0008406623273710754, + "loss": 0.82515264, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.35913086, + "step": 1474, + "time_per_iteration": 2.58866286277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112011, + "balance_loss_mlp": 1.07732141, + "epoch": 0.2837629857637553, + "flos": 530329420800.0, + "grad_norm": 0.06008709036576614, + "language_loss": 0.82883334, + "learning_rate": 0.0008404342180929351, + "loss": 0.83995342, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.34716797, + "step": 1475, + "time_per_iteration": 2.636383295059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112842, + "balance_loss_mlp": 1.07834268, + "epoch": 0.28395536744901884, + "flos": 539763526656.0, + "grad_norm": 0.06514959519071023, + "language_loss": 0.81725156, + "learning_rate": 0.00084020597664651, + "loss": 0.82837999, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.34521484, + "step": 1476, + "time_per_iteration": 2.7587718963623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106069, + "balance_loss_mlp": 1.0697813, + "epoch": 0.2841477491342824, + "flos": 573344510976.0, + "grad_norm": 0.0608139165355994, + "language_loss": 0.84204602, + "learning_rate": 0.0008399776031204111, + "loss": 0.85310674, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.36303711, + "step": 1477, + "time_per_iteration": 2.7376203536987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097711, + "balance_loss_mlp": 1.06263912, + "epoch": 0.28434013081954596, + "flos": 571802264064.0, + "grad_norm": 0.06275845169868324, + "language_loss": 0.8026123, + "learning_rate": 0.0008397490976033009, + "loss": 0.81358939, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.35083008, + "step": 1478, + "time_per_iteration": 2.6391618251800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103776, + "balance_loss_mlp": 1.02412283, + "epoch": 0.28453251250480954, + "flos": 1552554832896.0, + "grad_norm": 0.016614249421738093, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78917408, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.13671875, + "step": 1479, + "time_per_iteration": 4.730362176895142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079858, + "balance_loss_mlp": 1.04550207, + "epoch": 0.28472489419007313, + "flos": 748720862208.0, + "grad_norm": 0.04873312803653651, + "language_loss": 0.85529596, + "learning_rate": 0.0008392916909509525, + "loss": 0.86609453, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.34399414, + "step": 1480, + "time_per_iteration": 3.0429892539978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083924, + "balance_loss_mlp": 1.0495913, + "epoch": 0.28491727587533666, + "flos": 489914376192.0, + "grad_norm": 0.056617404906403615, + "language_loss": 0.85149431, + "learning_rate": 0.0008390627899932954, + "loss": 0.86233348, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.34375, + "step": 1481, + "time_per_iteration": 2.6355843544006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083301, + "balance_loss_mlp": 1.04818201, + "epoch": 0.28510965756060025, + "flos": 728671196160.0, + "grad_norm": 0.06013951169928809, + "language_loss": 0.88358414, + "learning_rate": 0.000838833757399789, + "loss": 0.89441717, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.3515625, + "step": 1482, + "time_per_iteration": 2.9198856353759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088409, + "balance_loss_mlp": 1.05357623, + "epoch": 0.2853020392458638, + "flos": 551300083200.0, + "grad_norm": 0.06378715850004578, + "language_loss": 0.80512154, + "learning_rate": 0.0008386045932593515, + "loss": 0.81600565, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.34887695, + "step": 1483, + "time_per_iteration": 2.665919065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087283, + "balance_loss_mlp": 1.05233121, + "epoch": 0.28549442093112737, + "flos": 754456425984.0, + "grad_norm": 0.06049898751226662, + "language_loss": 0.86304945, + "learning_rate": 0.0008383752976609525, + "loss": 0.87392229, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.34960938, + "step": 1484, + "time_per_iteration": 2.9113876819610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081674, + "balance_loss_mlp": 1.04586315, + "epoch": 0.2856868026163909, + "flos": 538311439872.0, + "grad_norm": 0.05363431349597561, + "language_loss": 0.79897112, + "learning_rate": 0.0008381458706936123, + "loss": 0.80978787, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.3581543, + "step": 1485, + "time_per_iteration": 2.715182065963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082792, + "balance_loss_mlp": 1.0470289, + "epoch": 0.2858791843016545, + "flos": 583487207424.0, + "grad_norm": 0.055785658857036256, + "language_loss": 0.8776381, + "learning_rate": 0.0008379163124464025, + "loss": 0.888466, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.35766602, + "step": 1486, + "time_per_iteration": 2.713412284851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083174, + "balance_loss_mlp": 1.04881775, + "epoch": 0.286071565986918, + "flos": 644503357440.0, + "grad_norm": 0.05967072286491994, + "language_loss": 0.76593089, + "learning_rate": 0.0008376866230084452, + "loss": 0.7767626, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.34399414, + "step": 1487, + "time_per_iteration": 2.812953472137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091632, + "balance_loss_mlp": 1.05522537, + "epoch": 0.2862639476721816, + "flos": 491120561664.0, + "grad_norm": 0.06413337589788286, + "language_loss": 0.85965335, + "learning_rate": 0.000837456802468914, + "loss": 0.87056965, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.36401367, + "step": 1488, + "time_per_iteration": 2.5974318981170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096408, + "balance_loss_mlp": 1.06050217, + "epoch": 0.2864563293574452, + "flos": 521363796480.0, + "grad_norm": 0.06049840128310572, + "language_loss": 0.85439187, + "learning_rate": 0.0008372268509170331, + "loss": 0.86535597, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.35888672, + "step": 1489, + "time_per_iteration": 2.6646056175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100143, + "balance_loss_mlp": 1.06478548, + "epoch": 0.2866487110427087, + "flos": 546834723840.0, + "grad_norm": 0.05582745965585505, + "language_loss": 0.84845203, + "learning_rate": 0.0008369967684420779, + "loss": 0.85945344, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.35424805, + "step": 1490, + "time_per_iteration": 2.737180471420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094742, + "balance_loss_mlp": 1.05902719, + "epoch": 0.2868410927279723, + "flos": 481977437184.0, + "grad_norm": 0.0702351654670911, + "language_loss": 0.84684229, + "learning_rate": 0.0008367665551333736, + "loss": 0.85778964, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.35717773, + "step": 1491, + "time_per_iteration": 2.591179847717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095437, + "balance_loss_mlp": 1.05807662, + "epoch": 0.28703347441323585, + "flos": 724578365952.0, + "grad_norm": 0.0690733570245185, + "language_loss": 0.85732669, + "learning_rate": 0.0008365362110802977, + "loss": 0.86828107, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.37329102, + "step": 1492, + "time_per_iteration": 2.8586251735687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083578, + "balance_loss_mlp": 1.04733849, + "epoch": 0.28722585609849943, + "flos": 634670581248.0, + "grad_norm": 0.059898504183233336, + "language_loss": 0.82604659, + "learning_rate": 0.0008363057363722773, + "loss": 0.83688229, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.36254883, + "step": 1493, + "time_per_iteration": 2.8491427898406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076927, + "balance_loss_mlp": 1.04264212, + "epoch": 0.28741823778376296, + "flos": 509974216704.0, + "grad_norm": 0.05796804627405179, + "language_loss": 0.84095198, + "learning_rate": 0.0008360751310987906, + "loss": 0.85172129, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.34301758, + "step": 1494, + "time_per_iteration": 2.5735158920288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077369, + "balance_loss_mlp": 1.04294157, + "epoch": 0.28761061946902655, + "flos": 603458297856.0, + "grad_norm": 0.07368534281083228, + "language_loss": 0.85552645, + "learning_rate": 0.0008358443953493666, + "loss": 0.86630011, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.34472656, + "step": 1495, + "time_per_iteration": 2.8492400646209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078466, + "balance_loss_mlp": 1.04260767, + "epoch": 0.28780300115429014, + "flos": 406977454080.0, + "grad_norm": 0.061458136593000166, + "language_loss": 0.88553399, + "learning_rate": 0.0008356135292135851, + "loss": 0.89631861, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.35864258, + "step": 1496, + "time_per_iteration": 2.499234676361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109055, + "balance_loss_mlp": 1.05428672, + "epoch": 0.28799538283955367, + "flos": 374726310912.0, + "grad_norm": 0.06023187099093886, + "language_loss": 0.92244387, + "learning_rate": 0.0008353825327810758, + "loss": 0.93334937, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.36230469, + "step": 1497, + "time_per_iteration": 2.4068801403045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083025, + "balance_loss_mlp": 1.04761958, + "epoch": 0.28818776452481726, + "flos": 591645316608.0, + "grad_norm": 0.050935971597675156, + "language_loss": 0.81914794, + "learning_rate": 0.00083515140614152, + "loss": 0.82997811, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.35473633, + "step": 1498, + "time_per_iteration": 2.7172293663024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05012727, + "epoch": 0.2883801462100808, + "flos": 534819511296.0, + "grad_norm": 0.055380500747477406, + "language_loss": 0.86671853, + "learning_rate": 0.0008349201493846485, + "loss": 0.87756932, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.34985352, + "step": 1499, + "time_per_iteration": 2.666877508163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088163, + "balance_loss_mlp": 1.05268669, + "epoch": 0.2885725278953444, + "flos": 479850255360.0, + "grad_norm": 0.06392675802491345, + "language_loss": 0.89344347, + "learning_rate": 0.0008346887626002432, + "loss": 0.90432513, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.35473633, + "step": 1500, + "time_per_iteration": 2.547353744506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081111, + "balance_loss_mlp": 1.04613519, + "epoch": 0.2887649095806079, + "flos": 463798877184.0, + "grad_norm": 0.050375470508879826, + "language_loss": 0.86195928, + "learning_rate": 0.000834457245878137, + "loss": 0.87277037, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.35009766, + "step": 1501, + "time_per_iteration": 2.6108102798461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076981, + "balance_loss_mlp": 1.04317355, + "epoch": 0.2889572912658715, + "flos": 930631527936.0, + "grad_norm": 0.05668037017333152, + "language_loss": 0.81365681, + "learning_rate": 0.000834225599308212, + "loss": 0.82442665, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.33837891, + "step": 1502, + "time_per_iteration": 3.2222447395324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085232, + "balance_loss_mlp": 1.04994583, + "epoch": 0.28914967295113503, + "flos": 569848200192.0, + "grad_norm": 0.05132223508893719, + "language_loss": 0.85018057, + "learning_rate": 0.0008339938229804016, + "loss": 0.8610329, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.35327148, + "step": 1503, + "time_per_iteration": 2.698528289794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_mlp": 1.01945031, + "epoch": 0.2893420546363986, + "flos": 1485803119104.0, + "grad_norm": 0.02573157997511775, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76467812, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.13574219, + "step": 1504, + "time_per_iteration": 4.950274467468262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083767, + "balance_loss_mlp": 1.04793239, + "epoch": 0.2895344363216622, + "flos": 469938903552.0, + "grad_norm": 0.06568085425348943, + "language_loss": 0.84119928, + "learning_rate": 0.0008335298814111094, + "loss": 0.85203701, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.35864258, + "step": 1505, + "time_per_iteration": 2.542043924331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085418, + "balance_loss_mlp": 1.05089498, + "epoch": 0.28972681800692573, + "flos": 647909508096.0, + "grad_norm": 0.06591449016003405, + "language_loss": 0.87860626, + "learning_rate": 0.0008332977163497455, + "loss": 0.88946044, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.34570312, + "step": 1506, + "time_per_iteration": 2.810399293899536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087351, + "balance_loss_mlp": 1.05163622, + "epoch": 0.2899191996921893, + "flos": 571955033088.0, + "grad_norm": 0.054529888005095714, + "language_loss": 0.83185649, + "learning_rate": 0.0008330654218907325, + "loss": 0.84272999, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.35742188, + "step": 1507, + "time_per_iteration": 2.6968414783477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084817, + "balance_loss_mlp": 1.04981744, + "epoch": 0.29011158137745285, + "flos": 661037773824.0, + "grad_norm": 0.1280653735040032, + "language_loss": 0.81777966, + "learning_rate": 0.0008328329981242548, + "loss": 0.82862782, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.3503418, + "step": 1508, + "time_per_iteration": 2.886396884918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.05441689, + "epoch": 0.29030396306271644, + "flos": 535933974528.0, + "grad_norm": 0.060234790533374126, + "language_loss": 0.87937206, + "learning_rate": 0.0008326004451405475, + "loss": 0.89026886, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.3527832, + "step": 1509, + "time_per_iteration": 2.7797772884368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096033, + "balance_loss_mlp": 1.06148589, + "epoch": 0.29049634474798, + "flos": 511707110400.0, + "grad_norm": 0.05385470227261208, + "language_loss": 0.82548428, + "learning_rate": 0.0008323677630298957, + "loss": 0.83644462, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.34594727, + "step": 1510, + "time_per_iteration": 2.5542855262756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093546, + "balance_loss_mlp": 1.05892766, + "epoch": 0.29068872643324356, + "flos": 613454017536.0, + "grad_norm": 0.05556182475666109, + "language_loss": 0.85001689, + "learning_rate": 0.0008321349518826345, + "loss": 0.86095232, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.34643555, + "step": 1511, + "time_per_iteration": 2.7849388122558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093876, + "balance_loss_mlp": 1.05878115, + "epoch": 0.2908811081185071, + "flos": 546164011008.0, + "grad_norm": 0.07046084545113683, + "language_loss": 0.94823933, + "learning_rate": 0.0008319020117891491, + "loss": 0.95917809, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.35131836, + "step": 1512, + "time_per_iteration": 2.5936031341552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090975, + "balance_loss_mlp": 1.05485463, + "epoch": 0.2910734898037707, + "flos": 604516096512.0, + "grad_norm": 0.06307487928884016, + "language_loss": 0.87063539, + "learning_rate": 0.0008316689428398751, + "loss": 0.88154513, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.36108398, + "step": 1513, + "time_per_iteration": 2.6774067878723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082761, + "balance_loss_mlp": 1.04792798, + "epoch": 0.29126587148903427, + "flos": 574383370752.0, + "grad_norm": 0.043578668947666564, + "language_loss": 0.88254529, + "learning_rate": 0.0008314357451252979, + "loss": 0.89337289, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.34887695, + "step": 1514, + "time_per_iteration": 2.7561941146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086548, + "balance_loss_mlp": 1.05092812, + "epoch": 0.2914582531742978, + "flos": 570802692096.0, + "grad_norm": 0.06240160889449628, + "language_loss": 0.87564558, + "learning_rate": 0.0008312024187359527, + "loss": 0.88651109, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.35644531, + "step": 1515, + "time_per_iteration": 2.636056900024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084088, + "balance_loss_mlp": 1.04942179, + "epoch": 0.2916506348595614, + "flos": 730523363328.0, + "grad_norm": 0.06185972429295104, + "language_loss": 0.87361014, + "learning_rate": 0.000830968963762425, + "loss": 0.88445103, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.34716797, + "step": 1516, + "time_per_iteration": 3.021732807159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091474, + "balance_loss_mlp": 1.05528235, + "epoch": 0.2918430165448249, + "flos": 510220118016.0, + "grad_norm": 0.05583453201751925, + "language_loss": 0.83947027, + "learning_rate": 0.0008307353802953497, + "loss": 0.85038507, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.36206055, + "step": 1517, + "time_per_iteration": 2.6659955978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100836, + "balance_loss_mlp": 1.06318951, + "epoch": 0.2920353982300885, + "flos": 630096122880.0, + "grad_norm": 0.04472517729516854, + "language_loss": 0.86110896, + "learning_rate": 0.0008305016684254125, + "loss": 0.87211728, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.37646484, + "step": 1518, + "time_per_iteration": 2.7896409034729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098986, + "balance_loss_mlp": 1.06153059, + "epoch": 0.29222777991535204, + "flos": 501411644928.0, + "grad_norm": 0.055409034097420505, + "language_loss": 0.86932153, + "learning_rate": 0.0008302678282433479, + "loss": 0.88031137, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.37451172, + "step": 1519, + "time_per_iteration": 2.585256814956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095676, + "balance_loss_mlp": 1.05891216, + "epoch": 0.2924201616006156, + "flos": 486522782208.0, + "grad_norm": 0.057505891705300044, + "language_loss": 0.85011005, + "learning_rate": 0.0008300338598399411, + "loss": 0.86106682, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.36791992, + "step": 1520, + "time_per_iteration": 2.6471352577209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109825, + "balance_loss_mlp": 1.07210708, + "epoch": 0.2926125432858792, + "flos": 476211350016.0, + "grad_norm": 0.05302442020038178, + "language_loss": 0.9456166, + "learning_rate": 0.0008297997633060263, + "loss": 0.95671487, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.37719727, + "step": 1521, + "time_per_iteration": 2.547457695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.06987774, + "epoch": 0.29280492497114274, + "flos": 676379151360.0, + "grad_norm": 0.054888704647412474, + "language_loss": 0.85310549, + "learning_rate": 0.0008295655387324883, + "loss": 0.86417043, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.36645508, + "step": 1522, + "time_per_iteration": 2.822557210922241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105716, + "balance_loss_mlp": 1.0686183, + "epoch": 0.29299730665640633, + "flos": 458175384576.0, + "grad_norm": 0.055715580232585875, + "language_loss": 0.85025144, + "learning_rate": 0.0008293311862102609, + "loss": 0.86130863, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.37084961, + "step": 1523, + "time_per_iteration": 2.5033791065216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103076, + "balance_loss_mlp": 1.06795669, + "epoch": 0.29318968834166986, + "flos": 446343464448.0, + "grad_norm": 0.0584953499596263, + "language_loss": 0.88722956, + "learning_rate": 0.0008290967058303275, + "loss": 0.89826035, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.3515625, + "step": 1524, + "time_per_iteration": 2.5981156826019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_mlp": 1.06630898, + "epoch": 0.29338207002693345, + "flos": 450085676544.0, + "grad_norm": 0.05072610752657829, + "language_loss": 0.86932707, + "learning_rate": 0.0008288620976837219, + "loss": 0.88035178, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.36181641, + "step": 1525, + "time_per_iteration": 2.522019863128662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092716, + "balance_loss_mlp": 1.05623853, + "epoch": 0.293574451712197, + "flos": 502027103232.0, + "grad_norm": 0.05230718040210392, + "language_loss": 0.83001733, + "learning_rate": 0.000828627361861527, + "loss": 0.84094453, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.36474609, + "step": 1526, + "time_per_iteration": 2.559201955795288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085779, + "balance_loss_mlp": 1.05051661, + "epoch": 0.29376683339746057, + "flos": 696158184960.0, + "grad_norm": 0.071892180297548, + "language_loss": 0.8465147, + "learning_rate": 0.0008283924984548752, + "loss": 0.85737246, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.3527832, + "step": 1527, + "time_per_iteration": 2.8108816146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079347, + "balance_loss_mlp": 1.04494321, + "epoch": 0.2939592150827241, + "flos": 478353088512.0, + "grad_norm": 0.05128355551395112, + "language_loss": 0.85087478, + "learning_rate": 0.0008281575075549485, + "loss": 0.86166823, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.34399414, + "step": 1528, + "time_per_iteration": 2.576814889907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051452, + "balance_loss_mlp": 1.04057992, + "epoch": 0.2941515967679877, + "flos": 1484482042368.0, + "grad_norm": 0.031732851839211505, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.7840414, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.10888672, + "step": 1529, + "time_per_iteration": 4.662023067474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089321, + "balance_loss_mlp": 1.0547502, + "epoch": 0.2943439784532513, + "flos": 673848916992.0, + "grad_norm": 0.06453398347295829, + "language_loss": 0.90086716, + "learning_rate": 0.0008276871436402469, + "loss": 0.91176039, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.34619141, + "step": 1530, + "time_per_iteration": 2.795783758163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097742, + "balance_loss_mlp": 1.06460166, + "epoch": 0.2945363601385148, + "flos": 576031896576.0, + "grad_norm": 0.05195467848041957, + "language_loss": 0.87790835, + "learning_rate": 0.000827451770808083, + "loss": 0.88888574, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.33154297, + "step": 1531, + "time_per_iteration": 2.6522414684295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100953, + "balance_loss_mlp": 1.06628692, + "epoch": 0.2947287418237784, + "flos": 480416251392.0, + "grad_norm": 0.05572078055736918, + "language_loss": 0.83276248, + "learning_rate": 0.0008272162708478674, + "loss": 0.84377199, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.34692383, + "step": 1532, + "time_per_iteration": 2.5960874557495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098518, + "balance_loss_mlp": 1.06459141, + "epoch": 0.2949211235090419, + "flos": 557917355520.0, + "grad_norm": 0.05232404820193651, + "language_loss": 0.86136103, + "learning_rate": 0.000826980643851029, + "loss": 0.87234622, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.33959961, + "step": 1533, + "time_per_iteration": 2.671867609024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106353, + "balance_loss_mlp": 1.07147205, + "epoch": 0.2951135051943055, + "flos": 483646311936.0, + "grad_norm": 0.06650262295584625, + "language_loss": 0.84864676, + "learning_rate": 0.0008267448899090464, + "loss": 0.85971034, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.34887695, + "step": 1534, + "time_per_iteration": 2.5133543014526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095921, + "balance_loss_mlp": 1.0604682, + "epoch": 0.29530588687956905, + "flos": 550015322112.0, + "grad_norm": 0.05711998360561463, + "language_loss": 0.80980158, + "learning_rate": 0.0008265090091134473, + "loss": 0.82076073, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.35473633, + "step": 1535, + "time_per_iteration": 2.8528778553009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085228, + "balance_loss_mlp": 1.05072904, + "epoch": 0.29549826856483263, + "flos": 672731481600.0, + "grad_norm": 0.047870597747086484, + "language_loss": 0.80150926, + "learning_rate": 0.0008262730015558088, + "loss": 0.8123616, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.34521484, + "step": 1536, + "time_per_iteration": 2.8849382400512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076376, + "balance_loss_mlp": 1.04192495, + "epoch": 0.29569065025009617, + "flos": 764300786688.0, + "grad_norm": 0.06331525049863725, + "language_loss": 0.82269859, + "learning_rate": 0.0008260368673277574, + "loss": 0.8334623, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.34472656, + "step": 1537, + "time_per_iteration": 3.12172269821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078873, + "balance_loss_mlp": 1.04480314, + "epoch": 0.29588303193535975, + "flos": 543398049792.0, + "grad_norm": 0.05107262607685598, + "language_loss": 0.84019077, + "learning_rate": 0.0008258006065209682, + "loss": 0.85097957, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.34106445, + "step": 1538, + "time_per_iteration": 2.7388381958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082306, + "balance_loss_mlp": 1.04744971, + "epoch": 0.29607541362062334, + "flos": 596648968704.0, + "grad_norm": 0.06469434822608365, + "language_loss": 0.80634302, + "learning_rate": 0.0008255642192271657, + "loss": 0.81716609, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.34863281, + "step": 1539, + "time_per_iteration": 2.7957324981689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082434, + "balance_loss_mlp": 1.04774427, + "epoch": 0.29626779530588687, + "flos": 609588149760.0, + "grad_norm": 0.06097977692176942, + "language_loss": 0.83830953, + "learning_rate": 0.0008253277055381241, + "loss": 0.84913385, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.34741211, + "step": 1540, + "time_per_iteration": 2.8428521156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085866, + "balance_loss_mlp": 1.05146217, + "epoch": 0.29646017699115046, + "flos": 867050237952.0, + "grad_norm": 0.06407432486539091, + "language_loss": 0.8580029, + "learning_rate": 0.0008250910655456658, + "loss": 0.8688615, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.34448242, + "step": 1541, + "time_per_iteration": 3.1741185188293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081587, + "balance_loss_mlp": 1.04696846, + "epoch": 0.296652558676414, + "flos": 495616444416.0, + "grad_norm": 0.06683547404256097, + "language_loss": 0.83703458, + "learning_rate": 0.0008248542993416625, + "loss": 0.84785044, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.34643555, + "step": 1542, + "time_per_iteration": 2.5634429454803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083482, + "balance_loss_mlp": 1.04914963, + "epoch": 0.2968449403616776, + "flos": 571275555840.0, + "grad_norm": 0.054805025189504364, + "language_loss": 0.83645159, + "learning_rate": 0.0008246174070180352, + "loss": 0.84728634, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.34375, + "step": 1543, + "time_per_iteration": 2.664029121398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108698, + "balance_loss_mlp": 1.05226684, + "epoch": 0.2970373220469411, + "flos": 793799115264.0, + "grad_norm": 0.06369286414713611, + "language_loss": 0.84087443, + "learning_rate": 0.0008243803886667537, + "loss": 0.85174423, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.34765625, + "step": 1544, + "time_per_iteration": 3.129185199737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082817, + "balance_loss_mlp": 1.04793644, + "epoch": 0.2972297037322047, + "flos": 660736617984.0, + "grad_norm": 0.0569938777400986, + "language_loss": 0.79051471, + "learning_rate": 0.0008241432443798364, + "loss": 0.80134284, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.34936523, + "step": 1545, + "time_per_iteration": 2.7968478202819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076425, + "balance_loss_mlp": 1.04225969, + "epoch": 0.29742208541746823, + "flos": 596849789952.0, + "grad_norm": 0.05185676674228935, + "language_loss": 0.85634965, + "learning_rate": 0.0008239059742493512, + "loss": 0.86711389, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.34204102, + "step": 1546, + "time_per_iteration": 2.730803966522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079044, + "balance_loss_mlp": 1.0448308, + "epoch": 0.2976144671027318, + "flos": 769519816704.0, + "grad_norm": 0.049935350225070424, + "language_loss": 0.87424839, + "learning_rate": 0.0008236685783674142, + "loss": 0.88503873, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.34228516, + "step": 1547, + "time_per_iteration": 3.0735998153686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060573, + "balance_loss_mlp": 1.04903388, + "epoch": 0.2978068487879954, + "flos": 1483980065280.0, + "grad_norm": 0.022808758650826922, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77281767, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.11523438, + "step": 1548, + "time_per_iteration": 4.902673959732056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088892, + "balance_loss_mlp": 1.05460715, + "epoch": 0.29799923047325894, + "flos": 475079357952.0, + "grad_norm": 0.07696298762455249, + "language_loss": 0.82568306, + "learning_rate": 0.0008231934097178955, + "loss": 0.83657193, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.34326172, + "step": 1549, + "time_per_iteration": 2.59600567817688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087503, + "balance_loss_mlp": 1.05173981, + "epoch": 0.2981916121585225, + "flos": 759464460288.0, + "grad_norm": 0.05308820200633048, + "language_loss": 0.8525809, + "learning_rate": 0.0008229556371347903, + "loss": 0.86345589, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.35791016, + "step": 1550, + "time_per_iteration": 2.955467939376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080247, + "balance_loss_mlp": 1.04593909, + "epoch": 0.29838399384378606, + "flos": 874642351104.0, + "grad_norm": 0.058723621398699785, + "language_loss": 0.79088616, + "learning_rate": 0.0008227177391691874, + "loss": 0.80168855, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.34350586, + "step": 1551, + "time_per_iteration": 3.1204521656036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084967, + "balance_loss_mlp": 1.05001473, + "epoch": 0.29857637552904964, + "flos": 579389995008.0, + "grad_norm": 0.060980844602782615, + "language_loss": 0.89576113, + "learning_rate": 0.0008224797159134463, + "loss": 0.90661073, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.34985352, + "step": 1552, + "time_per_iteration": 2.7535347938537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.04132128, + "epoch": 0.2987687572143132, + "flos": 836048950272.0, + "grad_norm": 0.05791718796165568, + "language_loss": 0.83571118, + "learning_rate": 0.0008222415674599765, + "loss": 0.84646177, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.33764648, + "step": 1553, + "time_per_iteration": 3.0609707832336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077849, + "balance_loss_mlp": 1.0417521, + "epoch": 0.29896113889957676, + "flos": 566800022016.0, + "grad_norm": 0.05477323920870417, + "language_loss": 0.83255476, + "learning_rate": 0.0008220032939012349, + "loss": 0.84333324, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.36108398, + "step": 1554, + "time_per_iteration": 2.6683521270751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077555, + "balance_loss_mlp": 1.04310393, + "epoch": 0.29915352058484035, + "flos": 498370669056.0, + "grad_norm": 0.049159177960894807, + "language_loss": 0.87956095, + "learning_rate": 0.0008217648953297277, + "loss": 0.89033645, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.34472656, + "step": 1555, + "time_per_iteration": 2.82114315032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109245, + "balance_loss_mlp": 1.05711639, + "epoch": 0.2993459022701039, + "flos": 591837373440.0, + "grad_norm": 0.06210935096260163, + "language_loss": 0.7799179, + "learning_rate": 0.0008215263718380095, + "loss": 0.79084241, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.35327148, + "step": 1556, + "time_per_iteration": 2.6806485652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104132, + "balance_loss_mlp": 1.06815481, + "epoch": 0.29953828395536747, + "flos": 572107802112.0, + "grad_norm": 0.051501670996139066, + "language_loss": 0.8437115, + "learning_rate": 0.0008212877235186833, + "loss": 0.8547529, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.36010742, + "step": 1557, + "time_per_iteration": 2.706531286239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075275, + "balance_loss_mlp": 1.06321061, + "epoch": 0.299730665640631, + "flos": 1503855051264.0, + "grad_norm": 0.03618665962020262, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78812838, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.12060547, + "step": 1558, + "time_per_iteration": 4.914030313491821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102225, + "balance_loss_mlp": 1.06641483, + "epoch": 0.2999230473258946, + "flos": 513538928640.0, + "grad_norm": 0.06717328469529676, + "language_loss": 0.80777293, + "learning_rate": 0.0008208100527678611, + "loss": 0.8187952, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.3581543, + "step": 1559, + "time_per_iteration": 2.5862650871276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097511, + "balance_loss_mlp": 1.06294012, + "epoch": 0.3001154290111581, + "flos": 834128381952.0, + "grad_norm": 0.05731533213860712, + "language_loss": 0.78337204, + "learning_rate": 0.0008205710305218135, + "loss": 0.79434717, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.34594727, + "step": 1560, + "time_per_iteration": 3.00581693649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094947, + "balance_loss_mlp": 1.06149733, + "epoch": 0.3003078106964217, + "flos": 556485617664.0, + "grad_norm": 0.051151635719759364, + "language_loss": 0.89917201, + "learning_rate": 0.0008203318838190541, + "loss": 0.9101215, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.3347168, + "step": 1561, + "time_per_iteration": 2.730187177658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087932, + "balance_loss_mlp": 1.05345702, + "epoch": 0.30050019238168524, + "flos": 525897556992.0, + "grad_norm": 0.07455466191279551, + "language_loss": 0.85053575, + "learning_rate": 0.0008200926127524281, + "loss": 0.86141509, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.34521484, + "step": 1562, + "time_per_iteration": 2.6252634525299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082824, + "balance_loss_mlp": 1.04837239, + "epoch": 0.3006925740669488, + "flos": 577582907904.0, + "grad_norm": 0.08578868432126639, + "language_loss": 0.83193934, + "learning_rate": 0.0008198532174148289, + "loss": 0.84276754, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.3449707, + "step": 1563, + "time_per_iteration": 2.784132957458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010668, + "balance_loss_mlp": 0.99912882, + "epoch": 0.3008849557522124, + "flos": 1489408528896.0, + "grad_norm": 0.006418694176289122, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81696838, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.11523438, + "step": 1564, + "time_per_iteration": 4.826026678085327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079569, + "balance_loss_mlp": 1.04607105, + "epoch": 0.30107733743747594, + "flos": 509565371904.0, + "grad_norm": 0.057361266050022765, + "language_loss": 0.88701093, + "learning_rate": 0.0008193740542985244, + "loss": 0.89780664, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.33520508, + "step": 1565, + "time_per_iteration": 2.5685722827911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082334, + "balance_loss_mlp": 1.04881263, + "epoch": 0.30126971912273953, + "flos": 587425858560.0, + "grad_norm": 0.055549771382925904, + "language_loss": 0.86598676, + "learning_rate": 0.0008191342867058467, + "loss": 0.87681007, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.33520508, + "step": 1566, + "time_per_iteration": 2.7413527965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088116, + "balance_loss_mlp": 1.05423677, + "epoch": 0.30146210080800306, + "flos": 601822918656.0, + "grad_norm": 0.054174391750340056, + "language_loss": 0.83411789, + "learning_rate": 0.0008188943952142509, + "loss": 0.84499902, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.33911133, + "step": 1567, + "time_per_iteration": 2.816777229309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098385, + "balance_loss_mlp": 1.06376624, + "epoch": 0.30165448249326665, + "flos": 917424686592.0, + "grad_norm": 0.057308899380469513, + "language_loss": 0.81973398, + "learning_rate": 0.0008186543799168711, + "loss": 0.8307178, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.34643555, + "step": 1568, + "time_per_iteration": 3.138439655303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094575, + "balance_loss_mlp": 1.060076, + "epoch": 0.3018468641785302, + "flos": 776953368576.0, + "grad_norm": 0.06314470525088989, + "language_loss": 0.88671768, + "learning_rate": 0.0008184142409068892, + "loss": 0.89766341, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.34545898, + "step": 1569, + "time_per_iteration": 3.0061678886413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104914, + "balance_loss_mlp": 1.07134473, + "epoch": 0.30203924586379377, + "flos": 522101500416.0, + "grad_norm": 0.05000282823150535, + "language_loss": 0.86630476, + "learning_rate": 0.000818173978277536, + "loss": 0.87735385, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.3359375, + "step": 1570, + "time_per_iteration": 2.7171432971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101367, + "balance_loss_mlp": 1.06779718, + "epoch": 0.3022316275490573, + "flos": 524288318976.0, + "grad_norm": 0.052630401262377564, + "language_loss": 0.83781934, + "learning_rate": 0.000817933592122089, + "loss": 0.84883296, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.3359375, + "step": 1571, + "time_per_iteration": 2.7346580028533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105149, + "balance_loss_mlp": 1.0710789, + "epoch": 0.3024240092343209, + "flos": 479672755200.0, + "grad_norm": 0.05357670269103591, + "language_loss": 0.83451247, + "learning_rate": 0.0008176930825338749, + "loss": 0.84556395, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.34106445, + "step": 1572, + "time_per_iteration": 2.5449459552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095582, + "balance_loss_mlp": 1.06110692, + "epoch": 0.3026163909195845, + "flos": 686901579264.0, + "grad_norm": 0.06283664606524127, + "language_loss": 0.8873198, + "learning_rate": 0.0008174524496062679, + "loss": 0.89827561, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.3449707, + "step": 1573, + "time_per_iteration": 2.8826043605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108869, + "balance_loss_mlp": 1.05380964, + "epoch": 0.302808772604848, + "flos": 542654553600.0, + "grad_norm": 0.05929060654319276, + "language_loss": 0.85444844, + "learning_rate": 0.0008172116934326894, + "loss": 0.86533535, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.34912109, + "step": 1574, + "time_per_iteration": 2.7539572715759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088334, + "balance_loss_mlp": 1.05435979, + "epoch": 0.3030011542901116, + "flos": 474852395520.0, + "grad_norm": 0.051325587648683973, + "language_loss": 0.87683225, + "learning_rate": 0.0008169708141066097, + "loss": 0.88771558, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.34008789, + "step": 1575, + "time_per_iteration": 2.5635225772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086472, + "balance_loss_mlp": 1.05199683, + "epoch": 0.30319353597537513, + "flos": 481233940992.0, + "grad_norm": 0.06106098638193731, + "language_loss": 0.90820259, + "learning_rate": 0.0008167298117215465, + "loss": 0.91906732, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.3449707, + "step": 1576, + "time_per_iteration": 2.5388035774230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087257, + "balance_loss_mlp": 1.05242443, + "epoch": 0.3033859176606387, + "flos": 704455916544.0, + "grad_norm": 0.06728579874610481, + "language_loss": 0.88300574, + "learning_rate": 0.0008164886863710649, + "loss": 0.89387834, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.34887695, + "step": 1577, + "time_per_iteration": 2.8935675621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088541, + "balance_loss_mlp": 1.05554342, + "epoch": 0.30357829934590225, + "flos": 764344456704.0, + "grad_norm": 0.04642698643554312, + "language_loss": 0.86113924, + "learning_rate": 0.0008162474381487783, + "loss": 0.87202466, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.33007812, + "step": 1578, + "time_per_iteration": 3.0151257514953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088208, + "balance_loss_mlp": 1.05401909, + "epoch": 0.30377068103116583, + "flos": 532082663424.0, + "grad_norm": 0.05691489249418783, + "language_loss": 0.84894794, + "learning_rate": 0.0008160060671483475, + "loss": 0.85983002, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.34228516, + "step": 1579, + "time_per_iteration": 2.6575984954833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097191, + "balance_loss_mlp": 1.06338358, + "epoch": 0.3039630627164294, + "flos": 509934928896.0, + "grad_norm": 0.07240450604386858, + "language_loss": 0.83520651, + "learning_rate": 0.0008157645734634809, + "loss": 0.84617841, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.33837891, + "step": 1580, + "time_per_iteration": 2.5869438648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061274, + "balance_loss_mlp": 1.04992568, + "epoch": 0.30415544440169295, + "flos": 1505206803456.0, + "grad_norm": 0.030998653754179664, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.77957761, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.11328125, + "step": 1581, + "time_per_iteration": 4.907174348831177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_mlp": 1.01606703, + "epoch": 0.30434782608695654, + "flos": 1457976637440.0, + "grad_norm": 0.012214555664928241, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74242246, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.11669922, + "step": 1582, + "time_per_iteration": 4.8717710971832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102415, + "balance_loss_mlp": 1.0685122, + "epoch": 0.3045402077722201, + "flos": 482312088576.0, + "grad_norm": 0.05813255519406619, + "language_loss": 0.83851862, + "learning_rate": 0.000815039357240067, + "loss": 0.84954274, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.33935547, + "step": 1583, + "time_per_iteration": 2.640148401260376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102551, + "balance_loss_mlp": 1.06879056, + "epoch": 0.30473258945748366, + "flos": 543220549632.0, + "grad_norm": 0.06312099992380371, + "language_loss": 0.85312426, + "learning_rate": 0.0008147973737554952, + "loss": 0.86414981, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.33789062, + "step": 1584, + "time_per_iteration": 2.772367000579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098103, + "balance_loss_mlp": 1.06443787, + "epoch": 0.3049249711427472, + "flos": 566789847552.0, + "grad_norm": 0.054268296030043885, + "language_loss": 0.85613728, + "learning_rate": 0.000814555268055744, + "loss": 0.86711836, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.33691406, + "step": 1585, + "time_per_iteration": 2.616687536239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109794, + "balance_loss_mlp": 1.06441879, + "epoch": 0.3051173528280108, + "flos": 527970894336.0, + "grad_norm": 0.05527556644311566, + "language_loss": 0.87648201, + "learning_rate": 0.0008143130402348073, + "loss": 0.88746148, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.33544922, + "step": 1586, + "time_per_iteration": 2.635103940963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094771, + "balance_loss_mlp": 1.06141627, + "epoch": 0.3053097345132743, + "flos": 586097427456.0, + "grad_norm": 0.052385807505719764, + "language_loss": 0.79520649, + "learning_rate": 0.0008140706903867265, + "loss": 0.80615419, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.33349609, + "step": 1587, + "time_per_iteration": 2.7922940254211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087687, + "balance_loss_mlp": 1.05263984, + "epoch": 0.3055021161985379, + "flos": 606810604032.0, + "grad_norm": 0.054380951058352583, + "language_loss": 0.90043247, + "learning_rate": 0.0008138282186055897, + "loss": 0.9113093, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.35058594, + "step": 1588, + "time_per_iteration": 2.683783769607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085545, + "balance_loss_mlp": 1.05290556, + "epoch": 0.3056944978838015, + "flos": 573594794496.0, + "grad_norm": 0.05235756550943364, + "language_loss": 0.8193745, + "learning_rate": 0.0008135856249855331, + "loss": 0.83023, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.32641602, + "step": 1589, + "time_per_iteration": 2.6717309951782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081523, + "balance_loss_mlp": 1.04900289, + "epoch": 0.305886879569065, + "flos": 633640485888.0, + "grad_norm": 0.06284243799371535, + "language_loss": 0.89691997, + "learning_rate": 0.0008133429096207398, + "loss": 0.90773523, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.32519531, + "step": 1590, + "time_per_iteration": 2.757962465286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037916, + "balance_loss_mlp": 1.02561319, + "epoch": 0.3060792612543286, + "flos": 1368227414016.0, + "grad_norm": 0.023218914608202516, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76350176, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.12304688, + "step": 1591, + "time_per_iteration": 4.927033185958862 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078048, + "balance_loss_mlp": 1.0450511, + "epoch": 0.30627164293959214, + "flos": 518290887168.0, + "grad_norm": 0.05132667013942606, + "language_loss": 0.86601979, + "learning_rate": 0.0008128571140339123, + "loss": 0.87680024, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.33007812, + "step": 1592, + "time_per_iteration": 2.627272367477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075862, + "balance_loss_mlp": 1.0423162, + "epoch": 0.3064640246248557, + "flos": 455354168832.0, + "grad_norm": 0.054345541641725725, + "language_loss": 0.87405455, + "learning_rate": 0.0008126140340004805, + "loss": 0.88481319, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.33569336, + "step": 1593, + "time_per_iteration": 2.5047686100006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076578, + "balance_loss_mlp": 1.04355717, + "epoch": 0.30665640631011926, + "flos": 849718480896.0, + "grad_norm": 0.04925367115496714, + "language_loss": 0.82262254, + "learning_rate": 0.0008123708325995172, + "loss": 0.83338827, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.33032227, + "step": 1594, + "time_per_iteration": 3.1693196296691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107923, + "balance_loss_mlp": 1.04582715, + "epoch": 0.30684878799538284, + "flos": 757996406784.0, + "grad_norm": 0.04977841797679214, + "language_loss": 0.79901791, + "learning_rate": 0.0008121275099254414, + "loss": 0.80981016, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.33422852, + "step": 1595, + "time_per_iteration": 2.9197185039520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108089, + "balance_loss_mlp": 1.04758275, + "epoch": 0.3070411696806464, + "flos": 517320428544.0, + "grad_norm": 0.05488318824662342, + "language_loss": 0.88300943, + "learning_rate": 0.0008118840660727194, + "loss": 0.89381832, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.33325195, + "step": 1596, + "time_per_iteration": 2.6452150344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079954, + "balance_loss_mlp": 1.04788685, + "epoch": 0.30723355136590996, + "flos": 843883992576.0, + "grad_norm": 0.05612425557740203, + "language_loss": 0.87403214, + "learning_rate": 0.0008116405011358644, + "loss": 0.88483167, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.32055664, + "step": 1597, + "time_per_iteration": 3.135666608810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_mlp": 1.05092525, + "epoch": 0.30742593305117355, + "flos": 465905710080.0, + "grad_norm": 0.05391343675647517, + "language_loss": 0.80005342, + "learning_rate": 0.0008113968152094369, + "loss": 0.81089526, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.33276367, + "step": 1598, + "time_per_iteration": 2.5122313499450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076054, + "balance_loss_mlp": 1.04331923, + "epoch": 0.3076183147364371, + "flos": 686286120960.0, + "grad_norm": 0.04979397496165333, + "language_loss": 0.82305032, + "learning_rate": 0.0008111530083880438, + "loss": 0.83381081, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.32739258, + "step": 1599, + "time_per_iteration": 2.8883755207061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072089, + "balance_loss_mlp": 1.03949702, + "epoch": 0.30781069642170067, + "flos": 613729032192.0, + "grad_norm": 0.059000164712882774, + "language_loss": 0.86657357, + "learning_rate": 0.0008109090807663399, + "loss": 0.87729448, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.32592773, + "step": 1600, + "time_per_iteration": 2.7799928188323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075122, + "balance_loss_mlp": 1.04260147, + "epoch": 0.3080030781069642, + "flos": 590021521920.0, + "grad_norm": 0.046450828420206536, + "language_loss": 0.88735926, + "learning_rate": 0.0008106650324390257, + "loss": 0.89811045, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.32519531, + "step": 1601, + "time_per_iteration": 2.7887444496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071489, + "balance_loss_mlp": 1.03913534, + "epoch": 0.3081954597922278, + "flos": 562353601536.0, + "grad_norm": 0.06865077604181559, + "language_loss": 0.81335884, + "learning_rate": 0.0008104208635008493, + "loss": 0.82407373, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.32348633, + "step": 1602, + "time_per_iteration": 2.6526358127593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077173, + "balance_loss_mlp": 1.04448628, + "epoch": 0.3083878414774913, + "flos": 447599112192.0, + "grad_norm": 0.053973671264543166, + "language_loss": 0.81925714, + "learning_rate": 0.0008101765740466058, + "loss": 0.83002889, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.3269043, + "step": 1603, + "time_per_iteration": 2.5337142944335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073512, + "balance_loss_mlp": 1.04020488, + "epoch": 0.3085802231627549, + "flos": 493297205760.0, + "grad_norm": 0.05670542728571842, + "language_loss": 0.84135199, + "learning_rate": 0.0008099321641711364, + "loss": 0.85208714, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.33325195, + "step": 1604, + "time_per_iteration": 2.6616737842559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071402, + "balance_loss_mlp": 1.03804755, + "epoch": 0.3087726048480185, + "flos": 487437986304.0, + "grad_norm": 0.0517354770696361, + "language_loss": 0.8343811, + "learning_rate": 0.0008096876339693295, + "loss": 0.8450951, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.33374023, + "step": 1605, + "time_per_iteration": 2.6034042835235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078765, + "balance_loss_mlp": 1.04412317, + "epoch": 0.308964986533282, + "flos": 730265877504.0, + "grad_norm": 0.0630488444124333, + "language_loss": 0.8123467, + "learning_rate": 0.0008094429835361206, + "loss": 0.8231343, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.34667969, + "step": 1606, + "time_per_iteration": 2.9442811012268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_mlp": 1.03788495, + "epoch": 0.3091573682185456, + "flos": 605131554816.0, + "grad_norm": 0.0490228515497239, + "language_loss": 0.85833865, + "learning_rate": 0.0008091982129664908, + "loss": 0.8690542, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.33691406, + "step": 1607, + "time_per_iteration": 2.734976053237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077455, + "balance_loss_mlp": 1.04290783, + "epoch": 0.30934974990380915, + "flos": 460081396224.0, + "grad_norm": 0.04772079658934369, + "language_loss": 0.82646394, + "learning_rate": 0.0008089533223554687, + "loss": 0.83723843, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.34594727, + "step": 1608, + "time_per_iteration": 2.756741762161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080172, + "balance_loss_mlp": 1.04669785, + "epoch": 0.30954213158907273, + "flos": 553142075904.0, + "grad_norm": 0.05499274022240881, + "language_loss": 0.8525604, + "learning_rate": 0.0008087083117981294, + "loss": 0.86336207, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.33496094, + "step": 1609, + "time_per_iteration": 2.9062788486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081142, + "balance_loss_mlp": 1.04676199, + "epoch": 0.30973451327433627, + "flos": 552776901120.0, + "grad_norm": 0.0512798930400947, + "language_loss": 0.87996054, + "learning_rate": 0.0008084631813895943, + "loss": 0.89077199, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.34375, + "step": 1610, + "time_per_iteration": 2.789893627166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079299, + "balance_loss_mlp": 1.04575384, + "epoch": 0.30992689495959985, + "flos": 565430893056.0, + "grad_norm": 0.07403274033744815, + "language_loss": 0.83632123, + "learning_rate": 0.0008082179312250315, + "loss": 0.84711421, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.33544922, + "step": 1611, + "time_per_iteration": 2.713533878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_mlp": 1.02864099, + "epoch": 0.3101192766448634, + "flos": 1441621131264.0, + "grad_norm": 0.023040649208851512, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.80895925, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.11425781, + "step": 1612, + "time_per_iteration": 4.866560459136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_mlp": 1.02448523, + "epoch": 0.31031165833012697, + "flos": 1531086575616.0, + "grad_norm": 0.021256554447441355, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77664924, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.11132812, + "step": 1613, + "time_per_iteration": 5.01593279838562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010886, + "balance_loss_mlp": 1.05483997, + "epoch": 0.31050404001539056, + "flos": 991534196736.0, + "grad_norm": 0.06253960188626659, + "language_loss": 0.81937206, + "learning_rate": 0.0008074814631475545, + "loss": 0.83025801, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.33789062, + "step": 1614, + "time_per_iteration": 3.3154871463775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092644, + "balance_loss_mlp": 1.05940843, + "epoch": 0.3106964217006541, + "flos": 445748355072.0, + "grad_norm": 0.0719929788966035, + "language_loss": 0.78655052, + "learning_rate": 0.0008072357349114907, + "loss": 0.79747701, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.33251953, + "step": 1615, + "time_per_iteration": 2.6783502101898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084302, + "balance_loss_mlp": 1.05063736, + "epoch": 0.3108888033859177, + "flos": 510259405824.0, + "grad_norm": 0.06269338504314155, + "language_loss": 0.88523185, + "learning_rate": 0.0008069898873959363, + "loss": 0.89607489, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.33691406, + "step": 1616, + "time_per_iteration": 2.6805779933929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092165, + "balance_loss_mlp": 1.05952573, + "epoch": 0.3110811850711812, + "flos": 520471913472.0, + "grad_norm": 0.06669389997650658, + "language_loss": 0.85964763, + "learning_rate": 0.0008067439206963375, + "loss": 0.87056935, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.32641602, + "step": 1617, + "time_per_iteration": 2.6084542274475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091143, + "balance_loss_mlp": 1.05797851, + "epoch": 0.3112735667564448, + "flos": 686085299712.0, + "grad_norm": 0.06020913179087489, + "language_loss": 0.86049557, + "learning_rate": 0.0008064978349081873, + "loss": 0.87140703, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.33178711, + "step": 1618, + "time_per_iteration": 2.9622116088867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089777, + "balance_loss_mlp": 1.05554032, + "epoch": 0.31146594844170833, + "flos": 532786871808.0, + "grad_norm": 0.04562356821057988, + "language_loss": 0.86218596, + "learning_rate": 0.0008062516301270245, + "loss": 0.87308377, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.3425293, + "step": 1619, + "time_per_iteration": 2.691589593887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091127, + "balance_loss_mlp": 1.0575099, + "epoch": 0.3116583301269719, + "flos": 679187220480.0, + "grad_norm": 0.05429224886242875, + "language_loss": 0.88343138, + "learning_rate": 0.0008060053064484343, + "loss": 0.89434266, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.33642578, + "step": 1620, + "time_per_iteration": 2.936244487762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096127, + "balance_loss_mlp": 1.06277251, + "epoch": 0.31185071181223545, + "flos": 585855908352.0, + "grad_norm": 0.05040245512912965, + "language_loss": 0.85009742, + "learning_rate": 0.0008057588639680482, + "loss": 0.86105865, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.33374023, + "step": 1621, + "time_per_iteration": 2.7633163928985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107807, + "balance_loss_mlp": 1.07309282, + "epoch": 0.31204309349749904, + "flos": 725090517504.0, + "grad_norm": 0.06801147163116106, + "language_loss": 0.82624507, + "learning_rate": 0.0008055123027815434, + "loss": 0.83732307, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.34741211, + "step": 1622, + "time_per_iteration": 2.946943521499634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105084, + "balance_loss_mlp": 1.07156253, + "epoch": 0.3122354751827626, + "flos": 576558604800.0, + "grad_norm": 0.0611005921730787, + "language_loss": 0.85109818, + "learning_rate": 0.0008052656229846436, + "loss": 0.862149, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.33544922, + "step": 1623, + "time_per_iteration": 2.6431145668029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106208, + "balance_loss_mlp": 1.07106483, + "epoch": 0.31242785686802615, + "flos": 575672514048.0, + "grad_norm": 0.055122717603047884, + "language_loss": 0.90674621, + "learning_rate": 0.0008050188246731182, + "loss": 0.91780829, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.35180664, + "step": 1624, + "time_per_iteration": 2.6666738986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101645, + "balance_loss_mlp": 1.06745625, + "epoch": 0.31262023855328974, + "flos": 736490271744.0, + "grad_norm": 0.05430344032768667, + "language_loss": 0.81962687, + "learning_rate": 0.0008047719079427834, + "loss": 0.8306433, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.34204102, + "step": 1625, + "time_per_iteration": 2.978775978088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095181, + "balance_loss_mlp": 1.07791936, + "epoch": 0.3128126202385533, + "flos": 1558395113472.0, + "grad_norm": 0.034550759669135796, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75446886, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.17285156, + "step": 1626, + "time_per_iteration": 4.800370931625366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109181, + "balance_loss_mlp": 1.05695319, + "epoch": 0.31300500192381686, + "flos": 514666538496.0, + "grad_norm": 0.04752817769408696, + "language_loss": 0.86259782, + "learning_rate": 0.0008042777196091757, + "loss": 0.87351596, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.34863281, + "step": 1627, + "time_per_iteration": 2.695350408554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088728, + "balance_loss_mlp": 1.05301261, + "epoch": 0.3131973836090804, + "flos": 526370420736.0, + "grad_norm": 0.06407391520506579, + "language_loss": 0.8214981, + "learning_rate": 0.0008040304481977643, + "loss": 0.83238542, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.35742188, + "step": 1628, + "time_per_iteration": 2.6652393341064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_mlp": 1.05030346, + "epoch": 0.313389765294344, + "flos": 822473961984.0, + "grad_norm": 0.08346342139557943, + "language_loss": 0.86950874, + "learning_rate": 0.0008037830587512649, + "loss": 0.88034296, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.33129883, + "step": 1629, + "time_per_iteration": 3.0668327808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090413, + "balance_loss_mlp": 1.05651021, + "epoch": 0.31358214697960757, + "flos": 393604697088.0, + "grad_norm": 0.061409761762948115, + "language_loss": 0.78720629, + "learning_rate": 0.0008035355513657224, + "loss": 0.79811049, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.33935547, + "step": 1630, + "time_per_iteration": 2.5013740062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087918, + "balance_loss_mlp": 1.05449188, + "epoch": 0.3137745286648711, + "flos": 571611617280.0, + "grad_norm": 0.049199842100191564, + "language_loss": 0.93020999, + "learning_rate": 0.0008032879261372279, + "loss": 0.94108921, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.33447266, + "step": 1631, + "time_per_iteration": 2.8559622764587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088241, + "balance_loss_mlp": 1.07612944, + "epoch": 0.3139669103501347, + "flos": 1497614690304.0, + "grad_norm": 0.04267228885339989, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80724084, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.12109375, + "step": 1632, + "time_per_iteration": 5.726024627685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081932, + "balance_loss_mlp": 1.04886341, + "epoch": 0.3141592920353982, + "flos": 525090041856.0, + "grad_norm": 0.04986838794009694, + "language_loss": 0.87459773, + "learning_rate": 0.0008027923225359748, + "loss": 0.88541704, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.33081055, + "step": 1633, + "time_per_iteration": 2.599775791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078913, + "balance_loss_mlp": 1.04465246, + "epoch": 0.3143516737206618, + "flos": 592989714432.0, + "grad_norm": 0.05680374588643473, + "language_loss": 0.8835839, + "learning_rate": 0.0008025443443556267, + "loss": 0.89437306, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.34301758, + "step": 1634, + "time_per_iteration": 2.7439024448394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010804, + "balance_loss_mlp": 1.04776073, + "epoch": 0.31454405540592534, + "flos": 648034573824.0, + "grad_norm": 0.04764849369773053, + "language_loss": 0.88161099, + "learning_rate": 0.000802296248717147, + "loss": 0.89241499, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.32641602, + "step": 1635, + "time_per_iteration": 2.902290105819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082457, + "balance_loss_mlp": 1.04850602, + "epoch": 0.3147364370911889, + "flos": 642543501312.0, + "grad_norm": 0.05380775409858787, + "language_loss": 0.79150212, + "learning_rate": 0.0008020480357168554, + "loss": 0.80232668, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.33984375, + "step": 1636, + "time_per_iteration": 2.7940564155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107486, + "balance_loss_mlp": 1.04176795, + "epoch": 0.31492881877645246, + "flos": 471607778304.0, + "grad_norm": 0.05509564816324918, + "language_loss": 0.88341308, + "learning_rate": 0.0008017997054511165, + "loss": 0.89416164, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.33105469, + "step": 1637, + "time_per_iteration": 2.596212148666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075188, + "balance_loss_mlp": 1.04157114, + "epoch": 0.31512120046171604, + "flos": 629135838720.0, + "grad_norm": 0.0536589952194777, + "language_loss": 0.8549943, + "learning_rate": 0.0008015512580163407, + "loss": 0.86574614, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.33642578, + "step": 1638, + "time_per_iteration": 2.763343334197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107569, + "balance_loss_mlp": 1.0416913, + "epoch": 0.31531358214697963, + "flos": 703460726784.0, + "grad_norm": 0.0636877441873346, + "language_loss": 0.80888116, + "learning_rate": 0.0008013026935089838, + "loss": 0.81963813, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.34033203, + "step": 1639, + "time_per_iteration": 2.9786083698272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070933, + "balance_loss_mlp": 1.03798366, + "epoch": 0.31550596383224316, + "flos": 572275127808.0, + "grad_norm": 0.055086353977466425, + "language_loss": 0.83909047, + "learning_rate": 0.0008010540120255472, + "loss": 0.84979975, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.32958984, + "step": 1640, + "time_per_iteration": 2.666520357131958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075633, + "balance_loss_mlp": 1.04196858, + "epoch": 0.31569834551750675, + "flos": 658047822336.0, + "grad_norm": 0.06483249406864507, + "language_loss": 0.86052895, + "learning_rate": 0.0008008052136625774, + "loss": 0.8712852, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.33691406, + "step": 1641, + "time_per_iteration": 2.8062589168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078165, + "balance_loss_mlp": 1.04407096, + "epoch": 0.3158907272027703, + "flos": 566002681344.0, + "grad_norm": 0.05792040128516231, + "language_loss": 0.86837387, + "learning_rate": 0.0008005562985166666, + "loss": 0.87915552, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.34130859, + "step": 1642, + "time_per_iteration": 2.6996512413024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081029, + "balance_loss_mlp": 1.04576707, + "epoch": 0.31608310888803387, + "flos": 536622216192.0, + "grad_norm": 0.04642534602938139, + "language_loss": 0.84936821, + "learning_rate": 0.0008003072666844524, + "loss": 0.86017853, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.35302734, + "step": 1643, + "time_per_iteration": 2.6999776363372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078239, + "balance_loss_mlp": 1.04292917, + "epoch": 0.3162754905732974, + "flos": 486428239872.0, + "grad_norm": 0.08271259063406261, + "language_loss": 0.82613683, + "learning_rate": 0.0008000581182626173, + "loss": 0.83691919, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.35302734, + "step": 1644, + "time_per_iteration": 2.541093111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082389, + "balance_loss_mlp": 1.04893875, + "epoch": 0.316467872258561, + "flos": 529792538112.0, + "grad_norm": 0.058359985905672214, + "language_loss": 0.86275887, + "learning_rate": 0.0007998088533478894, + "loss": 0.87358278, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.33447266, + "step": 1645, + "time_per_iteration": 2.641402006149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077931, + "balance_loss_mlp": 1.04309845, + "epoch": 0.3166602539438245, + "flos": 443197771776.0, + "grad_norm": 0.07387441321187599, + "language_loss": 0.84062803, + "learning_rate": 0.000799559472037042, + "loss": 0.85140741, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.34887695, + "step": 1646, + "time_per_iteration": 2.5274438858032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076892, + "balance_loss_mlp": 1.04222584, + "epoch": 0.3168526356290881, + "flos": 645513103872.0, + "grad_norm": 0.053861363144643716, + "language_loss": 0.87875295, + "learning_rate": 0.0007993099744268932, + "loss": 0.8895219, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.34716797, + "step": 1647, + "time_per_iteration": 2.8893649578094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070585, + "balance_loss_mlp": 1.03646684, + "epoch": 0.3170450173143517, + "flos": 585889403904.0, + "grad_norm": 0.05841982976759713, + "language_loss": 0.87792766, + "learning_rate": 0.000799060360614307, + "loss": 0.88863349, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.34155273, + "step": 1648, + "time_per_iteration": 2.6867480278015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076826, + "balance_loss_mlp": 1.04273248, + "epoch": 0.3172373989996152, + "flos": 826763231232.0, + "grad_norm": 0.05654214693871822, + "language_loss": 0.83848637, + "learning_rate": 0.0007988106306961917, + "loss": 0.84925467, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.34130859, + "step": 1649, + "time_per_iteration": 3.1321003437042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080036, + "balance_loss_mlp": 1.04577541, + "epoch": 0.3174297806848788, + "flos": 527153204736.0, + "grad_norm": 0.060794493166337976, + "language_loss": 0.84529203, + "learning_rate": 0.0007985607847695014, + "loss": 0.85609239, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.34301758, + "step": 1650, + "time_per_iteration": 2.6306049823760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081772, + "balance_loss_mlp": 1.04851258, + "epoch": 0.31762216237014235, + "flos": 712855544832.0, + "grad_norm": 0.05325998456044798, + "language_loss": 0.82638443, + "learning_rate": 0.0007983108229312345, + "loss": 0.83720207, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.33276367, + "step": 1651, + "time_per_iteration": 2.909571647644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108965, + "balance_loss_mlp": 1.05567503, + "epoch": 0.31781454405540593, + "flos": 483567736320.0, + "grad_norm": 0.0653784528473409, + "language_loss": 0.86672306, + "learning_rate": 0.0007980607452784351, + "loss": 0.87761962, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.33984375, + "step": 1652, + "time_per_iteration": 2.5339765548706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_mlp": 1.06639075, + "epoch": 0.31800692574066947, + "flos": 548483249664.0, + "grad_norm": 0.0685029555550019, + "language_loss": 0.90562367, + "learning_rate": 0.0007978105519081919, + "loss": 0.91662765, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.34008789, + "step": 1653, + "time_per_iteration": 2.6916213035583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096542, + "balance_loss_mlp": 1.06213784, + "epoch": 0.31819930742593305, + "flos": 516640951296.0, + "grad_norm": 0.07941193091019123, + "language_loss": 0.87969935, + "learning_rate": 0.0007975602429176385, + "loss": 0.89066482, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.34423828, + "step": 1654, + "time_per_iteration": 2.5997297763824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100835, + "balance_loss_mlp": 1.06695616, + "epoch": 0.31839168911119664, + "flos": 455748456960.0, + "grad_norm": 0.07129171690745044, + "language_loss": 0.81582803, + "learning_rate": 0.0007973098184039536, + "loss": 0.82683635, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.33911133, + "step": 1655, + "time_per_iteration": 2.654914140701294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096724, + "balance_loss_mlp": 1.06284511, + "epoch": 0.3185840707964602, + "flos": 625719513600.0, + "grad_norm": 0.05658637496419385, + "language_loss": 0.86710656, + "learning_rate": 0.0007970592784643602, + "loss": 0.87807381, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.33911133, + "step": 1656, + "time_per_iteration": 2.846390962600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090807, + "balance_loss_mlp": 1.05719042, + "epoch": 0.31877645248172376, + "flos": 567213249024.0, + "grad_norm": 0.058346793379709355, + "language_loss": 0.85032123, + "learning_rate": 0.0007968086231961272, + "loss": 0.8612293, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.33642578, + "step": 1657, + "time_per_iteration": 2.652986526489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094073, + "balance_loss_mlp": 1.0593828, + "epoch": 0.3189688341669873, + "flos": 489338205696.0, + "grad_norm": 0.08644842740903268, + "language_loss": 0.836254, + "learning_rate": 0.0007965578526965671, + "loss": 0.84719473, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.34741211, + "step": 1658, + "time_per_iteration": 2.5607872009277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092531, + "balance_loss_mlp": 1.05860353, + "epoch": 0.3191612158522509, + "flos": 575948938752.0, + "grad_norm": 0.04707712809776705, + "language_loss": 0.86020696, + "learning_rate": 0.0007963069670630377, + "loss": 0.87113225, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.33959961, + "step": 1659, + "time_per_iteration": 2.7435247898101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097687, + "balance_loss_mlp": 1.06447566, + "epoch": 0.3193535975375144, + "flos": 537867689472.0, + "grad_norm": 0.062321727217464123, + "language_loss": 0.87956834, + "learning_rate": 0.0007960559663929416, + "loss": 0.89054519, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.33227539, + "step": 1660, + "time_per_iteration": 2.6282846927642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096682, + "balance_loss_mlp": 1.06265998, + "epoch": 0.319545979222778, + "flos": 733954245120.0, + "grad_norm": 0.07201541894751945, + "language_loss": 0.87465358, + "learning_rate": 0.0007958048507837259, + "loss": 0.88562042, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.34057617, + "step": 1661, + "time_per_iteration": 2.9250974655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099132, + "balance_loss_mlp": 1.0647999, + "epoch": 0.31973836090804153, + "flos": 764136433152.0, + "grad_norm": 0.0721917610669121, + "language_loss": 0.87230003, + "learning_rate": 0.0007955536203328822, + "loss": 0.8832913, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.34375, + "step": 1662, + "time_per_iteration": 2.899735450744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109485, + "balance_loss_mlp": 1.06109047, + "epoch": 0.3199307425933051, + "flos": 560252560896.0, + "grad_norm": 0.06666532975578916, + "language_loss": 0.83308822, + "learning_rate": 0.0007953022751379469, + "loss": 0.84403676, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.33789062, + "step": 1663, + "time_per_iteration": 2.7743418216705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093375, + "balance_loss_mlp": 1.05899549, + "epoch": 0.3201231242785687, + "flos": 751019751936.0, + "grad_norm": 0.058114271957014456, + "language_loss": 0.81677037, + "learning_rate": 0.000795050815296501, + "loss": 0.82770407, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.34399414, + "step": 1664, + "time_per_iteration": 2.9620323181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091229, + "balance_loss_mlp": 1.05768323, + "epoch": 0.32031550596383224, + "flos": 496157709312.0, + "grad_norm": 0.061791342299560625, + "language_loss": 0.93274921, + "learning_rate": 0.0007947992409061695, + "loss": 0.94366151, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.33569336, + "step": 1665, + "time_per_iteration": 2.6097099781036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083934, + "balance_loss_mlp": 1.05022132, + "epoch": 0.3205078876490958, + "flos": 731294562816.0, + "grad_norm": 0.05133774923717053, + "language_loss": 0.86471802, + "learning_rate": 0.0007945475520646226, + "loss": 0.8755573, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.33740234, + "step": 1666, + "time_per_iteration": 2.9224231243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088823, + "balance_loss_mlp": 1.05487204, + "epoch": 0.32070026933435936, + "flos": 549177283584.0, + "grad_norm": 0.1345109768982335, + "language_loss": 0.8496111, + "learning_rate": 0.0007942957488695743, + "loss": 0.86049932, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.33959961, + "step": 1667, + "time_per_iteration": 2.6267666816711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086537, + "balance_loss_mlp": 1.05265749, + "epoch": 0.32089265101962294, + "flos": 744949536768.0, + "grad_norm": 0.061316479944915916, + "language_loss": 0.80963373, + "learning_rate": 0.0007940438314187833, + "loss": 0.82049918, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.33886719, + "step": 1668, + "time_per_iteration": 3.00421142578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089646, + "balance_loss_mlp": 1.05638647, + "epoch": 0.3210850327048865, + "flos": 493937395200.0, + "grad_norm": 0.05864654089818211, + "language_loss": 0.80047274, + "learning_rate": 0.0007937917998100529, + "loss": 0.81136918, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.33276367, + "step": 1669, + "time_per_iteration": 2.607917070388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_mlp": 1.06610548, + "epoch": 0.32127741439015006, + "flos": 530383265280.0, + "grad_norm": 0.060159342011431034, + "language_loss": 0.78680766, + "learning_rate": 0.0007935396541412302, + "loss": 0.79781532, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.34692383, + "step": 1670, + "time_per_iteration": 2.6022346019744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108678, + "balance_loss_mlp": 1.07458389, + "epoch": 0.3214697960754136, + "flos": 500948955648.0, + "grad_norm": 0.07085567852213893, + "language_loss": 0.85879421, + "learning_rate": 0.0007932873945102068, + "loss": 0.86988097, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.34130859, + "step": 1671, + "time_per_iteration": 2.581815719604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120776, + "balance_loss_mlp": 1.10942781, + "epoch": 0.3216621777606772, + "flos": 1382579394048.0, + "grad_norm": 0.04969555951860313, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76882553, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.11328125, + "step": 1672, + "time_per_iteration": 4.821724891662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106649, + "balance_loss_mlp": 1.07193518, + "epoch": 0.32185455944594077, + "flos": 571260999168.0, + "grad_norm": 0.05896773993357689, + "language_loss": 0.86527109, + "learning_rate": 0.0007927825337533461, + "loss": 0.87633765, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.34765625, + "step": 1673, + "time_per_iteration": 2.6640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105888, + "balance_loss_mlp": 1.07184219, + "epoch": 0.3220469411312043, + "flos": 543652715520.0, + "grad_norm": 0.06618360944756078, + "language_loss": 0.84761298, + "learning_rate": 0.0007925299328235131, + "loss": 0.85867184, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.34057617, + "step": 1674, + "time_per_iteration": 2.6524405479431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102416, + "balance_loss_mlp": 1.06705832, + "epoch": 0.3222393228164679, + "flos": 490884834816.0, + "grad_norm": 0.05872681692102293, + "language_loss": 0.85148364, + "learning_rate": 0.000792277218323488, + "loss": 0.86250782, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.35424805, + "step": 1675, + "time_per_iteration": 2.557460069656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100116, + "balance_loss_mlp": 1.06523526, + "epoch": 0.3224317045017314, + "flos": 490145720832.0, + "grad_norm": 0.05188137415598196, + "language_loss": 0.84647608, + "learning_rate": 0.0007920243903513833, + "loss": 0.85747719, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.34912109, + "step": 1676, + "time_per_iteration": 2.5208208560943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092752, + "balance_loss_mlp": 1.05813313, + "epoch": 0.322624086186995, + "flos": 575505188352.0, + "grad_norm": 0.06429192544800656, + "language_loss": 0.83992624, + "learning_rate": 0.0007917714490053556, + "loss": 0.8508538, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.34667969, + "step": 1677, + "time_per_iteration": 2.653686761856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109123, + "balance_loss_mlp": 1.05668271, + "epoch": 0.32281646787225854, + "flos": 628974305280.0, + "grad_norm": 0.048890211607645416, + "language_loss": 0.86094737, + "learning_rate": 0.0007915183943836055, + "loss": 0.87185967, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.34594727, + "step": 1678, + "time_per_iteration": 2.852612018585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083818, + "balance_loss_mlp": 1.04950905, + "epoch": 0.3230088495575221, + "flos": 781036024320.0, + "grad_norm": 0.05620908679364121, + "language_loss": 0.83880055, + "learning_rate": 0.0007912652265843773, + "loss": 0.8496387, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.34350586, + "step": 1679, + "time_per_iteration": 3.006805419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.0433867, + "epoch": 0.3232012312427857, + "flos": 535839432192.0, + "grad_norm": 0.04762836982551939, + "language_loss": 0.81587136, + "learning_rate": 0.0007910119457059597, + "loss": 0.82664907, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.34423828, + "step": 1680, + "time_per_iteration": 2.6930737495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_mlp": 1.04129148, + "epoch": 0.32339361292804925, + "flos": 704515553280.0, + "grad_norm": 0.06110281418881611, + "language_loss": 0.80031025, + "learning_rate": 0.0007907585518466849, + "loss": 0.81107438, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.35180664, + "step": 1681, + "time_per_iteration": 2.940950870513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081204, + "balance_loss_mlp": 1.04603744, + "epoch": 0.32358599461331283, + "flos": 452099377152.0, + "grad_norm": 0.0474614445796137, + "language_loss": 0.90124965, + "learning_rate": 0.000790505045104929, + "loss": 0.91206169, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.35205078, + "step": 1682, + "time_per_iteration": 2.4919469356536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078264, + "balance_loss_mlp": 1.0435977, + "epoch": 0.32377837629857636, + "flos": 600597794304.0, + "grad_norm": 0.057051782898604, + "language_loss": 0.86989701, + "learning_rate": 0.0007902514255791125, + "loss": 0.88067961, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.34692383, + "step": 1683, + "time_per_iteration": 2.7545859813690186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108269, + "balance_loss_mlp": 1.04721308, + "epoch": 0.32397075798383995, + "flos": 807180636672.0, + "grad_norm": 0.05145240385219177, + "language_loss": 0.87981123, + "learning_rate": 0.0007899976933676986, + "loss": 0.89063811, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.35498047, + "step": 1684, + "time_per_iteration": 2.97807240486145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081365, + "balance_loss_mlp": 1.04638934, + "epoch": 0.3241631396691035, + "flos": 601414073856.0, + "grad_norm": 0.06429290680378846, + "language_loss": 0.8767072, + "learning_rate": 0.0007897438485691955, + "loss": 0.88752091, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.3503418, + "step": 1685, + "time_per_iteration": 2.704326868057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083548, + "balance_loss_mlp": 1.04826176, + "epoch": 0.32435552135436707, + "flos": 473980861440.0, + "grad_norm": 0.058364951070402814, + "language_loss": 0.82023847, + "learning_rate": 0.0007894898912821542, + "loss": 0.831074, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.3527832, + "step": 1686, + "time_per_iteration": 2.5206680297851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076257, + "balance_loss_mlp": 1.04178166, + "epoch": 0.3245479030396306, + "flos": 537824019456.0, + "grad_norm": 0.04476181031616706, + "language_loss": 0.86661267, + "learning_rate": 0.0007892358216051695, + "loss": 0.87737525, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.3449707, + "step": 1687, + "time_per_iteration": 2.7332026958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075747, + "balance_loss_mlp": 1.04108071, + "epoch": 0.3247402847248942, + "flos": 547394927616.0, + "grad_norm": 0.05643246072623682, + "language_loss": 0.92275292, + "learning_rate": 0.0007889816396368803, + "loss": 0.93351042, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.34692383, + "step": 1688, + "time_per_iteration": 2.6158432960510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077832, + "balance_loss_mlp": 1.04388082, + "epoch": 0.3249326664101578, + "flos": 377941814784.0, + "grad_norm": 0.04953960067471088, + "language_loss": 0.85575634, + "learning_rate": 0.0007887273454759687, + "loss": 0.86653465, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.33984375, + "step": 1689, + "time_per_iteration": 2.4958317279815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075156, + "balance_loss_mlp": 1.04051399, + "epoch": 0.3251250480954213, + "flos": 527818125312.0, + "grad_norm": 0.050587956688220255, + "language_loss": 0.82717729, + "learning_rate": 0.0007884729392211603, + "loss": 0.83792883, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.34692383, + "step": 1690, + "time_per_iteration": 2.6325736045837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075634, + "balance_loss_mlp": 1.04113472, + "epoch": 0.3253174297806849, + "flos": 449435312640.0, + "grad_norm": 0.06211432544239721, + "language_loss": 0.85214412, + "learning_rate": 0.0007882184209712245, + "loss": 0.8629005, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.34545898, + "step": 1691, + "time_per_iteration": 2.5199012756347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076936, + "balance_loss_mlp": 1.04303288, + "epoch": 0.32550981146594843, + "flos": 703855014912.0, + "grad_norm": 0.0444021152083115, + "language_loss": 0.85646939, + "learning_rate": 0.000787963790824974, + "loss": 0.86723876, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.33935547, + "step": 1692, + "time_per_iteration": 2.9585483074188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076864, + "balance_loss_mlp": 1.04217362, + "epoch": 0.325702193151212, + "flos": 392491643904.0, + "grad_norm": 0.06035071191190156, + "language_loss": 0.89588344, + "learning_rate": 0.0007877090488812651, + "loss": 0.90665203, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.34716797, + "step": 1693, + "time_per_iteration": 2.4247992038726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073167, + "balance_loss_mlp": 1.0379529, + "epoch": 0.32589457483647555, + "flos": 577223525376.0, + "grad_norm": 0.051335929306222446, + "language_loss": 0.83377099, + "learning_rate": 0.0007874541952389973, + "loss": 0.84450269, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.35253906, + "step": 1694, + "time_per_iteration": 2.679868459701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074307, + "balance_loss_mlp": 1.03947401, + "epoch": 0.32608695652173914, + "flos": 498092834304.0, + "grad_norm": 0.051580366849716015, + "language_loss": 0.86795568, + "learning_rate": 0.0007871992299971136, + "loss": 0.87869877, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.34887695, + "step": 1695, + "time_per_iteration": 2.6005072593688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079265, + "balance_loss_mlp": 1.04431272, + "epoch": 0.32627933820700267, + "flos": 590858150400.0, + "grad_norm": 0.054409905067417906, + "language_loss": 0.84529006, + "learning_rate": 0.0007869441532546001, + "loss": 0.85608268, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.34985352, + "step": 1696, + "time_per_iteration": 2.7373292446136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071229, + "balance_loss_mlp": 1.03749299, + "epoch": 0.32647171989226625, + "flos": 608790809088.0, + "grad_norm": 0.05196776598603691, + "language_loss": 0.79551816, + "learning_rate": 0.0007866889651104867, + "loss": 0.80623043, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.33764648, + "step": 1697, + "time_per_iteration": 2.768869638442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069598, + "balance_loss_mlp": 1.03464603, + "epoch": 0.32666410157752984, + "flos": 476896619520.0, + "grad_norm": 0.05699082390473629, + "language_loss": 0.83313, + "learning_rate": 0.000786433665663846, + "loss": 0.84382606, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.34985352, + "step": 1698, + "time_per_iteration": 2.6574184894561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070595, + "balance_loss_mlp": 1.03664398, + "epoch": 0.3268564832627934, + "flos": 718060018176.0, + "grad_norm": 0.0499104315286651, + "language_loss": 0.86441195, + "learning_rate": 0.0007861782550137942, + "loss": 0.8751179, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.33984375, + "step": 1699, + "time_per_iteration": 2.8897016048431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071541, + "balance_loss_mlp": 1.0379957, + "epoch": 0.32704886494805696, + "flos": 768469372416.0, + "grad_norm": 0.05892131453680714, + "language_loss": 0.85990739, + "learning_rate": 0.0007859227332594901, + "loss": 0.87062275, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.33569336, + "step": 1700, + "time_per_iteration": 2.8941755294799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080492, + "balance_loss_mlp": 1.046803, + "epoch": 0.3272412466333205, + "flos": 849540980736.0, + "grad_norm": 0.0647173620985618, + "language_loss": 0.84537613, + "learning_rate": 0.0007856671005001365, + "loss": 0.85618103, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.3371582, + "step": 1701, + "time_per_iteration": 3.1362555027008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107778, + "balance_loss_mlp": 1.04373336, + "epoch": 0.3274336283185841, + "flos": 831224208384.0, + "grad_norm": 0.055785838120560656, + "language_loss": 0.81608075, + "learning_rate": 0.0007854113568349787, + "loss": 0.82685852, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.34082031, + "step": 1702, + "time_per_iteration": 3.0684425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108348, + "balance_loss_mlp": 1.04900455, + "epoch": 0.3276260100038476, + "flos": 691721938944.0, + "grad_norm": 0.059478075679183354, + "language_loss": 0.80008304, + "learning_rate": 0.0007851555023633052, + "loss": 0.81091785, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.34521484, + "step": 1703, + "time_per_iteration": 2.829838991165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083551, + "balance_loss_mlp": 1.04974365, + "epoch": 0.3278183916891112, + "flos": 435831211008.0, + "grad_norm": 0.05938301584715095, + "language_loss": 0.82290888, + "learning_rate": 0.0007848995371844474, + "loss": 0.83374435, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.33837891, + "step": 1704, + "time_per_iteration": 2.498462200164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.0532254, + "epoch": 0.3280107733743748, + "flos": 460883119104.0, + "grad_norm": 0.06015932024064871, + "language_loss": 0.80622214, + "learning_rate": 0.0007846434613977801, + "loss": 0.81709725, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.34326172, + "step": 1705, + "time_per_iteration": 2.4933369159698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087045, + "balance_loss_mlp": 1.05330932, + "epoch": 0.3282031550596383, + "flos": 679018484736.0, + "grad_norm": 0.05558890685700398, + "language_loss": 0.78265798, + "learning_rate": 0.0007843872751027203, + "loss": 0.7935285, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.33764648, + "step": 1706, + "time_per_iteration": 2.81091046333313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090191, + "balance_loss_mlp": 1.05621648, + "epoch": 0.3283955367449019, + "flos": 544821023232.0, + "grad_norm": 0.10097233050810657, + "language_loss": 0.87312186, + "learning_rate": 0.0007841309783987287, + "loss": 0.88402379, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.34008789, + "step": 1707, + "time_per_iteration": 2.7456212043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083118, + "balance_loss_mlp": 1.0490005, + "epoch": 0.32858791843016544, + "flos": 481017153024.0, + "grad_norm": 0.06288690811568091, + "language_loss": 0.89185357, + "learning_rate": 0.0007838745713853084, + "loss": 0.90268475, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.34155273, + "step": 1708, + "time_per_iteration": 2.5734565258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086944, + "balance_loss_mlp": 1.0529933, + "epoch": 0.328780300115429, + "flos": 566529389568.0, + "grad_norm": 0.059735917623485235, + "language_loss": 0.84101981, + "learning_rate": 0.0007836180541620053, + "loss": 0.85188925, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.33984375, + "step": 1709, + "time_per_iteration": 2.6734848022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082239, + "balance_loss_mlp": 1.04843152, + "epoch": 0.32897268180069256, + "flos": 475787948544.0, + "grad_norm": 0.06557165815913592, + "language_loss": 0.86666405, + "learning_rate": 0.0007833614268284082, + "loss": 0.87748647, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.33813477, + "step": 1710, + "time_per_iteration": 2.5004236698150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109204, + "balance_loss_mlp": 1.07611382, + "epoch": 0.32916506348595614, + "flos": 1576517008896.0, + "grad_norm": 0.028486921929439343, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75201809, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.15917969, + "step": 1711, + "time_per_iteration": 4.857421159744263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084065, + "balance_loss_mlp": 1.05011439, + "epoch": 0.3293574451712197, + "flos": 482646739968.0, + "grad_norm": 0.05383776069577274, + "language_loss": 0.78376174, + "learning_rate": 0.0007828478422289016, + "loss": 0.79460239, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.33984375, + "step": 1712, + "time_per_iteration": 2.5763661861419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089524, + "balance_loss_mlp": 1.05438161, + "epoch": 0.32954982685648326, + "flos": 622266872832.0, + "grad_norm": 0.05220026625301518, + "language_loss": 0.89185119, + "learning_rate": 0.0007825908851623833, + "loss": 0.90274644, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.35205078, + "step": 1713, + "time_per_iteration": 2.7262768745422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081531, + "balance_loss_mlp": 1.04648352, + "epoch": 0.32974220854174685, + "flos": 544697367552.0, + "grad_norm": 0.06806070360888057, + "language_loss": 0.85360491, + "learning_rate": 0.0007823338183843533, + "loss": 0.86442018, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.35083008, + "step": 1714, + "time_per_iteration": 2.652278184890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078166, + "balance_loss_mlp": 1.0447638, + "epoch": 0.3299345902270104, + "flos": 981740708352.0, + "grad_norm": 0.05603975865081876, + "language_loss": 0.8075726, + "learning_rate": 0.0007820766419946141, + "loss": 0.81835425, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.33422852, + "step": 1715, + "time_per_iteration": 3.282278537750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087722, + "balance_loss_mlp": 1.07227242, + "epoch": 0.33012697191227397, + "flos": 1402857432576.0, + "grad_norm": 0.02753251532821737, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80760199, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.15429688, + "step": 1716, + "time_per_iteration": 4.925649881362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081747, + "balance_loss_mlp": 1.04789162, + "epoch": 0.3303193535975375, + "flos": 504897781248.0, + "grad_norm": 0.09582479469105179, + "language_loss": 0.75968826, + "learning_rate": 0.0007815619607794288, + "loss": 0.77050573, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.33886719, + "step": 1717, + "time_per_iteration": 2.653355598449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077451, + "balance_loss_mlp": 1.04423952, + "epoch": 0.3305117352828011, + "flos": 937602390528.0, + "grad_norm": 0.059316474336830904, + "language_loss": 0.8254683, + "learning_rate": 0.0007813044561538001, + "loss": 0.83624279, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.33227539, + "step": 1718, + "time_per_iteration": 3.1251535415649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084283, + "balance_loss_mlp": 1.05035567, + "epoch": 0.3307041169680646, + "flos": 721176597504.0, + "grad_norm": 0.08429030846847434, + "language_loss": 0.88411027, + "learning_rate": 0.0007810468423160958, + "loss": 0.89495313, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.33959961, + "step": 1719, + "time_per_iteration": 2.8598783016204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090007, + "balance_loss_mlp": 1.05760598, + "epoch": 0.3308964986533282, + "flos": 583315499520.0, + "grad_norm": 0.04547421634197757, + "language_loss": 0.81920642, + "learning_rate": 0.0007807891193663306, + "loss": 0.8301065, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.32397461, + "step": 1720, + "time_per_iteration": 2.7775802612304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092448, + "balance_loss_mlp": 1.0582583, + "epoch": 0.33108888033859174, + "flos": 473340672000.0, + "grad_norm": 0.07591254280459368, + "language_loss": 0.82440275, + "learning_rate": 0.0007805312874045614, + "loss": 0.83532727, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.34228516, + "step": 1721, + "time_per_iteration": 2.5138351917266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100808, + "balance_loss_mlp": 1.06657076, + "epoch": 0.3312812620238553, + "flos": 385913659392.0, + "grad_norm": 0.08101052667778896, + "language_loss": 0.86919391, + "learning_rate": 0.0007802733465308874, + "loss": 0.88020205, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.34277344, + "step": 1722, + "time_per_iteration": 2.440809726715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106056, + "balance_loss_mlp": 1.07074606, + "epoch": 0.3314736437091189, + "flos": 494292395520.0, + "grad_norm": 0.0567806329299034, + "language_loss": 0.8509872, + "learning_rate": 0.0007800152968454501, + "loss": 0.86204773, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.35375977, + "step": 1723, + "time_per_iteration": 2.61602520942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090428, + "balance_loss_mlp": 1.056072, + "epoch": 0.33166602539438245, + "flos": 653346736128.0, + "grad_norm": 0.038882210578800376, + "language_loss": 0.90476918, + "learning_rate": 0.0007797571384484334, + "loss": 0.91567349, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.34399414, + "step": 1724, + "time_per_iteration": 2.857562780380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090291, + "balance_loss_mlp": 1.05586314, + "epoch": 0.33185840707964603, + "flos": 520550489088.0, + "grad_norm": 0.04870849772261114, + "language_loss": 0.91599178, + "learning_rate": 0.0007794988714400633, + "loss": 0.92689478, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.34448242, + "step": 1725, + "time_per_iteration": 2.5884361267089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097227, + "balance_loss_mlp": 1.06077266, + "epoch": 0.33205078876490957, + "flos": 436712919552.0, + "grad_norm": 0.05260760436426434, + "language_loss": 0.85199809, + "learning_rate": 0.0007792404959206079, + "loss": 0.86297035, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.36474609, + "step": 1726, + "time_per_iteration": 2.4855122566223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095942, + "balance_loss_mlp": 1.05965447, + "epoch": 0.33224317045017315, + "flos": 768400971264.0, + "grad_norm": 0.052329818141719754, + "language_loss": 0.81527805, + "learning_rate": 0.0007789820119903774, + "loss": 0.82623744, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.36279297, + "step": 1727, + "time_per_iteration": 2.945405960083008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105991, + "balance_loss_mlp": 1.04579556, + "epoch": 0.3324355521354367, + "flos": 1465656090624.0, + "grad_norm": 0.02932642968329903, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79552573, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.14160156, + "step": 1728, + "time_per_iteration": 4.810296297073364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084477, + "balance_loss_mlp": 1.04880977, + "epoch": 0.3326279338207003, + "flos": 496415195136.0, + "grad_norm": 0.05339720943334808, + "language_loss": 0.83919221, + "learning_rate": 0.0007784647192990428, + "loss": 0.85003698, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.35693359, + "step": 1729, + "time_per_iteration": 2.6848132610321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083753, + "balance_loss_mlp": 1.04844344, + "epoch": 0.33282031550596386, + "flos": 635600342016.0, + "grad_norm": 0.05212570885713578, + "language_loss": 0.80661625, + "learning_rate": 0.0007782059107387696, + "loss": 0.81745386, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.35351562, + "step": 1730, + "time_per_iteration": 2.874356269836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078294, + "balance_loss_mlp": 1.04329371, + "epoch": 0.3330126971912274, + "flos": 689210643456.0, + "grad_norm": 0.05636936917064103, + "language_loss": 0.88407743, + "learning_rate": 0.0007779469941693826, + "loss": 0.89486033, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.3503418, + "step": 1731, + "time_per_iteration": 2.7914862632751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079351, + "balance_loss_mlp": 1.04511368, + "epoch": 0.333205078876491, + "flos": 566184563712.0, + "grad_norm": 0.05730145040609657, + "language_loss": 0.77017218, + "learning_rate": 0.0007776879696914029, + "loss": 0.78096569, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.34277344, + "step": 1732, + "time_per_iteration": 2.8158769607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081, + "balance_loss_mlp": 1.04666734, + "epoch": 0.3333974605617545, + "flos": 640618550784.0, + "grad_norm": 0.044212495165629015, + "language_loss": 0.8903594, + "learning_rate": 0.000777428837405392, + "loss": 0.90116942, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.34375, + "step": 1733, + "time_per_iteration": 2.8417906761169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079717, + "balance_loss_mlp": 1.04536092, + "epoch": 0.3335898422470181, + "flos": 461597501952.0, + "grad_norm": 0.05109390766697835, + "language_loss": 0.87070495, + "learning_rate": 0.0007771695974119544, + "loss": 0.88150203, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.34399414, + "step": 1734, + "time_per_iteration": 2.4995057582855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078764, + "balance_loss_mlp": 1.04514742, + "epoch": 0.33378222393228163, + "flos": 852504791040.0, + "grad_norm": 0.05825672376588237, + "language_loss": 0.75576115, + "learning_rate": 0.0007769102498117359, + "loss": 0.76654887, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.33642578, + "step": 1735, + "time_per_iteration": 3.0868663787841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083295, + "balance_loss_mlp": 1.04991651, + "epoch": 0.3339746056175452, + "flos": 954256080384.0, + "grad_norm": 0.05069255593645712, + "language_loss": 0.79858601, + "learning_rate": 0.000776650794705424, + "loss": 0.80941892, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.33398438, + "step": 1736, + "time_per_iteration": 3.2665328979492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108809, + "balance_loss_mlp": 1.05387688, + "epoch": 0.33416698730280875, + "flos": 544559155200.0, + "grad_norm": 0.045819605067785145, + "language_loss": 0.82160866, + "learning_rate": 0.0007763912321937483, + "loss": 0.83248949, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.34228516, + "step": 1737, + "time_per_iteration": 2.677316665649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081816, + "balance_loss_mlp": 1.04798412, + "epoch": 0.33435936898807234, + "flos": 1013652817920.0, + "grad_norm": 0.053421386657792044, + "language_loss": 0.82471478, + "learning_rate": 0.0007761315623774799, + "loss": 0.8355329, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.33862305, + "step": 1738, + "time_per_iteration": 3.4182283878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089571, + "balance_loss_mlp": 1.05554879, + "epoch": 0.3345517506733359, + "flos": 614935217664.0, + "grad_norm": 0.051536505858366714, + "language_loss": 0.87671852, + "learning_rate": 0.0007758717853574313, + "loss": 0.88761419, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.34057617, + "step": 1739, + "time_per_iteration": 2.7348380088806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084561, + "balance_loss_mlp": 1.05125391, + "epoch": 0.33474413235859946, + "flos": 494350622208.0, + "grad_norm": 0.06141180611747274, + "language_loss": 0.9002257, + "learning_rate": 0.0007756119012344571, + "loss": 0.91107136, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.33325195, + "step": 1740, + "time_per_iteration": 2.536121129989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091135, + "balance_loss_mlp": 1.05754209, + "epoch": 0.33493651404386304, + "flos": 628105743360.0, + "grad_norm": 0.06662069566578578, + "language_loss": 0.84404671, + "learning_rate": 0.0007753519101094535, + "loss": 0.85495806, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.33618164, + "step": 1741, + "time_per_iteration": 2.753371238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082833, + "balance_loss_mlp": 1.04945421, + "epoch": 0.3351288957291266, + "flos": 513474909696.0, + "grad_norm": 0.05750412427252262, + "language_loss": 0.86366677, + "learning_rate": 0.0007750918120833575, + "loss": 0.87449515, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.33398438, + "step": 1742, + "time_per_iteration": 2.56093168258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082276, + "balance_loss_mlp": 1.05037546, + "epoch": 0.33532127741439016, + "flos": 647008860672.0, + "grad_norm": 0.0676260973392943, + "language_loss": 0.87342101, + "learning_rate": 0.0007748316072571485, + "loss": 0.88424373, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.31884766, + "step": 1743, + "time_per_iteration": 2.7759556770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086123, + "balance_loss_mlp": 1.05193388, + "epoch": 0.3355136590996537, + "flos": 768134721024.0, + "grad_norm": 0.047185436483198326, + "language_loss": 0.79306734, + "learning_rate": 0.0007745712957318467, + "loss": 0.80392861, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.34228516, + "step": 1744, + "time_per_iteration": 2.959686756134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085284, + "balance_loss_mlp": 1.05119014, + "epoch": 0.3357060407849173, + "flos": 595259490816.0, + "grad_norm": 0.046948111550021425, + "language_loss": 0.86506951, + "learning_rate": 0.0007743108776085141, + "loss": 0.87592232, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.34106445, + "step": 1745, + "time_per_iteration": 2.7391204833984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089169, + "balance_loss_mlp": 1.05462217, + "epoch": 0.3358984224701808, + "flos": 598288730112.0, + "grad_norm": 0.04983419543630797, + "language_loss": 0.82728243, + "learning_rate": 0.0007740503529882543, + "loss": 0.8381741, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.34594727, + "step": 1746, + "time_per_iteration": 2.788041114807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108953, + "balance_loss_mlp": 1.05474496, + "epoch": 0.3360908041554444, + "flos": 578055771648.0, + "grad_norm": 0.05677254755827829, + "language_loss": 0.91252941, + "learning_rate": 0.0007737897219722114, + "loss": 0.92342472, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.34790039, + "step": 1747, + "time_per_iteration": 2.6752376556396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083715, + "balance_loss_mlp": 1.04945374, + "epoch": 0.336283185840708, + "flos": 513332315136.0, + "grad_norm": 0.05427874766165502, + "language_loss": 0.81146061, + "learning_rate": 0.0007735289846615716, + "loss": 0.82229781, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.34301758, + "step": 1748, + "time_per_iteration": 2.670315742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083181, + "balance_loss_mlp": 1.04984999, + "epoch": 0.3364755675259715, + "flos": 524716102656.0, + "grad_norm": 0.05445380235157479, + "language_loss": 0.81899059, + "learning_rate": 0.0007732681411575621, + "loss": 0.82982242, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.33349609, + "step": 1749, + "time_per_iteration": 2.644740104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079502, + "balance_loss_mlp": 1.04567027, + "epoch": 0.3366679492112351, + "flos": 554594162688.0, + "grad_norm": 0.05291201013717534, + "language_loss": 0.87517959, + "learning_rate": 0.0007730071915614514, + "loss": 0.88597459, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.33862305, + "step": 1750, + "time_per_iteration": 2.6605777740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082694, + "balance_loss_mlp": 1.04874277, + "epoch": 0.33686033089649864, + "flos": 427051851264.0, + "grad_norm": 0.07867660779661921, + "language_loss": 0.88976741, + "learning_rate": 0.0007727461359745489, + "loss": 0.90059435, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.33984375, + "step": 1751, + "time_per_iteration": 2.4562768936157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082471, + "balance_loss_mlp": 1.04987907, + "epoch": 0.3370527125817622, + "flos": 541452750336.0, + "grad_norm": 0.05472309390748721, + "language_loss": 0.86156446, + "learning_rate": 0.0007724849744982056, + "loss": 0.87238914, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.32592773, + "step": 1752, + "time_per_iteration": 2.683575391769409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086284, + "balance_loss_mlp": 1.05295336, + "epoch": 0.33724509426702576, + "flos": 541836864000.0, + "grad_norm": 0.052181206472060114, + "language_loss": 0.81578106, + "learning_rate": 0.0007722237072338131, + "loss": 0.82664388, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.33349609, + "step": 1753, + "time_per_iteration": 2.7059788703918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108949, + "balance_loss_mlp": 1.05563486, + "epoch": 0.33743747595228935, + "flos": 472557888000.0, + "grad_norm": 0.063588606701447, + "language_loss": 0.85402888, + "learning_rate": 0.0007719623342828046, + "loss": 0.86492383, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.33886719, + "step": 1754, + "time_per_iteration": 2.5117459297180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090728, + "balance_loss_mlp": 1.05708706, + "epoch": 0.33762985763755293, + "flos": 469564964352.0, + "grad_norm": 0.05602573096673387, + "language_loss": 0.84115714, + "learning_rate": 0.000771700855746654, + "loss": 0.85206437, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.33666992, + "step": 1755, + "time_per_iteration": 2.5685064792633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085423, + "balance_loss_mlp": 1.05214, + "epoch": 0.33782223932281646, + "flos": 492002270208.0, + "grad_norm": 0.05352941428578995, + "language_loss": 0.88329422, + "learning_rate": 0.0007714392717268763, + "loss": 0.89414847, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.33300781, + "step": 1756, + "time_per_iteration": 2.568432569503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084365, + "balance_loss_mlp": 1.04981852, + "epoch": 0.33801462100808005, + "flos": 464827562496.0, + "grad_norm": 0.051807056092833426, + "language_loss": 0.86368155, + "learning_rate": 0.0007711775823250273, + "loss": 0.87452519, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.34594727, + "step": 1757, + "time_per_iteration": 2.5542263984680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010902, + "balance_loss_mlp": 1.05660665, + "epoch": 0.3382070026933436, + "flos": 795319603200.0, + "grad_norm": 0.05510084593487172, + "language_loss": 0.83019066, + "learning_rate": 0.0007709157876427039, + "loss": 0.84109271, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.33618164, + "step": 1758, + "time_per_iteration": 3.07852840423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082149, + "balance_loss_mlp": 1.04903245, + "epoch": 0.33839938437860717, + "flos": 508181686272.0, + "grad_norm": 0.0524958838474987, + "language_loss": 0.85602981, + "learning_rate": 0.0007706538877815439, + "loss": 0.86685127, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.33129883, + "step": 1759, + "time_per_iteration": 2.6002085208892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082716, + "balance_loss_mlp": 1.05021918, + "epoch": 0.3385917660638707, + "flos": 483986755584.0, + "grad_norm": 0.05079207863068971, + "language_loss": 0.83150595, + "learning_rate": 0.0007703918828432259, + "loss": 0.84233308, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.32495117, + "step": 1760, + "time_per_iteration": 2.5961215496063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086286, + "balance_loss_mlp": 1.05297899, + "epoch": 0.3387841477491343, + "flos": 545071306752.0, + "grad_norm": 0.0542286668270813, + "language_loss": 0.89021361, + "learning_rate": 0.000770129772929469, + "loss": 0.9010765, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.33325195, + "step": 1761, + "time_per_iteration": 2.6393394470214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076289, + "balance_loss_mlp": 1.04264784, + "epoch": 0.3389765294343978, + "flos": 719487373824.0, + "grad_norm": 0.057381721603975526, + "language_loss": 0.88803625, + "learning_rate": 0.0007698675581420334, + "loss": 0.89879912, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.33666992, + "step": 1762, + "time_per_iteration": 2.8959014415740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_mlp": 1.03968656, + "epoch": 0.3391689111196614, + "flos": 699596269056.0, + "grad_norm": 0.05381480837735757, + "language_loss": 0.78922743, + "learning_rate": 0.0007696052385827199, + "loss": 0.79995811, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.33398438, + "step": 1763, + "time_per_iteration": 2.9110584259033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068767, + "balance_loss_mlp": 1.03519773, + "epoch": 0.339361292804925, + "flos": 626806425600.0, + "grad_norm": 0.05521588721088573, + "language_loss": 0.78407156, + "learning_rate": 0.00076934281435337, + "loss": 0.79475927, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.33569336, + "step": 1764, + "time_per_iteration": 2.7673115730285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073935, + "balance_loss_mlp": 1.04043674, + "epoch": 0.33955367449018853, + "flos": 609302960640.0, + "grad_norm": 0.0615155635578628, + "language_loss": 0.85995364, + "learning_rate": 0.0007690802855558658, + "loss": 0.87069303, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.33520508, + "step": 1765, + "time_per_iteration": 2.871255397796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_mlp": 1.07131636, + "epoch": 0.3397460561754521, + "flos": 1452494177280.0, + "grad_norm": 0.03113174858532202, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77458668, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.12402344, + "step": 1766, + "time_per_iteration": 4.906975746154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089831, + "balance_loss_mlp": 1.05485463, + "epoch": 0.33993843786071565, + "flos": 487068429312.0, + "grad_norm": 0.059784397062932884, + "language_loss": 0.89060128, + "learning_rate": 0.0007685549146641262, + "loss": 0.90149957, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.35009766, + "step": 1767, + "time_per_iteration": 2.5172948837280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081834, + "balance_loss_mlp": 1.04683375, + "epoch": 0.34013081954597923, + "flos": 417115768320.0, + "grad_norm": 0.05470212710373979, + "language_loss": 0.88085568, + "learning_rate": 0.0007682920727738579, + "loss": 0.89167398, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.35058594, + "step": 1768, + "time_per_iteration": 2.4423539638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084324, + "balance_loss_mlp": 1.04939604, + "epoch": 0.34032320123124277, + "flos": 437293472256.0, + "grad_norm": 0.06228189549734304, + "language_loss": 0.84428132, + "learning_rate": 0.000768029126723369, + "loss": 0.85512453, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.34960938, + "step": 1769, + "time_per_iteration": 2.5054280757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082733, + "balance_loss_mlp": 1.04811513, + "epoch": 0.34051558291650635, + "flos": 457353312768.0, + "grad_norm": 0.058774755629116764, + "language_loss": 0.81489038, + "learning_rate": 0.0007677660766147447, + "loss": 0.82571769, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.34667969, + "step": 1770, + "time_per_iteration": 2.524327039718628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_mlp": 1.01844561, + "epoch": 0.3407079646017699, + "flos": 1558029938688.0, + "grad_norm": 0.017684799672329513, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.7350117, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.11767578, + "step": 1771, + "time_per_iteration": 4.924427032470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081114, + "balance_loss_mlp": 1.04666233, + "epoch": 0.3409003462870335, + "flos": 492312190464.0, + "grad_norm": 0.06375677891517043, + "language_loss": 0.79619604, + "learning_rate": 0.0007672396646316306, + "loss": 0.80700719, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.3449707, + "step": 1772, + "time_per_iteration": 2.5239012241363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081305, + "balance_loss_mlp": 1.04589987, + "epoch": 0.34109272797229706, + "flos": 808145303040.0, + "grad_norm": 0.06003817873980187, + "language_loss": 0.80518734, + "learning_rate": 0.000766976302961512, + "loss": 0.81600046, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.35424805, + "step": 1773, + "time_per_iteration": 2.9730074405670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083157, + "balance_loss_mlp": 1.04834807, + "epoch": 0.3412851096575606, + "flos": 469903997952.0, + "grad_norm": 0.05958263274361502, + "language_loss": 0.81420594, + "learning_rate": 0.0007667128376420003, + "loss": 0.82503754, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.34863281, + "step": 1774, + "time_per_iteration": 2.521329879760742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073378, + "balance_loss_mlp": 1.03842556, + "epoch": 0.3414774913428242, + "flos": 595402085376.0, + "grad_norm": 0.09709010294240925, + "language_loss": 0.84563607, + "learning_rate": 0.0007664492687753817, + "loss": 0.85636985, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.34985352, + "step": 1775, + "time_per_iteration": 2.6744766235351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072901, + "balance_loss_mlp": 1.03976059, + "epoch": 0.3416698730280877, + "flos": 527202667008.0, + "grad_norm": 0.05030413358353647, + "language_loss": 0.81513566, + "learning_rate": 0.000766185596463983, + "loss": 0.82586467, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.33154297, + "step": 1776, + "time_per_iteration": 2.6050221920013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073672, + "balance_loss_mlp": 1.03962612, + "epoch": 0.3418622547133513, + "flos": 874272794112.0, + "grad_norm": 0.05039515754698922, + "language_loss": 0.76683038, + "learning_rate": 0.0007659218208101706, + "loss": 0.77756709, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.34082031, + "step": 1777, + "time_per_iteration": 3.0900516510009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079189, + "balance_loss_mlp": 1.04504752, + "epoch": 0.34205463639861483, + "flos": 603462680064.0, + "grad_norm": 0.04915159817243754, + "language_loss": 0.84680861, + "learning_rate": 0.0007656579419163515, + "loss": 0.85760045, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.34179688, + "step": 1778, + "time_per_iteration": 2.7328884601593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081812, + "balance_loss_mlp": 1.04774237, + "epoch": 0.3422470180838784, + "flos": 463547183616.0, + "grad_norm": 0.05230649511498847, + "language_loss": 0.76939148, + "learning_rate": 0.0007653939598849724, + "loss": 0.7802096, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.34106445, + "step": 1779, + "time_per_iteration": 2.5020573139190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_mlp": 1.01441097, + "epoch": 0.34243939976914195, + "flos": 1585584377856.0, + "grad_norm": 0.019842751190116498, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83907396, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.12792969, + "step": 1780, + "time_per_iteration": 4.919352054595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107992, + "balance_loss_mlp": 1.04656482, + "epoch": 0.34263178145440554, + "flos": 872662146048.0, + "grad_norm": 0.0514393238831889, + "language_loss": 0.80344206, + "learning_rate": 0.000764865686819522, + "loss": 0.81424129, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.33374023, + "step": 1781, + "time_per_iteration": 3.059682846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089318, + "balance_loss_mlp": 1.05546236, + "epoch": 0.3428241631396691, + "flos": 506630674944.0, + "grad_norm": 0.04318417455303755, + "language_loss": 0.85701579, + "learning_rate": 0.0007646013959905449, + "loss": 0.86790895, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.33886719, + "step": 1782, + "time_per_iteration": 2.572772741317749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085097, + "balance_loss_mlp": 1.05162311, + "epoch": 0.34301654482493266, + "flos": 879669324288.0, + "grad_norm": 0.05640606275212692, + "language_loss": 0.80626374, + "learning_rate": 0.0007643370024341949, + "loss": 0.81711471, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.33496094, + "step": 1783, + "time_per_iteration": 3.0805578231811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089012, + "balance_loss_mlp": 1.05472708, + "epoch": 0.34320892651019624, + "flos": 431537559552.0, + "grad_norm": 0.05116039291223259, + "language_loss": 0.82947731, + "learning_rate": 0.0007640725062531195, + "loss": 0.84036732, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.34326172, + "step": 1784, + "time_per_iteration": 2.497859239578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083577, + "balance_loss_mlp": 1.05010295, + "epoch": 0.3434013081954598, + "flos": 463404589056.0, + "grad_norm": 0.06763804466989645, + "language_loss": 0.86272931, + "learning_rate": 0.0007638079075500047, + "loss": 0.87356508, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.33496094, + "step": 1785, + "time_per_iteration": 2.516842842102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017655, + "balance_loss_mlp": 1.0058769, + "epoch": 0.34359368988072336, + "flos": 1556499276288.0, + "grad_norm": 0.01279941843938601, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76198322, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.11767578, + "step": 1786, + "time_per_iteration": 4.979317665100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077073, + "balance_loss_mlp": 1.04417157, + "epoch": 0.3437860715659869, + "flos": 495267236352.0, + "grad_norm": 0.04590480874587016, + "language_loss": 0.83035767, + "learning_rate": 0.0007632784029886026, + "loss": 0.84112841, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.32910156, + "step": 1787, + "time_per_iteration": 2.6075103282928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079505, + "balance_loss_mlp": 1.04617453, + "epoch": 0.3439784532512505, + "flos": 717942154752.0, + "grad_norm": 0.04559278353066439, + "language_loss": 0.85611933, + "learning_rate": 0.0007630134973358873, + "loss": 0.86691439, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.33349609, + "step": 1788, + "time_per_iteration": 2.917405366897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086657, + "balance_loss_mlp": 1.05327868, + "epoch": 0.34417083493651407, + "flos": 565598218752.0, + "grad_norm": 0.05301353071730806, + "language_loss": 0.86864436, + "learning_rate": 0.0007627484895722763, + "loss": 0.879511, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.33398438, + "step": 1789, + "time_per_iteration": 2.6353273391723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081939, + "balance_loss_mlp": 1.04834569, + "epoch": 0.3443632166217776, + "flos": 795988905984.0, + "grad_norm": 0.057022653970397005, + "language_loss": 0.80155563, + "learning_rate": 0.0007624833798006552, + "loss": 0.81237495, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.3359375, + "step": 1790, + "time_per_iteration": 3.039126396179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090848, + "balance_loss_mlp": 1.05692101, + "epoch": 0.3445555983070412, + "flos": 569045067264.0, + "grad_norm": 0.05940117534987587, + "language_loss": 0.84113955, + "learning_rate": 0.0007622181681239483, + "loss": 0.85204804, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.33935547, + "step": 1791, + "time_per_iteration": 2.6392083168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083434, + "balance_loss_mlp": 1.04903054, + "epoch": 0.3447479799923047, + "flos": 568524151296.0, + "grad_norm": 0.04492792711883196, + "language_loss": 0.84501636, + "learning_rate": 0.0007619528546451202, + "loss": 0.8558507, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.34448242, + "step": 1792, + "time_per_iteration": 2.776982069015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080197, + "balance_loss_mlp": 1.04708052, + "epoch": 0.3449403616775683, + "flos": 967323299328.0, + "grad_norm": 0.05878857203246004, + "language_loss": 0.8358798, + "learning_rate": 0.0007616874394671745, + "loss": 0.84668171, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.33129883, + "step": 1793, + "time_per_iteration": 3.3427693843841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074615, + "balance_loss_mlp": 1.04128361, + "epoch": 0.34513274336283184, + "flos": 568340858880.0, + "grad_norm": 0.05893035372227358, + "language_loss": 0.84961653, + "learning_rate": 0.0007614219226931547, + "loss": 0.86036265, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.33349609, + "step": 1794, + "time_per_iteration": 2.6591315269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070169, + "balance_loss_mlp": 1.03783977, + "epoch": 0.3453251250480954, + "flos": 460715793408.0, + "grad_norm": 0.06432823181520617, + "language_loss": 0.84724808, + "learning_rate": 0.0007611563044261435, + "loss": 0.85794979, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.32324219, + "step": 1795, + "time_per_iteration": 2.51755690574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078106, + "balance_loss_mlp": 1.0443697, + "epoch": 0.34551750673335896, + "flos": 415397431296.0, + "grad_norm": 0.0640589434438139, + "language_loss": 0.87120652, + "learning_rate": 0.0007608905847692631, + "loss": 0.88198757, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.33764648, + "step": 1796, + "time_per_iteration": 2.47190260887146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074227, + "balance_loss_mlp": 1.04103947, + "epoch": 0.34570988841862255, + "flos": 587540749824.0, + "grad_norm": 0.04642061059617041, + "language_loss": 0.86689866, + "learning_rate": 0.0007606247638256749, + "loss": 0.8776409, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.33203125, + "step": 1797, + "time_per_iteration": 2.956444025039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094984, + "balance_loss_mlp": 1.08373046, + "epoch": 0.34590227010388613, + "flos": 1566835439616.0, + "grad_norm": 0.041839887126655914, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79265279, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.11230469, + "step": 1798, + "time_per_iteration": 4.9134039878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.05352104, + "epoch": 0.34609465178914967, + "flos": 1536950177280.0, + "grad_norm": 0.029939636480755576, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80391788, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.11083984, + "step": 1799, + "time_per_iteration": 4.743322849273682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083924, + "balance_loss_mlp": 1.05054486, + "epoch": 0.34628703347441325, + "flos": 609075998208.0, + "grad_norm": 0.0564087129204809, + "language_loss": 0.85731971, + "learning_rate": 0.0007598266943068686, + "loss": 0.86815894, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.33398438, + "step": 1800, + "time_per_iteration": 2.7374043464660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077797, + "balance_loss_mlp": 1.04603946, + "epoch": 0.3464794151596768, + "flos": 473084596224.0, + "grad_norm": 0.06346922489791823, + "language_loss": 0.83911705, + "learning_rate": 0.0007595604692488507, + "loss": 0.849895, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.31738281, + "step": 1801, + "time_per_iteration": 2.5296249389648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.04147625, + "epoch": 0.34667179684494037, + "flos": 605397805056.0, + "grad_norm": 0.05750507090521113, + "language_loss": 0.83014846, + "learning_rate": 0.0007592941434205215, + "loss": 0.84088963, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.32641602, + "step": 1802, + "time_per_iteration": 2.758260488510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017015, + "balance_loss_mlp": 1.00628662, + "epoch": 0.3468641785302039, + "flos": 1564053511680.0, + "grad_norm": 0.014489769518708178, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74588072, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.10742188, + "step": 1803, + "time_per_iteration": 5.078529119491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069541, + "balance_loss_mlp": 1.0363059, + "epoch": 0.3470565602154675, + "flos": 906902258688.0, + "grad_norm": 0.0666829693597375, + "language_loss": 0.79937375, + "learning_rate": 0.0007587611898665566, + "loss": 0.81006914, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.33251953, + "step": 1804, + "time_per_iteration": 3.05087947845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078664, + "balance_loss_mlp": 1.04621565, + "epoch": 0.347248941900731, + "flos": 638613614592.0, + "grad_norm": 0.050247612363814816, + "language_loss": 0.8218019, + "learning_rate": 0.0007584945623478315, + "loss": 0.83258855, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.32446289, + "step": 1805, + "time_per_iteration": 2.8188822269439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071735, + "balance_loss_mlp": 1.03940511, + "epoch": 0.3474413235859946, + "flos": 847009336320.0, + "grad_norm": 0.06830759319763476, + "language_loss": 0.81376302, + "learning_rate": 0.000758227834472617, + "loss": 0.82448041, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.32324219, + "step": 1806, + "time_per_iteration": 3.049736976623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080854, + "balance_loss_mlp": 1.04771423, + "epoch": 0.3476337052712582, + "flos": 515395478016.0, + "grad_norm": 0.0580200838122141, + "language_loss": 0.77365351, + "learning_rate": 0.0007579610063444664, + "loss": 0.78446203, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.33154297, + "step": 1807, + "time_per_iteration": 2.768986701965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072697, + "balance_loss_mlp": 1.03993857, + "epoch": 0.34782608695652173, + "flos": 913161558528.0, + "grad_norm": 0.05804810611861273, + "language_loss": 0.8735044, + "learning_rate": 0.0007576940780669712, + "loss": 0.88423139, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.32763672, + "step": 1808, + "time_per_iteration": 3.200984477996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073414, + "balance_loss_mlp": 1.04041636, + "epoch": 0.3480184686417853, + "flos": 773374099968.0, + "grad_norm": 0.05336970886803796, + "language_loss": 0.84611619, + "learning_rate": 0.0007574270497437624, + "loss": 0.85685027, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.33007812, + "step": 1809, + "time_per_iteration": 2.9432260990142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069124, + "balance_loss_mlp": 1.03619814, + "epoch": 0.34821085032704885, + "flos": 576549840384.0, + "grad_norm": 0.04930975616190813, + "language_loss": 0.87883413, + "learning_rate": 0.000757159921478509, + "loss": 0.88952535, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.3293457, + "step": 1810, + "time_per_iteration": 2.769669771194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079936, + "balance_loss_mlp": 1.06887364, + "epoch": 0.34840323201231244, + "flos": 1524176911872.0, + "grad_norm": 0.03214902088053901, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75530577, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.11083984, + "step": 1811, + "time_per_iteration": 4.764174222946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107769, + "balance_loss_mlp": 1.04469275, + "epoch": 0.34859561369757597, + "flos": 508910625792.0, + "grad_norm": 0.059132347701423886, + "language_loss": 0.87255216, + "learning_rate": 0.0007566253655367423, + "loss": 0.88332909, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.33007812, + "step": 1812, + "time_per_iteration": 2.578930616378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073116, + "balance_loss_mlp": 1.04014218, + "epoch": 0.34878799538283956, + "flos": 548390117376.0, + "grad_norm": 0.051501554075800156, + "language_loss": 0.89574003, + "learning_rate": 0.000756357938067762, + "loss": 0.90647119, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.32983398, + "step": 1813, + "time_per_iteration": 2.6508982181549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079394, + "balance_loss_mlp": 1.04673076, + "epoch": 0.34898037706810314, + "flos": 983251021824.0, + "grad_norm": 0.051360492330316726, + "language_loss": 0.82609868, + "learning_rate": 0.0007560904110718033, + "loss": 0.8368926, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.32666016, + "step": 1814, + "time_per_iteration": 3.236894130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075398, + "balance_loss_mlp": 1.04185271, + "epoch": 0.3491727587533667, + "flos": 681298435584.0, + "grad_norm": 0.05446392192761228, + "language_loss": 0.83478653, + "learning_rate": 0.0007558227846527297, + "loss": 0.84554052, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.33569336, + "step": 1815, + "time_per_iteration": 2.8674044609069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074987, + "balance_loss_mlp": 1.04175162, + "epoch": 0.34936514043863026, + "flos": 393811310592.0, + "grad_norm": 0.0691488486506015, + "language_loss": 0.83195454, + "learning_rate": 0.0007555550589144429, + "loss": 0.84270442, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.33251953, + "step": 1816, + "time_per_iteration": 2.421494722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071292, + "balance_loss_mlp": 1.03917694, + "epoch": 0.3495575221238938, + "flos": 461120256000.0, + "grad_norm": 0.07868701205222765, + "language_loss": 0.8463372, + "learning_rate": 0.000755287233960883, + "loss": 0.85705012, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.32104492, + "step": 1817, + "time_per_iteration": 2.54315185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072544, + "balance_loss_mlp": 1.04023862, + "epoch": 0.3497499038091574, + "flos": 723859600896.0, + "grad_norm": 0.06602653795060065, + "language_loss": 0.77636009, + "learning_rate": 0.0007550193098960292, + "loss": 0.78708553, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.32299805, + "step": 1818, + "time_per_iteration": 2.848236560821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076763, + "balance_loss_mlp": 1.04452837, + "epoch": 0.3499422854944209, + "flos": 827364132864.0, + "grad_norm": 0.049816715297611704, + "language_loss": 0.86387616, + "learning_rate": 0.0007547512868238988, + "loss": 0.8746438, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.32226562, + "step": 1819, + "time_per_iteration": 3.140552043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076553, + "balance_loss_mlp": 1.0441277, + "epoch": 0.3501346671796845, + "flos": 493214247936.0, + "grad_norm": 0.049810070694169546, + "language_loss": 0.83196282, + "learning_rate": 0.0007544831648485473, + "loss": 0.84272838, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.32421875, + "step": 1820, + "time_per_iteration": 2.7085797786712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074414, + "balance_loss_mlp": 1.04179859, + "epoch": 0.35032704886494803, + "flos": 578479173120.0, + "grad_norm": 0.05987447994889705, + "language_loss": 0.81237, + "learning_rate": 0.0007542149440740694, + "loss": 0.82311416, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.32617188, + "step": 1821, + "time_per_iteration": 2.6648108959198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075377, + "balance_loss_mlp": 1.0426898, + "epoch": 0.3505194305502116, + "flos": 584383472640.0, + "grad_norm": 0.06285767185927299, + "language_loss": 0.85454488, + "learning_rate": 0.000753946624604597, + "loss": 0.86529863, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.3269043, + "step": 1822, + "time_per_iteration": 2.7114102840423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080366, + "balance_loss_mlp": 1.04722571, + "epoch": 0.3507118122354752, + "flos": 526705072128.0, + "grad_norm": 0.056571758739544044, + "language_loss": 0.88259315, + "learning_rate": 0.0007536782065443015, + "loss": 0.89339685, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.33154297, + "step": 1823, + "time_per_iteration": 2.6336190700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077518, + "balance_loss_mlp": 1.04576099, + "epoch": 0.35090419392073874, + "flos": 511269152256.0, + "grad_norm": 0.06612506998948281, + "language_loss": 0.74917412, + "learning_rate": 0.0007534096899973919, + "loss": 0.75994933, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.31738281, + "step": 1824, + "time_per_iteration": 2.5683584213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108279, + "balance_loss_mlp": 1.05069852, + "epoch": 0.3510965756060023, + "flos": 563728522752.0, + "grad_norm": 0.05207355522992398, + "language_loss": 0.82511663, + "learning_rate": 0.0007531410750681154, + "loss": 0.83594453, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.32080078, + "step": 1825, + "time_per_iteration": 2.7370071411132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108428, + "balance_loss_mlp": 1.05207014, + "epoch": 0.35128895729126586, + "flos": 1020107146752.0, + "grad_norm": 0.05855996344544413, + "language_loss": 0.86223209, + "learning_rate": 0.0007528723618607575, + "loss": 0.87307489, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.32202148, + "step": 1826, + "time_per_iteration": 3.4230828285217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080919, + "balance_loss_mlp": 1.04889941, + "epoch": 0.35148133897652944, + "flos": 587972915712.0, + "grad_norm": 0.06514472806491804, + "language_loss": 0.82370871, + "learning_rate": 0.0007526035504796422, + "loss": 0.8345179, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.32006836, + "step": 1827, + "time_per_iteration": 2.7472023963928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083394, + "balance_loss_mlp": 1.05011046, + "epoch": 0.351673720661793, + "flos": 495054830592.0, + "grad_norm": 0.13631276803870807, + "language_loss": 0.86120903, + "learning_rate": 0.0007523346410291312, + "loss": 0.87204289, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.33300781, + "step": 1828, + "time_per_iteration": 2.7665555477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080213, + "balance_loss_mlp": 1.04757404, + "epoch": 0.35186610234705656, + "flos": 762339520512.0, + "grad_norm": 0.04983334941453678, + "language_loss": 0.85021639, + "learning_rate": 0.0007520656336136245, + "loss": 0.86101854, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.32641602, + "step": 1829, + "time_per_iteration": 2.9405102729797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080442, + "balance_loss_mlp": 1.04847038, + "epoch": 0.3520584840323201, + "flos": 625822820352.0, + "grad_norm": 0.049266285647792965, + "language_loss": 0.87560928, + "learning_rate": 0.0007517965283375599, + "loss": 0.88641369, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.31958008, + "step": 1830, + "time_per_iteration": 2.8742456436157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076907, + "balance_loss_mlp": 1.04429162, + "epoch": 0.3522508657175837, + "flos": 537124193280.0, + "grad_norm": 0.05152278098600794, + "language_loss": 0.8913554, + "learning_rate": 0.0007515273253054132, + "loss": 0.90212452, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.32617188, + "step": 1831, + "time_per_iteration": 2.647270917892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085251, + "balance_loss_mlp": 1.0506562, + "epoch": 0.35244324740284727, + "flos": 567105560064.0, + "grad_norm": 0.052396269804254075, + "language_loss": 0.82697165, + "learning_rate": 0.0007512580246216988, + "loss": 0.83782411, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.34643555, + "step": 1832, + "time_per_iteration": 2.6887552738189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079591, + "balance_loss_mlp": 1.04673672, + "epoch": 0.3526356290881108, + "flos": 512809989120.0, + "grad_norm": 0.05749675796225481, + "language_loss": 0.85263908, + "learning_rate": 0.000750988626390968, + "loss": 0.86343497, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.32861328, + "step": 1833, + "time_per_iteration": 2.6013457775115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080641, + "balance_loss_mlp": 1.04781032, + "epoch": 0.3528280107733744, + "flos": 595496627712.0, + "grad_norm": 0.05344239959905588, + "language_loss": 0.84880912, + "learning_rate": 0.0007507191307178108, + "loss": 0.8596155, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.32836914, + "step": 1834, + "time_per_iteration": 2.7490363121032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080747, + "balance_loss_mlp": 1.04682052, + "epoch": 0.3530203924586379, + "flos": 550969814016.0, + "grad_norm": 0.06515988244691072, + "language_loss": 0.74431223, + "learning_rate": 0.0007504495377068543, + "loss": 0.75511968, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.33959961, + "step": 1835, + "time_per_iteration": 2.729029417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084115, + "balance_loss_mlp": 1.04925871, + "epoch": 0.3532127741439015, + "flos": 652662876672.0, + "grad_norm": 0.06759605529963146, + "language_loss": 0.81589389, + "learning_rate": 0.0007501798474627642, + "loss": 0.82673502, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.34912109, + "step": 1836, + "time_per_iteration": 2.9048030376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080775, + "balance_loss_mlp": 1.04708695, + "epoch": 0.35340515582916504, + "flos": 722452594176.0, + "grad_norm": 0.055893281392717674, + "language_loss": 0.83221173, + "learning_rate": 0.0007499100600902433, + "loss": 0.84301955, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.3371582, + "step": 1837, + "time_per_iteration": 2.9900574684143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080937, + "balance_loss_mlp": 1.0464375, + "epoch": 0.35359753751442863, + "flos": 594619301376.0, + "grad_norm": 0.06113982905710786, + "language_loss": 0.84191763, + "learning_rate": 0.0007496401756940324, + "loss": 0.852727, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.34545898, + "step": 1838, + "time_per_iteration": 2.6746203899383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079632, + "balance_loss_mlp": 1.04575253, + "epoch": 0.3537899191996922, + "flos": 632384838144.0, + "grad_norm": 0.05956961248716192, + "language_loss": 0.82392603, + "learning_rate": 0.0007493701943789098, + "loss": 0.8347224, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.33886719, + "step": 1839, + "time_per_iteration": 2.773550510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089423, + "balance_loss_mlp": 1.05590141, + "epoch": 0.35398230088495575, + "flos": 506118523392.0, + "grad_norm": 0.05410374630174333, + "language_loss": 0.82311571, + "learning_rate": 0.000749100116249692, + "loss": 0.83400989, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.33544922, + "step": 1840, + "time_per_iteration": 2.6255862712860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089923, + "balance_loss_mlp": 1.05649722, + "epoch": 0.35417468257021933, + "flos": 507783015936.0, + "grad_norm": 0.06109989504264522, + "language_loss": 0.86315167, + "learning_rate": 0.0007488299414112321, + "loss": 0.87405092, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.33447266, + "step": 1841, + "time_per_iteration": 2.5875229835510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088685, + "balance_loss_mlp": 1.05552149, + "epoch": 0.35436706425548287, + "flos": 656133046272.0, + "grad_norm": 0.05742985112465967, + "language_loss": 0.77833533, + "learning_rate": 0.0007485596699684215, + "loss": 0.78922212, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.33178711, + "step": 1842, + "time_per_iteration": 2.819591760635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092886, + "balance_loss_mlp": 1.05903101, + "epoch": 0.35455944594074645, + "flos": 652322433024.0, + "grad_norm": 0.047878329403948795, + "language_loss": 0.85455877, + "learning_rate": 0.000748289302026189, + "loss": 0.86548758, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.33886719, + "step": 1843, + "time_per_iteration": 2.829897880554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088118, + "balance_loss_mlp": 1.05569351, + "epoch": 0.35475182762601, + "flos": 848240252928.0, + "grad_norm": 0.06279452498251797, + "language_loss": 0.85658133, + "learning_rate": 0.0007480188376895004, + "loss": 0.86746252, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.32421875, + "step": 1844, + "time_per_iteration": 3.067828893661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085552, + "balance_loss_mlp": 1.07358336, + "epoch": 0.3549442093112736, + "flos": 1520644133376.0, + "grad_norm": 0.027370210450034033, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.7489689, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.11962891, + "step": 1845, + "time_per_iteration": 4.860119342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087227, + "balance_loss_mlp": 1.05401564, + "epoch": 0.3551365909965371, + "flos": 651087134208.0, + "grad_norm": 0.057022057365061586, + "language_loss": 0.7840451, + "learning_rate": 0.0007474776202528074, + "loss": 0.79491735, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.33227539, + "step": 1846, + "time_per_iteration": 2.924600601196289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081626, + "balance_loss_mlp": 1.04877198, + "epoch": 0.3553289726818007, + "flos": 897094213632.0, + "grad_norm": 0.05655103479540665, + "language_loss": 0.81245291, + "learning_rate": 0.000747206867362922, + "loss": 0.82326913, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.32861328, + "step": 1847, + "time_per_iteration": 3.0635437965393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078587, + "balance_loss_mlp": 1.0454942, + "epoch": 0.3555213543670643, + "flos": 688181958144.0, + "grad_norm": 0.057996459019562165, + "language_loss": 0.83748043, + "learning_rate": 0.0007469360184988194, + "loss": 0.84826624, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.33105469, + "step": 1848, + "time_per_iteration": 2.816774606704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072989, + "balance_loss_mlp": 1.03977752, + "epoch": 0.3557137360523278, + "flos": 538305647616.0, + "grad_norm": 0.0578078380794177, + "language_loss": 0.87284935, + "learning_rate": 0.0007466650737656518, + "loss": 0.88357925, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.33203125, + "step": 1849, + "time_per_iteration": 2.611743927001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075842, + "balance_loss_mlp": 1.04208231, + "epoch": 0.3559061177375914, + "flos": 402039230976.0, + "grad_norm": 0.05251231214094578, + "language_loss": 0.90093362, + "learning_rate": 0.0007463940332686098, + "loss": 0.91169202, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.33789062, + "step": 1850, + "time_per_iteration": 2.4692726135253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073866, + "balance_loss_mlp": 1.04017735, + "epoch": 0.35609849942285493, + "flos": 696238170624.0, + "grad_norm": 0.04795835093932571, + "language_loss": 0.84167922, + "learning_rate": 0.0007461228971129205, + "loss": 0.85241795, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.33691406, + "step": 1851, + "time_per_iteration": 2.894505023956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106997, + "balance_loss_mlp": 1.0372591, + "epoch": 0.3562908811081185, + "flos": 568660953600.0, + "grad_norm": 0.055081415669052246, + "language_loss": 0.85513294, + "learning_rate": 0.0007458516654038483, + "loss": 0.86583257, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.32714844, + "step": 1852, + "time_per_iteration": 2.678018569946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076636, + "balance_loss_mlp": 1.0421133, + "epoch": 0.35648326279338205, + "flos": 682081219584.0, + "grad_norm": 0.04842584798560518, + "language_loss": 0.8668319, + "learning_rate": 0.0007455803382466946, + "loss": 0.87759829, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.34545898, + "step": 1853, + "time_per_iteration": 2.795799493789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081318, + "balance_loss_mlp": 1.04674757, + "epoch": 0.35667564447864564, + "flos": 628840475136.0, + "grad_norm": 0.04891463031827082, + "language_loss": 0.87319207, + "learning_rate": 0.0007453089157467979, + "loss": 0.88400525, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.34594727, + "step": 1854, + "time_per_iteration": 2.7683348655700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084525, + "balance_loss_mlp": 1.05000162, + "epoch": 0.35686802616390917, + "flos": 813685837824.0, + "grad_norm": 0.04901692214928195, + "language_loss": 0.81941634, + "learning_rate": 0.0007450373980095341, + "loss": 0.83026159, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.34545898, + "step": 1855, + "time_per_iteration": 3.069664716720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088761, + "balance_loss_mlp": 1.0541904, + "epoch": 0.35706040784917276, + "flos": 525922288128.0, + "grad_norm": 0.06393454459125486, + "language_loss": 0.86792582, + "learning_rate": 0.0007447657851403155, + "loss": 0.87881339, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.34619141, + "step": 1856, + "time_per_iteration": 2.6120662689208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081268, + "balance_loss_mlp": 1.04793692, + "epoch": 0.35725278953443634, + "flos": 511698345984.0, + "grad_norm": 0.060959809088696394, + "language_loss": 0.78963649, + "learning_rate": 0.0007444940772445915, + "loss": 0.80044913, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.33349609, + "step": 1857, + "time_per_iteration": 2.802053689956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079861, + "balance_loss_mlp": 1.04653037, + "epoch": 0.3574451712196999, + "flos": 487162971648.0, + "grad_norm": 0.06448223618511208, + "language_loss": 0.80338144, + "learning_rate": 0.0007442222744278484, + "loss": 0.81418002, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.33349609, + "step": 1858, + "time_per_iteration": 2.660689353942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080529, + "balance_loss_mlp": 1.04765141, + "epoch": 0.35763755290496346, + "flos": 550384879104.0, + "grad_norm": 0.061962253699798436, + "language_loss": 0.84126002, + "learning_rate": 0.0007439503767956099, + "loss": 0.85206527, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.32885742, + "step": 1859, + "time_per_iteration": 2.7479875087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080547, + "balance_loss_mlp": 1.06767237, + "epoch": 0.357829934590227, + "flos": 1503300791808.0, + "grad_norm": 0.035903025748828234, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80752152, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.12890625, + "step": 1860, + "time_per_iteration": 4.900041580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077433, + "balance_loss_mlp": 1.04479325, + "epoch": 0.3580223162754906, + "flos": 568410670080.0, + "grad_norm": 0.040558802150678905, + "language_loss": 0.85799539, + "learning_rate": 0.000743406297506922, + "loss": 0.86876976, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.32641602, + "step": 1861, + "time_per_iteration": 2.701162576675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084428, + "balance_loss_mlp": 1.05107355, + "epoch": 0.3582146979607541, + "flos": 626153089536.0, + "grad_norm": 0.04686630584337546, + "language_loss": 0.8419295, + "learning_rate": 0.0007431341160617031, + "loss": 0.85277379, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.33374023, + "step": 1862, + "time_per_iteration": 2.860173463821411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085309, + "balance_loss_mlp": 1.05266929, + "epoch": 0.3584070796460177, + "flos": 507010406400.0, + "grad_norm": 0.04939599291948986, + "language_loss": 0.88143289, + "learning_rate": 0.0007428618402234491, + "loss": 0.89228594, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.32641602, + "step": 1863, + "time_per_iteration": 2.62233567237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083488, + "balance_loss_mlp": 1.05030036, + "epoch": 0.3585994613312813, + "flos": 606190763520.0, + "grad_norm": 0.051497717495276533, + "language_loss": 0.80248374, + "learning_rate": 0.0007425894700978668, + "loss": 0.81331861, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.33203125, + "step": 1864, + "time_per_iteration": 2.711484670639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087216, + "balance_loss_mlp": 1.05424261, + "epoch": 0.3587918430165448, + "flos": 1412338484736.0, + "grad_norm": 0.047877863134497434, + "language_loss": 0.79232943, + "learning_rate": 0.0007423170057906996, + "loss": 0.80320162, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.32983398, + "step": 1865, + "time_per_iteration": 3.852776527404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091453, + "balance_loss_mlp": 1.05821717, + "epoch": 0.3589842247018084, + "flos": 478313800704.0, + "grad_norm": 0.06431447428318769, + "language_loss": 0.85827845, + "learning_rate": 0.0007420444474077275, + "loss": 0.86919296, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.33251953, + "step": 1866, + "time_per_iteration": 2.6104037761688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101796, + "balance_loss_mlp": 1.06829846, + "epoch": 0.35917660638707194, + "flos": 504464205312.0, + "grad_norm": 0.06438653143979521, + "language_loss": 0.89830631, + "learning_rate": 0.0007417717950547671, + "loss": 0.90932429, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.33520508, + "step": 1867, + "time_per_iteration": 2.5619330406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108848, + "balance_loss_mlp": 1.09687901, + "epoch": 0.3593689880723355, + "flos": 1491294191616.0, + "grad_norm": 0.037524520389889536, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77105457, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.11962891, + "step": 1868, + "time_per_iteration": 4.971943378448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096427, + "balance_loss_mlp": 1.06388319, + "epoch": 0.35956136975759906, + "flos": 528369564672.0, + "grad_norm": 0.050983088733796166, + "language_loss": 0.84612024, + "learning_rate": 0.0007412262088623299, + "loss": 0.85708451, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.32543945, + "step": 1869, + "time_per_iteration": 2.7295072078704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104656, + "balance_loss_mlp": 1.07142007, + "epoch": 0.35975375144286265, + "flos": 534647803392.0, + "grad_norm": 0.057848782745497714, + "language_loss": 0.79012549, + "learning_rate": 0.0007409532752346684, + "loss": 0.80117208, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.33251953, + "step": 1870, + "time_per_iteration": 2.74664568901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107858, + "balance_loss_mlp": 1.07464623, + "epoch": 0.3599461331281262, + "flos": 504695549952.0, + "grad_norm": 0.054621035664709404, + "language_loss": 0.88661271, + "learning_rate": 0.0007406802480606491, + "loss": 0.89769125, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.33227539, + "step": 1871, + "time_per_iteration": 2.636101722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_mlp": 1.06923246, + "epoch": 0.36013851481338977, + "flos": 511283708928.0, + "grad_norm": 0.05849515281409536, + "language_loss": 0.90592384, + "learning_rate": 0.0007404071274462707, + "loss": 0.91694903, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.33300781, + "step": 1872, + "time_per_iteration": 2.559588670730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098641, + "balance_loss_mlp": 1.06533384, + "epoch": 0.36033089649865335, + "flos": 547330908672.0, + "grad_norm": 0.06237198940644659, + "language_loss": 0.8363173, + "learning_rate": 0.0007401339134975682, + "loss": 0.84730369, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.33325195, + "step": 1873, + "time_per_iteration": 2.6156845092773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100197, + "balance_loss_mlp": 1.06617522, + "epoch": 0.3605232781839169, + "flos": 458416903680.0, + "grad_norm": 0.05108892475659159, + "language_loss": 0.84187275, + "learning_rate": 0.0007398606063206122, + "loss": 0.85287476, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.34033203, + "step": 1874, + "time_per_iteration": 2.6152756214141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090341, + "balance_loss_mlp": 1.05729628, + "epoch": 0.36071565986918047, + "flos": 509309296128.0, + "grad_norm": 0.05589329807105905, + "language_loss": 0.7857852, + "learning_rate": 0.0007395872060215101, + "loss": 0.79668868, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.33056641, + "step": 1875, + "time_per_iteration": 2.592906951904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095107, + "balance_loss_mlp": 1.06230044, + "epoch": 0.360908041554444, + "flos": 558931484160.0, + "grad_norm": 0.12468103825296885, + "language_loss": 0.88329792, + "learning_rate": 0.0007393137127064056, + "loss": 0.89424896, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.328125, + "step": 1876, + "time_per_iteration": 2.629368782043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109653, + "balance_loss_mlp": 1.06300879, + "epoch": 0.3611004232397076, + "flos": 523588492800.0, + "grad_norm": 0.05189881397754868, + "language_loss": 0.84167802, + "learning_rate": 0.0007390401264814779, + "loss": 0.85264337, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.33544922, + "step": 1877, + "time_per_iteration": 2.644322156906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084049, + "balance_loss_mlp": 1.05179131, + "epoch": 0.3612928049249711, + "flos": 540728193024.0, + "grad_norm": 0.07312313725982984, + "language_loss": 0.84472072, + "learning_rate": 0.0007387664474529427, + "loss": 0.8555612, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.32250977, + "step": 1878, + "time_per_iteration": 2.6105034351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094149, + "balance_loss_mlp": 1.06131935, + "epoch": 0.3614851866102347, + "flos": 552289480704.0, + "grad_norm": 0.06338398309504269, + "language_loss": 0.9129535, + "learning_rate": 0.0007384926757270518, + "loss": 0.923895, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.32836914, + "step": 1879, + "time_per_iteration": 2.6268200874328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082708, + "balance_loss_mlp": 1.05023539, + "epoch": 0.36167756829549824, + "flos": 771734338560.0, + "grad_norm": 0.048672507925477976, + "language_loss": 0.79680419, + "learning_rate": 0.0007382188114100924, + "loss": 0.80763125, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.32470703, + "step": 1880, + "time_per_iteration": 2.9548373222351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078144, + "balance_loss_mlp": 1.04509938, + "epoch": 0.36186994998076183, + "flos": 711560609280.0, + "grad_norm": 0.04804943389379678, + "language_loss": 0.81544787, + "learning_rate": 0.0007379448546083884, + "loss": 0.82622933, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.33056641, + "step": 1881, + "time_per_iteration": 2.900480031967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082413, + "balance_loss_mlp": 1.04884315, + "epoch": 0.3620623316660254, + "flos": 747209138688.0, + "grad_norm": 0.049920719635936736, + "language_loss": 0.88019323, + "learning_rate": 0.0007376708054282992, + "loss": 0.89101738, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.3359375, + "step": 1882, + "time_per_iteration": 2.9482829570770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075385, + "balance_loss_mlp": 1.04286492, + "epoch": 0.36225471335128895, + "flos": 482312088576.0, + "grad_norm": 0.04692483307288239, + "language_loss": 0.83908749, + "learning_rate": 0.0007373966639762201, + "loss": 0.84984136, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.32519531, + "step": 1883, + "time_per_iteration": 2.597809076309204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078049, + "balance_loss_mlp": 1.0448606, + "epoch": 0.36244709503655254, + "flos": 506655406080.0, + "grad_norm": 0.0703007209611724, + "language_loss": 0.8835175, + "learning_rate": 0.0007371224303585822, + "loss": 0.89429802, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.33203125, + "step": 1884, + "time_per_iteration": 2.5686471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046189, + "balance_loss_mlp": 1.03360081, + "epoch": 0.36263947672181607, + "flos": 1393302643200.0, + "grad_norm": 0.020620128786032376, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81403255, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.12597656, + "step": 1885, + "time_per_iteration": 4.68831205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073343, + "balance_loss_mlp": 1.03939199, + "epoch": 0.36283185840707965, + "flos": 652991735808.0, + "grad_norm": 0.05943029236677907, + "language_loss": 0.82845902, + "learning_rate": 0.0007365736870525335, + "loss": 0.83919251, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.33959961, + "step": 1886, + "time_per_iteration": 2.8566346168518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070323, + "balance_loss_mlp": 1.0373739, + "epoch": 0.3630242400923432, + "flos": 488619440640.0, + "grad_norm": 0.06703223685064427, + "language_loss": 0.82574463, + "learning_rate": 0.000736299177577164, + "loss": 0.83644783, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.32958984, + "step": 1887, + "time_per_iteration": 2.5848894119262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074233, + "balance_loss_mlp": 1.04130709, + "epoch": 0.3632166217776068, + "flos": 516892644864.0, + "grad_norm": 0.0626455482667494, + "language_loss": 0.83844066, + "learning_rate": 0.0007360245763623174, + "loss": 0.84918302, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.3293457, + "step": 1888, + "time_per_iteration": 2.6179397106170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067525, + "balance_loss_mlp": 1.03564882, + "epoch": 0.36340900346287036, + "flos": 645881250816.0, + "grad_norm": 0.06111369810549259, + "language_loss": 0.89794236, + "learning_rate": 0.0007357498835146039, + "loss": 0.90861762, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.31860352, + "step": 1889, + "time_per_iteration": 2.8311662673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070281, + "balance_loss_mlp": 1.03854752, + "epoch": 0.3636013851481339, + "flos": 553057708032.0, + "grad_norm": 0.0568549422608731, + "language_loss": 0.86402494, + "learning_rate": 0.0007354750991406684, + "loss": 0.87472773, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.31713867, + "step": 1890, + "time_per_iteration": 2.6922197341918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072041, + "balance_loss_mlp": 1.03952074, + "epoch": 0.3637937668333975, + "flos": 546395355648.0, + "grad_norm": 0.053455628499382915, + "language_loss": 0.80957252, + "learning_rate": 0.0007352002233471919, + "loss": 0.82029295, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.32519531, + "step": 1891, + "time_per_iteration": 2.621241569519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066281, + "balance_loss_mlp": 1.03371286, + "epoch": 0.363986148518661, + "flos": 537838576128.0, + "grad_norm": 0.07508945751401845, + "language_loss": 0.79549944, + "learning_rate": 0.0007349252562408906, + "loss": 0.80616224, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.32543945, + "step": 1892, + "time_per_iteration": 2.674318552017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069175, + "balance_loss_mlp": 1.0372504, + "epoch": 0.3641785302039246, + "flos": 659895607296.0, + "grad_norm": 0.04761623500947703, + "language_loss": 0.81258041, + "learning_rate": 0.0007346501979285158, + "loss": 0.82327211, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.3190918, + "step": 1893, + "time_per_iteration": 2.8671226501464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_mlp": 1.0238719, + "epoch": 0.36437091188918813, + "flos": 1467911158272.0, + "grad_norm": 0.02143179240630706, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81574464, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.11474609, + "step": 1894, + "time_per_iteration": 4.7720019817352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075531, + "balance_loss_mlp": 1.04248571, + "epoch": 0.3645632935744517, + "flos": 597012733440.0, + "grad_norm": 0.049808711864839754, + "language_loss": 0.85850054, + "learning_rate": 0.0007340998081127308, + "loss": 0.8692559, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.33056641, + "step": 1895, + "time_per_iteration": 2.753730058670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081037, + "balance_loss_mlp": 1.0486834, + "epoch": 0.36475567525971525, + "flos": 599214108672.0, + "grad_norm": 0.05384470863640996, + "language_loss": 0.9063257, + "learning_rate": 0.0007338244768230007, + "loss": 0.91713607, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.32348633, + "step": 1896, + "time_per_iteration": 2.749844551086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_mlp": 1.05189669, + "epoch": 0.36494805694497884, + "flos": 798047686656.0, + "grad_norm": 0.12041633701688108, + "language_loss": 0.88843018, + "learning_rate": 0.0007335490547545578, + "loss": 0.89927363, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.32446289, + "step": 1897, + "time_per_iteration": 3.0181519985198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089996, + "balance_loss_mlp": 1.05795288, + "epoch": 0.3651404386302424, + "flos": 637023315456.0, + "grad_norm": 0.06340749789439089, + "language_loss": 0.82377589, + "learning_rate": 0.0007332735420143308, + "loss": 0.83467579, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.3203125, + "step": 1898, + "time_per_iteration": 2.7370855808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077681, + "balance_loss_mlp": 1.04442132, + "epoch": 0.36533282031550596, + "flos": 491337349632.0, + "grad_norm": 0.05751458989837244, + "language_loss": 0.8663426, + "learning_rate": 0.0007329979387092826, + "loss": 0.87711942, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.33276367, + "step": 1899, + "time_per_iteration": 2.552072048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076766, + "balance_loss_mlp": 1.0440551, + "epoch": 0.36552520200076954, + "flos": 855587874816.0, + "grad_norm": 0.050366197091212025, + "language_loss": 0.83863711, + "learning_rate": 0.0007327222449464124, + "loss": 0.84940481, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.32714844, + "step": 1900, + "time_per_iteration": 3.2450594902038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072697, + "balance_loss_mlp": 1.03936601, + "epoch": 0.3657175836860331, + "flos": 483449872896.0, + "grad_norm": 0.053278478248789174, + "language_loss": 0.88864619, + "learning_rate": 0.0007324464608327538, + "loss": 0.89937317, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.33349609, + "step": 1901, + "time_per_iteration": 2.6027348041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072333, + "balance_loss_mlp": 1.03957391, + "epoch": 0.36590996537129666, + "flos": 434561006592.0, + "grad_norm": 0.058664113400220264, + "language_loss": 0.88440275, + "learning_rate": 0.0007321705864753758, + "loss": 0.8951261, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.32763672, + "step": 1902, + "time_per_iteration": 2.668935537338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073172, + "balance_loss_mlp": 1.03950715, + "epoch": 0.3661023470565602, + "flos": 711880704000.0, + "grad_norm": 0.047699393186438684, + "language_loss": 0.84307706, + "learning_rate": 0.0007318946219813823, + "loss": 0.85380876, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.33691406, + "step": 1903, + "time_per_iteration": 3.025866985321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074477, + "balance_loss_mlp": 1.04100275, + "epoch": 0.3662947287418238, + "flos": 564495340032.0, + "grad_norm": 0.05797091317965262, + "language_loss": 0.90078342, + "learning_rate": 0.000731618567457912, + "loss": 0.91152817, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.3347168, + "step": 1904, + "time_per_iteration": 2.6391115188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073585, + "balance_loss_mlp": 1.03937197, + "epoch": 0.3664871104270873, + "flos": 789391982592.0, + "grad_norm": 0.05925410463566973, + "language_loss": 0.87083924, + "learning_rate": 0.000731342423012139, + "loss": 0.88157511, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.3425293, + "step": 1905, + "time_per_iteration": 3.020660400390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070848, + "balance_loss_mlp": 1.03682566, + "epoch": 0.3666794921123509, + "flos": 752202616320.0, + "grad_norm": 0.06601024748857935, + "language_loss": 0.82244205, + "learning_rate": 0.0007310661887512722, + "loss": 0.83315057, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.34033203, + "step": 1906, + "time_per_iteration": 3.0128185749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068568, + "balance_loss_mlp": 1.03571391, + "epoch": 0.3668718737976145, + "flos": 523264015872.0, + "grad_norm": 0.04853340441162438, + "language_loss": 0.82115662, + "learning_rate": 0.0007307898647825549, + "loss": 0.8318423, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.32861328, + "step": 1907, + "time_per_iteration": 2.6610257625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075142, + "balance_loss_mlp": 1.04126275, + "epoch": 0.367064255482878, + "flos": 571698957312.0, + "grad_norm": 0.05773956677348378, + "language_loss": 0.89470363, + "learning_rate": 0.0007305134512132659, + "loss": 0.90545505, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.33886719, + "step": 1908, + "time_per_iteration": 2.706658124923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076976, + "balance_loss_mlp": 1.04309678, + "epoch": 0.3672566371681416, + "flos": 446880347136.0, + "grad_norm": 0.0894454707503668, + "language_loss": 0.83388865, + "learning_rate": 0.0007302369481507183, + "loss": 0.84465849, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.33911133, + "step": 1909, + "time_per_iteration": 2.499483346939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049259, + "balance_loss_mlp": 1.03838694, + "epoch": 0.36744901885340514, + "flos": 1539275208192.0, + "grad_norm": 0.032302576214162944, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81011015, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.10888672, + "step": 1910, + "time_per_iteration": 4.86514949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088721, + "balance_loss_mlp": 1.05476999, + "epoch": 0.36764140053866873, + "flos": 563417192448.0, + "grad_norm": 0.061805914783829616, + "language_loss": 0.85575247, + "learning_rate": 0.000729683673975274, + "loss": 0.86663967, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.33984375, + "step": 1911, + "time_per_iteration": 2.6907522678375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091619, + "balance_loss_mlp": 1.05709589, + "epoch": 0.36783378222393226, + "flos": 1216168971264.0, + "grad_norm": 0.04498413319979697, + "language_loss": 0.82746279, + "learning_rate": 0.0007294069030771774, + "loss": 0.83837891, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.34570312, + "step": 1912, + "time_per_iteration": 3.6445353031158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109623, + "balance_loss_mlp": 1.06196952, + "epoch": 0.36802616390919585, + "flos": 498476947968.0, + "grad_norm": 0.055898807174015214, + "language_loss": 0.90671504, + "learning_rate": 0.0007291300431154224, + "loss": 0.9176774, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.34301758, + "step": 1913, + "time_per_iteration": 2.5600366592407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025736, + "balance_loss_mlp": 1.0155319, + "epoch": 0.36821854559445943, + "flos": 1581281961984.0, + "grad_norm": 0.015307788275572325, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71415472, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.10205078, + "step": 1914, + "time_per_iteration": 4.9577555656433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110633, + "balance_loss_mlp": 1.07287991, + "epoch": 0.36841092727972297, + "flos": 835261784064.0, + "grad_norm": 0.06223209716338702, + "language_loss": 0.79735458, + "learning_rate": 0.0007285760564309179, + "loss": 0.80841786, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.3347168, + "step": 1915, + "time_per_iteration": 3.1251590251922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101251, + "balance_loss_mlp": 1.06811082, + "epoch": 0.36860330896498655, + "flos": 689517591552.0, + "grad_norm": 0.05672479428696366, + "language_loss": 0.85010201, + "learning_rate": 0.0007282989299232448, + "loss": 0.8611145, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.33154297, + "step": 1916, + "time_per_iteration": 3.062971353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096553, + "balance_loss_mlp": 1.06381774, + "epoch": 0.3687956906502501, + "flos": 553919067648.0, + "grad_norm": 0.05955658020637064, + "language_loss": 0.83600092, + "learning_rate": 0.0007280217147820668, + "loss": 0.84696645, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.32739258, + "step": 1917, + "time_per_iteration": 2.6169495582580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097845, + "balance_loss_mlp": 1.06465673, + "epoch": 0.3689880723355137, + "flos": 576426184704.0, + "grad_norm": 0.05443515430960571, + "language_loss": 0.79137111, + "learning_rate": 0.0007277444111150079, + "loss": 0.80234957, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.33203125, + "step": 1918, + "time_per_iteration": 2.672696828842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101204, + "balance_loss_mlp": 1.06820679, + "epoch": 0.3691804540207772, + "flos": 528615465984.0, + "grad_norm": 0.06564490716140688, + "language_loss": 0.84340626, + "learning_rate": 0.0007274670190297272, + "loss": 0.85441828, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.33007812, + "step": 1919, + "time_per_iteration": 2.569920539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091279, + "balance_loss_mlp": 1.0575906, + "epoch": 0.3693728357060408, + "flos": 560729806848.0, + "grad_norm": 0.06475948680742319, + "language_loss": 0.81988895, + "learning_rate": 0.0007271895386339179, + "loss": 0.83080173, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.3371582, + "step": 1920, + "time_per_iteration": 2.765470027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086642, + "balance_loss_mlp": 1.05350161, + "epoch": 0.3695652173913043, + "flos": 579488919552.0, + "grad_norm": 0.0536525739451854, + "language_loss": 0.82950377, + "learning_rate": 0.0007269119700353073, + "loss": 0.84037018, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.33154297, + "step": 1921, + "time_per_iteration": 2.703117847442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089729, + "balance_loss_mlp": 1.05756688, + "epoch": 0.3697575990765679, + "flos": 512629516800.0, + "grad_norm": 0.04104943724396866, + "language_loss": 0.84983069, + "learning_rate": 0.0007266343133416571, + "loss": 0.86072791, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.3215332, + "step": 1922, + "time_per_iteration": 2.7371909618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060973, + "balance_loss_mlp": 1.04967153, + "epoch": 0.3699499807618315, + "flos": 1569826953216.0, + "grad_norm": 0.023907464900796205, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78177893, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.11279297, + "step": 1923, + "time_per_iteration": 4.827981233596802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084577, + "balance_loss_mlp": 1.05136538, + "epoch": 0.37014236244709503, + "flos": 497093262336.0, + "grad_norm": 0.06739223877154035, + "language_loss": 0.84575641, + "learning_rate": 0.0007260787361004556, + "loss": 0.85660219, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.33227539, + "step": 1924, + "time_per_iteration": 2.6221601963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048708, + "balance_loss_mlp": 1.03764546, + "epoch": 0.3703347441323586, + "flos": 1443562048512.0, + "grad_norm": 0.02017040526472397, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74810213, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.11083984, + "step": 1925, + "time_per_iteration": 4.909639120101929 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083689, + "balance_loss_mlp": 1.04997683, + "epoch": 0.37052712581762215, + "flos": 563324060160.0, + "grad_norm": 0.19489786122972683, + "language_loss": 0.87265027, + "learning_rate": 0.0007255228077730903, + "loss": 0.88348716, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.33740234, + "step": 1926, + "time_per_iteration": 2.726412773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092339, + "balance_loss_mlp": 1.05850744, + "epoch": 0.37071950750288574, + "flos": 925706451456.0, + "grad_norm": 0.06639539702607969, + "language_loss": 0.81730163, + "learning_rate": 0.0007252447122218632, + "loss": 0.82822502, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.33862305, + "step": 1927, + "time_per_iteration": 3.157439708709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090678, + "balance_loss_mlp": 1.05710912, + "epoch": 0.37091188918814927, + "flos": 418090609152.0, + "grad_norm": 0.06586667444600991, + "language_loss": 0.87736213, + "learning_rate": 0.0007249665292228834, + "loss": 0.88826889, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.3359375, + "step": 1928, + "time_per_iteration": 2.569284677505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086968, + "balance_loss_mlp": 1.05208778, + "epoch": 0.37110427087341286, + "flos": 462941899776.0, + "grad_norm": 0.056849308308669105, + "language_loss": 0.83676869, + "learning_rate": 0.000724688258884151, + "loss": 0.84763837, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.34912109, + "step": 1929, + "time_per_iteration": 2.522596597671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085634, + "balance_loss_mlp": 1.05204105, + "epoch": 0.3712966525586764, + "flos": 849303843840.0, + "grad_norm": 0.0484214736208702, + "language_loss": 0.86208755, + "learning_rate": 0.0007244099013137002, + "loss": 0.87294388, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.33618164, + "step": 1930, + "time_per_iteration": 3.055302619934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089345, + "balance_loss_mlp": 1.05370176, + "epoch": 0.37148903424394, + "flos": 925555092480.0, + "grad_norm": 0.05147814185741214, + "language_loss": 0.88918859, + "learning_rate": 0.0007241314566195993, + "loss": 0.90008199, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.35693359, + "step": 1931, + "time_per_iteration": 3.249950408935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108542, + "balance_loss_mlp": 1.05020559, + "epoch": 0.37168141592920356, + "flos": 519565473792.0, + "grad_norm": 0.061459583473066896, + "language_loss": 0.85347825, + "learning_rate": 0.0007238529249099496, + "loss": 0.86433244, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.35253906, + "step": 1932, + "time_per_iteration": 2.603287696838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068675, + "balance_loss_mlp": 1.05599129, + "epoch": 0.3718737976144671, + "flos": 1445107267584.0, + "grad_norm": 0.02721294021284605, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.7892555, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.12695312, + "step": 1933, + "time_per_iteration": 4.850685358047485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089729, + "balance_loss_mlp": 1.05346537, + "epoch": 0.3720661792997307, + "flos": 759218558976.0, + "grad_norm": 0.08735029164116491, + "language_loss": 0.80658156, + "learning_rate": 0.000723295600876581, + "loss": 0.81747884, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.36279297, + "step": 1934, + "time_per_iteration": 3.0082099437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092646, + "balance_loss_mlp": 1.05690694, + "epoch": 0.3722585609849942, + "flos": 516686031360.0, + "grad_norm": 0.1760204301041219, + "language_loss": 0.8798061, + "learning_rate": 0.0007230168087692344, + "loss": 0.89073259, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.35791016, + "step": 1935, + "time_per_iteration": 2.7076756954193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095918, + "balance_loss_mlp": 1.06070328, + "epoch": 0.3724509426702578, + "flos": 782114171904.0, + "grad_norm": 0.058450977170247324, + "language_loss": 0.82290804, + "learning_rate": 0.0007227379300790839, + "loss": 0.83386725, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.35205078, + "step": 1936, + "time_per_iteration": 3.0381107330322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108913, + "balance_loss_mlp": 1.07262528, + "epoch": 0.37264332435552133, + "flos": 391502246400.0, + "grad_norm": 0.062314619417064634, + "language_loss": 0.85779369, + "learning_rate": 0.0007224589649143997, + "loss": 0.86888283, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.36328125, + "step": 1937, + "time_per_iteration": 2.5413918495178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118094, + "balance_loss_mlp": 1.08223581, + "epoch": 0.3728357060407849, + "flos": 542599299072.0, + "grad_norm": 0.08241458585549921, + "language_loss": 0.80921531, + "learning_rate": 0.0007221799133834861, + "loss": 0.82039624, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.35839844, + "step": 1938, + "time_per_iteration": 2.620593309402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122902, + "balance_loss_mlp": 1.08682895, + "epoch": 0.3730280877260485, + "flos": 433344646656.0, + "grad_norm": 0.05702640818290307, + "language_loss": 0.81373966, + "learning_rate": 0.00072190077559468, + "loss": 0.8249687, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.36083984, + "step": 1939, + "time_per_iteration": 2.512871026992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133095, + "balance_loss_mlp": 1.09587836, + "epoch": 0.37322046941131204, + "flos": 531230068224.0, + "grad_norm": 0.0616329871980105, + "language_loss": 0.89228082, + "learning_rate": 0.0007216215516563527, + "loss": 0.90361178, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.37207031, + "step": 1940, + "time_per_iteration": 2.6655144691467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113311, + "balance_loss_mlp": 1.09534478, + "epoch": 0.3734128510965756, + "flos": 531294087168.0, + "grad_norm": 0.05659412158312536, + "language_loss": 0.83479297, + "learning_rate": 0.0007213422416769083, + "loss": 0.84612405, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.37744141, + "step": 1941, + "time_per_iteration": 2.6088144779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127109, + "balance_loss_mlp": 1.09022546, + "epoch": 0.37360523278183916, + "flos": 500195284992.0, + "grad_norm": 0.05910558413712496, + "language_loss": 0.74991721, + "learning_rate": 0.0007210628457647849, + "loss": 0.76118833, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.36889648, + "step": 1942, + "time_per_iteration": 2.57867169380188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131479, + "balance_loss_mlp": 1.09366596, + "epoch": 0.37379761446710275, + "flos": 547652413440.0, + "grad_norm": 0.06364761456819781, + "language_loss": 0.79148316, + "learning_rate": 0.000720783364028453, + "loss": 0.80279785, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.37768555, + "step": 1943, + "time_per_iteration": 2.744575023651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121556, + "balance_loss_mlp": 1.0834806, + "epoch": 0.3739899961523663, + "flos": 475517316096.0, + "grad_norm": 0.05406366318559307, + "language_loss": 0.87411249, + "learning_rate": 0.0007205037965764177, + "loss": 0.88532799, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.38061523, + "step": 1944, + "time_per_iteration": 2.5253238677978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122588, + "balance_loss_mlp": 1.0851568, + "epoch": 0.37418237783762986, + "flos": 611626581504.0, + "grad_norm": 0.05571778703090581, + "language_loss": 0.85614675, + "learning_rate": 0.0007202241435172161, + "loss": 0.86737263, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.37426758, + "step": 1945, + "time_per_iteration": 2.7462716102600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117651, + "balance_loss_mlp": 1.07931328, + "epoch": 0.3743747595228934, + "flos": 765953694720.0, + "grad_norm": 0.05225192391609906, + "language_loss": 0.88148731, + "learning_rate": 0.0007199444049594198, + "loss": 0.89266384, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.38330078, + "step": 1946, + "time_per_iteration": 2.9533469676971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116752, + "balance_loss_mlp": 1.07798529, + "epoch": 0.374567141208157, + "flos": 524120993280.0, + "grad_norm": 0.06150523549490838, + "language_loss": 0.83402771, + "learning_rate": 0.0007196645810116322, + "loss": 0.84519523, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.38720703, + "step": 1947, + "time_per_iteration": 2.709965705871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106001, + "balance_loss_mlp": 1.06735349, + "epoch": 0.37475952289342057, + "flos": 681067090944.0, + "grad_norm": 0.05833074938802531, + "language_loss": 0.83909506, + "learning_rate": 0.0007193846717824912, + "loss": 0.850155, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.38598633, + "step": 1948, + "time_per_iteration": 2.854522705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094783, + "balance_loss_mlp": 1.05682671, + "epoch": 0.3749519045786841, + "flos": 460061047296.0, + "grad_norm": 0.06673844071937801, + "language_loss": 0.88263041, + "learning_rate": 0.0007191046773806669, + "loss": 0.89357823, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.37915039, + "step": 1949, + "time_per_iteration": 2.575989007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085738, + "balance_loss_mlp": 1.04682803, + "epoch": 0.3751442862639477, + "flos": 954471458304.0, + "grad_norm": 0.06638817682476543, + "language_loss": 0.83010924, + "learning_rate": 0.0007188245979148631, + "loss": 0.84096658, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.38867188, + "step": 1950, + "time_per_iteration": 3.1386518478393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082888, + "balance_loss_mlp": 1.04462171, + "epoch": 0.3753366679492112, + "flos": 527483473920.0, + "grad_norm": 0.05996025340147905, + "language_loss": 0.8766306, + "learning_rate": 0.0007185444334938157, + "loss": 0.88745946, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.38232422, + "step": 1951, + "time_per_iteration": 2.644848585128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082635, + "balance_loss_mlp": 1.04501283, + "epoch": 0.3755290496344748, + "flos": 521535504384.0, + "grad_norm": 0.05938829335869994, + "language_loss": 0.84891546, + "learning_rate": 0.0007182641842262947, + "loss": 0.85974181, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.3762207, + "step": 1952, + "time_per_iteration": 2.6239395141601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081192, + "balance_loss_mlp": 1.04361689, + "epoch": 0.37572143131973834, + "flos": 620810403840.0, + "grad_norm": 0.06544951097265184, + "language_loss": 0.77827752, + "learning_rate": 0.0007179838502211022, + "loss": 0.78908944, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.37524414, + "step": 1953, + "time_per_iteration": 2.8444712162017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076992, + "balance_loss_mlp": 1.03958416, + "epoch": 0.37591381300500193, + "flos": 770635842048.0, + "grad_norm": 0.05616797515781331, + "language_loss": 0.86183697, + "learning_rate": 0.0007177034315870738, + "loss": 0.87260687, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.37402344, + "step": 1954, + "time_per_iteration": 2.9628727436065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078673, + "balance_loss_mlp": 1.0411936, + "epoch": 0.37610619469026546, + "flos": 520191106560.0, + "grad_norm": 0.05872311076525267, + "language_loss": 0.9098376, + "learning_rate": 0.0007174229284330773, + "loss": 0.92062426, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.37402344, + "step": 1955, + "time_per_iteration": 2.579792022705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010784, + "balance_loss_mlp": 1.0412302, + "epoch": 0.37629857637552905, + "flos": 598524456960.0, + "grad_norm": 0.050284285498010506, + "language_loss": 0.86896843, + "learning_rate": 0.0007171423408680141, + "loss": 0.8797524, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.37133789, + "step": 1956, + "time_per_iteration": 2.7764384746551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079026, + "balance_loss_mlp": 1.04211903, + "epoch": 0.37649095806079264, + "flos": 564687396864.0, + "grad_norm": 0.058102078307858664, + "language_loss": 0.89614129, + "learning_rate": 0.0007168616690008176, + "loss": 0.90693152, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.36889648, + "step": 1957, + "time_per_iteration": 2.646986246109009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083688, + "balance_loss_mlp": 1.04606569, + "epoch": 0.37668333974605617, + "flos": 592196755968.0, + "grad_norm": 0.10223927136981294, + "language_loss": 0.86142451, + "learning_rate": 0.0007165809129404545, + "loss": 0.8722614, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.37573242, + "step": 1958, + "time_per_iteration": 2.756287097930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081219, + "balance_loss_mlp": 1.04440713, + "epoch": 0.37687572143131975, + "flos": 419257506816.0, + "grad_norm": 0.0560584683493853, + "language_loss": 0.85760534, + "learning_rate": 0.0007163000727959239, + "loss": 0.8684175, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.36791992, + "step": 1959, + "time_per_iteration": 2.5415151119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105237, + "balance_loss_mlp": 1.03587127, + "epoch": 0.3770681031165833, + "flos": 1356484243968.0, + "grad_norm": 0.028402472736143748, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.7901144, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.16503906, + "step": 1960, + "time_per_iteration": 4.892657518386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086261, + "balance_loss_mlp": 1.04892445, + "epoch": 0.3772604848018469, + "flos": 644592107520.0, + "grad_norm": 0.04530874218926827, + "language_loss": 0.84377986, + "learning_rate": 0.00071573814069052, + "loss": 0.85464251, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.37329102, + "step": 1961, + "time_per_iteration": 2.898301839828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088156, + "balance_loss_mlp": 1.05098641, + "epoch": 0.3774528664871104, + "flos": 901265619456.0, + "grad_norm": 0.052585227845940184, + "language_loss": 0.87987518, + "learning_rate": 0.0007154570489478081, + "loss": 0.89075673, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.37158203, + "step": 1962, + "time_per_iteration": 3.1638717651367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088198, + "balance_loss_mlp": 1.05048013, + "epoch": 0.377645248172374, + "flos": 787717315584.0, + "grad_norm": 0.047624248218528294, + "language_loss": 0.864995, + "learning_rate": 0.0007151758735572514, + "loss": 0.87587702, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.37695312, + "step": 1963, + "time_per_iteration": 2.985558271408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090796, + "balance_loss_mlp": 1.05236292, + "epoch": 0.3778376298576376, + "flos": 586417522176.0, + "grad_norm": 0.06598015027050642, + "language_loss": 0.80448836, + "learning_rate": 0.0007148946146280119, + "loss": 0.81539631, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.38427734, + "step": 1964, + "time_per_iteration": 2.784947395324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053975, + "balance_loss_mlp": 1.03938425, + "epoch": 0.3780300115429011, + "flos": 1396014759936.0, + "grad_norm": 0.018109037433210438, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73246121, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.14550781, + "step": 1965, + "time_per_iteration": 4.85606050491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052214, + "balance_loss_mlp": 1.03838599, + "epoch": 0.3782223932281647, + "flos": 1356935348736.0, + "grad_norm": 0.019183634636191996, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76394159, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.13867188, + "step": 1966, + "time_per_iteration": 4.946903467178345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082797, + "balance_loss_mlp": 1.04517484, + "epoch": 0.37841477491342823, + "flos": 703811344896.0, + "grad_norm": 0.05921890511558738, + "language_loss": 0.83782387, + "learning_rate": 0.0007140503377003022, + "loss": 0.84865183, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.3762207, + "step": 1967, + "time_per_iteration": 2.984163761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089165, + "balance_loss_mlp": 1.0504458, + "epoch": 0.3786071565986918, + "flos": 528856985088.0, + "grad_norm": 0.047303083725180994, + "language_loss": 0.84754062, + "learning_rate": 0.000713768745708599, + "loss": 0.85843223, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.38696289, + "step": 1968, + "time_per_iteration": 2.6251039505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086627, + "balance_loss_mlp": 1.04881418, + "epoch": 0.37879953828395535, + "flos": 992872802304.0, + "grad_norm": 0.053091209869219315, + "language_loss": 0.76740122, + "learning_rate": 0.0007134870707245085, + "loss": 0.7782675, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.37792969, + "step": 1969, + "time_per_iteration": 3.252840995788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082313, + "balance_loss_mlp": 1.04435682, + "epoch": 0.37899191996921894, + "flos": 626358292992.0, + "grad_norm": 0.06088981396741891, + "language_loss": 0.8454808, + "learning_rate": 0.0007132053128573864, + "loss": 0.85630393, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.37915039, + "step": 1970, + "time_per_iteration": 2.739210844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078955, + "balance_loss_mlp": 1.0417614, + "epoch": 0.37918430165448247, + "flos": 686005314048.0, + "grad_norm": 0.05972304110224919, + "language_loss": 0.83631253, + "learning_rate": 0.0007129234722166211, + "loss": 0.84710205, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.37182617, + "step": 1971, + "time_per_iteration": 2.814235210418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086171, + "balance_loss_mlp": 1.05012178, + "epoch": 0.37937668333974606, + "flos": 475374721536.0, + "grad_norm": 0.05230765101952506, + "language_loss": 0.91063309, + "learning_rate": 0.0007126415489116328, + "loss": 0.92149478, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.3605957, + "step": 1972, + "time_per_iteration": 2.657435178756714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091568, + "balance_loss_mlp": 1.05413604, + "epoch": 0.37956906502500964, + "flos": 707271340032.0, + "grad_norm": 0.05210329015751025, + "language_loss": 0.81174934, + "learning_rate": 0.0007123595430518736, + "loss": 0.82266498, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.37402344, + "step": 1973, + "time_per_iteration": 2.832801103591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108628, + "balance_loss_mlp": 1.04856205, + "epoch": 0.3797614467102732, + "flos": 426421836288.0, + "grad_norm": 0.07403044475037865, + "language_loss": 0.8602494, + "learning_rate": 0.0007120774547468282, + "loss": 0.87111217, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.37695312, + "step": 1974, + "time_per_iteration": 2.523059844970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087165, + "balance_loss_mlp": 1.05016232, + "epoch": 0.37995382839553676, + "flos": 481588941312.0, + "grad_norm": 0.05250431859571283, + "language_loss": 0.81228226, + "learning_rate": 0.0007117952841060128, + "loss": 0.82315391, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.36962891, + "step": 1975, + "time_per_iteration": 2.648947238922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080927, + "balance_loss_mlp": 1.04409158, + "epoch": 0.3801462100808003, + "flos": 560286056448.0, + "grad_norm": 0.0511194255012935, + "language_loss": 0.83466387, + "learning_rate": 0.0007115130312389756, + "loss": 0.84547317, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.3684082, + "step": 1976, + "time_per_iteration": 2.6648154258728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086722, + "balance_loss_mlp": 1.0505538, + "epoch": 0.3803385917660639, + "flos": 464699524608.0, + "grad_norm": 0.06028169205400359, + "language_loss": 0.79143679, + "learning_rate": 0.0007112306962552973, + "loss": 0.80230403, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.36181641, + "step": 1977, + "time_per_iteration": 2.5715434551239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086468, + "balance_loss_mlp": 1.04934657, + "epoch": 0.3805309734513274, + "flos": 521614080000.0, + "grad_norm": 0.055719330197324175, + "language_loss": 0.8517288, + "learning_rate": 0.0007109482792645896, + "loss": 0.86259341, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.37084961, + "step": 1978, + "time_per_iteration": 2.6932663917541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088226, + "balance_loss_mlp": 1.05222487, + "epoch": 0.380723355136591, + "flos": 591128782848.0, + "grad_norm": 0.06665517257748008, + "language_loss": 0.83491528, + "learning_rate": 0.0007106657803764969, + "loss": 0.84579754, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.36010742, + "step": 1979, + "time_per_iteration": 2.710658311843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079049, + "balance_loss_mlp": 1.04204643, + "epoch": 0.38091573682185453, + "flos": 622394910720.0, + "grad_norm": 0.05735071115872701, + "language_loss": 0.81648314, + "learning_rate": 0.0007103831997006948, + "loss": 0.82727367, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.36987305, + "step": 1980, + "time_per_iteration": 2.7589223384857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107835, + "balance_loss_mlp": 1.04168153, + "epoch": 0.3811081185071181, + "flos": 568716208128.0, + "grad_norm": 0.047165366669346453, + "language_loss": 0.85731214, + "learning_rate": 0.0007101005373468908, + "loss": 0.86809564, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.36669922, + "step": 1981, + "time_per_iteration": 2.85588002204895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077478, + "balance_loss_mlp": 1.04161954, + "epoch": 0.3813005001923817, + "flos": 584550798336.0, + "grad_norm": 0.055048826019740454, + "language_loss": 0.86394024, + "learning_rate": 0.0007098177934248242, + "loss": 0.87471503, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.35888672, + "step": 1982, + "time_per_iteration": 2.7226805686950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077944, + "balance_loss_mlp": 1.04160953, + "epoch": 0.38149288187764524, + "flos": 621287649792.0, + "grad_norm": 0.056689743602043985, + "language_loss": 0.85823661, + "learning_rate": 0.0007095349680442661, + "loss": 0.86901605, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.36352539, + "step": 1983, + "time_per_iteration": 2.8454253673553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075326, + "balance_loss_mlp": 1.03829932, + "epoch": 0.3816852635629088, + "flos": 570414196224.0, + "grad_norm": 0.07971741755252446, + "language_loss": 0.78927159, + "learning_rate": 0.0007092520613150188, + "loss": 0.80002487, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.37036133, + "step": 1984, + "time_per_iteration": 2.7279529571533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081451, + "balance_loss_mlp": 1.04368556, + "epoch": 0.38187764524817236, + "flos": 565313029632.0, + "grad_norm": 0.06238598748372814, + "language_loss": 0.81304747, + "learning_rate": 0.0007089690733469165, + "loss": 0.82386196, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.37719727, + "step": 1985, + "time_per_iteration": 2.7343544960021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077201, + "balance_loss_mlp": 1.04115212, + "epoch": 0.38207002693343595, + "flos": 630932751360.0, + "grad_norm": 0.07832972313002672, + "language_loss": 0.82561398, + "learning_rate": 0.000708686004249825, + "loss": 0.83638602, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.36035156, + "step": 1986, + "time_per_iteration": 2.7691054344177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082228, + "balance_loss_mlp": 1.04572582, + "epoch": 0.3822624086186995, + "flos": 548507980800.0, + "grad_norm": 0.053849318526496194, + "language_loss": 0.9147824, + "learning_rate": 0.0007084028541336413, + "loss": 0.9256047, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.36499023, + "step": 1987, + "time_per_iteration": 2.7131056785583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083994, + "balance_loss_mlp": 1.04753971, + "epoch": 0.38245479030396307, + "flos": 613571880960.0, + "grad_norm": 0.06787860515410171, + "language_loss": 0.86709088, + "learning_rate": 0.0007081196231082942, + "loss": 0.87793082, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.36450195, + "step": 1988, + "time_per_iteration": 2.7983548641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083168, + "balance_loss_mlp": 1.04621339, + "epoch": 0.38264717198922665, + "flos": 667787466240.0, + "grad_norm": 0.05230973877590939, + "language_loss": 0.80107033, + "learning_rate": 0.0007078363112837436, + "loss": 0.81190205, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.36938477, + "step": 1989, + "time_per_iteration": 2.8133270740509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087048, + "balance_loss_mlp": 1.04935408, + "epoch": 0.3828395536744902, + "flos": 454521922560.0, + "grad_norm": 0.077904410907181, + "language_loss": 0.84988701, + "learning_rate": 0.000707552918769981, + "loss": 0.86075753, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.37646484, + "step": 1990, + "time_per_iteration": 2.5100817680358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089992, + "balance_loss_mlp": 1.05213106, + "epoch": 0.3830319353597538, + "flos": 499191330816.0, + "grad_norm": 0.06242573245055077, + "language_loss": 0.83457661, + "learning_rate": 0.000707269445677029, + "loss": 0.84547657, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.37817383, + "step": 1991, + "time_per_iteration": 2.7737526893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099946, + "balance_loss_mlp": 1.06327772, + "epoch": 0.3832243170450173, + "flos": 743787021312.0, + "grad_norm": 0.05437985066129539, + "language_loss": 0.84858984, + "learning_rate": 0.0007069858921149416, + "loss": 0.85958934, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.36694336, + "step": 1992, + "time_per_iteration": 2.9642581939697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095684, + "balance_loss_mlp": 1.05868101, + "epoch": 0.3834166987302809, + "flos": 577937908224.0, + "grad_norm": 0.10762195872615073, + "language_loss": 0.85869837, + "learning_rate": 0.0007067022581938043, + "loss": 0.86965525, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.36987305, + "step": 1993, + "time_per_iteration": 2.805201292037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101221, + "balance_loss_mlp": 1.06531525, + "epoch": 0.3836090804155444, + "flos": 536194432512.0, + "grad_norm": 0.06280477504596697, + "language_loss": 0.831635, + "learning_rate": 0.0007064185440237334, + "loss": 0.84264719, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.359375, + "step": 1994, + "time_per_iteration": 2.7297706604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101916, + "balance_loss_mlp": 1.06527066, + "epoch": 0.383801462100808, + "flos": 601587191808.0, + "grad_norm": 0.05513764490979663, + "language_loss": 0.84278905, + "learning_rate": 0.0007061347497148764, + "loss": 0.85380822, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.36621094, + "step": 1995, + "time_per_iteration": 2.725632429122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094031, + "balance_loss_mlp": 1.05876923, + "epoch": 0.38399384378607154, + "flos": 572427896832.0, + "grad_norm": 0.06604776765413087, + "language_loss": 0.86282277, + "learning_rate": 0.0007058508753774122, + "loss": 0.87376308, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.35302734, + "step": 1996, + "time_per_iteration": 2.760045051574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092829, + "balance_loss_mlp": 1.05773425, + "epoch": 0.38418622547133513, + "flos": 536513117184.0, + "grad_norm": 0.058737109015633, + "language_loss": 0.86788458, + "learning_rate": 0.0007055669211215505, + "loss": 0.87881291, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.35131836, + "step": 1997, + "time_per_iteration": 2.6091408729553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088772, + "balance_loss_mlp": 1.05238962, + "epoch": 0.3843786071565987, + "flos": 572673798144.0, + "grad_norm": 0.06483433205315106, + "language_loss": 0.77687544, + "learning_rate": 0.0007052828870575322, + "loss": 0.78776312, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.36376953, + "step": 1998, + "time_per_iteration": 2.671349048614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109131, + "balance_loss_mlp": 1.0558331, + "epoch": 0.38457098884186225, + "flos": 728361275904.0, + "grad_norm": 0.05010154832824161, + "language_loss": 0.86955881, + "learning_rate": 0.0007049987732956291, + "loss": 0.88047194, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.35498047, + "step": 1999, + "time_per_iteration": 2.9918439388275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108718, + "balance_loss_mlp": 1.05189395, + "epoch": 0.38476337052712584, + "flos": 583123442688.0, + "grad_norm": 0.047224279388360366, + "language_loss": 0.82623643, + "learning_rate": 0.0007047145799461439, + "loss": 0.83710825, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.35327148, + "step": 2000, + "time_per_iteration": 2.84328293800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092374, + "balance_loss_mlp": 1.05656374, + "epoch": 0.38495575221238937, + "flos": 552787075584.0, + "grad_norm": 0.05254385134155795, + "language_loss": 0.82269979, + "learning_rate": 0.00070443030711941, + "loss": 0.83362353, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.3581543, + "step": 2001, + "time_per_iteration": 2.753903865814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092226, + "balance_loss_mlp": 1.0559628, + "epoch": 0.38514813389765296, + "flos": 654173190144.0, + "grad_norm": 0.05896823149323879, + "language_loss": 0.8241961, + "learning_rate": 0.0007041459549257924, + "loss": 0.83511841, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.36254883, + "step": 2002, + "time_per_iteration": 2.85963773727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086715, + "balance_loss_mlp": 1.05030835, + "epoch": 0.3853405155829165, + "flos": 867715158528.0, + "grad_norm": 0.0671523306708724, + "language_loss": 0.78569824, + "learning_rate": 0.0007038615234756859, + "loss": 0.79656541, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.36425781, + "step": 2003, + "time_per_iteration": 3.1452226638793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_mlp": 1.04440188, + "epoch": 0.3855328972681801, + "flos": 546164011008.0, + "grad_norm": 0.05736478292188374, + "language_loss": 0.83675313, + "learning_rate": 0.000703577012879517, + "loss": 0.84755898, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.36230469, + "step": 2004, + "time_per_iteration": 2.6308705806732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075931, + "balance_loss_mlp": 1.04040706, + "epoch": 0.3857252789534436, + "flos": 533819939328.0, + "grad_norm": 0.0602394573591363, + "language_loss": 0.8843599, + "learning_rate": 0.0007032924232477423, + "loss": 0.89511919, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.35595703, + "step": 2005, + "time_per_iteration": 2.6188220977783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079495, + "balance_loss_mlp": 1.04337406, + "epoch": 0.3859176606387072, + "flos": 491514849792.0, + "grad_norm": 0.055511202055775664, + "language_loss": 0.80448711, + "learning_rate": 0.0007030077546908493, + "loss": 0.81528199, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.36132812, + "step": 2006, + "time_per_iteration": 2.6309516429901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088067, + "balance_loss_mlp": 1.07609844, + "epoch": 0.3861100423239708, + "flos": 1486278955008.0, + "grad_norm": 0.032522163132150485, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84152722, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.11962891, + "step": 2007, + "time_per_iteration": 4.736604452133179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083286, + "balance_loss_mlp": 1.04816651, + "epoch": 0.3863024240092343, + "flos": 473493441024.0, + "grad_norm": 0.05514866045126494, + "language_loss": 0.79152983, + "learning_rate": 0.0007024381812438117, + "loss": 0.80236268, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.3515625, + "step": 2008, + "time_per_iteration": 2.5392396450042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108868, + "balance_loss_mlp": 1.05306053, + "epoch": 0.3864948056944979, + "flos": 716258723328.0, + "grad_norm": 0.059806412844581394, + "language_loss": 0.83199877, + "learning_rate": 0.0007021532765747951, + "loss": 0.84288561, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.35668945, + "step": 2009, + "time_per_iteration": 2.9926528930664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087876, + "balance_loss_mlp": 1.0511353, + "epoch": 0.38668718737976143, + "flos": 727302067200.0, + "grad_norm": 0.05631620148302912, + "language_loss": 0.7933259, + "learning_rate": 0.0007018682934229162, + "loss": 0.8042047, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.36743164, + "step": 2010, + "time_per_iteration": 2.924781322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087301, + "balance_loss_mlp": 1.05103779, + "epoch": 0.386879569065025, + "flos": 525218079744.0, + "grad_norm": 0.05794664731816873, + "language_loss": 0.82387936, + "learning_rate": 0.0007015832318988152, + "loss": 0.83475244, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.36328125, + "step": 2011, + "time_per_iteration": 2.668565511703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_mlp": 1.02173615, + "epoch": 0.38707195075028855, + "flos": 1527036005376.0, + "grad_norm": 0.019732384975687786, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74922872, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11083984, + "step": 2012, + "time_per_iteration": 4.948081970214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085105, + "balance_loss_mlp": 1.04886508, + "epoch": 0.38726433243555214, + "flos": 557045821440.0, + "grad_norm": 0.049164425244227684, + "language_loss": 0.84333575, + "learning_rate": 0.0007010128741766604, + "loss": 0.85418677, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.36230469, + "step": 2013, + "time_per_iteration": 2.7263684272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073123, + "balance_loss_mlp": 1.03695476, + "epoch": 0.38745671412081567, + "flos": 553431647232.0, + "grad_norm": 0.06787190277242791, + "language_loss": 0.84107876, + "learning_rate": 0.0007007275782000391, + "loss": 0.85181004, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.36181641, + "step": 2014, + "time_per_iteration": 2.6458756923675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082824, + "balance_loss_mlp": 1.04679942, + "epoch": 0.38764909580607926, + "flos": 458175384576.0, + "grad_norm": 0.05583745089019265, + "language_loss": 0.85148585, + "learning_rate": 0.0007004422042940605, + "loss": 0.86231411, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.3605957, + "step": 2015, + "time_per_iteration": 2.5374679565429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074881, + "balance_loss_mlp": 1.03847444, + "epoch": 0.38784147749134285, + "flos": 521973462528.0, + "grad_norm": 0.056017537147394686, + "language_loss": 0.89528251, + "learning_rate": 0.0007001567525695169, + "loss": 0.90603131, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.36425781, + "step": 2016, + "time_per_iteration": 2.5863571166992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081449, + "balance_loss_mlp": 1.04504275, + "epoch": 0.3880338591766064, + "flos": 665696600064.0, + "grad_norm": 0.05583938490392423, + "language_loss": 0.839926, + "learning_rate": 0.0006998712231372303, + "loss": 0.85074055, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.36425781, + "step": 2017, + "time_per_iteration": 2.998652219772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076964, + "balance_loss_mlp": 1.04103458, + "epoch": 0.38822624086186996, + "flos": 593660427264.0, + "grad_norm": 0.044278068469259586, + "language_loss": 0.86088806, + "learning_rate": 0.0006995856161080532, + "loss": 0.87165773, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.35961914, + "step": 2018, + "time_per_iteration": 2.870619297027588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077822, + "balance_loss_mlp": 1.03972268, + "epoch": 0.3884186225471335, + "flos": 612256596480.0, + "grad_norm": 0.10653426783792587, + "language_loss": 0.8221643, + "learning_rate": 0.0006992999315928679, + "loss": 0.83294249, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.38061523, + "step": 2019, + "time_per_iteration": 2.7867438793182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074258, + "balance_loss_mlp": 1.03782773, + "epoch": 0.3886110042323971, + "flos": 606737820672.0, + "grad_norm": 0.05830260104080337, + "language_loss": 0.85476196, + "learning_rate": 0.0006990141697025871, + "loss": 0.8655045, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.36401367, + "step": 2020, + "time_per_iteration": 2.7722346782684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008633, + "balance_loss_mlp": 0.99642587, + "epoch": 0.3888033859176606, + "flos": 1527289108992.0, + "grad_norm": 0.011259985776032525, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77368271, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.12207031, + "step": 2021, + "time_per_iteration": 4.71975302696228 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069739, + "balance_loss_mlp": 1.03368998, + "epoch": 0.3889957676029242, + "flos": 692145340416.0, + "grad_norm": 0.08577921290538253, + "language_loss": 0.82040119, + "learning_rate": 0.0006984424142405392, + "loss": 0.83109856, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.36035156, + "step": 2022, + "time_per_iteration": 2.8003616333007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070636, + "balance_loss_mlp": 1.03413415, + "epoch": 0.3891881492881878, + "flos": 514937170944.0, + "grad_norm": 0.06357614890279897, + "language_loss": 0.81860286, + "learning_rate": 0.0006981564208907474, + "loss": 0.82930923, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.36499023, + "step": 2023, + "time_per_iteration": 2.581556558609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074791, + "balance_loss_mlp": 1.03933799, + "epoch": 0.3893805309734513, + "flos": 628770663936.0, + "grad_norm": 0.04985256691663517, + "language_loss": 0.90055227, + "learning_rate": 0.0006978703506098102, + "loss": 0.91130018, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.35498047, + "step": 2024, + "time_per_iteration": 2.7220654487609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080786, + "balance_loss_mlp": 1.04497564, + "epoch": 0.3895729126587149, + "flos": 543894234624.0, + "grad_norm": 0.06500254639996711, + "language_loss": 0.88078821, + "learning_rate": 0.00069758420350879, + "loss": 0.89159608, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.35839844, + "step": 2025, + "time_per_iteration": 2.6044023036956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086131, + "balance_loss_mlp": 1.05012965, + "epoch": 0.38976529434397844, + "flos": 617987778048.0, + "grad_norm": 0.06153368317516065, + "language_loss": 0.86008936, + "learning_rate": 0.000697297979698779, + "loss": 0.87095064, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.36010742, + "step": 2026, + "time_per_iteration": 2.7051053047180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091671, + "balance_loss_mlp": 1.05628932, + "epoch": 0.38995767602924203, + "flos": 834518287872.0, + "grad_norm": 0.05732441037152358, + "language_loss": 0.83766049, + "learning_rate": 0.0006970116792908992, + "loss": 0.84857726, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.35400391, + "step": 2027, + "time_per_iteration": 3.086228847503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096114, + "balance_loss_mlp": 1.06032705, + "epoch": 0.39015005771450556, + "flos": 541343651328.0, + "grad_norm": 0.060477391230123065, + "language_loss": 0.8159399, + "learning_rate": 0.000696725302396302, + "loss": 0.82690096, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.3581543, + "step": 2028, + "time_per_iteration": 2.6521902084350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093769, + "balance_loss_mlp": 1.05867422, + "epoch": 0.39034243939976915, + "flos": 1007102536704.0, + "grad_norm": 0.04866281229524116, + "language_loss": 0.85781944, + "learning_rate": 0.0006964388491261692, + "loss": 0.86875713, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.35131836, + "step": 2029, + "time_per_iteration": 3.2338647842407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099777, + "balance_loss_mlp": 1.06401408, + "epoch": 0.3905348210850327, + "flos": 678723121152.0, + "grad_norm": 0.05278281932643199, + "language_loss": 0.87335277, + "learning_rate": 0.0006961523195917114, + "loss": 0.88435054, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.35791016, + "step": 2030, + "time_per_iteration": 2.8414504528045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098277, + "balance_loss_mlp": 1.06256151, + "epoch": 0.39072720277029627, + "flos": 548606905344.0, + "grad_norm": 0.05643291477722073, + "language_loss": 0.77850938, + "learning_rate": 0.0006958657139041696, + "loss": 0.78949213, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.35742188, + "step": 2031, + "time_per_iteration": 2.7278060913085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091715, + "balance_loss_mlp": 1.07807708, + "epoch": 0.39091958445555985, + "flos": 1546912401408.0, + "grad_norm": 0.03426807627657635, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77804685, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.13671875, + "step": 2032, + "time_per_iteration": 4.927444696426392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099209, + "balance_loss_mlp": 1.06399429, + "epoch": 0.3911119661408234, + "flos": 503741058048.0, + "grad_norm": 0.050822615130563034, + "language_loss": 0.78371578, + "learning_rate": 0.0006952922745149434, + "loss": 0.79470789, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.35253906, + "step": 2033, + "time_per_iteration": 2.6730306148529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095244, + "balance_loss_mlp": 1.05933857, + "epoch": 0.391304347826087, + "flos": 556967245824.0, + "grad_norm": 0.05118619150019999, + "language_loss": 0.87770367, + "learning_rate": 0.000695005441035888, + "loss": 0.88865614, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.359375, + "step": 2034, + "time_per_iteration": 2.6585283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_mlp": 1.02461255, + "epoch": 0.3914967295113505, + "flos": 1499309858304.0, + "grad_norm": 0.00946210886057752, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74762058, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.140625, + "step": 2035, + "time_per_iteration": 4.8464648723602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087203, + "balance_loss_mlp": 1.05182171, + "epoch": 0.3916891111966141, + "flos": 706715518464.0, + "grad_norm": 0.07007748344060821, + "language_loss": 0.81416976, + "learning_rate": 0.0006944315470656863, + "loss": 0.82504177, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.35424805, + "step": 2036, + "time_per_iteration": 2.9289584159851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010868, + "balance_loss_mlp": 1.05051255, + "epoch": 0.3918814928818776, + "flos": 556085537280.0, + "grad_norm": 0.05570869743183256, + "language_loss": 0.91007531, + "learning_rate": 0.000694144486797345, + "loss": 0.92094326, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.36303711, + "step": 2037, + "time_per_iteration": 0.0013861656188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043662, + "balance_loss_mlp": 1.02954793, + "epoch": 0.3920738745671412, + "flos": 1537845032448.0, + "grad_norm": 0.018656318140729232, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80564094, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.14160156, + "step": 2038, + "time_per_iteration": 4.6249003410339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091153, + "balance_loss_mlp": 1.05586696, + "epoch": 0.39226625625240474, + "flos": 498594811392.0, + "grad_norm": 0.06764177247916761, + "language_loss": 0.89479941, + "learning_rate": 0.0006935701402514156, + "loss": 0.90571094, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.35327148, + "step": 2039, + "time_per_iteration": 2.535269260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_mlp": 1.01963174, + "epoch": 0.39245863793766833, + "flos": 1346465203200.0, + "grad_norm": 0.017448285120256823, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.7406826, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.13769531, + "step": 2040, + "time_per_iteration": 4.913180589675903 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086444, + "balance_loss_mlp": 1.05072939, + "epoch": 0.3926510196229319, + "flos": 1345614474240.0, + "grad_norm": 0.055350347752650936, + "language_loss": 0.8453002, + "learning_rate": 0.0006929954931031422, + "loss": 0.85616457, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.35742188, + "step": 2041, + "time_per_iteration": 3.6796491146087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081967, + "balance_loss_mlp": 1.04722965, + "epoch": 0.39284340130819545, + "flos": 499333925376.0, + "grad_norm": 0.05434437059814268, + "language_loss": 0.88856936, + "learning_rate": 0.0006927080570819805, + "loss": 0.89938903, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.34790039, + "step": 2042, + "time_per_iteration": 2.634052038192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087578, + "balance_loss_mlp": 1.05217266, + "epoch": 0.39303578299345904, + "flos": 520077625344.0, + "grad_norm": 0.06468620716873735, + "language_loss": 0.80649555, + "learning_rate": 0.0006924205462449161, + "loss": 0.81737131, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.35473633, + "step": 2043, + "time_per_iteration": 2.6021780967712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078315, + "balance_loss_mlp": 1.04288566, + "epoch": 0.39322816467872257, + "flos": 907529301504.0, + "grad_norm": 0.05516365318311268, + "language_loss": 0.82013571, + "learning_rate": 0.0006921329607035702, + "loss": 0.83091891, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.35473633, + "step": 2044, + "time_per_iteration": 3.2195992469787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078665, + "balance_loss_mlp": 1.04473805, + "epoch": 0.39342054636398616, + "flos": 517330603008.0, + "grad_norm": 0.046703626280748714, + "language_loss": 0.88374329, + "learning_rate": 0.0006918453005695938, + "loss": 0.89452994, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.33959961, + "step": 2045, + "time_per_iteration": 2.6319282054901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080714, + "balance_loss_mlp": 1.04435515, + "epoch": 0.3936129280492497, + "flos": 547646621184.0, + "grad_norm": 0.04434497339872072, + "language_loss": 0.8422206, + "learning_rate": 0.0006915575659546662, + "loss": 0.8530277, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.36401367, + "step": 2046, + "time_per_iteration": 2.648895263671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081267, + "balance_loss_mlp": 1.04519427, + "epoch": 0.3938053097345133, + "flos": 525858269184.0, + "grad_norm": 0.0524234289418272, + "language_loss": 0.80648899, + "learning_rate": 0.0006912697569704959, + "loss": 0.81730163, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.36083984, + "step": 2047, + "time_per_iteration": 2.6330111026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085798, + "balance_loss_mlp": 1.04934382, + "epoch": 0.39399769141977686, + "flos": 471390990336.0, + "grad_norm": 0.0542278175669412, + "language_loss": 0.86721706, + "learning_rate": 0.0006909818737288205, + "loss": 0.87807506, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.36450195, + "step": 2048, + "time_per_iteration": 2.537581205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090734, + "balance_loss_mlp": 1.05468488, + "epoch": 0.3941900731050404, + "flos": 501490220544.0, + "grad_norm": 0.056383256559611315, + "language_loss": 0.80660325, + "learning_rate": 0.000690693916341406, + "loss": 0.8175106, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.3605957, + "step": 2049, + "time_per_iteration": 2.622183084487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096224, + "balance_loss_mlp": 1.05898309, + "epoch": 0.394382454790304, + "flos": 580577241600.0, + "grad_norm": 0.11468284139168772, + "language_loss": 0.82465422, + "learning_rate": 0.0006904058849200475, + "loss": 0.83561641, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.37255859, + "step": 2050, + "time_per_iteration": 2.7216436862945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087551, + "balance_loss_mlp": 1.05088186, + "epoch": 0.3945748364755675, + "flos": 513563659776.0, + "grad_norm": 0.05187056217607278, + "language_loss": 0.84988606, + "learning_rate": 0.0006901177795765683, + "loss": 0.86076152, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.36694336, + "step": 2051, + "time_per_iteration": 2.5725293159484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079371, + "balance_loss_mlp": 1.04291642, + "epoch": 0.3947672181608311, + "flos": 593683748352.0, + "grad_norm": 0.0518129521666432, + "language_loss": 0.8131091, + "learning_rate": 0.0006898296004228213, + "loss": 0.82390279, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.36450195, + "step": 2052, + "time_per_iteration": 2.6969447135925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050959, + "balance_loss_mlp": 1.0379895, + "epoch": 0.39495959984609463, + "flos": 1546829443584.0, + "grad_norm": 0.029989620736742544, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79177701, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12988281, + "step": 2053, + "time_per_iteration": 4.8486199378967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078173, + "balance_loss_mlp": 1.04233825, + "epoch": 0.3951519815313582, + "flos": 496271190528.0, + "grad_norm": 0.06693197740080077, + "language_loss": 0.80026031, + "learning_rate": 0.0006892530211320763, + "loss": 0.81104207, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.35864258, + "step": 2054, + "time_per_iteration": 2.6731534004211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082597, + "balance_loss_mlp": 1.04657161, + "epoch": 0.39534436321662175, + "flos": 530934704640.0, + "grad_norm": 0.06400198340094926, + "language_loss": 0.8367995, + "learning_rate": 0.000688964621218926, + "loss": 0.84762549, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.36010742, + "step": 2055, + "time_per_iteration": 2.623870611190796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107818, + "balance_loss_mlp": 1.0422982, + "epoch": 0.39553674490188534, + "flos": 702224017920.0, + "grad_norm": 0.05929287076568038, + "language_loss": 0.80154717, + "learning_rate": 0.0006886761479432037, + "loss": 0.81232893, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.35864258, + "step": 2056, + "time_per_iteration": 2.810593366622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088441, + "balance_loss_mlp": 1.05263042, + "epoch": 0.3957291265871489, + "flos": 409552768512.0, + "grad_norm": 0.05784227470554994, + "language_loss": 0.84645867, + "learning_rate": 0.0006883876014169045, + "loss": 0.85734308, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.3581543, + "step": 2057, + "time_per_iteration": 2.464358329772949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088007, + "balance_loss_mlp": 1.05121863, + "epoch": 0.39592150827241246, + "flos": 618204566016.0, + "grad_norm": 0.05454135250908964, + "language_loss": 0.90161431, + "learning_rate": 0.000688098981752052, + "loss": 0.91249436, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.36791992, + "step": 2058, + "time_per_iteration": 2.742589235305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094817, + "balance_loss_mlp": 1.05819607, + "epoch": 0.39611388995767605, + "flos": 820986969600.0, + "grad_norm": 0.05656267147709111, + "language_loss": 0.80105305, + "learning_rate": 0.0006878102890606982, + "loss": 0.81200117, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.36621094, + "step": 2059, + "time_per_iteration": 3.0725343227386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096787, + "balance_loss_mlp": 1.06018949, + "epoch": 0.3963062716429396, + "flos": 491977539072.0, + "grad_norm": 0.0710153527902746, + "language_loss": 0.81321216, + "learning_rate": 0.0006875215234549239, + "loss": 0.82418001, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.3659668, + "step": 2060, + "time_per_iteration": 2.542090654373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097398, + "balance_loss_mlp": 1.06015694, + "epoch": 0.39649865332820317, + "flos": 584466430464.0, + "grad_norm": 0.08966956269211096, + "language_loss": 0.8554219, + "learning_rate": 0.0006872326850468376, + "loss": 0.86639589, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.37231445, + "step": 2061, + "time_per_iteration": 2.7194440364837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010999, + "balance_loss_mlp": 1.06139588, + "epoch": 0.3966910350134667, + "flos": 458328153600.0, + "grad_norm": 0.05276871533818733, + "language_loss": 0.78609985, + "learning_rate": 0.0006869437739485762, + "loss": 0.79709888, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.38476562, + "step": 2062, + "time_per_iteration": 2.6032114028930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089079, + "balance_loss_mlp": 1.05281568, + "epoch": 0.3968834166987303, + "flos": 508388299776.0, + "grad_norm": 0.05735909750828215, + "language_loss": 0.93035084, + "learning_rate": 0.0006866547902723053, + "loss": 0.94124162, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.36279297, + "step": 2063, + "time_per_iteration": 2.666294813156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097295, + "balance_loss_mlp": 1.06110287, + "epoch": 0.3970757983839938, + "flos": 572349321216.0, + "grad_norm": 0.05819495660266105, + "language_loss": 0.79961425, + "learning_rate": 0.000686365734130218, + "loss": 0.81058717, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.36206055, + "step": 2064, + "time_per_iteration": 2.667443037033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093878, + "balance_loss_mlp": 1.05639839, + "epoch": 0.3972681800692574, + "flos": 481391092224.0, + "grad_norm": 0.051061390118103664, + "language_loss": 0.84029245, + "learning_rate": 0.000686076605634536, + "loss": 0.85123128, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.37475586, + "step": 2065, + "time_per_iteration": 2.605252981185913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091284, + "balance_loss_mlp": 1.05406737, + "epoch": 0.397460561754521, + "flos": 487683887616.0, + "grad_norm": 0.060621892107923396, + "language_loss": 0.84327424, + "learning_rate": 0.0006857874048975088, + "loss": 0.85418713, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.37207031, + "step": 2066, + "time_per_iteration": 2.5779786109924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090658, + "balance_loss_mlp": 1.05329823, + "epoch": 0.3976529434397845, + "flos": 421768802304.0, + "grad_norm": 0.05929679602335689, + "language_loss": 0.86975348, + "learning_rate": 0.0006854981320314142, + "loss": 0.88066006, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.37353516, + "step": 2067, + "time_per_iteration": 2.4723403453826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082754, + "balance_loss_mlp": 1.04591799, + "epoch": 0.3978453251250481, + "flos": 545331764736.0, + "grad_norm": 0.058605050617464606, + "language_loss": 0.86758339, + "learning_rate": 0.0006852087871485579, + "loss": 0.87841094, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.36816406, + "step": 2068, + "time_per_iteration": 2.6007602214813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082239, + "balance_loss_mlp": 1.04602289, + "epoch": 0.39803770681031164, + "flos": 650548841472.0, + "grad_norm": 0.06821675645960798, + "language_loss": 0.81689966, + "learning_rate": 0.0006849193703612735, + "loss": 0.82772201, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.36206055, + "step": 2069, + "time_per_iteration": 2.7661337852478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083569, + "balance_loss_mlp": 1.04661417, + "epoch": 0.39823008849557523, + "flos": 739734888960.0, + "grad_norm": 0.059947122372600754, + "language_loss": 0.77649361, + "learning_rate": 0.0006846298817819225, + "loss": 0.78732932, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.36987305, + "step": 2070, + "time_per_iteration": 2.9364843368530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091151, + "balance_loss_mlp": 1.05436325, + "epoch": 0.39842247018083876, + "flos": 384825337344.0, + "grad_norm": 0.0736862776590319, + "language_loss": 0.80732799, + "learning_rate": 0.0006843403215228945, + "loss": 0.81823957, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.36767578, + "step": 2071, + "time_per_iteration": 2.4597537517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078334, + "balance_loss_mlp": 1.04133189, + "epoch": 0.39861485186610235, + "flos": 533431443456.0, + "grad_norm": 0.052578385162892496, + "language_loss": 0.80366135, + "learning_rate": 0.0006840506896966065, + "loss": 0.81444472, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.36962891, + "step": 2072, + "time_per_iteration": 2.6826841831207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081243, + "balance_loss_mlp": 1.04397774, + "epoch": 0.39880723355136594, + "flos": 642834482688.0, + "grad_norm": 0.055481383737447196, + "language_loss": 0.82090193, + "learning_rate": 0.0006837609864155038, + "loss": 0.83171439, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.37255859, + "step": 2073, + "time_per_iteration": 2.8541154861450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075844, + "balance_loss_mlp": 1.0408442, + "epoch": 0.39899961523662947, + "flos": 515587534848.0, + "grad_norm": 0.07686004257588779, + "language_loss": 0.83464944, + "learning_rate": 0.0006834712117920592, + "loss": 0.84540784, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.3503418, + "step": 2074, + "time_per_iteration": 2.6023800373077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107731, + "balance_loss_mlp": 1.04025948, + "epoch": 0.39919199692189306, + "flos": 464148085248.0, + "grad_norm": 0.0625246810856132, + "language_loss": 0.85794407, + "learning_rate": 0.0006831813659387729, + "loss": 0.86871719, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.37036133, + "step": 2075, + "time_per_iteration": 2.5916242599487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071695, + "balance_loss_mlp": 1.0353837, + "epoch": 0.3993843786071566, + "flos": 531382837248.0, + "grad_norm": 0.05588277312371317, + "language_loss": 0.84130096, + "learning_rate": 0.0006828914489681733, + "loss": 0.852018, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.36303711, + "step": 2076, + "time_per_iteration": 2.7014079093933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078543, + "balance_loss_mlp": 1.04168355, + "epoch": 0.3995767602924202, + "flos": 503701770240.0, + "grad_norm": 0.05616101270921505, + "language_loss": 0.85284638, + "learning_rate": 0.0006826014609928162, + "loss": 0.86363184, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.36816406, + "step": 2077, + "time_per_iteration": 2.6714975833892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089101, + "balance_loss_mlp": 1.07622623, + "epoch": 0.3997691419776837, + "flos": 1453780500480.0, + "grad_norm": 0.03492718818999835, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.8428849, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.12890625, + "step": 2078, + "time_per_iteration": 4.8198041915893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075675, + "balance_loss_mlp": 1.03943551, + "epoch": 0.3999615236629473, + "flos": 530418170880.0, + "grad_norm": 0.06060184252075253, + "language_loss": 0.79984158, + "learning_rate": 0.0006820212724781896, + "loss": 0.81059831, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.36279297, + "step": 2079, + "time_per_iteration": 2.725576400756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077383, + "balance_loss_mlp": 1.04209709, + "epoch": 0.4001539053482108, + "flos": 694823961600.0, + "grad_norm": 0.12722864956638674, + "language_loss": 0.83843565, + "learning_rate": 0.0006817310721641694, + "loss": 0.84920955, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.35302734, + "step": 2080, + "time_per_iteration": 2.808981418609619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083492, + "balance_loss_mlp": 1.04732358, + "epoch": 0.4003462870334744, + "flos": 520102356480.0, + "grad_norm": 0.09864886938158902, + "language_loss": 0.84025592, + "learning_rate": 0.00068144080129589, + "loss": 0.85109079, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.36181641, + "step": 2081, + "time_per_iteration": 2.61795973777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090466, + "balance_loss_mlp": 1.05403543, + "epoch": 0.400538668718738, + "flos": 492272902656.0, + "grad_norm": 0.05814134634807872, + "language_loss": 0.83103502, + "learning_rate": 0.0006811504599860441, + "loss": 0.84193969, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.36450195, + "step": 2082, + "time_per_iteration": 2.5586163997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109661, + "balance_loss_mlp": 1.06161022, + "epoch": 0.40073105040400153, + "flos": 490083111936.0, + "grad_norm": 0.05292967428813452, + "language_loss": 0.85547149, + "learning_rate": 0.0006808600483473526, + "loss": 0.86643761, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.35058594, + "step": 2083, + "time_per_iteration": 2.8549985885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110158, + "balance_loss_mlp": 1.06584144, + "epoch": 0.4009234320892651, + "flos": 562088761344.0, + "grad_norm": 0.051341860757237005, + "language_loss": 0.85926497, + "learning_rate": 0.0006805695664925629, + "loss": 0.87028074, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.35791016, + "step": 2084, + "time_per_iteration": 2.7807514667510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111507, + "balance_loss_mlp": 1.07619727, + "epoch": 0.40111581377452865, + "flos": 425786029056.0, + "grad_norm": 0.07139972521672847, + "language_loss": 0.84098327, + "learning_rate": 0.0006802790145344506, + "loss": 0.85209835, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.35327148, + "step": 2085, + "time_per_iteration": 2.4653491973876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106176, + "balance_loss_mlp": 1.07024658, + "epoch": 0.40130819545979224, + "flos": 612148907520.0, + "grad_norm": 0.09859033966702202, + "language_loss": 0.87080699, + "learning_rate": 0.0006799883925858176, + "loss": 0.88186872, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.35961914, + "step": 2086, + "time_per_iteration": 2.8432652950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101686, + "balance_loss_mlp": 1.06580365, + "epoch": 0.40150057714505577, + "flos": 523179648000.0, + "grad_norm": 0.06735788816740666, + "language_loss": 0.85303611, + "learning_rate": 0.0006796977007594933, + "loss": 0.86405295, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.35913086, + "step": 2087, + "time_per_iteration": 2.597883701324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109903, + "balance_loss_mlp": 1.06240904, + "epoch": 0.40169295883031936, + "flos": 561143033856.0, + "grad_norm": 0.0524220318715257, + "language_loss": 0.86402881, + "learning_rate": 0.0006794069391683345, + "loss": 0.87501919, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.36621094, + "step": 2088, + "time_per_iteration": 2.7313365936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101072, + "balance_loss_mlp": 1.06414104, + "epoch": 0.4018853405155829, + "flos": 518743401984.0, + "grad_norm": 0.056795041649419745, + "language_loss": 0.80919069, + "learning_rate": 0.0006791161079252248, + "loss": 0.8202014, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.36914062, + "step": 2089, + "time_per_iteration": 2.57450532913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109207, + "balance_loss_mlp": 1.05652201, + "epoch": 0.4020777222008465, + "flos": 525957193728.0, + "grad_norm": 0.05166370887572794, + "language_loss": 0.82473212, + "learning_rate": 0.0006788252071430747, + "loss": 0.83565277, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.35546875, + "step": 2090, + "time_per_iteration": 2.6603012084960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097817, + "balance_loss_mlp": 1.06100535, + "epoch": 0.40227010388611006, + "flos": 525494504448.0, + "grad_norm": 0.056931817338158205, + "language_loss": 0.86595076, + "learning_rate": 0.0006785342369348222, + "loss": 0.87692893, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.3684082, + "step": 2091, + "time_per_iteration": 2.807980537414551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091263, + "balance_loss_mlp": 1.05566692, + "epoch": 0.4024624855713736, + "flos": 432074442240.0, + "grad_norm": 0.0736357586886409, + "language_loss": 0.79799104, + "learning_rate": 0.0006782431974134316, + "loss": 0.80890369, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.35668945, + "step": 2092, + "time_per_iteration": 2.5331132411956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097919, + "balance_loss_mlp": 1.06044006, + "epoch": 0.4026548672566372, + "flos": 766304312832.0, + "grad_norm": 0.05288336614740697, + "language_loss": 0.89230573, + "learning_rate": 0.0006779520886918949, + "loss": 0.90328491, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.375, + "step": 2093, + "time_per_iteration": 3.014895439147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093032, + "balance_loss_mlp": 1.0560298, + "epoch": 0.4028472489419007, + "flos": 642636633600.0, + "grad_norm": 0.05102527643704043, + "language_loss": 0.8125242, + "learning_rate": 0.0006776609108832301, + "loss": 0.8234545, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.36987305, + "step": 2094, + "time_per_iteration": 2.7778923511505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089446, + "balance_loss_mlp": 1.05311072, + "epoch": 0.4030396306271643, + "flos": 491593425408.0, + "grad_norm": 0.053262929353227066, + "language_loss": 0.84942901, + "learning_rate": 0.0006773696641004828, + "loss": 0.86032349, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.36352539, + "step": 2095, + "time_per_iteration": 2.580313205718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089479, + "balance_loss_mlp": 1.05238152, + "epoch": 0.40323201231242783, + "flos": 901363133952.0, + "grad_norm": 0.05931554649921985, + "language_loss": 0.77618563, + "learning_rate": 0.0006770783484567247, + "loss": 0.78708041, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.37109375, + "step": 2096, + "time_per_iteration": 3.0955684185028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089916, + "balance_loss_mlp": 1.0536046, + "epoch": 0.4034243939976914, + "flos": 570267219456.0, + "grad_norm": 0.07944545156942663, + "language_loss": 0.8587091, + "learning_rate": 0.000676786964065055, + "loss": 0.86960828, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.36303711, + "step": 2097, + "time_per_iteration": 2.742293119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084372, + "balance_loss_mlp": 1.04829895, + "epoch": 0.403616775682955, + "flos": 507206845440.0, + "grad_norm": 0.04869402927646331, + "language_loss": 0.78305566, + "learning_rate": 0.0006764955110385986, + "loss": 0.79389936, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.3605957, + "step": 2098, + "time_per_iteration": 2.708390235900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086865, + "balance_loss_mlp": 1.05055428, + "epoch": 0.40380915736821854, + "flos": 519127515648.0, + "grad_norm": 0.06727344126892942, + "language_loss": 0.80247992, + "learning_rate": 0.0006762039894905083, + "loss": 0.81334853, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.36328125, + "step": 2099, + "time_per_iteration": 2.6428377628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095654, + "balance_loss_mlp": 1.05812716, + "epoch": 0.40400153905348213, + "flos": 441686048256.0, + "grad_norm": 0.06575852305434472, + "language_loss": 0.80233693, + "learning_rate": 0.000675912399533962, + "loss": 0.81329346, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.375, + "step": 2100, + "time_per_iteration": 2.5560812950134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088947, + "balance_loss_mlp": 1.05249298, + "epoch": 0.40419392073874566, + "flos": 771961300992.0, + "grad_norm": 0.1036114098840327, + "language_loss": 0.85183066, + "learning_rate": 0.0006756207412821656, + "loss": 0.86272013, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.36450195, + "step": 2101, + "time_per_iteration": 2.986583709716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086168, + "balance_loss_mlp": 1.05021429, + "epoch": 0.40438630242400925, + "flos": 766215562752.0, + "grad_norm": 0.06055449439143942, + "language_loss": 0.80025709, + "learning_rate": 0.0006753290148483505, + "loss": 0.81111872, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.36010742, + "step": 2102, + "time_per_iteration": 3.0076749324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080415, + "balance_loss_mlp": 1.04491425, + "epoch": 0.4045786841092728, + "flos": 415013317632.0, + "grad_norm": 0.052033945118291625, + "language_loss": 0.7866869, + "learning_rate": 0.0006750372203457752, + "loss": 0.79749095, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.35546875, + "step": 2103, + "time_per_iteration": 2.490941286087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083553, + "balance_loss_mlp": 1.04767144, + "epoch": 0.40477106579453637, + "flos": 538941454848.0, + "grad_norm": 0.07087529891902919, + "language_loss": 0.86455047, + "learning_rate": 0.0006747453578877242, + "loss": 0.875386, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.35864258, + "step": 2104, + "time_per_iteration": 2.6906399726867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083688, + "balance_loss_mlp": 1.04766345, + "epoch": 0.4049634474797999, + "flos": 826358768640.0, + "grad_norm": 0.07644078595746046, + "language_loss": 0.82677126, + "learning_rate": 0.0006744534275875085, + "loss": 0.83760816, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.3605957, + "step": 2105, + "time_per_iteration": 2.9925642013549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081927, + "balance_loss_mlp": 1.0459255, + "epoch": 0.4051558291650635, + "flos": 572417722368.0, + "grad_norm": 0.07127110995979934, + "language_loss": 0.8562066, + "learning_rate": 0.0006741614295584657, + "loss": 0.86702585, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.36010742, + "step": 2106, + "time_per_iteration": 2.6289658546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078264, + "balance_loss_mlp": 1.04321659, + "epoch": 0.4053482108503271, + "flos": 731541874176.0, + "grad_norm": 0.07814638610947379, + "language_loss": 0.78334522, + "learning_rate": 0.0006738693639139595, + "loss": 0.79412782, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.35083008, + "step": 2107, + "time_per_iteration": 3.0381481647491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078707, + "balance_loss_mlp": 1.04234815, + "epoch": 0.4055405925355906, + "flos": 1212588292608.0, + "grad_norm": 0.05182127384415646, + "language_loss": 0.77652568, + "learning_rate": 0.0006735772307673796, + "loss": 0.78731275, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.36376953, + "step": 2108, + "time_per_iteration": 3.5424931049346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075462, + "balance_loss_mlp": 1.03998494, + "epoch": 0.4057329742208542, + "flos": 715553104896.0, + "grad_norm": 0.0496802449600099, + "language_loss": 0.83129466, + "learning_rate": 0.0006732850302321421, + "loss": 0.84204924, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.35498047, + "step": 2109, + "time_per_iteration": 2.902758836746216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081445, + "balance_loss_mlp": 1.04506207, + "epoch": 0.4059253559061177, + "flos": 564623377920.0, + "grad_norm": 0.054690107844022846, + "language_loss": 0.84019876, + "learning_rate": 0.00067299276242169, + "loss": 0.85101312, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.36376953, + "step": 2110, + "time_per_iteration": 2.6453192234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108684, + "balance_loss_mlp": 1.07272601, + "epoch": 0.4061177375913813, + "flos": 1592886919680.0, + "grad_norm": 0.03852995701507201, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75469011, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.14160156, + "step": 2111, + "time_per_iteration": 4.936276197433472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092328, + "balance_loss_mlp": 1.05587411, + "epoch": 0.40631011927664484, + "flos": 615122892288.0, + "grad_norm": 0.05227822307204106, + "language_loss": 0.77911901, + "learning_rate": 0.0006724080254290395, + "loss": 0.79004228, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.36425781, + "step": 2112, + "time_per_iteration": 2.804931402206421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_mlp": 1.04893136, + "epoch": 0.40650250096190843, + "flos": 557390647296.0, + "grad_norm": 0.056265148252134925, + "language_loss": 0.89716649, + "learning_rate": 0.0006721155564738566, + "loss": 0.90801871, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.36303711, + "step": 2113, + "time_per_iteration": 2.756901502609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050781, + "balance_loss_mlp": 1.03676188, + "epoch": 0.40669488264717196, + "flos": 1579301756928.0, + "grad_norm": 0.015026311101099392, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79673421, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.140625, + "step": 2114, + "time_per_iteration": 4.975963354110718 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109673, + "balance_loss_mlp": 1.0599184, + "epoch": 0.40688726433243555, + "flos": 507398902272.0, + "grad_norm": 0.07464761746525102, + "language_loss": 0.85648221, + "learning_rate": 0.0006715304182135078, + "loss": 0.86744952, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.36816406, + "step": 2115, + "time_per_iteration": 2.5924360752105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104151, + "balance_loss_mlp": 1.06726742, + "epoch": 0.40707964601769914, + "flos": 588757109760.0, + "grad_norm": 0.06427267203463374, + "language_loss": 0.88647795, + "learning_rate": 0.0006712377491355127, + "loss": 0.89751947, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.36889648, + "step": 2116, + "time_per_iteration": 2.887439489364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097518, + "balance_loss_mlp": 1.06135035, + "epoch": 0.40727202770296267, + "flos": 580134901248.0, + "grad_norm": 0.10612280790481599, + "language_loss": 0.81211627, + "learning_rate": 0.0006709450135771274, + "loss": 0.82309151, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.36206055, + "step": 2117, + "time_per_iteration": 2.9730725288391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101548, + "balance_loss_mlp": 1.06523705, + "epoch": 0.40746440938822626, + "flos": 503819633664.0, + "grad_norm": 0.05032701187252936, + "language_loss": 0.86683893, + "learning_rate": 0.0006706522116520023, + "loss": 0.87785447, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.36328125, + "step": 2118, + "time_per_iteration": 2.6400580406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096769, + "balance_loss_mlp": 1.06122053, + "epoch": 0.4076567910734898, + "flos": 605323611648.0, + "grad_norm": 0.05658204986861598, + "language_loss": 0.82839441, + "learning_rate": 0.0006703593434738127, + "loss": 0.83936214, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.35571289, + "step": 2119, + "time_per_iteration": 2.77944016456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091629, + "balance_loss_mlp": 1.05622339, + "epoch": 0.4078491727587534, + "flos": 479313372672.0, + "grad_norm": 0.0532477275953574, + "language_loss": 0.78150344, + "learning_rate": 0.0006700664091562604, + "loss": 0.79241967, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.35449219, + "step": 2120, + "time_per_iteration": 2.580658435821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093922, + "balance_loss_mlp": 1.05780149, + "epoch": 0.4080415544440169, + "flos": 510126985728.0, + "grad_norm": 0.045251762284626275, + "language_loss": 0.85188484, + "learning_rate": 0.0006697734088130725, + "loss": 0.86282408, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.36157227, + "step": 2121, + "time_per_iteration": 2.5990941524505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108927, + "balance_loss_mlp": 1.05329287, + "epoch": 0.4082339361292805, + "flos": 734318009856.0, + "grad_norm": 0.06207508790269206, + "language_loss": 0.85326135, + "learning_rate": 0.0006694803425580018, + "loss": 0.86415404, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.36010742, + "step": 2122, + "time_per_iteration": 2.9514336585998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093256, + "balance_loss_mlp": 1.05687356, + "epoch": 0.4084263178145441, + "flos": 457239831552.0, + "grad_norm": 0.08260422277145335, + "language_loss": 0.84467387, + "learning_rate": 0.0006691872105048268, + "loss": 0.85560644, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.36401367, + "step": 2123, + "time_per_iteration": 2.584765672683716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093552, + "balance_loss_mlp": 1.05762231, + "epoch": 0.4086186994998076, + "flos": 562659139584.0, + "grad_norm": 0.056985949085160005, + "language_loss": 0.84641832, + "learning_rate": 0.0006688940127673513, + "loss": 0.85735387, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.35961914, + "step": 2124, + "time_per_iteration": 2.698777675628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100727, + "balance_loss_mlp": 1.06446397, + "epoch": 0.4088110811850712, + "flos": 573364859904.0, + "grad_norm": 0.04747345440626025, + "language_loss": 0.85754699, + "learning_rate": 0.0006686007494594049, + "loss": 0.86855423, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.36279297, + "step": 2125, + "time_per_iteration": 2.8035151958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101938, + "balance_loss_mlp": 1.06538868, + "epoch": 0.40900346287033473, + "flos": 456702948864.0, + "grad_norm": 0.06322616011827766, + "language_loss": 0.80074888, + "learning_rate": 0.0006683074206948425, + "loss": 0.81176829, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.36547852, + "step": 2126, + "time_per_iteration": 2.4856953620910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103432, + "balance_loss_mlp": 1.06697774, + "epoch": 0.4091958445555983, + "flos": 617097305088.0, + "grad_norm": 0.05684118517242104, + "language_loss": 0.8146261, + "learning_rate": 0.0006680140265875443, + "loss": 0.82566047, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.36474609, + "step": 2127, + "time_per_iteration": 2.772571325302124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111548, + "balance_loss_mlp": 1.07564259, + "epoch": 0.40938822624086185, + "flos": 472159217664.0, + "grad_norm": 0.051537767424008556, + "language_loss": 0.95483583, + "learning_rate": 0.0006677205672514162, + "loss": 0.96595132, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.35888672, + "step": 2128, + "time_per_iteration": 2.6006312370300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114142, + "balance_loss_mlp": 1.07642448, + "epoch": 0.40958060792612544, + "flos": 569734718976.0, + "grad_norm": 0.04853999942998699, + "language_loss": 0.88646978, + "learning_rate": 0.000667427042800389, + "loss": 0.8976112, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.37670898, + "step": 2129, + "time_per_iteration": 2.742804765701294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107096, + "balance_loss_mlp": 1.07030797, + "epoch": 0.40977298961138897, + "flos": 609065823744.0, + "grad_norm": 0.053374560930054, + "language_loss": 0.8288517, + "learning_rate": 0.0006671334533484192, + "loss": 0.83992267, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.36767578, + "step": 2130, + "time_per_iteration": 2.7175474166870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105432, + "balance_loss_mlp": 1.06854916, + "epoch": 0.40996537129665256, + "flos": 581463332352.0, + "grad_norm": 0.10187828374301312, + "language_loss": 0.83427989, + "learning_rate": 0.0006668397990094881, + "loss": 0.84533429, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.36889648, + "step": 2131, + "time_per_iteration": 2.718189239501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102513, + "balance_loss_mlp": 1.06438994, + "epoch": 0.41015775298191615, + "flos": 516296125440.0, + "grad_norm": 0.05088305967580112, + "language_loss": 0.84777439, + "learning_rate": 0.0006665460798976027, + "loss": 0.85879958, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.38134766, + "step": 2132, + "time_per_iteration": 2.754838228225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103514, + "balance_loss_mlp": 1.06448531, + "epoch": 0.4103501346671797, + "flos": 510083315712.0, + "grad_norm": 0.04980971333778078, + "language_loss": 0.81075269, + "learning_rate": 0.0006662522961267947, + "loss": 0.82178783, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.38989258, + "step": 2133, + "time_per_iteration": 2.630645513534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105465, + "balance_loss_mlp": 1.06514883, + "epoch": 0.41054251635244327, + "flos": 549459500544.0, + "grad_norm": 0.047627275091831754, + "language_loss": 0.87016159, + "learning_rate": 0.0006659584478111211, + "loss": 0.88121629, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.40307617, + "step": 2134, + "time_per_iteration": 2.7775702476501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114637, + "balance_loss_mlp": 1.07408166, + "epoch": 0.4107348980377068, + "flos": 839549643264.0, + "grad_norm": 0.06581962625194586, + "language_loss": 0.82464856, + "learning_rate": 0.000665664535064664, + "loss": 0.83579493, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.40551758, + "step": 2135, + "time_per_iteration": 3.0234854221343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011149, + "balance_loss_mlp": 1.07501245, + "epoch": 0.4109272797229704, + "flos": 503445694464.0, + "grad_norm": 0.05498766410062668, + "language_loss": 0.82554698, + "learning_rate": 0.0006653705580015303, + "loss": 0.83669591, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.39892578, + "step": 2136, + "time_per_iteration": 2.740478992462158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110871, + "balance_loss_mlp": 1.06786942, + "epoch": 0.4111196614082339, + "flos": 610533877248.0, + "grad_norm": 0.1069583069182241, + "language_loss": 0.86098707, + "learning_rate": 0.0006650765167358523, + "loss": 0.87207425, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.40844727, + "step": 2137, + "time_per_iteration": 2.7766735553741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112768, + "balance_loss_mlp": 1.07264185, + "epoch": 0.4113120430934975, + "flos": 452931623424.0, + "grad_norm": 0.06240188984530218, + "language_loss": 0.8998509, + "learning_rate": 0.0006647824113817864, + "loss": 0.91097856, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.40112305, + "step": 2138, + "time_per_iteration": 2.558088779449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109501, + "balance_loss_mlp": 1.06992376, + "epoch": 0.41150442477876104, + "flos": 541324712448.0, + "grad_norm": 0.06351755199965968, + "language_loss": 0.81488299, + "learning_rate": 0.000664488242053515, + "loss": 0.82597804, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.39550781, + "step": 2139, + "time_per_iteration": 2.7064287662506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102585, + "balance_loss_mlp": 1.06405628, + "epoch": 0.4116968064640246, + "flos": 576017339904.0, + "grad_norm": 0.052717271070364294, + "language_loss": 0.8372525, + "learning_rate": 0.0006641940088652445, + "loss": 0.8482784, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.38500977, + "step": 2140, + "time_per_iteration": 2.8360941410064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107136, + "balance_loss_mlp": 1.0685842, + "epoch": 0.4118891881492882, + "flos": 495857963520.0, + "grad_norm": 0.05632128251923113, + "language_loss": 0.82241237, + "learning_rate": 0.0006638997119312065, + "loss": 0.83348376, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.38500977, + "step": 2141, + "time_per_iteration": 2.695482015609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432807, + "balance_loss_mlp": 1.41773903, + "epoch": 0.41208156983455174, + "flos": 1537604923392.0, + "grad_norm": 0.12335560313674339, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76496112, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.15039062, + "step": 2142, + "time_per_iteration": 4.938086032867432 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096248, + "balance_loss_mlp": 1.05800605, + "epoch": 0.41227395151981533, + "flos": 584697775104.0, + "grad_norm": 0.06073263389064812, + "language_loss": 0.84852999, + "learning_rate": 0.000663310927282877, + "loss": 0.85949242, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.38208008, + "step": 2143, + "time_per_iteration": 2.776041269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098979, + "balance_loss_mlp": 1.06183362, + "epoch": 0.41246633320507886, + "flos": 442685620224.0, + "grad_norm": 0.05843533128868507, + "language_loss": 0.85999441, + "learning_rate": 0.000663016439797172, + "loss": 0.8709842, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.37109375, + "step": 2144, + "time_per_iteration": 2.6550843715667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099188, + "balance_loss_mlp": 1.06280541, + "epoch": 0.41265871489034245, + "flos": 579680976384.0, + "grad_norm": 0.05476235673703619, + "language_loss": 0.80718118, + "learning_rate": 0.0006627218890228724, + "loss": 0.81817305, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.36376953, + "step": 2145, + "time_per_iteration": 2.748966693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098226, + "balance_loss_mlp": 1.06139088, + "epoch": 0.412851096575606, + "flos": 760906372608.0, + "grad_norm": 0.06511227414480983, + "language_loss": 0.83519912, + "learning_rate": 0.0006624272750743326, + "loss": 0.84618139, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.3684082, + "step": 2146, + "time_per_iteration": 2.987541913986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098071, + "balance_loss_mlp": 1.05994785, + "epoch": 0.41304347826086957, + "flos": 555062644224.0, + "grad_norm": 0.04596756157996359, + "language_loss": 0.82878035, + "learning_rate": 0.0006621325980659322, + "loss": 0.83976108, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.38061523, + "step": 2147, + "time_per_iteration": 2.821556568145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104625, + "balance_loss_mlp": 1.0655247, + "epoch": 0.41323585994613315, + "flos": 665418765312.0, + "grad_norm": 0.06740751064613239, + "language_loss": 0.8204211, + "learning_rate": 0.000661837858112075, + "loss": 0.83146733, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.390625, + "step": 2148, + "time_per_iteration": 2.7922754287719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089584, + "balance_loss_mlp": 1.05136561, + "epoch": 0.4134282416313967, + "flos": 548429405184.0, + "grad_norm": 0.050771109286751076, + "language_loss": 0.88476944, + "learning_rate": 0.0006615430553271888, + "loss": 0.89566529, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.38208008, + "step": 2149, + "time_per_iteration": 2.7367136478424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091169, + "balance_loss_mlp": 1.05326056, + "epoch": 0.4136206233166603, + "flos": 645951062016.0, + "grad_norm": 0.056682848656222896, + "language_loss": 0.85300201, + "learning_rate": 0.0006612481898257264, + "loss": 0.86391366, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.37866211, + "step": 2150, + "time_per_iteration": 2.862969160079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082558, + "balance_loss_mlp": 1.04398179, + "epoch": 0.4138130050019238, + "flos": 517103640576.0, + "grad_norm": 0.07190872816549171, + "language_loss": 0.85216105, + "learning_rate": 0.000660953261722165, + "loss": 0.86298662, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.38549805, + "step": 2151, + "time_per_iteration": 2.608966588973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072414, + "balance_loss_mlp": 1.03379023, + "epoch": 0.4140053866871874, + "flos": 608977073664.0, + "grad_norm": 0.05213877076699988, + "language_loss": 0.82764488, + "learning_rate": 0.0006606582711310055, + "loss": 0.83836901, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.38574219, + "step": 2152, + "time_per_iteration": 2.704941511154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081545, + "balance_loss_mlp": 1.04287302, + "epoch": 0.4141977683724509, + "flos": 579493301760.0, + "grad_norm": 0.0573275470165796, + "language_loss": 0.83345616, + "learning_rate": 0.0006603632181667736, + "loss": 0.8442716, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.38671875, + "step": 2153, + "time_per_iteration": 2.670036792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157558, + "balance_loss_mlp": 1.14086878, + "epoch": 0.4143901500577145, + "flos": 1306598777856.0, + "grad_norm": 0.04466441147089705, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.80100882, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.16699219, + "step": 2154, + "time_per_iteration": 4.936178684234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088281, + "balance_loss_mlp": 1.04989576, + "epoch": 0.41458253174297804, + "flos": 459957740544.0, + "grad_norm": 0.05825483779723247, + "language_loss": 0.81504506, + "learning_rate": 0.0006597729255773153, + "loss": 0.82592785, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.38354492, + "step": 2155, + "time_per_iteration": 2.5436675548553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095127, + "balance_loss_mlp": 1.056885, + "epoch": 0.41477491342824163, + "flos": 553096995840.0, + "grad_norm": 0.14369101348323118, + "language_loss": 0.82126498, + "learning_rate": 0.0006594776861812608, + "loss": 0.83221632, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.38183594, + "step": 2156, + "time_per_iteration": 2.6603870391845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102341, + "balance_loss_mlp": 1.06414664, + "epoch": 0.4149672951135052, + "flos": 697444356096.0, + "grad_norm": 0.09619651786969989, + "language_loss": 0.86957002, + "learning_rate": 0.0006591823848704776, + "loss": 0.88059342, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.38183594, + "step": 2157, + "time_per_iteration": 2.888523578643799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112154, + "balance_loss_mlp": 1.07362556, + "epoch": 0.41515967679876875, + "flos": 565480355328.0, + "grad_norm": 0.06180894820080996, + "language_loss": 0.81514823, + "learning_rate": 0.0006588870217596117, + "loss": 0.82626975, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.38500977, + "step": 2158, + "time_per_iteration": 2.7872376441955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124674, + "balance_loss_mlp": 1.08497691, + "epoch": 0.41535205848403234, + "flos": 500938781184.0, + "grad_norm": 0.08519942481898463, + "language_loss": 0.85712391, + "learning_rate": 0.0006585915969633334, + "loss": 0.86837065, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.39672852, + "step": 2159, + "time_per_iteration": 2.5857338905334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135799, + "balance_loss_mlp": 1.09703159, + "epoch": 0.41554444016929587, + "flos": 607268911104.0, + "grad_norm": 0.06479316283343547, + "language_loss": 0.89294302, + "learning_rate": 0.0006582961105963366, + "loss": 0.90430105, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.38720703, + "step": 2160, + "time_per_iteration": 2.7831602096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153796, + "balance_loss_mlp": 1.11493373, + "epoch": 0.41573682185455946, + "flos": 528856985088.0, + "grad_norm": 0.06215124272048543, + "language_loss": 0.77626073, + "learning_rate": 0.0006580005627733395, + "loss": 0.7877987, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.38818359, + "step": 2161, + "time_per_iteration": 2.6620304584503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152884, + "balance_loss_mlp": 1.11349678, + "epoch": 0.415929203539823, + "flos": 504686785536.0, + "grad_norm": 0.0577168801928891, + "language_loss": 0.81587994, + "learning_rate": 0.0006577049536090838, + "loss": 0.82740879, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.39355469, + "step": 2162, + "time_per_iteration": 2.6678874492645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144655, + "balance_loss_mlp": 1.10693753, + "epoch": 0.4161215852250866, + "flos": 582467286528.0, + "grad_norm": 0.07160302952697103, + "language_loss": 0.85415941, + "learning_rate": 0.000657409283218335, + "loss": 0.86560595, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.37695312, + "step": 2163, + "time_per_iteration": 2.6405746936798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134301, + "balance_loss_mlp": 1.09570062, + "epoch": 0.4163139669103501, + "flos": 490432320000.0, + "grad_norm": 0.051386242205519156, + "language_loss": 0.80774486, + "learning_rate": 0.0006571135517158829, + "loss": 0.81908786, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.38549805, + "step": 2164, + "time_per_iteration": 2.6496996879577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218225, + "balance_loss_mlp": 1.20143986, + "epoch": 0.4165063485956137, + "flos": 1287445377024.0, + "grad_norm": 0.06520745435981959, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77982283, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.16796875, + "step": 2165, + "time_per_iteration": 4.76560640335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127203, + "balance_loss_mlp": 1.09003401, + "epoch": 0.4166987302808773, + "flos": 495015542784.0, + "grad_norm": 0.07154886739030113, + "language_loss": 0.83213758, + "learning_rate": 0.0006565219058351444, + "loss": 0.8434096, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.37133789, + "step": 2166, + "time_per_iteration": 2.539856433868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112149, + "balance_loss_mlp": 1.07397866, + "epoch": 0.4168911119661408, + "flos": 463823608320.0, + "grad_norm": 0.0764039854303378, + "language_loss": 0.83196324, + "learning_rate": 0.0006562259916865553, + "loss": 0.84308469, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.38110352, + "step": 2167, + "time_per_iteration": 2.5938220024108887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106062, + "balance_loss_mlp": 1.06939304, + "epoch": 0.4170834936514044, + "flos": 536499970560.0, + "grad_norm": 0.052882286550722295, + "language_loss": 0.7941224, + "learning_rate": 0.0006559300168856573, + "loss": 0.80518305, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.36694336, + "step": 2168, + "time_per_iteration": 2.7382309436798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100629, + "balance_loss_mlp": 1.0633167, + "epoch": 0.41727587533666793, + "flos": 550418374656.0, + "grad_norm": 0.05257418188896324, + "language_loss": 0.85768378, + "learning_rate": 0.0006556339815473577, + "loss": 0.86869007, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.37280273, + "step": 2169, + "time_per_iteration": 2.6762564182281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110352, + "balance_loss_mlp": 1.06501567, + "epoch": 0.4174682570219315, + "flos": 630795949056.0, + "grad_norm": 0.0440641640787593, + "language_loss": 0.85913342, + "learning_rate": 0.000655337885786588, + "loss": 0.87016863, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.38452148, + "step": 2170, + "time_per_iteration": 2.8669848442077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098716, + "balance_loss_mlp": 1.06068778, + "epoch": 0.41766063870719505, + "flos": 519501454848.0, + "grad_norm": 0.07103396575336611, + "language_loss": 0.84732234, + "learning_rate": 0.0006550417297183025, + "loss": 0.85830951, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.37988281, + "step": 2171, + "time_per_iteration": 2.6471290588378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110284, + "balance_loss_mlp": 1.0640254, + "epoch": 0.41785302039245864, + "flos": 557656897536.0, + "grad_norm": 0.051327988161677204, + "language_loss": 0.8175863, + "learning_rate": 0.0006547455134574793, + "loss": 0.82861477, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.38793945, + "step": 2172, + "time_per_iteration": 2.71508526802063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100888, + "balance_loss_mlp": 1.06338453, + "epoch": 0.41804540207772223, + "flos": 788156683776.0, + "grad_norm": 0.052280747851499734, + "language_loss": 0.84377366, + "learning_rate": 0.0006544492371191198, + "loss": 0.85478258, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.37475586, + "step": 2173, + "time_per_iteration": 3.114607810974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096475, + "balance_loss_mlp": 1.05775642, + "epoch": 0.41823778376298576, + "flos": 903944240640.0, + "grad_norm": 0.04972167781175626, + "language_loss": 0.83103442, + "learning_rate": 0.0006541529008182485, + "loss": 0.84199917, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.38696289, + "step": 2174, + "time_per_iteration": 3.165484666824341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094456, + "balance_loss_mlp": 1.0563333, + "epoch": 0.41843016544824935, + "flos": 511308440064.0, + "grad_norm": 0.05116159603840096, + "language_loss": 0.8702668, + "learning_rate": 0.0006538565046699136, + "loss": 0.88121128, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.38085938, + "step": 2175, + "time_per_iteration": 2.5701253414154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101136, + "balance_loss_mlp": 1.06389487, + "epoch": 0.4186225471335129, + "flos": 652774947840.0, + "grad_norm": 0.05537675869017034, + "language_loss": 0.81610411, + "learning_rate": 0.0006535600487891862, + "loss": 0.82711548, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.37231445, + "step": 2176, + "time_per_iteration": 2.7980031967163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096315, + "balance_loss_mlp": 1.05900216, + "epoch": 0.41881492881877647, + "flos": 568892298240.0, + "grad_norm": 0.05573219506936483, + "language_loss": 0.89184308, + "learning_rate": 0.0006532635332911603, + "loss": 0.90280616, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.37304688, + "step": 2177, + "time_per_iteration": 2.64104962348938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092004, + "balance_loss_mlp": 1.05495393, + "epoch": 0.41900731050404, + "flos": 911478127104.0, + "grad_norm": 0.05325324025552218, + "language_loss": 0.80538237, + "learning_rate": 0.0006529669582909541, + "loss": 0.81630242, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.37011719, + "step": 2178, + "time_per_iteration": 3.21323299407959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108647, + "balance_loss_mlp": 1.04896641, + "epoch": 0.4191996921893036, + "flos": 535498988544.0, + "grad_norm": 0.06510625194491998, + "language_loss": 0.85975909, + "learning_rate": 0.0006526703239037077, + "loss": 0.87062377, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.375, + "step": 2179, + "time_per_iteration": 2.630338430404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086501, + "balance_loss_mlp": 1.0496887, + "epoch": 0.4193920738745671, + "flos": 582363979776.0, + "grad_norm": 0.04783092813648227, + "language_loss": 0.86411011, + "learning_rate": 0.0006523736302445851, + "loss": 0.8749752, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.36816406, + "step": 2180, + "time_per_iteration": 2.7710120677948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083583, + "balance_loss_mlp": 1.04681921, + "epoch": 0.4195844555598307, + "flos": 1335279720960.0, + "grad_norm": 0.05415818779113344, + "language_loss": 0.77215266, + "learning_rate": 0.0006520768774287728, + "loss": 0.78298849, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.36743164, + "step": 2181, + "time_per_iteration": 3.738273859024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082946, + "balance_loss_mlp": 1.04642057, + "epoch": 0.4197768372450943, + "flos": 598480786944.0, + "grad_norm": 0.04672312513315136, + "language_loss": 0.85467362, + "learning_rate": 0.0006517800655714806, + "loss": 0.86550307, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.36547852, + "step": 2182, + "time_per_iteration": 2.796132802963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076263, + "balance_loss_mlp": 1.04016638, + "epoch": 0.4199692189303578, + "flos": 734929085952.0, + "grad_norm": 0.05966366646918548, + "language_loss": 0.84806752, + "learning_rate": 0.0006514831947879407, + "loss": 0.85883021, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.36132812, + "step": 2183, + "time_per_iteration": 2.9417624473571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077956, + "balance_loss_mlp": 1.04243183, + "epoch": 0.4201616006156214, + "flos": 749854264320.0, + "grad_norm": 0.05811307518141115, + "language_loss": 0.78259802, + "learning_rate": 0.0006511862651934091, + "loss": 0.79337758, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.35522461, + "step": 2184, + "time_per_iteration": 3.0546512603759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082116, + "balance_loss_mlp": 1.04601932, + "epoch": 0.42035398230088494, + "flos": 546764912640.0, + "grad_norm": 0.041926600273946305, + "language_loss": 0.82459891, + "learning_rate": 0.0006508892769031638, + "loss": 0.83542007, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.36083984, + "step": 2185, + "time_per_iteration": 2.7021775245666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085419, + "balance_loss_mlp": 1.04972804, + "epoch": 0.42054636398614853, + "flos": 616628823552.0, + "grad_norm": 0.31605549573939495, + "language_loss": 0.86902821, + "learning_rate": 0.000650592230032506, + "loss": 0.87988245, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.35742188, + "step": 2186, + "time_per_iteration": 2.725625514984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090024, + "balance_loss_mlp": 1.05175829, + "epoch": 0.42073874567141206, + "flos": 640077285888.0, + "grad_norm": 0.04878826269588872, + "language_loss": 0.84995645, + "learning_rate": 0.0006502951246967595, + "loss": 0.86085677, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.38256836, + "step": 2187, + "time_per_iteration": 2.8762335777282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092344, + "balance_loss_mlp": 1.05517459, + "epoch": 0.42093112735667565, + "flos": 493524168192.0, + "grad_norm": 0.05435264660880543, + "language_loss": 0.86905056, + "learning_rate": 0.0006499979610112706, + "loss": 0.87997395, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.37158203, + "step": 2188, + "time_per_iteration": 2.7210283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105519, + "balance_loss_mlp": 1.06615603, + "epoch": 0.4211235090419392, + "flos": 542097321984.0, + "grad_norm": 0.05832158753777823, + "language_loss": 0.84076196, + "learning_rate": 0.000649700739091409, + "loss": 0.85181713, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.39331055, + "step": 2189, + "time_per_iteration": 2.70627498626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109273, + "balance_loss_mlp": 1.09582591, + "epoch": 0.42131589072720277, + "flos": 1531342651392.0, + "grad_norm": 0.0317680876714807, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74945545, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.13476562, + "step": 2190, + "time_per_iteration": 4.8291919231414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103729, + "balance_loss_mlp": 1.0656538, + "epoch": 0.42150827241246636, + "flos": 566583234048.0, + "grad_norm": 0.055290985630161965, + "language_loss": 0.85335857, + "learning_rate": 0.0006491061210101557, + "loss": 0.86439586, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.38037109, + "step": 2191, + "time_per_iteration": 2.669895887374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096378, + "balance_loss_mlp": 1.05770612, + "epoch": 0.4217006540977299, + "flos": 707242226688.0, + "grad_norm": 0.050091435221191714, + "language_loss": 0.83998156, + "learning_rate": 0.0006488087250796157, + "loss": 0.85094529, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.38623047, + "step": 2192, + "time_per_iteration": 2.951594352722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098297, + "balance_loss_mlp": 1.05864835, + "epoch": 0.4218930357829935, + "flos": 626975161344.0, + "grad_norm": 0.047618767001194696, + "language_loss": 0.81377089, + "learning_rate": 0.0006485112713764049, + "loss": 0.82475388, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.39624023, + "step": 2193, + "time_per_iteration": 2.943021535873413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095527, + "balance_loss_mlp": 1.05592585, + "epoch": 0.422085417468257, + "flos": 460110509568.0, + "grad_norm": 0.051159508672241207, + "language_loss": 0.83686495, + "learning_rate": 0.0006482137600160051, + "loss": 0.84782028, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.39575195, + "step": 2194, + "time_per_iteration": 2.5134236812591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095144, + "balance_loss_mlp": 1.05590069, + "epoch": 0.4222777991535206, + "flos": 473788804608.0, + "grad_norm": 0.10490890222415104, + "language_loss": 0.84473735, + "learning_rate": 0.0006479161911139206, + "loss": 0.85568881, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.39208984, + "step": 2195, + "time_per_iteration": 2.577578544616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096098, + "balance_loss_mlp": 1.05754566, + "epoch": 0.4224701808387841, + "flos": 470647494144.0, + "grad_norm": 0.0782943385788455, + "language_loss": 0.85684174, + "learning_rate": 0.0006476185647856778, + "loss": 0.86780274, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.38500977, + "step": 2196, + "time_per_iteration": 2.578495740890503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102208, + "balance_loss_mlp": 1.06286871, + "epoch": 0.4226625625240477, + "flos": 677202633216.0, + "grad_norm": 0.22187176821456261, + "language_loss": 0.81400013, + "learning_rate": 0.0006473208811468255, + "loss": 0.82502222, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.39306641, + "step": 2197, + "time_per_iteration": 2.870922088623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099017, + "balance_loss_mlp": 1.05984497, + "epoch": 0.4228549442093113, + "flos": 503268194304.0, + "grad_norm": 0.05214229642018916, + "language_loss": 0.8430717, + "learning_rate": 0.0006470231403129347, + "loss": 0.85406196, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.39135742, + "step": 2198, + "time_per_iteration": 2.5834295749664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098125, + "balance_loss_mlp": 1.05959654, + "epoch": 0.42304732589457483, + "flos": 611543623680.0, + "grad_norm": 0.055955286861533095, + "language_loss": 0.81645906, + "learning_rate": 0.0006467253423995988, + "loss": 0.82744032, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.38500977, + "step": 2199, + "time_per_iteration": 2.8634603023529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097002, + "balance_loss_mlp": 1.05854511, + "epoch": 0.4232397075798384, + "flos": 515302345728.0, + "grad_norm": 0.05326479811347408, + "language_loss": 0.79026473, + "learning_rate": 0.000646427487522433, + "loss": 0.80123472, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.38452148, + "step": 2200, + "time_per_iteration": 2.649003744125366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102129, + "balance_loss_mlp": 1.063815, + "epoch": 0.42343208926510195, + "flos": 589513752576.0, + "grad_norm": 0.053706873495154336, + "language_loss": 0.83035368, + "learning_rate": 0.0006461295757970749, + "loss": 0.84137499, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.3828125, + "step": 2201, + "time_per_iteration": 2.8269903659820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103583, + "balance_loss_mlp": 1.06379044, + "epoch": 0.42362447095036554, + "flos": 640342126080.0, + "grad_norm": 0.05615670023579285, + "language_loss": 0.8144629, + "learning_rate": 0.0006458316073391839, + "loss": 0.8254987, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.39770508, + "step": 2202, + "time_per_iteration": 2.9145257472991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094508, + "balance_loss_mlp": 1.05595589, + "epoch": 0.42381685263562907, + "flos": 512421493248.0, + "grad_norm": 0.05176927409450969, + "language_loss": 0.87622833, + "learning_rate": 0.0006455335822644422, + "loss": 0.88717341, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.38525391, + "step": 2203, + "time_per_iteration": 2.596822500228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099235, + "balance_loss_mlp": 1.06032515, + "epoch": 0.42400923432089266, + "flos": 546523393536.0, + "grad_norm": 0.08269999762480702, + "language_loss": 0.77441901, + "learning_rate": 0.0006452355006885527, + "loss": 0.78541136, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.38867188, + "step": 2204, + "time_per_iteration": 2.6238672733306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105054, + "balance_loss_mlp": 1.06533396, + "epoch": 0.4242016160061562, + "flos": 621872584704.0, + "grad_norm": 0.06279334467905663, + "language_loss": 0.86963212, + "learning_rate": 0.0006449373627272412, + "loss": 0.88068271, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.39697266, + "step": 2205, + "time_per_iteration": 2.715792417526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094037, + "balance_loss_mlp": 1.05515122, + "epoch": 0.4243939976914198, + "flos": 571649495040.0, + "grad_norm": 0.055815664393925046, + "language_loss": 0.82368463, + "learning_rate": 0.0006446391684962553, + "loss": 0.83462495, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.38867188, + "step": 2206, + "time_per_iteration": 2.642230987548828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096131, + "balance_loss_mlp": 1.05822253, + "epoch": 0.42458637937668336, + "flos": 448509934080.0, + "grad_norm": 0.05868479731789126, + "language_loss": 0.83175069, + "learning_rate": 0.000644340918111364, + "loss": 0.84271193, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.37841797, + "step": 2207, + "time_per_iteration": 2.5489144325256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096536, + "balance_loss_mlp": 1.0566721, + "epoch": 0.4247787610619469, + "flos": 435176464896.0, + "grad_norm": 0.05469710752121124, + "language_loss": 0.84862429, + "learning_rate": 0.0006440426116883585, + "loss": 0.8595897, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.3984375, + "step": 2208, + "time_per_iteration": 2.5027823448181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106001, + "balance_loss_mlp": 1.06563711, + "epoch": 0.4249711427472105, + "flos": 495818675712.0, + "grad_norm": 0.04694631121992161, + "language_loss": 0.86197406, + "learning_rate": 0.0006437442493430519, + "loss": 0.87303412, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.40356445, + "step": 2209, + "time_per_iteration": 2.624462127685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111613, + "balance_loss_mlp": 1.0711534, + "epoch": 0.425163524432474, + "flos": 655498649088.0, + "grad_norm": 0.06243114219893557, + "language_loss": 0.86437929, + "learning_rate": 0.000643445831191278, + "loss": 0.87549543, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.40454102, + "step": 2210, + "time_per_iteration": 2.883671760559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110528, + "balance_loss_mlp": 1.06544065, + "epoch": 0.4253559061177376, + "flos": 650317496832.0, + "grad_norm": 0.059150918853506505, + "language_loss": 0.81800103, + "learning_rate": 0.0006431473573488937, + "loss": 0.82905388, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.39819336, + "step": 2211, + "time_per_iteration": 2.723308563232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098753, + "balance_loss_mlp": 1.05807877, + "epoch": 0.42554828780300114, + "flos": 553894336512.0, + "grad_norm": 0.05841858860857517, + "language_loss": 0.84883767, + "learning_rate": 0.0006428488279317765, + "loss": 0.85982525, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.40673828, + "step": 2212, + "time_per_iteration": 2.628831148147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098786, + "balance_loss_mlp": 1.05904126, + "epoch": 0.4257406694882647, + "flos": 514154386944.0, + "grad_norm": 0.056764121975701104, + "language_loss": 0.87647104, + "learning_rate": 0.0006425502430558259, + "loss": 0.88745892, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.39746094, + "step": 2213, + "time_per_iteration": 2.604146718978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094618, + "balance_loss_mlp": 1.0550406, + "epoch": 0.42593305117352825, + "flos": 515380921344.0, + "grad_norm": 0.05046529876809897, + "language_loss": 0.84638417, + "learning_rate": 0.0006422516028369628, + "loss": 0.85733032, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.39550781, + "step": 2214, + "time_per_iteration": 2.6178741455078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088889, + "balance_loss_mlp": 1.04864407, + "epoch": 0.42612543285879184, + "flos": 587766302208.0, + "grad_norm": 0.04660283784017015, + "language_loss": 0.83496028, + "learning_rate": 0.0006419529073911296, + "loss": 0.84584916, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.40234375, + "step": 2215, + "time_per_iteration": 2.8105666637420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085879, + "balance_loss_mlp": 1.04515672, + "epoch": 0.42631781454405543, + "flos": 635153619456.0, + "grad_norm": 0.05277435964401644, + "language_loss": 0.85660267, + "learning_rate": 0.0006416541568342901, + "loss": 0.86746144, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.40722656, + "step": 2216, + "time_per_iteration": 2.880662441253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080832, + "balance_loss_mlp": 1.040277, + "epoch": 0.42651019622931896, + "flos": 540891136512.0, + "grad_norm": 0.04969535335028593, + "language_loss": 0.84409285, + "learning_rate": 0.0006413553512824297, + "loss": 0.85490113, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.40551758, + "step": 2217, + "time_per_iteration": 2.7169618606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108871, + "balance_loss_mlp": 1.0485599, + "epoch": 0.42670257791458255, + "flos": 557892624384.0, + "grad_norm": 0.052410461022671016, + "language_loss": 0.84532559, + "learning_rate": 0.0006410564908515549, + "loss": 0.85621268, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.40136719, + "step": 2218, + "time_per_iteration": 2.657231092453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077064, + "balance_loss_mlp": 1.03710461, + "epoch": 0.4268949595998461, + "flos": 621025781760.0, + "grad_norm": 0.054635208049088675, + "language_loss": 0.8539567, + "learning_rate": 0.0006407575756576935, + "loss": 0.86472738, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.39941406, + "step": 2219, + "time_per_iteration": 2.7336490154266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089202, + "balance_loss_mlp": 1.04921913, + "epoch": 0.42708734128510967, + "flos": 537646519296.0, + "grad_norm": 0.04674173481591379, + "language_loss": 0.8770538, + "learning_rate": 0.0006404586058168951, + "loss": 0.88794577, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.3996582, + "step": 2220, + "time_per_iteration": 2.757380723953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080306, + "balance_loss_mlp": 1.0401566, + "epoch": 0.4272797229703732, + "flos": 502617830400.0, + "grad_norm": 0.05080694298179496, + "language_loss": 0.86598134, + "learning_rate": 0.0006401595814452296, + "loss": 0.87678444, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.40136719, + "step": 2221, + "time_per_iteration": 2.583448886871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082627, + "balance_loss_mlp": 1.04252505, + "epoch": 0.4274721046556368, + "flos": 492208883712.0, + "grad_norm": 0.05244104927134987, + "language_loss": 0.80640519, + "learning_rate": 0.000639860502658789, + "loss": 0.81723142, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.40087891, + "step": 2222, + "time_per_iteration": 2.6454262733459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080551, + "balance_loss_mlp": 1.04149842, + "epoch": 0.4276644863409004, + "flos": 568094957568.0, + "grad_norm": 0.049852493850949496, + "language_loss": 0.84906983, + "learning_rate": 0.0006395613695736853, + "loss": 0.85987538, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.39038086, + "step": 2223, + "time_per_iteration": 2.6607768535614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108883, + "balance_loss_mlp": 1.04841852, + "epoch": 0.4278568680261639, + "flos": 607155429888.0, + "grad_norm": 0.052366739862963044, + "language_loss": 0.8181783, + "learning_rate": 0.0006392621823060529, + "loss": 0.82906657, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.40405273, + "step": 2224, + "time_per_iteration": 2.7084245681762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085727, + "balance_loss_mlp": 1.045434, + "epoch": 0.4280492497114275, + "flos": 560265707520.0, + "grad_norm": 0.062247479017330604, + "language_loss": 0.85044312, + "learning_rate": 0.0006389629409720465, + "loss": 0.86130041, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.40307617, + "step": 2225, + "time_per_iteration": 2.6494481563568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083023, + "balance_loss_mlp": 1.04451835, + "epoch": 0.428241631396691, + "flos": 720334176768.0, + "grad_norm": 0.05784613309553924, + "language_loss": 0.88236213, + "learning_rate": 0.0006386636456878417, + "loss": 0.89319241, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.38452148, + "step": 2226, + "time_per_iteration": 2.8575398921966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086633, + "balance_loss_mlp": 1.04643595, + "epoch": 0.4284340130819546, + "flos": 429243052032.0, + "grad_norm": 0.05660062263134159, + "language_loss": 0.9185167, + "learning_rate": 0.0006383642965696353, + "loss": 0.92938304, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.40185547, + "step": 2227, + "time_per_iteration": 2.436495065689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093242, + "balance_loss_mlp": 1.05240059, + "epoch": 0.42862639476721814, + "flos": 524732069376.0, + "grad_norm": 0.06503204597883332, + "language_loss": 0.82736492, + "learning_rate": 0.000638064893733645, + "loss": 0.83829737, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.40844727, + "step": 2228, + "time_per_iteration": 2.737835645675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097834, + "balance_loss_mlp": 1.05937719, + "epoch": 0.42881877645248173, + "flos": 465089430528.0, + "grad_norm": 0.05835798065495767, + "language_loss": 0.90023828, + "learning_rate": 0.000637765437296109, + "loss": 0.91121662, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.38427734, + "step": 2229, + "time_per_iteration": 2.6694185733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102575, + "balance_loss_mlp": 1.06383204, + "epoch": 0.42901115813774526, + "flos": 560034362880.0, + "grad_norm": 0.048777417646368525, + "language_loss": 0.85443366, + "learning_rate": 0.000637465927373287, + "loss": 0.86545944, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.38720703, + "step": 2230, + "time_per_iteration": 2.608868360519409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097095, + "balance_loss_mlp": 1.05942452, + "epoch": 0.42920353982300885, + "flos": 561186703872.0, + "grad_norm": 0.058529600310023314, + "language_loss": 0.78994036, + "learning_rate": 0.000637166364081459, + "loss": 0.80091131, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.37670898, + "step": 2231, + "time_per_iteration": 2.6343741416931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109752, + "balance_loss_mlp": 1.06089842, + "epoch": 0.42939592150827244, + "flos": 555982230528.0, + "grad_norm": 0.06635954042372831, + "language_loss": 0.84122705, + "learning_rate": 0.0006368667475369256, + "loss": 0.8522023, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.36621094, + "step": 2232, + "time_per_iteration": 2.719153881072998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01385097, + "balance_loss_mlp": 1.36373484, + "epoch": 0.42958830319353597, + "flos": 1520796902400.0, + "grad_norm": 0.10507214536659652, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79912877, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.21386719, + "step": 2233, + "time_per_iteration": 4.869459390640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222541, + "balance_loss_mlp": 1.20547056, + "epoch": 0.42978068487879956, + "flos": 1495052522496.0, + "grad_norm": 0.06278147410173565, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.80117965, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.17089844, + "step": 2234, + "time_per_iteration": 4.809493780136108 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101654, + "balance_loss_mlp": 1.06386471, + "epoch": 0.4299730665640631, + "flos": 546725624832.0, + "grad_norm": 0.047028007384334866, + "language_loss": 0.86220634, + "learning_rate": 0.0006359675795504112, + "loss": 0.87322283, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.37744141, + "step": 2235, + "time_per_iteration": 2.644548177719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104888, + "balance_loss_mlp": 1.06671751, + "epoch": 0.4301654482493267, + "flos": 1128839473152.0, + "grad_norm": 0.053864842268977364, + "language_loss": 0.7475214, + "learning_rate": 0.0006356677511584775, + "loss": 0.75857025, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.38134766, + "step": 2236, + "time_per_iteration": 3.473637580871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104941, + "balance_loss_mlp": 1.06784356, + "epoch": 0.4303578299345902, + "flos": 495502963200.0, + "grad_norm": 0.07035023985335077, + "language_loss": 0.8582648, + "learning_rate": 0.0006353678700956511, + "loss": 0.86931419, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.37084961, + "step": 2237, + "time_per_iteration": 2.5412683486938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110161, + "balance_loss_mlp": 1.0728724, + "epoch": 0.4305502116198538, + "flos": 615472100352.0, + "grad_norm": 0.048926528615743585, + "language_loss": 0.83597398, + "learning_rate": 0.0006350679364783569, + "loss": 0.84707558, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.37255859, + "step": 2238, + "time_per_iteration": 2.7351441383361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108038, + "balance_loss_mlp": 1.0704397, + "epoch": 0.4307425933051173, + "flos": 558995503104.0, + "grad_norm": 0.05635941331688695, + "language_loss": 0.85586011, + "learning_rate": 0.0006347679504230393, + "loss": 0.8669405, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.37573242, + "step": 2239, + "time_per_iteration": 2.628014326095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108584, + "balance_loss_mlp": 1.06981754, + "epoch": 0.4309349749903809, + "flos": 971755163136.0, + "grad_norm": 0.06390031403556296, + "language_loss": 0.75844669, + "learning_rate": 0.0006344679120461632, + "loss": 0.76953256, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.38745117, + "step": 2240, + "time_per_iteration": 3.325970411300659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099219, + "balance_loss_mlp": 1.06123924, + "epoch": 0.4311273566756445, + "flos": 541663746048.0, + "grad_norm": 0.07957466882071795, + "language_loss": 0.79994094, + "learning_rate": 0.0006341678214642134, + "loss": 0.81093317, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.37963867, + "step": 2241, + "time_per_iteration": 2.598954916000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098329, + "balance_loss_mlp": 1.06118321, + "epoch": 0.43131973836090803, + "flos": 761316627456.0, + "grad_norm": 0.06316124390987561, + "language_loss": 0.82909411, + "learning_rate": 0.0006338676787936963, + "loss": 0.8400774, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.37133789, + "step": 2242, + "time_per_iteration": 3.057990074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092454, + "balance_loss_mlp": 1.0547359, + "epoch": 0.4315121200461716, + "flos": 554263893504.0, + "grad_norm": 0.058630582948494374, + "language_loss": 0.83799654, + "learning_rate": 0.0006335674841511367, + "loss": 0.84892106, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.37670898, + "step": 2243, + "time_per_iteration": 2.667917490005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152602, + "balance_loss_mlp": 1.1380111, + "epoch": 0.43170450173143515, + "flos": 1484499419136.0, + "grad_norm": 0.03105866471095203, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80333769, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.14550781, + "step": 2244, + "time_per_iteration": 4.996346473693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147416, + "balance_loss_mlp": 1.13225269, + "epoch": 0.43189688341669874, + "flos": 1472897433600.0, + "grad_norm": 0.02634625536346193, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78512967, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.15136719, + "step": 2245, + "time_per_iteration": 4.925641775131226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090243, + "balance_loss_mlp": 1.05293071, + "epoch": 0.43208926510196227, + "flos": 492677365248.0, + "grad_norm": 0.04832922480589342, + "language_loss": 0.82476389, + "learning_rate": 0.0006326665895567652, + "loss": 0.83566636, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.37304688, + "step": 2246, + "time_per_iteration": 2.6338651180267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108663, + "balance_loss_mlp": 1.04876888, + "epoch": 0.43228164678722586, + "flos": 519969936384.0, + "grad_norm": 0.06353903654252775, + "language_loss": 0.86891162, + "learning_rate": 0.0006323661881916976, + "loss": 0.87977791, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.37841797, + "step": 2247, + "time_per_iteration": 2.7270143032073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088946, + "balance_loss_mlp": 1.05082273, + "epoch": 0.4324740284724894, + "flos": 795722655744.0, + "grad_norm": 0.06655581665723238, + "language_loss": 0.81039822, + "learning_rate": 0.0006320657354375179, + "loss": 0.82128775, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.38134766, + "step": 2248, + "time_per_iteration": 2.9334113597869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090387, + "balance_loss_mlp": 1.05183434, + "epoch": 0.432666410157753, + "flos": 481917800448.0, + "grad_norm": 0.05858711608638651, + "language_loss": 0.87308645, + "learning_rate": 0.0006317652314108726, + "loss": 0.88399029, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.38500977, + "step": 2249, + "time_per_iteration": 2.5155436992645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083427, + "balance_loss_mlp": 1.04508948, + "epoch": 0.43285879184301657, + "flos": 499963940352.0, + "grad_norm": 0.06176153995331203, + "language_loss": 0.91197717, + "learning_rate": 0.0006314646762284277, + "loss": 0.92281145, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.38305664, + "step": 2250, + "time_per_iteration": 2.5938589572906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151081, + "balance_loss_mlp": 1.13324702, + "epoch": 0.4330511735282801, + "flos": 1509615346176.0, + "grad_norm": 0.03602865793169688, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76576912, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.17871094, + "step": 2251, + "time_per_iteration": 4.858763217926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082209, + "balance_loss_mlp": 1.04322791, + "epoch": 0.4332435552135437, + "flos": 699270382080.0, + "grad_norm": 0.07106828010915285, + "language_loss": 0.77364099, + "learning_rate": 0.0006308634128629022, + "loss": 0.78446311, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.3894043, + "step": 2252, + "time_per_iteration": 2.857311487197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081116, + "balance_loss_mlp": 1.04163396, + "epoch": 0.4334359368988072, + "flos": 591995934720.0, + "grad_norm": 0.05494240381392999, + "language_loss": 0.87411273, + "learning_rate": 0.0006305627049132531, + "loss": 0.88492393, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.39453125, + "step": 2253, + "time_per_iteration": 2.7931392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074672, + "balance_loss_mlp": 1.03628647, + "epoch": 0.4336283185840708, + "flos": 842440670208.0, + "grad_norm": 0.045544810523015906, + "language_loss": 0.85602796, + "learning_rate": 0.0006302619462746662, + "loss": 0.86677468, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.38330078, + "step": 2254, + "time_per_iteration": 3.137031078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072053, + "balance_loss_mlp": 1.03521752, + "epoch": 0.43382070026933434, + "flos": 625974179328.0, + "grad_norm": 0.05597321467051534, + "language_loss": 0.90273923, + "learning_rate": 0.0006299611370639069, + "loss": 0.91345972, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.36816406, + "step": 2255, + "time_per_iteration": 2.7370500564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078678, + "balance_loss_mlp": 1.04029226, + "epoch": 0.4340130819545979, + "flos": 590837801472.0, + "grad_norm": 0.05249156720482198, + "language_loss": 0.7960273, + "learning_rate": 0.0006296602773977593, + "loss": 0.80681407, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.38354492, + "step": 2256, + "time_per_iteration": 2.671543836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082288, + "balance_loss_mlp": 1.04387856, + "epoch": 0.4342054636398615, + "flos": 490624376832.0, + "grad_norm": 0.047941706130753194, + "language_loss": 0.87283635, + "learning_rate": 0.0006293593673930277, + "loss": 0.88365924, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.3840332, + "step": 2257, + "time_per_iteration": 2.622807741165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084566, + "balance_loss_mlp": 1.04694366, + "epoch": 0.43439784532512504, + "flos": 698679654912.0, + "grad_norm": 0.05256563639723818, + "language_loss": 0.78625226, + "learning_rate": 0.0006290584071665358, + "loss": 0.79709792, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.3762207, + "step": 2258, + "time_per_iteration": 2.8814268112182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084583, + "balance_loss_mlp": 1.0463171, + "epoch": 0.43459022701038863, + "flos": 485581436928.0, + "grad_norm": 0.05582719483060078, + "language_loss": 0.82315511, + "learning_rate": 0.0006287573968351266, + "loss": 0.83400095, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.38256836, + "step": 2259, + "time_per_iteration": 2.530107259750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093075, + "balance_loss_mlp": 1.05585814, + "epoch": 0.43478260869565216, + "flos": 642818515968.0, + "grad_norm": 0.06362082652150813, + "language_loss": 0.82416236, + "learning_rate": 0.0006284563365156626, + "loss": 0.83509314, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.37182617, + "step": 2260, + "time_per_iteration": 2.798595905303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088803, + "balance_loss_mlp": 1.05103791, + "epoch": 0.43497499038091575, + "flos": 425870396928.0, + "grad_norm": 0.05655312611086985, + "language_loss": 0.87709838, + "learning_rate": 0.0006281552263250261, + "loss": 0.88798642, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.37719727, + "step": 2261, + "time_per_iteration": 2.452665090560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160927, + "balance_loss_mlp": 1.14223516, + "epoch": 0.4351673720661793, + "flos": 1537594748928.0, + "grad_norm": 0.04176446008295971, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.8185246, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.18652344, + "step": 2262, + "time_per_iteration": 4.821255207061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101716, + "balance_loss_mlp": 1.0650475, + "epoch": 0.43535975375144287, + "flos": 748828551168.0, + "grad_norm": 0.06957692587484587, + "language_loss": 0.81302369, + "learning_rate": 0.0006275528567978593, + "loss": 0.82404089, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.36669922, + "step": 2263, + "time_per_iteration": 2.9021594524383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105224, + "balance_loss_mlp": 1.06710052, + "epoch": 0.4355521354367064, + "flos": 860914593792.0, + "grad_norm": 0.05359116837259303, + "language_loss": 0.8251968, + "learning_rate": 0.0006272515976951898, + "loss": 0.83624899, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.38134766, + "step": 2264, + "time_per_iteration": 3.051140546798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100567, + "balance_loss_mlp": 1.06160915, + "epoch": 0.43574451712197, + "flos": 734200146432.0, + "grad_norm": 0.04085362180640218, + "language_loss": 0.79003727, + "learning_rate": 0.0006269502891890687, + "loss": 0.80104291, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.38916016, + "step": 2265, + "time_per_iteration": 2.987435817718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109612, + "balance_loss_mlp": 1.05899858, + "epoch": 0.4359368988072336, + "flos": 570296332800.0, + "grad_norm": 0.04646658934269887, + "language_loss": 0.88059056, + "learning_rate": 0.0006266489313964743, + "loss": 0.89155173, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.37109375, + "step": 2266, + "time_per_iteration": 2.718259572982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098621, + "balance_loss_mlp": 1.06040287, + "epoch": 0.4361292804924971, + "flos": 555244526592.0, + "grad_norm": 0.06168340797293566, + "language_loss": 0.85241735, + "learning_rate": 0.0006263475244344041, + "loss": 0.86340356, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.38183594, + "step": 2267, + "time_per_iteration": 2.822174072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099998, + "balance_loss_mlp": 1.06232774, + "epoch": 0.4363216621777607, + "flos": 557021090304.0, + "grad_norm": 0.06545155195827496, + "language_loss": 0.84663981, + "learning_rate": 0.0006260460684198746, + "loss": 0.85763973, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.37646484, + "step": 2268, + "time_per_iteration": 2.652629852294922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092951, + "balance_loss_mlp": 1.05556679, + "epoch": 0.4365140438630242, + "flos": 477979149312.0, + "grad_norm": 0.06144025960698331, + "language_loss": 0.84485406, + "learning_rate": 0.0006257445634699213, + "loss": 0.85578358, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.3737793, + "step": 2269, + "time_per_iteration": 2.526547431945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091306, + "balance_loss_mlp": 1.05506659, + "epoch": 0.4367064255482878, + "flos": 578646498816.0, + "grad_norm": 0.047950904811088546, + "language_loss": 0.82840669, + "learning_rate": 0.0006254430097015993, + "loss": 0.83931977, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.36279297, + "step": 2270, + "time_per_iteration": 2.6397740840911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121077, + "balance_loss_mlp": 1.1094898, + "epoch": 0.43689880723355135, + "flos": 1458117669888.0, + "grad_norm": 0.029995875979849037, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77600169, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.11572266, + "step": 2271, + "time_per_iteration": 4.781012535095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093451, + "balance_loss_mlp": 1.0559721, + "epoch": 0.43709118891881493, + "flos": 667295663616.0, + "grad_norm": 0.05579821190743498, + "language_loss": 0.85169244, + "learning_rate": 0.0006248397561781609, + "loss": 0.86262697, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.37426758, + "step": 2272, + "time_per_iteration": 2.8750343322753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109427, + "balance_loss_mlp": 1.05617118, + "epoch": 0.43728357060407846, + "flos": 544612999680.0, + "grad_norm": 0.06638881020832643, + "language_loss": 0.86299849, + "learning_rate": 0.0006245380566572482, + "loss": 0.87394118, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.38085938, + "step": 2273, + "time_per_iteration": 2.667287826538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095571, + "balance_loss_mlp": 1.05873561, + "epoch": 0.43747595228934205, + "flos": 746504930304.0, + "grad_norm": 0.06509502789500103, + "language_loss": 0.75652242, + "learning_rate": 0.0006242363087863744, + "loss": 0.76747811, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.36816406, + "step": 2274, + "time_per_iteration": 2.948168992996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088988, + "balance_loss_mlp": 1.05060267, + "epoch": 0.43766833397460564, + "flos": 631060789248.0, + "grad_norm": 0.0773983629565932, + "language_loss": 0.85681164, + "learning_rate": 0.0006239345126826878, + "loss": 0.86770147, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.38354492, + "step": 2275, + "time_per_iteration": 2.7522637844085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084857, + "balance_loss_mlp": 1.04682946, + "epoch": 0.43786071565986917, + "flos": 530709152256.0, + "grad_norm": 0.05397848209837344, + "language_loss": 0.84028137, + "learning_rate": 0.0006236326684633561, + "loss": 0.85112989, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.37988281, + "step": 2276, + "time_per_iteration": 2.8013172149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083155, + "balance_loss_mlp": 1.04479384, + "epoch": 0.43805309734513276, + "flos": 538295473152.0, + "grad_norm": 0.057720697432170794, + "language_loss": 0.74613291, + "learning_rate": 0.0006233307762455658, + "loss": 0.75696445, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.38354492, + "step": 2277, + "time_per_iteration": 4.090092658996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088607, + "balance_loss_mlp": 1.05057979, + "epoch": 0.4382454790303963, + "flos": 864188324352.0, + "grad_norm": 0.052083504639934525, + "language_loss": 0.83232701, + "learning_rate": 0.0006230288361465216, + "loss": 0.84321308, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.37988281, + "step": 2278, + "time_per_iteration": 3.0360679626464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092026, + "balance_loss_mlp": 1.05368817, + "epoch": 0.4384378607156599, + "flos": 765175292928.0, + "grad_norm": 0.0765632057362916, + "language_loss": 0.85051048, + "learning_rate": 0.0006227268482834473, + "loss": 0.86143076, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.38305664, + "step": 2279, + "time_per_iteration": 2.875603437423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092125, + "balance_loss_mlp": 1.05369186, + "epoch": 0.4386302424009234, + "flos": 668260329984.0, + "grad_norm": 0.06746087226793605, + "language_loss": 0.87309432, + "learning_rate": 0.000622424812773585, + "loss": 0.88401562, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.3840332, + "step": 2280, + "time_per_iteration": 2.815737724304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091003, + "balance_loss_mlp": 1.05335641, + "epoch": 0.438822624086187, + "flos": 484941247488.0, + "grad_norm": 0.06660247150401381, + "language_loss": 0.7952022, + "learning_rate": 0.000622122729734195, + "loss": 0.80611223, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.3762207, + "step": 2281, + "time_per_iteration": 2.528907060623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010937, + "balance_loss_mlp": 1.05653024, + "epoch": 0.4390150057714506, + "flos": 498959986176.0, + "grad_norm": 0.07198447175498815, + "language_loss": 0.87400854, + "learning_rate": 0.0006218205992825566, + "loss": 0.88494551, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.37158203, + "step": 2282, + "time_per_iteration": 2.6437437534332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086909, + "balance_loss_mlp": 1.04895234, + "epoch": 0.4392073874567141, + "flos": 557937704448.0, + "grad_norm": 0.0537918663445124, + "language_loss": 0.81690598, + "learning_rate": 0.0006215184215359671, + "loss": 0.82777506, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.37939453, + "step": 2283, + "time_per_iteration": 2.7374680042266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082056, + "balance_loss_mlp": 1.04531598, + "epoch": 0.4393997691419777, + "flos": 605028248064.0, + "grad_norm": 0.053438963610997155, + "language_loss": 0.86718416, + "learning_rate": 0.0006212161966117425, + "loss": 0.87800473, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.36743164, + "step": 2284, + "time_per_iteration": 2.7031607627868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082719, + "balance_loss_mlp": 1.04476333, + "epoch": 0.43959215082724123, + "flos": 803812363776.0, + "grad_norm": 0.05414488390239245, + "language_loss": 0.81261152, + "learning_rate": 0.0006209139246272164, + "loss": 0.8234387, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.37915039, + "step": 2285, + "time_per_iteration": 2.942938804626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081504, + "balance_loss_mlp": 1.04354775, + "epoch": 0.4397845325125048, + "flos": 487403080704.0, + "grad_norm": 0.06213580776851028, + "language_loss": 0.8193686, + "learning_rate": 0.0006206116056997421, + "loss": 0.83018363, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.37939453, + "step": 2286, + "time_per_iteration": 2.549246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083431, + "balance_loss_mlp": 1.04671431, + "epoch": 0.43997691419776835, + "flos": 480569020416.0, + "grad_norm": 0.047189645190622125, + "language_loss": 0.82737786, + "learning_rate": 0.0006203092399466892, + "loss": 0.83821213, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.36694336, + "step": 2287, + "time_per_iteration": 2.533667802810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079141, + "balance_loss_mlp": 1.04259157, + "epoch": 0.44016929588303194, + "flos": 482873702400.0, + "grad_norm": 0.04521232958061075, + "language_loss": 0.85280973, + "learning_rate": 0.0006200068274854473, + "loss": 0.86360115, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.36523438, + "step": 2288, + "time_per_iteration": 2.6336212158203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087714, + "balance_loss_mlp": 1.05013943, + "epoch": 0.4403616775682955, + "flos": 571562155008.0, + "grad_norm": 0.04238785738832165, + "language_loss": 0.85822582, + "learning_rate": 0.0006197043684334229, + "loss": 0.86910295, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.37548828, + "step": 2289, + "time_per_iteration": 2.7420616149902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108734, + "balance_loss_mlp": 1.05028939, + "epoch": 0.44055405925355906, + "flos": 630563194368.0, + "grad_norm": 0.0573866619632787, + "language_loss": 0.79627317, + "learning_rate": 0.0006194018629080411, + "loss": 0.80714655, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.37036133, + "step": 2290, + "time_per_iteration": 2.7804791927337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108993, + "balance_loss_mlp": 1.0514729, + "epoch": 0.44074644093882265, + "flos": 536523291648.0, + "grad_norm": 0.052070709818396434, + "language_loss": 0.81445479, + "learning_rate": 0.0006190993110267451, + "loss": 0.82535404, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.38427734, + "step": 2291, + "time_per_iteration": 2.6991255283355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079676, + "balance_loss_mlp": 1.04317451, + "epoch": 0.4409388226240862, + "flos": 462995744256.0, + "grad_norm": 0.05365602748785357, + "language_loss": 0.84155387, + "learning_rate": 0.0006187967129069958, + "loss": 0.85235059, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.36523438, + "step": 2292, + "time_per_iteration": 2.558609962463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082437, + "balance_loss_mlp": 1.04569674, + "epoch": 0.44113120430934977, + "flos": 565717492224.0, + "grad_norm": 0.05065606510830679, + "language_loss": 0.87013716, + "learning_rate": 0.0006184940686662722, + "loss": 0.88096148, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.36743164, + "step": 2293, + "time_per_iteration": 2.753314733505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078141, + "balance_loss_mlp": 1.04125786, + "epoch": 0.4413235859946133, + "flos": 543313681920.0, + "grad_norm": 0.05240936044313176, + "language_loss": 0.89929485, + "learning_rate": 0.0006181913784220714, + "loss": 0.91007626, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.36865234, + "step": 2294, + "time_per_iteration": 2.6420986652374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111961, + "balance_loss_mlp": 1.09889555, + "epoch": 0.4415159676798769, + "flos": 1569016465920.0, + "grad_norm": 0.03544098021349555, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81665742, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.13085938, + "step": 2295, + "time_per_iteration": 4.864506483078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085965, + "balance_loss_mlp": 1.04831886, + "epoch": 0.4417083493651404, + "flos": 658423171584.0, + "grad_norm": 0.06256258413724265, + "language_loss": 0.79847091, + "learning_rate": 0.0006175858603933146, + "loss": 0.80933058, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.3762207, + "step": 2296, + "time_per_iteration": 2.8739333152770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079624, + "balance_loss_mlp": 1.04328871, + "epoch": 0.441900731050404, + "flos": 740119002624.0, + "grad_norm": 0.05454759239937102, + "language_loss": 0.80644178, + "learning_rate": 0.0006172830328438416, + "loss": 0.81723803, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.36352539, + "step": 2297, + "time_per_iteration": 2.9661777019500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082208, + "balance_loss_mlp": 1.0437274, + "epoch": 0.44209311273566754, + "flos": 539153860608.0, + "grad_norm": 0.05386131456834753, + "language_loss": 0.87081188, + "learning_rate": 0.0006169801597610572, + "loss": 0.88163394, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.38452148, + "step": 2298, + "time_per_iteration": 2.732304573059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107238, + "balance_loss_mlp": 1.03604531, + "epoch": 0.4422854944209311, + "flos": 621335702016.0, + "grad_norm": 0.07013675434202182, + "language_loss": 0.89663231, + "learning_rate": 0.0006166772412625469, + "loss": 0.90735614, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.36328125, + "step": 2299, + "time_per_iteration": 2.70890736579895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075195, + "balance_loss_mlp": 1.03793061, + "epoch": 0.4424778761061947, + "flos": 658516303872.0, + "grad_norm": 0.06419018913135732, + "language_loss": 0.81816357, + "learning_rate": 0.0006163742774659141, + "loss": 0.8289156, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.37255859, + "step": 2300, + "time_per_iteration": 2.830306053161621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081661, + "balance_loss_mlp": 1.0454216, + "epoch": 0.44267025779145824, + "flos": 568297188864.0, + "grad_norm": 0.05261241955347018, + "language_loss": 0.85695601, + "learning_rate": 0.0006160712684887801, + "loss": 0.86777264, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.36279297, + "step": 2301, + "time_per_iteration": 2.7931785583496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010826, + "balance_loss_mlp": 1.04600239, + "epoch": 0.44286263947672183, + "flos": 496469039616.0, + "grad_norm": 0.05340137710748247, + "language_loss": 0.81907189, + "learning_rate": 0.0006157682144487832, + "loss": 0.82989788, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.36572266, + "step": 2302, + "time_per_iteration": 2.7355551719665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108501, + "balance_loss_mlp": 1.04793596, + "epoch": 0.44305502116198536, + "flos": 609096347136.0, + "grad_norm": 0.060309070663334345, + "language_loss": 0.82788789, + "learning_rate": 0.0006154651154635793, + "loss": 0.83873796, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.37084961, + "step": 2303, + "time_per_iteration": 2.8048007488250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088624, + "balance_loss_mlp": 1.05150199, + "epoch": 0.44324740284724895, + "flos": 470558744064.0, + "grad_norm": 0.05169590776144269, + "language_loss": 0.84867418, + "learning_rate": 0.0006151619716508421, + "loss": 0.85956049, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.37084961, + "step": 2304, + "time_per_iteration": 2.5419833660125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.05046785, + "epoch": 0.4434397845325125, + "flos": 578454441984.0, + "grad_norm": 0.05720417651641939, + "language_loss": 0.86974978, + "learning_rate": 0.0006148587831282625, + "loss": 0.88062799, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.37353516, + "step": 2305, + "time_per_iteration": 2.689751386642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055473, + "balance_loss_mlp": 1.04326594, + "epoch": 0.44363216621777607, + "flos": 1495765343232.0, + "grad_norm": 0.012762307031937271, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80231541, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.12207031, + "step": 2306, + "time_per_iteration": 4.886535406112671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092659, + "balance_loss_mlp": 1.05699158, + "epoch": 0.44382454790303966, + "flos": 477082884096.0, + "grad_norm": 0.06286570611305137, + "language_loss": 0.86913157, + "learning_rate": 0.0006142522724244255, + "loss": 0.88005817, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.35693359, + "step": 2307, + "time_per_iteration": 2.499870777130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054077, + "balance_loss_mlp": 1.04177487, + "epoch": 0.4440169295883032, + "flos": 1543321548288.0, + "grad_norm": 0.013017387525484581, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.775388, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.12255859, + "step": 2308, + "time_per_iteration": 4.8646886348724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087414, + "balance_loss_mlp": 1.05115092, + "epoch": 0.4442093112735668, + "flos": 590789749248.0, + "grad_norm": 0.050195382328210664, + "language_loss": 0.77274799, + "learning_rate": 0.000613645584293942, + "loss": 0.78362215, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.36279297, + "step": 2309, + "time_per_iteration": 2.877244472503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087256, + "balance_loss_mlp": 1.05056334, + "epoch": 0.4444016929588303, + "flos": 530009326080.0, + "grad_norm": 0.047114011401622066, + "language_loss": 0.83068305, + "learning_rate": 0.0006133421739881185, + "loss": 0.8415556, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.36694336, + "step": 2310, + "time_per_iteration": 2.667240858078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081979, + "balance_loss_mlp": 1.04557252, + "epoch": 0.4445940746440939, + "flos": 619947634176.0, + "grad_norm": 0.055208144480819774, + "language_loss": 0.82587862, + "learning_rate": 0.0006130387196789605, + "loss": 0.83669835, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.36425781, + "step": 2311, + "time_per_iteration": 2.7925667762756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082597, + "balance_loss_mlp": 1.04704881, + "epoch": 0.4447864563293574, + "flos": 628782248448.0, + "grad_norm": 0.049856185775691036, + "language_loss": 0.83914995, + "learning_rate": 0.0006127352214842795, + "loss": 0.84997582, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.35571289, + "step": 2312, + "time_per_iteration": 2.9495813846588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079657, + "balance_loss_mlp": 1.04236865, + "epoch": 0.444978838014621, + "flos": 650548841472.0, + "grad_norm": 0.0527905378587152, + "language_loss": 0.85049295, + "learning_rate": 0.0006124316795219041, + "loss": 0.8612895, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.37255859, + "step": 2313, + "time_per_iteration": 2.760117769241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077597, + "balance_loss_mlp": 1.04119062, + "epoch": 0.44517121969988455, + "flos": 612153289728.0, + "grad_norm": 0.047764928605774304, + "language_loss": 0.82297838, + "learning_rate": 0.0006121280939096794, + "loss": 0.8337543, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.36401367, + "step": 2314, + "time_per_iteration": 2.737471580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075674, + "balance_loss_mlp": 1.0385046, + "epoch": 0.44536360138514813, + "flos": 488491402752.0, + "grad_norm": 0.07620217918322614, + "language_loss": 0.87685931, + "learning_rate": 0.000611824464765468, + "loss": 0.88761604, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.37133789, + "step": 2315, + "time_per_iteration": 2.5991926193237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103119, + "balance_loss_mlp": 1.01922143, + "epoch": 0.4455559830704117, + "flos": 1515425255424.0, + "grad_norm": 0.013293348061684912, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79626131, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.11962891, + "step": 2316, + "time_per_iteration": 4.652711391448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079239, + "balance_loss_mlp": 1.04335713, + "epoch": 0.44574836475567525, + "flos": 615314949120.0, + "grad_norm": 0.04747333782009751, + "language_loss": 0.85680878, + "learning_rate": 0.000611217076352619, + "loss": 0.86760116, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.35913086, + "step": 2317, + "time_per_iteration": 2.7729227542877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077005, + "balance_loss_mlp": 1.04140949, + "epoch": 0.44594074644093884, + "flos": 506070471168.0, + "grad_norm": 0.2761075259266177, + "language_loss": 0.82980591, + "learning_rate": 0.0006109133173197905, + "loss": 0.84057599, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.35620117, + "step": 2318, + "time_per_iteration": 2.6684277057647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087623, + "balance_loss_mlp": 1.05243218, + "epoch": 0.44613312812620237, + "flos": 726647321088.0, + "grad_norm": 0.057083346058123784, + "language_loss": 0.85251284, + "learning_rate": 0.0006106095152265935, + "loss": 0.86338907, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.35229492, + "step": 2319, + "time_per_iteration": 2.9197404384613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092234, + "balance_loss_mlp": 1.05604196, + "epoch": 0.44632550981146596, + "flos": 635419869696.0, + "grad_norm": 0.048967973341694476, + "language_loss": 0.8448627, + "learning_rate": 0.0006103056701909739, + "loss": 0.85578501, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.36230469, + "step": 2320, + "time_per_iteration": 2.885965347290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101974, + "balance_loss_mlp": 1.06604421, + "epoch": 0.4465178914967295, + "flos": 826690447872.0, + "grad_norm": 0.04429440839494469, + "language_loss": 0.82779431, + "learning_rate": 0.0006100017823308956, + "loss": 0.83881408, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.35961914, + "step": 2321, + "time_per_iteration": 3.1523914337158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110877, + "balance_loss_mlp": 1.0737319, + "epoch": 0.4467102731819931, + "flos": 665532246528.0, + "grad_norm": 0.05773147459468349, + "language_loss": 0.79802787, + "learning_rate": 0.0006096978517643377, + "loss": 0.80913663, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.37158203, + "step": 2322, + "time_per_iteration": 2.8030614852905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123327, + "balance_loss_mlp": 1.08668184, + "epoch": 0.4469026548672566, + "flos": 512692125696.0, + "grad_norm": 0.052696901781691036, + "language_loss": 0.83731532, + "learning_rate": 0.0006093938786092968, + "loss": 0.84854853, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.3659668, + "step": 2323, + "time_per_iteration": 2.6108593940734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112741, + "balance_loss_mlp": 1.0761435, + "epoch": 0.4470950365525202, + "flos": 683774825472.0, + "grad_norm": 0.0683875942547517, + "language_loss": 0.89724207, + "learning_rate": 0.0006090898629837857, + "loss": 0.90836942, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.3659668, + "step": 2324, + "time_per_iteration": 2.8141510486602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121866, + "balance_loss_mlp": 1.08515, + "epoch": 0.4472874182377838, + "flos": 627018831360.0, + "grad_norm": 0.05799026068482576, + "language_loss": 0.87375617, + "learning_rate": 0.0006087858050058337, + "loss": 0.88497484, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.3671875, + "step": 2325, + "time_per_iteration": 2.8174242973327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106391, + "balance_loss_mlp": 1.07053268, + "epoch": 0.4474797999230473, + "flos": 546946795008.0, + "grad_norm": 0.06107345330372946, + "language_loss": 0.81985253, + "learning_rate": 0.0006084817047934866, + "loss": 0.8309164, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.35888672, + "step": 2326, + "time_per_iteration": 2.627870798110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111871, + "balance_loss_mlp": 1.08211279, + "epoch": 0.4476721816083109, + "flos": 455585513472.0, + "grad_norm": 0.09021260210248909, + "language_loss": 0.89277744, + "learning_rate": 0.0006081775624648066, + "loss": 0.90396452, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.3659668, + "step": 2327, + "time_per_iteration": 2.517587900161743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107902, + "balance_loss_mlp": 1.07154357, + "epoch": 0.44786456329357444, + "flos": 481273228800.0, + "grad_norm": 0.05788938613905733, + "language_loss": 0.8277235, + "learning_rate": 0.0006078733781378721, + "loss": 0.83880252, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.36401367, + "step": 2328, + "time_per_iteration": 2.5216193199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102085, + "balance_loss_mlp": 1.06579816, + "epoch": 0.448056944978838, + "flos": 551822409216.0, + "grad_norm": 0.05774471450654044, + "language_loss": 0.82095438, + "learning_rate": 0.0006075691519307781, + "loss": 0.83197522, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.36303711, + "step": 2329, + "time_per_iteration": 2.8394477367401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093614, + "balance_loss_mlp": 1.05551517, + "epoch": 0.44824932666410156, + "flos": 550571143680.0, + "grad_norm": 0.05485541452922095, + "language_loss": 0.82042563, + "learning_rate": 0.0006072648839616356, + "loss": 0.83136177, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.38061523, + "step": 2330, + "time_per_iteration": 2.650087594985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089159, + "balance_loss_mlp": 1.05229926, + "epoch": 0.44844170834936514, + "flos": 988161541632.0, + "grad_norm": 0.0454185508799419, + "language_loss": 0.82814097, + "learning_rate": 0.0006069605743485718, + "loss": 0.83903253, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.3684082, + "step": 2331, + "time_per_iteration": 3.345179319381714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085878, + "balance_loss_mlp": 1.0494473, + "epoch": 0.44863409003462873, + "flos": 591040032768.0, + "grad_norm": 0.057018102026312835, + "language_loss": 0.83470714, + "learning_rate": 0.0006066562232097303, + "loss": 0.84556592, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.36425781, + "step": 2332, + "time_per_iteration": 2.7025153636932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089148, + "balance_loss_mlp": 1.0525744, + "epoch": 0.44882647171989226, + "flos": 724313525760.0, + "grad_norm": 0.055435808375502424, + "language_loss": 0.86104345, + "learning_rate": 0.0006063518306632708, + "loss": 0.87193495, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.36572266, + "step": 2333, + "time_per_iteration": 2.934469699859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082178, + "balance_loss_mlp": 1.04465127, + "epoch": 0.44901885340515585, + "flos": 534662360064.0, + "grad_norm": 0.061394686563490536, + "language_loss": 0.82313985, + "learning_rate": 0.0006060473968273688, + "loss": 0.83396161, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.375, + "step": 2334, + "time_per_iteration": 2.6561286449432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139417, + "balance_loss_mlp": 1.12782979, + "epoch": 0.4492112350904194, + "flos": 1554456462336.0, + "grad_norm": 0.048192148717983975, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.79018956, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.11572266, + "step": 2335, + "time_per_iteration": 4.895314693450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092745, + "balance_loss_mlp": 1.08144426, + "epoch": 0.44940361677568297, + "flos": 1522525413888.0, + "grad_norm": 0.0355581806637232, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.8209796, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.11279297, + "step": 2336, + "time_per_iteration": 4.86665940284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088054, + "balance_loss_mlp": 1.05064595, + "epoch": 0.4495959984609465, + "flos": 382289310720.0, + "grad_norm": 0.06064477802371089, + "language_loss": 0.88117951, + "learning_rate": 0.0006051338487650047, + "loss": 0.89206004, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.3737793, + "step": 2337, + "time_per_iteration": 2.4159162044525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086135, + "balance_loss_mlp": 1.04777336, + "epoch": 0.4497883801462101, + "flos": 497630145024.0, + "grad_norm": 0.058257925131248826, + "language_loss": 0.82456082, + "learning_rate": 0.0006048292509534095, + "loss": 0.83542222, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.38354492, + "step": 2338, + "time_per_iteration": 2.5835769176483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081051, + "balance_loss_mlp": 1.04392958, + "epoch": 0.4499807618314736, + "flos": 614166990336.0, + "grad_norm": 0.053787147945734054, + "language_loss": 0.77580249, + "learning_rate": 0.0006045246124434895, + "loss": 0.78661299, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.37109375, + "step": 2339, + "time_per_iteration": 2.7258870601654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080837, + "balance_loss_mlp": 1.04311895, + "epoch": 0.4501731435167372, + "flos": 1005122331648.0, + "grad_norm": 0.06446556175990359, + "language_loss": 0.86143219, + "learning_rate": 0.0006042199333535162, + "loss": 0.87224054, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.37695312, + "step": 2340, + "time_per_iteration": 3.2644054889678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089021, + "balance_loss_mlp": 1.05132723, + "epoch": 0.4503655252020008, + "flos": 820519898112.0, + "grad_norm": 0.05440597484835576, + "language_loss": 0.8378191, + "learning_rate": 0.0006039152138017763, + "loss": 0.84870934, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.37695312, + "step": 2341, + "time_per_iteration": 3.0747756958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082643, + "balance_loss_mlp": 1.04566467, + "epoch": 0.4505579068872643, + "flos": 486113937408.0, + "grad_norm": 0.06051531382505287, + "language_loss": 0.83470345, + "learning_rate": 0.0006036104539065726, + "loss": 0.84552985, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.36962891, + "step": 2342, + "time_per_iteration": 2.6581151485443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076125, + "balance_loss_mlp": 1.03812099, + "epoch": 0.4507502885725279, + "flos": 884421282816.0, + "grad_norm": 0.05288539322407846, + "language_loss": 0.845487, + "learning_rate": 0.000603305653786223, + "loss": 0.85624826, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.37963867, + "step": 2343, + "time_per_iteration": 3.1298844814300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079349, + "balance_loss_mlp": 1.04208446, + "epoch": 0.45094267025779144, + "flos": 578070328320.0, + "grad_norm": 0.04730162576611683, + "language_loss": 0.83859873, + "learning_rate": 0.0006030008135590622, + "loss": 0.84939224, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.37255859, + "step": 2344, + "time_per_iteration": 2.685067892074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107533, + "balance_loss_mlp": 1.03799331, + "epoch": 0.45113505194305503, + "flos": 525124947456.0, + "grad_norm": 0.051192045733620226, + "language_loss": 0.80228901, + "learning_rate": 0.0006026959333434387, + "loss": 0.81304228, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.37353516, + "step": 2345, + "time_per_iteration": 2.783407688140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107739, + "balance_loss_mlp": 1.04014897, + "epoch": 0.45132743362831856, + "flos": 501791376384.0, + "grad_norm": 0.05199160611628431, + "language_loss": 0.77699506, + "learning_rate": 0.0006023910132577181, + "loss": 0.78776896, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.37207031, + "step": 2346, + "time_per_iteration": 2.646801233291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077398, + "balance_loss_mlp": 1.03968024, + "epoch": 0.45151981531358215, + "flos": 431690328576.0, + "grad_norm": 0.04922592508563583, + "language_loss": 0.84707314, + "learning_rate": 0.0006020860534202806, + "loss": 0.85784709, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.37670898, + "step": 2347, + "time_per_iteration": 2.4788920879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078489, + "balance_loss_mlp": 1.04036641, + "epoch": 0.4517121969988457, + "flos": 711826859520.0, + "grad_norm": 0.07725824631471088, + "language_loss": 0.80951411, + "learning_rate": 0.0006017810539495224, + "loss": 0.82029903, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.38110352, + "step": 2348, + "time_per_iteration": 3.013258934020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071848, + "balance_loss_mlp": 1.03587079, + "epoch": 0.45190457868410927, + "flos": 579197938176.0, + "grad_norm": 0.052394100693581906, + "language_loss": 0.82200068, + "learning_rate": 0.0006014760149638547, + "loss": 0.83271921, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.35986328, + "step": 2349, + "time_per_iteration": 2.6988728046417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073469, + "balance_loss_mlp": 1.03823042, + "epoch": 0.45209696036937286, + "flos": 482415395328.0, + "grad_norm": 0.04812495303687425, + "language_loss": 0.88394493, + "learning_rate": 0.000601170936581704, + "loss": 0.89467961, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.35253906, + "step": 2350, + "time_per_iteration": 2.5537099838256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108294, + "balance_loss_mlp": 1.04617548, + "epoch": 0.4522893420546364, + "flos": 539945409024.0, + "grad_norm": 0.059990427154632556, + "language_loss": 0.84346575, + "learning_rate": 0.0006008658189215121, + "loss": 0.85429513, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.36767578, + "step": 2351, + "time_per_iteration": 2.649442434310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084325, + "balance_loss_mlp": 1.04803789, + "epoch": 0.4524817237399, + "flos": 496423959552.0, + "grad_norm": 0.09153462549619036, + "language_loss": 0.7966159, + "learning_rate": 0.0006005606621017366, + "loss": 0.80745912, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.36328125, + "step": 2352, + "time_per_iteration": 2.55026912689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086533, + "balance_loss_mlp": 1.04891062, + "epoch": 0.4526741054251635, + "flos": 652229300736.0, + "grad_norm": 0.05116414037173521, + "language_loss": 0.80266565, + "learning_rate": 0.0006002554662408496, + "loss": 0.81353092, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.3762207, + "step": 2353, + "time_per_iteration": 2.8708717823028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089479, + "balance_loss_mlp": 1.05259538, + "epoch": 0.4528664871104271, + "flos": 570674654208.0, + "grad_norm": 0.05934636879993742, + "language_loss": 0.91137719, + "learning_rate": 0.0005999502314573388, + "loss": 0.92227197, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.36865234, + "step": 2354, + "time_per_iteration": 2.636732339859009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091866, + "balance_loss_mlp": 1.05424321, + "epoch": 0.45305886879569063, + "flos": 458480922624.0, + "grad_norm": 0.06511026561582739, + "language_loss": 0.85993183, + "learning_rate": 0.0005996449578697066, + "loss": 0.87085044, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.3762207, + "step": 2355, + "time_per_iteration": 2.6497340202331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095767, + "balance_loss_mlp": 1.05916929, + "epoch": 0.4532512504809542, + "flos": 504922512384.0, + "grad_norm": 0.05408585590104452, + "language_loss": 0.81462455, + "learning_rate": 0.0005993396455964709, + "loss": 0.82558227, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.36621094, + "step": 2356, + "time_per_iteration": 2.67404842376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090921, + "balance_loss_mlp": 1.05360866, + "epoch": 0.4534436321662178, + "flos": 581940578304.0, + "grad_norm": 0.046652791791384825, + "language_loss": 0.81415474, + "learning_rate": 0.0005990342947561647, + "loss": 0.82506394, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.37304688, + "step": 2357, + "time_per_iteration": 2.694093942642212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092731, + "balance_loss_mlp": 1.05577612, + "epoch": 0.45363601385148133, + "flos": 549458090496.0, + "grad_norm": 0.05811050095266086, + "language_loss": 0.77914369, + "learning_rate": 0.0005987289054673351, + "loss": 0.79007101, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.36987305, + "step": 2358, + "time_per_iteration": 2.6171157360076904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187917, + "balance_loss_mlp": 1.16912949, + "epoch": 0.4538283955367449, + "flos": 1473754411008.0, + "grad_norm": 0.03301673104438644, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77763653, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.1875, + "step": 2359, + "time_per_iteration": 4.821492910385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096443, + "balance_loss_mlp": 1.05986929, + "epoch": 0.45402077722200845, + "flos": 584441699328.0, + "grad_norm": 0.059282629275687046, + "language_loss": 0.91217041, + "learning_rate": 0.0005981180120183722, + "loss": 0.92313486, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.36572266, + "step": 2360, + "time_per_iteration": 2.6678080558776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109692, + "balance_loss_mlp": 1.05901098, + "epoch": 0.45421315890727204, + "flos": 531462822912.0, + "grad_norm": 0.0444268091974553, + "language_loss": 0.85307455, + "learning_rate": 0.0005978125080954089, + "loss": 0.86404377, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.37915039, + "step": 2361, + "time_per_iteration": 2.7723591327667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093514, + "balance_loss_mlp": 1.05651164, + "epoch": 0.4544055405925356, + "flos": 784890307584.0, + "grad_norm": 0.08031817047800895, + "language_loss": 0.7639026, + "learning_rate": 0.000597506966198262, + "loss": 0.77483773, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.36987305, + "step": 2362, + "time_per_iteration": 2.9897196292877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109078, + "balance_loss_mlp": 1.05389667, + "epoch": 0.45459792227779916, + "flos": 517950443520.0, + "grad_norm": 0.07752194494873299, + "language_loss": 0.84128416, + "learning_rate": 0.0005972013864455536, + "loss": 0.85219198, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.36914062, + "step": 2363, + "time_per_iteration": 2.580357074737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091271, + "balance_loss_mlp": 1.05515027, + "epoch": 0.4547903039630627, + "flos": 537306075648.0, + "grad_norm": 0.05808697989569881, + "language_loss": 0.85570788, + "learning_rate": 0.0005968957689559203, + "loss": 0.8666206, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.36132812, + "step": 2364, + "time_per_iteration": 2.64911150932312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095782, + "balance_loss_mlp": 1.05997205, + "epoch": 0.4549826856483263, + "flos": 528423409152.0, + "grad_norm": 0.05494979115149378, + "language_loss": 0.88544732, + "learning_rate": 0.0005965901138480131, + "loss": 0.8964051, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.35839844, + "step": 2365, + "time_per_iteration": 2.61967396736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100343, + "balance_loss_mlp": 1.06379294, + "epoch": 0.45517506733358987, + "flos": 520649413632.0, + "grad_norm": 0.0583285525672419, + "language_loss": 0.87046576, + "learning_rate": 0.0005962844212404982, + "loss": 0.88146913, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.36547852, + "step": 2366, + "time_per_iteration": 2.663799524307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108056, + "balance_loss_mlp": 1.07067156, + "epoch": 0.4553674490188534, + "flos": 450814616064.0, + "grad_norm": 0.06095483853323617, + "language_loss": 0.86969483, + "learning_rate": 0.0005959786912520558, + "loss": 0.88077545, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.37353516, + "step": 2367, + "time_per_iteration": 2.604011058807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104168, + "balance_loss_mlp": 1.06740427, + "epoch": 0.455559830704117, + "flos": 546308015616.0, + "grad_norm": 0.04613637765687707, + "language_loss": 0.83717126, + "learning_rate": 0.0005956729240013806, + "loss": 0.84821296, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.36743164, + "step": 2368, + "time_per_iteration": 2.7852706909179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110558, + "balance_loss_mlp": 1.06917334, + "epoch": 0.4557522123893805, + "flos": 583491589632.0, + "grad_norm": 0.05161395773765414, + "language_loss": 0.91501808, + "learning_rate": 0.0005953671196071824, + "loss": 0.92607391, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.36401367, + "step": 2369, + "time_per_iteration": 2.7515223026275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110279, + "balance_loss_mlp": 1.06681311, + "epoch": 0.4559445940746441, + "flos": 526149250560.0, + "grad_norm": 0.05240938085212211, + "language_loss": 0.80084532, + "learning_rate": 0.0005950612781881846, + "loss": 0.8118732, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.35986328, + "step": 2370, + "time_per_iteration": 2.6867175102233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096163, + "balance_loss_mlp": 1.05873156, + "epoch": 0.45613697575990764, + "flos": 651810281472.0, + "grad_norm": 0.06280114629685846, + "language_loss": 0.7594825, + "learning_rate": 0.0005947553998631259, + "loss": 0.77044415, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.37451172, + "step": 2371, + "time_per_iteration": 2.8399033546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096425, + "balance_loss_mlp": 1.05985141, + "epoch": 0.4563293574451712, + "flos": 866744699904.0, + "grad_norm": 0.04396235367342953, + "language_loss": 0.78598678, + "learning_rate": 0.000594449484750758, + "loss": 0.79695106, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.36572266, + "step": 2372, + "time_per_iteration": 3.140890121459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088429, + "balance_loss_mlp": 1.05140269, + "epoch": 0.45652173913043476, + "flos": 497817819648.0, + "grad_norm": 0.06709411136792778, + "language_loss": 0.82665753, + "learning_rate": 0.0005941435329698484, + "loss": 0.83754182, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.36987305, + "step": 2373, + "time_per_iteration": 2.6316027641296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089126, + "balance_loss_mlp": 1.05238533, + "epoch": 0.45671412081569834, + "flos": 560581420032.0, + "grad_norm": 0.05173954705628188, + "language_loss": 0.82881534, + "learning_rate": 0.0005938375446391778, + "loss": 0.83970654, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.36743164, + "step": 2374, + "time_per_iteration": 2.6999659538269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096506, + "balance_loss_mlp": 1.05823994, + "epoch": 0.45690650250096193, + "flos": 502873906176.0, + "grad_norm": 0.06488189122368912, + "language_loss": 0.88693655, + "learning_rate": 0.0005935315198775415, + "loss": 0.89790159, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.38232422, + "step": 2375, + "time_per_iteration": 2.584855556488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084023, + "balance_loss_mlp": 1.04675794, + "epoch": 0.45709888418622546, + "flos": 430473968640.0, + "grad_norm": 0.054054227258136585, + "language_loss": 0.86900407, + "learning_rate": 0.0005932254588037486, + "loss": 0.87984431, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.37207031, + "step": 2376, + "time_per_iteration": 2.4713377952575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087126, + "balance_loss_mlp": 1.04907441, + "epoch": 0.45729126587148905, + "flos": 525395579904.0, + "grad_norm": 0.22673198102288197, + "language_loss": 0.86219609, + "learning_rate": 0.000592919361536623, + "loss": 0.87306732, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.38037109, + "step": 2377, + "time_per_iteration": 2.6324362754821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074984, + "balance_loss_mlp": 1.03821993, + "epoch": 0.4574836475567526, + "flos": 637717349376.0, + "grad_norm": 0.06562895013351942, + "language_loss": 0.88980031, + "learning_rate": 0.0005926132281950017, + "loss": 0.90055019, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.36767578, + "step": 2378, + "time_per_iteration": 2.7336690425872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079258, + "balance_loss_mlp": 1.04194546, + "epoch": 0.45767602924201617, + "flos": 649288811520.0, + "grad_norm": 0.05221471992659685, + "language_loss": 0.84916019, + "learning_rate": 0.0005923070588977367, + "loss": 0.85995281, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.37280273, + "step": 2379, + "time_per_iteration": 2.796694755554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073672, + "balance_loss_mlp": 1.03745568, + "epoch": 0.4578684109272797, + "flos": 746356543488.0, + "grad_norm": 0.05948192069014845, + "language_loss": 0.86265379, + "learning_rate": 0.0005920008537636931, + "loss": 0.8733905, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.36230469, + "step": 2380, + "time_per_iteration": 2.919175863265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073893, + "balance_loss_mlp": 1.03734303, + "epoch": 0.4580607926125433, + "flos": 641155433472.0, + "grad_norm": 0.07082348059879481, + "language_loss": 0.86767799, + "learning_rate": 0.0005916946129117504, + "loss": 0.8784169, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.36523438, + "step": 2381, + "time_per_iteration": 2.8834073543548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076603, + "balance_loss_mlp": 1.03983903, + "epoch": 0.4582531742978069, + "flos": 801513474048.0, + "grad_norm": 0.06015762492268947, + "language_loss": 0.80385733, + "learning_rate": 0.0005913883364608017, + "loss": 0.81462336, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.36791992, + "step": 2382, + "time_per_iteration": 3.05711030960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077489, + "balance_loss_mlp": 1.03984237, + "epoch": 0.4584455559830704, + "flos": 683991613440.0, + "grad_norm": 0.05122280126715116, + "language_loss": 0.88575673, + "learning_rate": 0.0005910820245297542, + "loss": 0.89653164, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.37646484, + "step": 2383, + "time_per_iteration": 2.8739712238311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107409, + "balance_loss_mlp": 1.03682566, + "epoch": 0.458637937668334, + "flos": 517902391296.0, + "grad_norm": 0.06830932289634356, + "language_loss": 0.80442882, + "learning_rate": 0.000590775677237529, + "loss": 0.81516975, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.37231445, + "step": 2384, + "time_per_iteration": 2.7162787914276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083903, + "balance_loss_mlp": 1.04585159, + "epoch": 0.4588303193535975, + "flos": 505242607104.0, + "grad_norm": 0.06045305543182838, + "language_loss": 0.80110037, + "learning_rate": 0.0005904692947030601, + "loss": 0.81193942, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.38012695, + "step": 2385, + "time_per_iteration": 2.615645408630371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077094, + "balance_loss_mlp": 1.04054475, + "epoch": 0.4590227010388611, + "flos": 495655732224.0, + "grad_norm": 0.07817461665700527, + "language_loss": 0.89474368, + "learning_rate": 0.0005901628770452963, + "loss": 0.90551466, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.36572266, + "step": 2386, + "time_per_iteration": 2.545145273208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_mlp": 1.03952503, + "epoch": 0.45921508272412465, + "flos": 493375781376.0, + "grad_norm": 0.05719900676000999, + "language_loss": 0.87518173, + "learning_rate": 0.000589856424383199, + "loss": 0.88595015, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.37280273, + "step": 2387, + "time_per_iteration": 2.5866873264312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078911, + "balance_loss_mlp": 1.04000092, + "epoch": 0.45940746440938823, + "flos": 691096306176.0, + "grad_norm": 0.05272732350360167, + "language_loss": 0.82854474, + "learning_rate": 0.000589549936835744, + "loss": 0.83933389, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.38867188, + "step": 2388, + "time_per_iteration": 2.886815309524536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082474, + "balance_loss_mlp": 1.04485154, + "epoch": 0.45959984609465176, + "flos": 503489364480.0, + "grad_norm": 0.061476086167368736, + "language_loss": 0.79490817, + "learning_rate": 0.0005892434145219202, + "loss": 0.80573285, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.37597656, + "step": 2389, + "time_per_iteration": 2.669055461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078287, + "balance_loss_mlp": 1.04035497, + "epoch": 0.45979222777991535, + "flos": 676339863552.0, + "grad_norm": 0.13998924312013794, + "language_loss": 0.82966721, + "learning_rate": 0.0005889368575607303, + "loss": 0.84045005, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.37890625, + "step": 2390, + "time_per_iteration": 2.8364429473876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075252, + "balance_loss_mlp": 1.03941786, + "epoch": 0.45998460946517894, + "flos": 777308368896.0, + "grad_norm": 0.05472501976139028, + "language_loss": 0.78496212, + "learning_rate": 0.00058863026607119, + "loss": 0.79571462, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.35864258, + "step": 2391, + "time_per_iteration": 3.104703664779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078215, + "balance_loss_mlp": 1.04059267, + "epoch": 0.46017699115044247, + "flos": 851073053184.0, + "grad_norm": 0.06149888926191146, + "language_loss": 0.79584855, + "learning_rate": 0.0005883236401723287, + "loss": 0.80663073, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.37597656, + "step": 2392, + "time_per_iteration": 3.1967198848724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074328, + "balance_loss_mlp": 1.03603745, + "epoch": 0.46036937283570606, + "flos": 575608495104.0, + "grad_norm": 0.05401888737183198, + "language_loss": 0.84525239, + "learning_rate": 0.0005880169799831893, + "loss": 0.85599566, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.3828125, + "step": 2393, + "time_per_iteration": 2.6700267791748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078621, + "balance_loss_mlp": 1.04049826, + "epoch": 0.4605617545209696, + "flos": 611553798144.0, + "grad_norm": 0.04760801272162673, + "language_loss": 0.81405449, + "learning_rate": 0.0005877102856228278, + "loss": 0.82484066, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.38110352, + "step": 2394, + "time_per_iteration": 2.8472628593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079988, + "balance_loss_mlp": 1.04100633, + "epoch": 0.4607541362062332, + "flos": 532884386304.0, + "grad_norm": 0.0583897063043048, + "language_loss": 0.84685498, + "learning_rate": 0.0005874035572103133, + "loss": 0.85765481, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.38964844, + "step": 2395, + "time_per_iteration": 2.6390676498413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081925, + "balance_loss_mlp": 1.04437459, + "epoch": 0.4609465178914967, + "flos": 647023417344.0, + "grad_norm": 0.07571396195119524, + "language_loss": 0.82582867, + "learning_rate": 0.0005870967948647288, + "loss": 0.83664787, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.37573242, + "step": 2396, + "time_per_iteration": 2.7459003925323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115963, + "balance_loss_mlp": 1.09889209, + "epoch": 0.4611388995767603, + "flos": 1465487202816.0, + "grad_norm": 0.025541481833947964, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75424266, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.17089844, + "step": 2397, + "time_per_iteration": 5.318708896636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083182, + "balance_loss_mlp": 1.04446316, + "epoch": 0.46133128126202383, + "flos": 722772688896.0, + "grad_norm": 0.0770893227760576, + "language_loss": 0.8586902, + "learning_rate": 0.0005864831688507443, + "loss": 0.86952198, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.38696289, + "step": 2398, + "time_per_iteration": 3.0177690982818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092841, + "balance_loss_mlp": 1.05266774, + "epoch": 0.4615236629472874, + "flos": 547735371264.0, + "grad_norm": 0.05577558539065206, + "language_loss": 0.74877977, + "learning_rate": 0.0005861763054205754, + "loss": 0.75970817, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.40161133, + "step": 2399, + "time_per_iteration": 4.235994815826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089549, + "balance_loss_mlp": 1.04885101, + "epoch": 0.461716044632551, + "flos": 601942192128.0, + "grad_norm": 0.04983292023279428, + "language_loss": 0.80479169, + "learning_rate": 0.0005858694085337976, + "loss": 0.81568718, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.40698242, + "step": 2400, + "time_per_iteration": 2.807819366455078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095586, + "balance_loss_mlp": 1.0549593, + "epoch": 0.46190842631781454, + "flos": 474236937216.0, + "grad_norm": 0.0664642499777789, + "language_loss": 0.8348912, + "learning_rate": 0.0005855624783095589, + "loss": 0.84584707, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.40625, + "step": 2401, + "time_per_iteration": 2.572861909866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088871, + "balance_loss_mlp": 1.04848242, + "epoch": 0.4621008080030781, + "flos": 437254184448.0, + "grad_norm": 0.05436683283363487, + "language_loss": 0.85176182, + "learning_rate": 0.00058525551486702, + "loss": 0.86265051, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.40405273, + "step": 2402, + "time_per_iteration": 2.5116658210754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091528, + "balance_loss_mlp": 1.05056739, + "epoch": 0.46229318968834165, + "flos": 525203523072.0, + "grad_norm": 0.06054832474170735, + "language_loss": 0.81057394, + "learning_rate": 0.0005849485183253548, + "loss": 0.82148921, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.40942383, + "step": 2403, + "time_per_iteration": 2.6135447025299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109357, + "balance_loss_mlp": 1.05446947, + "epoch": 0.46248557137360524, + "flos": 439395922944.0, + "grad_norm": 0.05271308957386849, + "language_loss": 0.87085575, + "learning_rate": 0.0005846414888037501, + "loss": 0.88179141, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.39086914, + "step": 2404, + "time_per_iteration": 2.479233503341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094001, + "balance_loss_mlp": 1.05513883, + "epoch": 0.4626779530588688, + "flos": 617318475264.0, + "grad_norm": 0.05681624365321511, + "language_loss": 0.82982111, + "learning_rate": 0.0005843344264214049, + "loss": 0.84076107, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.38818359, + "step": 2405, + "time_per_iteration": 2.8025927543640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094528, + "balance_loss_mlp": 1.05478346, + "epoch": 0.46287033474413236, + "flos": 669796784640.0, + "grad_norm": 0.07573173665893672, + "language_loss": 0.84474289, + "learning_rate": 0.0005840273312975317, + "loss": 0.8556881, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.39746094, + "step": 2406, + "time_per_iteration": 2.880143642425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096681, + "balance_loss_mlp": 1.05705631, + "epoch": 0.46306271642939595, + "flos": 479992849920.0, + "grad_norm": 0.09801123732991168, + "language_loss": 0.90446943, + "learning_rate": 0.0005837202035513555, + "loss": 0.91543621, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.39599609, + "step": 2407, + "time_per_iteration": 2.5880489349365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109455, + "balance_loss_mlp": 1.05583048, + "epoch": 0.4632550981146595, + "flos": 580395359232.0, + "grad_norm": 0.057934056350582984, + "language_loss": 0.81573331, + "learning_rate": 0.0005834130433021136, + "loss": 0.82667881, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.38671875, + "step": 2408, + "time_per_iteration": 2.739018201828003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100791, + "balance_loss_mlp": 1.06121325, + "epoch": 0.46344747979992307, + "flos": 523701974016.0, + "grad_norm": 0.11568384778980019, + "language_loss": 0.73278892, + "learning_rate": 0.0005831058506690563, + "loss": 0.74379677, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.39550781, + "step": 2409, + "time_per_iteration": 2.6164803504943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109281, + "balance_loss_mlp": 1.05513954, + "epoch": 0.4636398614851866, + "flos": 746174661120.0, + "grad_norm": 0.10585491609730635, + "language_loss": 0.85966945, + "learning_rate": 0.0005827986257714464, + "loss": 0.87059754, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.3762207, + "step": 2410, + "time_per_iteration": 2.9002575874328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108768, + "balance_loss_mlp": 1.05008137, + "epoch": 0.4638322431704502, + "flos": 596273619456.0, + "grad_norm": 0.054458395819511424, + "language_loss": 0.88645154, + "learning_rate": 0.0005824913687285591, + "loss": 0.89732838, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.37597656, + "step": 2411, + "time_per_iteration": 2.65468168258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108455, + "balance_loss_mlp": 1.046808, + "epoch": 0.4640246248557137, + "flos": 539172799488.0, + "grad_norm": 0.10537111148670983, + "language_loss": 0.81237781, + "learning_rate": 0.0005821840796596821, + "loss": 0.82322335, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.37744141, + "step": 2412, + "time_per_iteration": 2.64800763130188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086752, + "balance_loss_mlp": 1.04979706, + "epoch": 0.4642170065409773, + "flos": 562330280448.0, + "grad_norm": 0.05022524173963101, + "language_loss": 0.80493259, + "learning_rate": 0.0005818767586841158, + "loss": 0.81580019, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.36962891, + "step": 2413, + "time_per_iteration": 2.755119800567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081928, + "balance_loss_mlp": 1.04657054, + "epoch": 0.46440938822624084, + "flos": 530684421120.0, + "grad_norm": 0.05374997972366647, + "language_loss": 0.86088538, + "learning_rate": 0.0005815694059211726, + "loss": 0.87170464, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.35400391, + "step": 2414, + "time_per_iteration": 2.6568868160247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.09606647, + "epoch": 0.4646017699115044, + "flos": 1525503780864.0, + "grad_norm": 0.029698276976430914, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.81986189, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.16503906, + "step": 2415, + "time_per_iteration": 4.772961378097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103885, + "balance_loss_mlp": 1.08795917, + "epoch": 0.464794151596768, + "flos": 1539999765504.0, + "grad_norm": 0.029205098078145548, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78048944, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.15917969, + "step": 2416, + "time_per_iteration": 4.972976446151733 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085394, + "balance_loss_mlp": 1.04908264, + "epoch": 0.46498653328203154, + "flos": 501200649216.0, + "grad_norm": 0.04510206741076235, + "language_loss": 0.86396641, + "learning_rate": 0.0005806471581013931, + "loss": 0.87482029, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.36328125, + "step": 2417, + "time_per_iteration": 2.6620965003967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084664, + "balance_loss_mlp": 1.04806709, + "epoch": 0.46517891496729513, + "flos": 675856825344.0, + "grad_norm": 0.06302462590955567, + "language_loss": 0.78826416, + "learning_rate": 0.0005803396793823146, + "loss": 0.79911077, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.36572266, + "step": 2418, + "time_per_iteration": 2.7901804447174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108772, + "balance_loss_mlp": 1.05190992, + "epoch": 0.46537129665255866, + "flos": 585062949888.0, + "grad_norm": 0.06339234247272847, + "language_loss": 0.85623956, + "learning_rate": 0.0005800321694726065, + "loss": 0.86711681, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.35839844, + "step": 2419, + "time_per_iteration": 2.728811740875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085796, + "balance_loss_mlp": 1.04836476, + "epoch": 0.46556367833782225, + "flos": 587425858560.0, + "grad_norm": 0.05222204092555794, + "language_loss": 0.8708874, + "learning_rate": 0.0005797246284916545, + "loss": 0.88174534, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.37402344, + "step": 2420, + "time_per_iteration": 2.6684653759002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045596, + "balance_loss_mlp": 1.03043234, + "epoch": 0.4657560600230858, + "flos": 1484674099200.0, + "grad_norm": 0.011675297447767578, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78550786, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.15136719, + "step": 2421, + "time_per_iteration": 4.958959102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109154, + "balance_loss_mlp": 1.05506182, + "epoch": 0.46594844170834937, + "flos": 579961783296.0, + "grad_norm": 0.06275032464162542, + "language_loss": 0.88184166, + "learning_rate": 0.0005791094537936233, + "loss": 0.89275706, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.36499023, + "step": 2422, + "time_per_iteration": 2.682985782623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085598, + "balance_loss_mlp": 1.04761815, + "epoch": 0.4661408233936129, + "flos": 512322568704.0, + "grad_norm": 0.05420418194823272, + "language_loss": 0.8170498, + "learning_rate": 0.0005788018203153762, + "loss": 0.82790577, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.37988281, + "step": 2423, + "time_per_iteration": 2.5706470012664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085407, + "balance_loss_mlp": 1.04883409, + "epoch": 0.4663332050788765, + "flos": 490839754752.0, + "grad_norm": 0.06546291293651209, + "language_loss": 0.85642946, + "learning_rate": 0.000578494156243549, + "loss": 0.86728358, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.36572266, + "step": 2424, + "time_per_iteration": 2.578847646713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085751, + "balance_loss_mlp": 1.04746079, + "epoch": 0.4665255867641401, + "flos": 512353092096.0, + "grad_norm": 0.059152702804089866, + "language_loss": 0.89097798, + "learning_rate": 0.0005781864616975878, + "loss": 0.90183544, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.38256836, + "step": 2425, + "time_per_iteration": 2.6408798694610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085338, + "balance_loss_mlp": 1.04585552, + "epoch": 0.4667179684494036, + "flos": 424590018048.0, + "grad_norm": 0.07480086545967683, + "language_loss": 0.84123272, + "learning_rate": 0.0005778787367969502, + "loss": 0.85208613, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.39477539, + "step": 2426, + "time_per_iteration": 2.5963637828826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077247, + "balance_loss_mlp": 1.03910041, + "epoch": 0.4669103501346672, + "flos": 707640897024.0, + "grad_norm": 0.07167303988395164, + "language_loss": 0.80844486, + "learning_rate": 0.0005775709816611053, + "loss": 0.81921738, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.38134766, + "step": 2427, + "time_per_iteration": 2.971285581588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079771, + "balance_loss_mlp": 1.04138589, + "epoch": 0.4671027318199307, + "flos": 554554874880.0, + "grad_norm": 0.05405801443852106, + "language_loss": 0.83748919, + "learning_rate": 0.0005772631964095346, + "loss": 0.84828693, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.38354492, + "step": 2428, + "time_per_iteration": 2.709364175796509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080886, + "balance_loss_mlp": 1.04271483, + "epoch": 0.4672951135051943, + "flos": 566839309824.0, + "grad_norm": 0.060777782070244445, + "language_loss": 0.8565498, + "learning_rate": 0.000576955381161731, + "loss": 0.86735862, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.38183594, + "step": 2429, + "time_per_iteration": 2.708270311355591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083121, + "balance_loss_mlp": 1.04452121, + "epoch": 0.46748749519045785, + "flos": 424294654464.0, + "grad_norm": 0.05633631430335825, + "language_loss": 0.85906339, + "learning_rate": 0.0005766475360371985, + "loss": 0.86989462, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.38574219, + "step": 2430, + "time_per_iteration": 2.617856740951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089079, + "balance_loss_mlp": 1.05055118, + "epoch": 0.46767987687572143, + "flos": 538088859648.0, + "grad_norm": 0.05568735360450276, + "language_loss": 0.84486759, + "learning_rate": 0.0005763396611554536, + "loss": 0.85575831, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.38476562, + "step": 2431, + "time_per_iteration": 2.6460912227630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093376, + "balance_loss_mlp": 1.0557059, + "epoch": 0.467872258560985, + "flos": 823360052736.0, + "grad_norm": 0.05823580457003032, + "language_loss": 0.80262822, + "learning_rate": 0.0005760317566360237, + "loss": 0.81356204, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.37646484, + "step": 2432, + "time_per_iteration": 3.010744094848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104411, + "balance_loss_mlp": 1.066836, + "epoch": 0.46806464024624855, + "flos": 661366632960.0, + "grad_norm": 0.07453415962543286, + "language_loss": 0.85120392, + "learning_rate": 0.000575723822598448, + "loss": 0.86224806, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.37573242, + "step": 2433, + "time_per_iteration": 2.7999444007873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100188, + "balance_loss_mlp": 1.06232667, + "epoch": 0.46825702193151214, + "flos": 755362865664.0, + "grad_norm": 0.08922556949000433, + "language_loss": 0.81824166, + "learning_rate": 0.0005754158591622773, + "loss": 0.82924354, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.37866211, + "step": 2434, + "time_per_iteration": 3.016101837158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089201, + "balance_loss_mlp": 1.05250812, + "epoch": 0.4684494036167757, + "flos": 439164578304.0, + "grad_norm": 0.06367410837717138, + "language_loss": 0.82359827, + "learning_rate": 0.0005751078664470732, + "loss": 0.8344903, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.36694336, + "step": 2435, + "time_per_iteration": 2.5870590209960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095131, + "balance_loss_mlp": 1.05762815, + "epoch": 0.46864178530203926, + "flos": 532446428160.0, + "grad_norm": 0.059213993455869605, + "language_loss": 0.85874772, + "learning_rate": 0.0005747998445724094, + "loss": 0.86969906, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.375, + "step": 2436, + "time_per_iteration": 2.606999397277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088052, + "balance_loss_mlp": 1.05135953, + "epoch": 0.4688341669873028, + "flos": 576328670208.0, + "grad_norm": 0.05282393784178956, + "language_loss": 0.89627349, + "learning_rate": 0.0005744917936578707, + "loss": 0.90715402, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.3671875, + "step": 2437, + "time_per_iteration": 2.7902729511260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075719, + "balance_loss_mlp": 1.03978968, + "epoch": 0.4690265486725664, + "flos": 539296455168.0, + "grad_norm": 0.04430533887369339, + "language_loss": 0.84245884, + "learning_rate": 0.0005741837138230526, + "loss": 0.85321605, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.35913086, + "step": 2438, + "time_per_iteration": 2.726710319519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082309, + "balance_loss_mlp": 1.04580677, + "epoch": 0.4692189303578299, + "flos": 770168770560.0, + "grad_norm": 0.06182369714878754, + "language_loss": 0.86213875, + "learning_rate": 0.0005738756051875627, + "loss": 0.87296176, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.36547852, + "step": 2439, + "time_per_iteration": 3.07755708694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077719, + "balance_loss_mlp": 1.04178953, + "epoch": 0.4694113120430935, + "flos": 571118404608.0, + "grad_norm": 0.047772699497207846, + "language_loss": 0.82990217, + "learning_rate": 0.0005735674678710192, + "loss": 0.84067929, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.359375, + "step": 2440, + "time_per_iteration": 2.6625607013702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080403, + "balance_loss_mlp": 1.04423499, + "epoch": 0.4696036937283571, + "flos": 748498281984.0, + "grad_norm": 0.07690297936976162, + "language_loss": 0.81414962, + "learning_rate": 0.0005732593019930517, + "loss": 0.82495368, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.36181641, + "step": 2441, + "time_per_iteration": 2.918219566345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082294, + "balance_loss_mlp": 1.04669785, + "epoch": 0.4697960754136206, + "flos": 493208455680.0, + "grad_norm": 0.061105529929901724, + "language_loss": 0.87989414, + "learning_rate": 0.0005729511076733008, + "loss": 0.89071703, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.35620117, + "step": 2442, + "time_per_iteration": 2.6301560401916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085737, + "balance_loss_mlp": 1.04909194, + "epoch": 0.4699884570988842, + "flos": 724809710592.0, + "grad_norm": 0.0773152930313349, + "language_loss": 0.84905529, + "learning_rate": 0.000572642885031418, + "loss": 0.85991269, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.36645508, + "step": 2443, + "time_per_iteration": 2.8638129234313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081528, + "balance_loss_mlp": 1.04619479, + "epoch": 0.47018083878414774, + "flos": 555141219840.0, + "grad_norm": 0.0470926044275737, + "language_loss": 0.80651355, + "learning_rate": 0.0005723346341870662, + "loss": 0.81732887, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.35351562, + "step": 2444, + "time_per_iteration": 2.7571544647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093224, + "balance_loss_mlp": 1.05767596, + "epoch": 0.4703732204694113, + "flos": 423846521856.0, + "grad_norm": 0.060426187781859556, + "language_loss": 0.8612802, + "learning_rate": 0.0005720263552599188, + "loss": 0.87221241, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.35595703, + "step": 2445, + "time_per_iteration": 2.457702398300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087133, + "balance_loss_mlp": 1.05003476, + "epoch": 0.47056560215467486, + "flos": 703179919872.0, + "grad_norm": 0.05103700331104036, + "language_loss": 0.79627156, + "learning_rate": 0.0005717180483696604, + "loss": 0.80714285, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.37084961, + "step": 2446, + "time_per_iteration": 2.851597785949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096579, + "balance_loss_mlp": 1.05981517, + "epoch": 0.47075798383993844, + "flos": 554701851648.0, + "grad_norm": 0.05942499594418206, + "language_loss": 0.82931131, + "learning_rate": 0.0005714097136359862, + "loss": 0.84027708, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.36791992, + "step": 2447, + "time_per_iteration": 2.6262872219085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088323, + "balance_loss_mlp": 1.05203617, + "epoch": 0.470950365525202, + "flos": 564009329664.0, + "grad_norm": 0.04849265524269106, + "language_loss": 0.86289024, + "learning_rate": 0.0005711013511786027, + "loss": 0.87377352, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.36303711, + "step": 2448, + "time_per_iteration": 2.7698192596435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087117, + "balance_loss_mlp": 1.05066276, + "epoch": 0.47114274721046556, + "flos": 534189496320.0, + "grad_norm": 0.0564117191668664, + "language_loss": 0.83740294, + "learning_rate": 0.0005707929611172263, + "loss": 0.84827411, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.36450195, + "step": 2449, + "time_per_iteration": 2.679288864135742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091785, + "balance_loss_mlp": 1.0557121, + "epoch": 0.47133512889572915, + "flos": 472877982720.0, + "grad_norm": 0.05809255973733416, + "language_loss": 0.83857393, + "learning_rate": 0.000570484543571585, + "loss": 0.84949178, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.3605957, + "step": 2450, + "time_per_iteration": 2.53946852684021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085264, + "balance_loss_mlp": 1.04914355, + "epoch": 0.4715275105809927, + "flos": 458776286208.0, + "grad_norm": 0.05957003441240347, + "language_loss": 0.83003706, + "learning_rate": 0.0005701760986614171, + "loss": 0.84088969, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.36132812, + "step": 2451, + "time_per_iteration": 2.578679323196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108432, + "balance_loss_mlp": 1.04784179, + "epoch": 0.47171989226625627, + "flos": 421783358976.0, + "grad_norm": 0.04971859173266034, + "language_loss": 0.86998093, + "learning_rate": 0.0005698676265064714, + "loss": 0.88082415, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.36499023, + "step": 2452, + "time_per_iteration": 2.5178701877593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108693, + "balance_loss_mlp": 1.04887831, + "epoch": 0.4719122739515198, + "flos": 457200543744.0, + "grad_norm": 0.06455625952921856, + "language_loss": 0.89101571, + "learning_rate": 0.0005695591272265074, + "loss": 0.90188503, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.38037109, + "step": 2453, + "time_per_iteration": 2.527940511703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094235, + "balance_loss_mlp": 1.05601645, + "epoch": 0.4721046556367834, + "flos": 514716000768.0, + "grad_norm": 0.05921175255811472, + "language_loss": 0.81955969, + "learning_rate": 0.0005692506009412954, + "loss": 0.83050203, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.3815918, + "step": 2454, + "time_per_iteration": 2.6692135334014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152126, + "balance_loss_mlp": 1.13209891, + "epoch": 0.4722970373220469, + "flos": 1571399723520.0, + "grad_norm": 0.04281653423243919, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78703392, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.20019531, + "step": 2455, + "time_per_iteration": 4.940452337265015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085506, + "balance_loss_mlp": 1.04731131, + "epoch": 0.4724894190073105, + "flos": 585919927296.0, + "grad_norm": 0.06574328103666784, + "language_loss": 0.89537692, + "learning_rate": 0.0005686334678342593, + "loss": 0.906232, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.38183594, + "step": 2456, + "time_per_iteration": 2.8626763820648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085091, + "balance_loss_mlp": 1.04816043, + "epoch": 0.4726818006925741, + "flos": 867290347008.0, + "grad_norm": 0.053689359601525224, + "language_loss": 0.81760311, + "learning_rate": 0.0005683248612520274, + "loss": 0.82845408, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.36914062, + "step": 2457, + "time_per_iteration": 3.062195301055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079889, + "balance_loss_mlp": 1.04300618, + "epoch": 0.4728741823778376, + "flos": 752653721088.0, + "grad_norm": 0.06424431420602757, + "language_loss": 0.83881927, + "learning_rate": 0.0005680162281437321, + "loss": 0.84961808, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.36865234, + "step": 2458, + "time_per_iteration": 4.24756932258606 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081915, + "balance_loss_mlp": 1.04474509, + "epoch": 0.4730665640631012, + "flos": 538301265408.0, + "grad_norm": 0.04398827684533395, + "language_loss": 0.84583557, + "learning_rate": 0.000567707568629195, + "loss": 0.8566547, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.37158203, + "step": 2459, + "time_per_iteration": 2.678410530090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077333, + "balance_loss_mlp": 1.04104519, + "epoch": 0.47325894574836475, + "flos": 491396986368.0, + "grad_norm": 0.04729381274413396, + "language_loss": 0.82117784, + "learning_rate": 0.0005673988828282486, + "loss": 0.83195114, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.36303711, + "step": 2460, + "time_per_iteration": 2.6379287242889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080146, + "balance_loss_mlp": 1.04397774, + "epoch": 0.47345132743362833, + "flos": 764117494272.0, + "grad_norm": 0.048508898725252214, + "language_loss": 0.80703068, + "learning_rate": 0.0005670901708607352, + "loss": 0.81783217, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.36206055, + "step": 2461, + "time_per_iteration": 2.9682881832122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079185, + "balance_loss_mlp": 1.04366088, + "epoch": 0.47364370911889186, + "flos": 539925060096.0, + "grad_norm": 0.06522156043574484, + "language_loss": 0.84211236, + "learning_rate": 0.0005667814328465076, + "loss": 0.8529042, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.35546875, + "step": 2462, + "time_per_iteration": 2.6927719116210938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074953, + "balance_loss_mlp": 1.04031122, + "epoch": 0.47383609080415545, + "flos": 406002613248.0, + "grad_norm": 0.06749328280555515, + "language_loss": 0.81615329, + "learning_rate": 0.0005664726689054285, + "loss": 0.82690287, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.34692383, + "step": 2463, + "time_per_iteration": 2.4384853839874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078599, + "balance_loss_mlp": 1.04345584, + "epoch": 0.474028472489419, + "flos": 453237161472.0, + "grad_norm": 0.0467114590315811, + "language_loss": 0.81182402, + "learning_rate": 0.0005661638791573704, + "loss": 0.82261002, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.35180664, + "step": 2464, + "time_per_iteration": 2.695479154586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108113, + "balance_loss_mlp": 1.04582047, + "epoch": 0.47422085417468257, + "flos": 491923694592.0, + "grad_norm": 0.04732653708909472, + "language_loss": 0.86637986, + "learning_rate": 0.0005658550637222164, + "loss": 0.87719119, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.35327148, + "step": 2465, + "time_per_iteration": 2.6167092323303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079374, + "balance_loss_mlp": 1.04365873, + "epoch": 0.47441323585994616, + "flos": 738537467904.0, + "grad_norm": 0.057300064889236176, + "language_loss": 0.82372761, + "learning_rate": 0.0005655462227198592, + "loss": 0.83452135, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.35742188, + "step": 2466, + "time_per_iteration": 2.9023492336273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080505, + "balance_loss_mlp": 1.04509962, + "epoch": 0.4746056175452097, + "flos": 484439270400.0, + "grad_norm": 0.05227273448390526, + "language_loss": 0.83720088, + "learning_rate": 0.0005652373562702016, + "loss": 0.84800589, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.35449219, + "step": 2467, + "time_per_iteration": 2.5808918476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082419, + "balance_loss_mlp": 1.04715681, + "epoch": 0.4747979992304733, + "flos": 460814717952.0, + "grad_norm": 0.05382206625072039, + "language_loss": 0.88037241, + "learning_rate": 0.000564928464493156, + "loss": 0.89119661, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.35302734, + "step": 2468, + "time_per_iteration": 2.5377156734466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087106, + "balance_loss_mlp": 1.05198669, + "epoch": 0.4749903809157368, + "flos": 864070460928.0, + "grad_norm": 0.0577962749951369, + "language_loss": 0.81768191, + "learning_rate": 0.000564619547508645, + "loss": 0.82855296, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.3515625, + "step": 2469, + "time_per_iteration": 3.043691396713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086191, + "balance_loss_mlp": 1.05042827, + "epoch": 0.4751827626010004, + "flos": 505296451584.0, + "grad_norm": 0.1751373121791138, + "language_loss": 0.83049238, + "learning_rate": 0.0005643106054366008, + "loss": 0.84135431, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.3581543, + "step": 2470, + "time_per_iteration": 2.6487743854522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085754, + "balance_loss_mlp": 1.05118382, + "epoch": 0.47537514428626393, + "flos": 559123540992.0, + "grad_norm": 0.05689297252919276, + "language_loss": 0.79414684, + "learning_rate": 0.000564001638396965, + "loss": 0.80500442, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.34594727, + "step": 2471, + "time_per_iteration": 2.749767780303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087834, + "balance_loss_mlp": 1.05228639, + "epoch": 0.4755675259715275, + "flos": 833907211776.0, + "grad_norm": 0.05462179859190678, + "language_loss": 0.81897652, + "learning_rate": 0.0005636926465096897, + "loss": 0.82985491, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.35546875, + "step": 2472, + "time_per_iteration": 3.043703556060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091887, + "balance_loss_mlp": 1.05569541, + "epoch": 0.47575990765679105, + "flos": 507989629440.0, + "grad_norm": 0.050841736172577985, + "language_loss": 0.87258822, + "learning_rate": 0.0005633836298947363, + "loss": 0.88350713, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.36206055, + "step": 2473, + "time_per_iteration": 2.564831018447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098688, + "balance_loss_mlp": 1.06206715, + "epoch": 0.47595228934205464, + "flos": 591566740992.0, + "grad_norm": 0.05674114123782856, + "language_loss": 0.70767033, + "learning_rate": 0.000563074588672075, + "loss": 0.7186572, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.3659668, + "step": 2474, + "time_per_iteration": 2.6735401153564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095847, + "balance_loss_mlp": 1.05960727, + "epoch": 0.4761446710273182, + "flos": 580340104704.0, + "grad_norm": 0.055780063244739476, + "language_loss": 0.84891874, + "learning_rate": 0.0005627655229616868, + "loss": 0.85987723, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.36230469, + "step": 2475, + "time_per_iteration": 2.672621488571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096569, + "balance_loss_mlp": 1.05899405, + "epoch": 0.47633705271258175, + "flos": 672597651456.0, + "grad_norm": 0.05102987049441457, + "language_loss": 0.90229654, + "learning_rate": 0.0005624564328835616, + "loss": 0.91326219, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.37524414, + "step": 2476, + "time_per_iteration": 2.8432443141937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102069, + "balance_loss_mlp": 1.0635407, + "epoch": 0.47652943439784534, + "flos": 541580788224.0, + "grad_norm": 0.0471064217807047, + "language_loss": 0.84254396, + "learning_rate": 0.0005621473185576986, + "loss": 0.85356462, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.38525391, + "step": 2477, + "time_per_iteration": 2.702977180480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095957, + "balance_loss_mlp": 1.05826259, + "epoch": 0.4767218160831089, + "flos": 524563333632.0, + "grad_norm": 0.057656530584244435, + "language_loss": 0.87137967, + "learning_rate": 0.0005618381801041068, + "loss": 0.88233924, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.37670898, + "step": 2478, + "time_per_iteration": 2.603593111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098137, + "balance_loss_mlp": 1.05953729, + "epoch": 0.47691419776837246, + "flos": 567789419520.0, + "grad_norm": 0.11168904607405869, + "language_loss": 0.82855433, + "learning_rate": 0.0005615290176428044, + "loss": 0.83953571, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.38574219, + "step": 2479, + "time_per_iteration": 2.6339292526245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109593, + "balance_loss_mlp": 1.05959523, + "epoch": 0.477106579453636, + "flos": 530659689984.0, + "grad_norm": 0.06204032147038535, + "language_loss": 0.85517442, + "learning_rate": 0.0005612198312938187, + "loss": 0.86613369, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.36328125, + "step": 2480, + "time_per_iteration": 2.727931261062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096505, + "balance_loss_mlp": 1.05912077, + "epoch": 0.4772989611388996, + "flos": 593980521984.0, + "grad_norm": 0.07113059060466843, + "language_loss": 0.79093325, + "learning_rate": 0.0005609106211771868, + "loss": 0.80189824, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.37402344, + "step": 2481, + "time_per_iteration": 2.8239502906799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091646, + "balance_loss_mlp": 1.05471444, + "epoch": 0.4774913428241631, + "flos": 544352541696.0, + "grad_norm": 0.07337307686737661, + "language_loss": 0.89208174, + "learning_rate": 0.0005606013874129543, + "loss": 0.90299821, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.36914062, + "step": 2482, + "time_per_iteration": 2.7480216026306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088167, + "balance_loss_mlp": 1.05187941, + "epoch": 0.4776837245094267, + "flos": 539817371136.0, + "grad_norm": 0.16520730257770824, + "language_loss": 0.80029452, + "learning_rate": 0.0005602921301211768, + "loss": 0.81117618, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.36303711, + "step": 2483, + "time_per_iteration": 2.6802146434783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096429, + "balance_loss_mlp": 1.06021321, + "epoch": 0.4778761061946903, + "flos": 471543759360.0, + "grad_norm": 0.07816325562851568, + "language_loss": 0.81835008, + "learning_rate": 0.0005599828494219185, + "loss": 0.82931435, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.36206055, + "step": 2484, + "time_per_iteration": 2.546365976333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094667, + "balance_loss_mlp": 1.05923831, + "epoch": 0.4780684878799538, + "flos": 725769994752.0, + "grad_norm": 0.05627448694129284, + "language_loss": 0.88551247, + "learning_rate": 0.0005596735454352527, + "loss": 0.89645922, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.35498047, + "step": 2485, + "time_per_iteration": 2.862647771835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106974, + "balance_loss_mlp": 1.07054353, + "epoch": 0.4782608695652174, + "flos": 548665132032.0, + "grad_norm": 0.07015146645765026, + "language_loss": 0.85657477, + "learning_rate": 0.0005593642182812619, + "loss": 0.86764455, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.36425781, + "step": 2486, + "time_per_iteration": 2.609184741973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101968, + "balance_loss_mlp": 1.06558526, + "epoch": 0.47845325125048094, + "flos": 829555333632.0, + "grad_norm": 0.061922125379274766, + "language_loss": 0.83543551, + "learning_rate": 0.0005590548680800378, + "loss": 0.84645522, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.36401367, + "step": 2487, + "time_per_iteration": 3.089769124984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110265, + "balance_loss_mlp": 1.0746448, + "epoch": 0.4786456329357445, + "flos": 513889546752.0, + "grad_norm": 0.2189409026834594, + "language_loss": 0.76099992, + "learning_rate": 0.0005587454949516804, + "loss": 0.77210259, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.35644531, + "step": 2488, + "time_per_iteration": 2.751055955886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110821, + "balance_loss_mlp": 1.07187533, + "epoch": 0.47883801462100806, + "flos": 564392033280.0, + "grad_norm": 0.10409544878795325, + "language_loss": 0.87659556, + "learning_rate": 0.0005584360990162993, + "loss": 0.88767767, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.36376953, + "step": 2489, + "time_per_iteration": 2.6652133464813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113142, + "balance_loss_mlp": 1.07563877, + "epoch": 0.47903039630627164, + "flos": 579296862720.0, + "grad_norm": 0.09667813376582209, + "language_loss": 0.8484993, + "learning_rate": 0.0005581266803940124, + "loss": 0.8596307, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.375, + "step": 2490, + "time_per_iteration": 2.736374616622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119685, + "balance_loss_mlp": 1.08206201, + "epoch": 0.47922277799153523, + "flos": 618667255296.0, + "grad_norm": 0.050098276566308, + "language_loss": 0.87162292, + "learning_rate": 0.0005578172392049471, + "loss": 0.88281971, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.37573242, + "step": 2491, + "time_per_iteration": 2.7753453254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011097, + "balance_loss_mlp": 1.07307923, + "epoch": 0.47941515967679876, + "flos": 639352728576.0, + "grad_norm": 0.06461059150776577, + "language_loss": 0.83998954, + "learning_rate": 0.0005575077755692386, + "loss": 0.85108656, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.3659668, + "step": 2492, + "time_per_iteration": 2.788609266281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113104, + "balance_loss_mlp": 1.07595801, + "epoch": 0.47960754136206235, + "flos": 519561091584.0, + "grad_norm": 0.0557937811773086, + "language_loss": 0.86232179, + "learning_rate": 0.0005571982896070316, + "loss": 0.87345278, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.37158203, + "step": 2493, + "time_per_iteration": 2.6394574642181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107707, + "balance_loss_mlp": 1.07111025, + "epoch": 0.4797999230473259, + "flos": 474798551040.0, + "grad_norm": 0.0598408121702559, + "language_loss": 0.90174985, + "learning_rate": 0.0005568887814384792, + "loss": 0.9128269, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.3659668, + "step": 2494, + "time_per_iteration": 2.534224033355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111594, + "balance_loss_mlp": 1.0754025, + "epoch": 0.47999230473258947, + "flos": 531766950912.0, + "grad_norm": 0.07246176028888049, + "language_loss": 0.87038457, + "learning_rate": 0.000556579251183743, + "loss": 0.88150048, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.36230469, + "step": 2495, + "time_per_iteration": 2.6398251056671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094859, + "balance_loss_mlp": 1.05802298, + "epoch": 0.480184686417853, + "flos": 601207460352.0, + "grad_norm": 0.06271692106547645, + "language_loss": 0.79938626, + "learning_rate": 0.0005562696989629936, + "loss": 0.8103348, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.3684082, + "step": 2496, + "time_per_iteration": 2.6642816066741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093996, + "balance_loss_mlp": 1.05766106, + "epoch": 0.4803770681031166, + "flos": 527931606528.0, + "grad_norm": 0.05594777531112506, + "language_loss": 0.82110333, + "learning_rate": 0.0005559601248964095, + "loss": 0.83204329, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.36352539, + "step": 2497, + "time_per_iteration": 2.636070966720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093639, + "balance_loss_mlp": 1.05739903, + "epoch": 0.4805694497883801, + "flos": 510934500864.0, + "grad_norm": 0.054324508936697755, + "language_loss": 0.85873795, + "learning_rate": 0.0005556505291041783, + "loss": 0.86967432, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.36254883, + "step": 2498, + "time_per_iteration": 2.7246336936950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094125, + "balance_loss_mlp": 1.05757546, + "epoch": 0.4807618314736437, + "flos": 600027416064.0, + "grad_norm": 0.37566577491106196, + "language_loss": 0.84318507, + "learning_rate": 0.0005553409117064954, + "loss": 0.85412627, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.36547852, + "step": 2499, + "time_per_iteration": 2.8535146713256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.06770992, + "epoch": 0.4809542131589073, + "flos": 568700241408.0, + "grad_norm": 0.05235544022747109, + "language_loss": 0.84675509, + "learning_rate": 0.0005550312728235654, + "loss": 0.85780698, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.37475586, + "step": 2500, + "time_per_iteration": 2.691314697265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118964, + "balance_loss_mlp": 1.08138871, + "epoch": 0.4811465948441708, + "flos": 575703037440.0, + "grad_norm": 0.0667425977867665, + "language_loss": 0.83709896, + "learning_rate": 0.0005547216125756003, + "loss": 0.84828854, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.37573242, + "step": 2501, + "time_per_iteration": 2.7381327152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126097, + "balance_loss_mlp": 1.08754468, + "epoch": 0.4813389765294344, + "flos": 823508439552.0, + "grad_norm": 0.052606522983796165, + "language_loss": 0.82174253, + "learning_rate": 0.0005544119310828211, + "loss": 0.83300352, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.38549805, + "step": 2502, + "time_per_iteration": 3.072216272354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134856, + "balance_loss_mlp": 1.09632754, + "epoch": 0.48153135821469795, + "flos": 635240959488.0, + "grad_norm": 0.048230358167368766, + "language_loss": 0.84706873, + "learning_rate": 0.0005541022284654568, + "loss": 0.85841727, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.38525391, + "step": 2503, + "time_per_iteration": 2.916139602661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128897, + "balance_loss_mlp": 1.09051132, + "epoch": 0.48172373989996153, + "flos": 503450076672.0, + "grad_norm": 0.07897645884633452, + "language_loss": 0.84086657, + "learning_rate": 0.0005537925048437446, + "loss": 0.85215557, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.38354492, + "step": 2504, + "time_per_iteration": 2.5921871662139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110906, + "balance_loss_mlp": 1.09278584, + "epoch": 0.48191612158522507, + "flos": 1531563821568.0, + "grad_norm": 0.0372588251023387, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76862371, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.18164062, + "step": 2505, + "time_per_iteration": 4.9559855461120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141303, + "balance_loss_mlp": 1.10132027, + "epoch": 0.48210850327048865, + "flos": 702078451200.0, + "grad_norm": 0.058816464552035166, + "language_loss": 0.88463128, + "learning_rate": 0.0005531729950682664, + "loss": 0.89604431, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.3996582, + "step": 2506, + "time_per_iteration": 3.0114240646362305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132181, + "balance_loss_mlp": 1.09353316, + "epoch": 0.4823008849557522, + "flos": 439548691968.0, + "grad_norm": 0.06626147096234755, + "language_loss": 0.84781104, + "learning_rate": 0.000552863209155015, + "loss": 0.85913289, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.38598633, + "step": 2507, + "time_per_iteration": 2.5784101486206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113394, + "balance_loss_mlp": 1.09390914, + "epoch": 0.48249326664101577, + "flos": 471622334976.0, + "grad_norm": 0.05712589242287889, + "language_loss": 0.82110274, + "learning_rate": 0.0005525534027184461, + "loss": 0.83244216, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.40014648, + "step": 2508, + "time_per_iteration": 2.552065372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132678, + "balance_loss_mlp": 1.09395885, + "epoch": 0.48268564832627936, + "flos": 562954503168.0, + "grad_norm": 0.04979156125943264, + "language_loss": 0.82958996, + "learning_rate": 0.0005522435758788365, + "loss": 0.84091675, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.38696289, + "step": 2509, + "time_per_iteration": 2.727841854095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121662, + "balance_loss_mlp": 1.08210802, + "epoch": 0.4828780300115429, + "flos": 629298782208.0, + "grad_norm": 0.054057791094232886, + "language_loss": 0.79695261, + "learning_rate": 0.0005519337287564721, + "loss": 0.80816925, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.39526367, + "step": 2510, + "time_per_iteration": 2.841032028198242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111392, + "balance_loss_mlp": 1.07582068, + "epoch": 0.4830704116968065, + "flos": 631562766336.0, + "grad_norm": 0.0770242195625866, + "language_loss": 0.83640802, + "learning_rate": 0.000551623861471646, + "loss": 0.84754717, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.38061523, + "step": 2511, + "time_per_iteration": 2.7330808639526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051582, + "balance_loss_mlp": 1.03489304, + "epoch": 0.48326279338207, + "flos": 1568434503168.0, + "grad_norm": 0.02207943535017646, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79870415, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.16699219, + "step": 2512, + "time_per_iteration": 4.847305536270142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119635, + "balance_loss_mlp": 1.08015239, + "epoch": 0.4834551750673336, + "flos": 508989201408.0, + "grad_norm": 0.07604353740704149, + "language_loss": 0.86230296, + "learning_rate": 0.0005510040668958211, + "loss": 0.87349927, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.39453125, + "step": 2513, + "time_per_iteration": 2.6358695030212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039681, + "balance_loss_mlp": 1.02423155, + "epoch": 0.48364755675259713, + "flos": 1527875453952.0, + "grad_norm": 0.016719139942629795, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78800267, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.15429688, + "step": 2514, + "time_per_iteration": 4.8266448974609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108341, + "balance_loss_mlp": 1.06895423, + "epoch": 0.4838399384378607, + "flos": 564726684672.0, + "grad_norm": 0.05692617769518991, + "language_loss": 0.8306818, + "learning_rate": 0.0005503841931138645, + "loss": 0.84176517, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.39355469, + "step": 2515, + "time_per_iteration": 4.18599271774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108623, + "balance_loss_mlp": 1.07073843, + "epoch": 0.4840323201231243, + "flos": 387479227392.0, + "grad_norm": 0.0681425082817114, + "language_loss": 0.81703341, + "learning_rate": 0.0005500742268214025, + "loss": 0.82811964, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.37841797, + "step": 2516, + "time_per_iteration": 2.4660089015960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109531, + "balance_loss_mlp": 1.07116938, + "epoch": 0.48422470180838784, + "flos": 630701406720.0, + "grad_norm": 0.09015941461472031, + "language_loss": 0.85304928, + "learning_rate": 0.0005497642410884014, + "loss": 0.86414456, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.38305664, + "step": 2517, + "time_per_iteration": 2.8147974014282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108486, + "balance_loss_mlp": 1.06845522, + "epoch": 0.4844170834936514, + "flos": 498955603968.0, + "grad_norm": 0.05998889999991439, + "language_loss": 0.8499558, + "learning_rate": 0.0005494542360352085, + "loss": 0.86104071, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.40014648, + "step": 2518, + "time_per_iteration": 2.639248847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101242, + "balance_loss_mlp": 1.06335747, + "epoch": 0.48460946517891496, + "flos": 550798106112.0, + "grad_norm": 0.04916831458391579, + "language_loss": 0.85637897, + "learning_rate": 0.0005491442117821783, + "loss": 0.86739141, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.37866211, + "step": 2519, + "time_per_iteration": 2.7024173736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101572, + "balance_loss_mlp": 1.06275773, + "epoch": 0.48480184686417854, + "flos": 529123235328.0, + "grad_norm": 0.05557918275255021, + "language_loss": 0.87415975, + "learning_rate": 0.0005488341684496732, + "loss": 0.88517547, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.38793945, + "step": 2520, + "time_per_iteration": 2.6733944416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094952, + "balance_loss_mlp": 1.05732954, + "epoch": 0.4849942285494421, + "flos": 531630148608.0, + "grad_norm": 0.049677430441928086, + "language_loss": 0.91897535, + "learning_rate": 0.0005485241061580624, + "loss": 0.92992491, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.37646484, + "step": 2521, + "time_per_iteration": 2.7186949253082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087464, + "balance_loss_mlp": 1.04802954, + "epoch": 0.48518661023470566, + "flos": 722231424000.0, + "grad_norm": 0.05969395587297076, + "language_loss": 0.84698212, + "learning_rate": 0.0005482140250277228, + "loss": 0.85785675, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.39404297, + "step": 2522, + "time_per_iteration": 3.0005805492401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084269, + "balance_loss_mlp": 1.04664636, + "epoch": 0.4853789919199692, + "flos": 505843508736.0, + "grad_norm": 0.0576168536354582, + "language_loss": 0.87382847, + "learning_rate": 0.0005479039251790387, + "loss": 0.88467115, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.37597656, + "step": 2523, + "time_per_iteration": 2.612565517425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083753, + "balance_loss_mlp": 1.04508114, + "epoch": 0.4855713736052328, + "flos": 660185178624.0, + "grad_norm": 0.05213001441745639, + "language_loss": 0.84754556, + "learning_rate": 0.0005475938067324014, + "loss": 0.85838306, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.38647461, + "step": 2524, + "time_per_iteration": 2.7874755859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084646, + "balance_loss_mlp": 1.04556894, + "epoch": 0.48576375529049637, + "flos": 436727476224.0, + "grad_norm": 0.04741211423020534, + "language_loss": 0.83422267, + "learning_rate": 0.0005472836698082098, + "loss": 0.84506917, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.39086914, + "step": 2525, + "time_per_iteration": 2.50516676902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076506, + "balance_loss_mlp": 1.03764343, + "epoch": 0.4859561369757599, + "flos": 581424044544.0, + "grad_norm": 0.04357292691167825, + "language_loss": 0.84170592, + "learning_rate": 0.0005469735145268694, + "loss": 0.85247099, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.38818359, + "step": 2526, + "time_per_iteration": 2.7474558353424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076384, + "balance_loss_mlp": 1.03723574, + "epoch": 0.4861485186610235, + "flos": 487723175424.0, + "grad_norm": 0.056946126423794464, + "language_loss": 0.80818385, + "learning_rate": 0.0005466633410087933, + "loss": 0.81894767, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.39111328, + "step": 2527, + "time_per_iteration": 2.690655469894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080703, + "balance_loss_mlp": 1.06363261, + "epoch": 0.486340900346287, + "flos": 1556893564416.0, + "grad_norm": 0.03973044492620415, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78341526, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.17089844, + "step": 2528, + "time_per_iteration": 4.852689981460571 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076, + "balance_loss_mlp": 1.03723347, + "epoch": 0.4865332820315506, + "flos": 482760221184.0, + "grad_norm": 0.04657742417719492, + "language_loss": 0.88156307, + "learning_rate": 0.0005460429397441214, + "loss": 0.89232314, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.38720703, + "step": 2529, + "time_per_iteration": 2.55281662940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079921, + "balance_loss_mlp": 1.04053402, + "epoch": 0.48672566371681414, + "flos": 535548450816.0, + "grad_norm": 0.06549810250084472, + "language_loss": 0.86653185, + "learning_rate": 0.0005457327122383866, + "loss": 0.87733108, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.39379883, + "step": 2530, + "time_per_iteration": 2.671656847000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035467, + "balance_loss_mlp": 1.01963639, + "epoch": 0.4869180454020777, + "flos": 1411876901376.0, + "grad_norm": 0.025637836045087663, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75671959, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.15820312, + "step": 2531, + "time_per_iteration": 4.814793348312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081396, + "balance_loss_mlp": 1.04322505, + "epoch": 0.48711042708734126, + "flos": 572836741632.0, + "grad_norm": 0.048652424424379774, + "language_loss": 0.7607469, + "learning_rate": 0.0005451122040823244, + "loss": 0.77156091, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.38134766, + "step": 2532, + "time_per_iteration": 2.7569382190704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081141, + "balance_loss_mlp": 1.04246926, + "epoch": 0.48730280877260485, + "flos": 626231665152.0, + "grad_norm": 0.05261384345268123, + "language_loss": 0.76949328, + "learning_rate": 0.0005448019236728997, + "loss": 0.78030467, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.38647461, + "step": 2533, + "time_per_iteration": 2.8791191577911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082063, + "balance_loss_mlp": 1.04439306, + "epoch": 0.48749519045786843, + "flos": 512233818624.0, + "grad_norm": 0.05361284003065004, + "language_loss": 0.84639871, + "learning_rate": 0.0005444916258698255, + "loss": 0.85721934, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.37670898, + "step": 2534, + "time_per_iteration": 2.584188938140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108354, + "balance_loss_mlp": 1.04548812, + "epoch": 0.48768757214313196, + "flos": 525149678592.0, + "grad_norm": 0.044479444876285516, + "language_loss": 0.85999918, + "learning_rate": 0.0005441813107935704, + "loss": 0.87083459, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.38037109, + "step": 2535, + "time_per_iteration": 2.63484787940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089581, + "balance_loss_mlp": 1.05141044, + "epoch": 0.48787995382839555, + "flos": 504784300032.0, + "grad_norm": 0.05225590764746468, + "language_loss": 0.85801542, + "learning_rate": 0.0005438709785646091, + "loss": 0.86891127, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.38110352, + "step": 2536, + "time_per_iteration": 2.5857274532318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087898, + "balance_loss_mlp": 1.0496794, + "epoch": 0.4880723355136591, + "flos": 574904286720.0, + "grad_norm": 0.05427082704851873, + "language_loss": 0.8654719, + "learning_rate": 0.0005435606293034234, + "loss": 0.87635088, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.3815918, + "step": 2537, + "time_per_iteration": 2.6441421508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082535, + "balance_loss_mlp": 1.04498374, + "epoch": 0.48826471719892267, + "flos": 561172147200.0, + "grad_norm": 0.0666705066547564, + "language_loss": 0.84424317, + "learning_rate": 0.0005432502631305016, + "loss": 0.8550685, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.37548828, + "step": 2538, + "time_per_iteration": 2.657888174057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081573, + "balance_loss_mlp": 1.04383135, + "epoch": 0.4884570988841862, + "flos": 725849980416.0, + "grad_norm": 0.04200092081923836, + "language_loss": 0.83068514, + "learning_rate": 0.0005429398801663386, + "loss": 0.84150088, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.37744141, + "step": 2539, + "time_per_iteration": 2.926213264465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085607, + "balance_loss_mlp": 1.04726946, + "epoch": 0.4886494805694498, + "flos": 430794063360.0, + "grad_norm": 0.05775520457519848, + "language_loss": 0.82975113, + "learning_rate": 0.0005426294805314355, + "loss": 0.84060717, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.38305664, + "step": 2540, + "time_per_iteration": 2.476100444793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087326, + "balance_loss_mlp": 1.0497514, + "epoch": 0.4888418622547134, + "flos": 672673254912.0, + "grad_norm": 0.050739997063638825, + "language_loss": 0.79934752, + "learning_rate": 0.0005423190643463003, + "loss": 0.81022084, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.37573242, + "step": 2541, + "time_per_iteration": 2.983567953109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108794, + "balance_loss_mlp": 1.05005538, + "epoch": 0.4890342439399769, + "flos": 541639014912.0, + "grad_norm": 0.05834464255250002, + "language_loss": 0.82589471, + "learning_rate": 0.0005420086317314473, + "loss": 0.83677411, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.37841797, + "step": 2542, + "time_per_iteration": 2.6762986183166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088795, + "balance_loss_mlp": 1.04957485, + "epoch": 0.4892266256252405, + "flos": 590380904448.0, + "grad_norm": 0.056502349447813176, + "language_loss": 0.8105309, + "learning_rate": 0.0005416981828073971, + "loss": 0.82141888, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.39208984, + "step": 2543, + "time_per_iteration": 2.798063039779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111053, + "balance_loss_mlp": 1.0975107, + "epoch": 0.48941900731050403, + "flos": 1515460008960.0, + "grad_norm": 0.049245887260565786, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78226066, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.13574219, + "step": 2544, + "time_per_iteration": 4.86514949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084158, + "balance_loss_mlp": 1.04632151, + "epoch": 0.4896113889957676, + "flos": 470327399424.0, + "grad_norm": 0.0633775200016376, + "language_loss": 0.84418309, + "learning_rate": 0.000541077236513819, + "loss": 0.85502464, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.37792969, + "step": 2545, + "time_per_iteration": 2.590907335281372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085016, + "balance_loss_mlp": 1.04698849, + "epoch": 0.48980377068103115, + "flos": 496310478336.0, + "grad_norm": 0.05034497234802515, + "language_loss": 0.82352334, + "learning_rate": 0.0005407667393853638, + "loss": 0.83437347, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.37988281, + "step": 2546, + "time_per_iteration": 2.6386098861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079303, + "balance_loss_mlp": 1.04187095, + "epoch": 0.48999615236629473, + "flos": 692539628544.0, + "grad_norm": 0.05625529240804266, + "language_loss": 0.83240199, + "learning_rate": 0.0005404562264298569, + "loss": 0.84319508, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.37426758, + "step": 2547, + "time_per_iteration": 2.8305716514587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083931, + "balance_loss_mlp": 1.04459167, + "epoch": 0.49018853405155827, + "flos": 541432401408.0, + "grad_norm": 0.05508159523705553, + "language_loss": 0.83712828, + "learning_rate": 0.0005401456977678498, + "loss": 0.84796757, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.39306641, + "step": 2548, + "time_per_iteration": 2.647726058959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079917, + "balance_loss_mlp": 1.0415554, + "epoch": 0.49038091573682185, + "flos": 695304027648.0, + "grad_norm": 0.06449580544702971, + "language_loss": 0.77341408, + "learning_rate": 0.0005398351535199008, + "loss": 0.7842133, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.38330078, + "step": 2549, + "time_per_iteration": 3.0876851081848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087981, + "balance_loss_mlp": 1.04976225, + "epoch": 0.49057329742208544, + "flos": 596614063104.0, + "grad_norm": 0.053976289964032184, + "language_loss": 0.83800292, + "learning_rate": 0.0005395245938065735, + "loss": 0.84888279, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.38183594, + "step": 2550, + "time_per_iteration": 2.804429769515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082681, + "balance_loss_mlp": 1.04372382, + "epoch": 0.490765679107349, + "flos": 513154814976.0, + "grad_norm": 0.06066311696873723, + "language_loss": 0.8244735, + "learning_rate": 0.0005392140187484379, + "loss": 0.83530027, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.38916016, + "step": 2551, + "time_per_iteration": 2.597642421722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078468, + "balance_loss_mlp": 1.04001141, + "epoch": 0.49095806079261256, + "flos": 629298782208.0, + "grad_norm": 0.0491826620467597, + "language_loss": 0.89348012, + "learning_rate": 0.0005389034284660701, + "loss": 0.90426481, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.3840332, + "step": 2552, + "time_per_iteration": 2.7942707538604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081847, + "balance_loss_mlp": 1.04231691, + "epoch": 0.4911504424778761, + "flos": 914938122240.0, + "grad_norm": 0.07682264853807555, + "language_loss": 0.82114685, + "learning_rate": 0.000538592823080052, + "loss": 0.83196527, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.39501953, + "step": 2553, + "time_per_iteration": 3.1190438270568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079314, + "balance_loss_mlp": 1.04154849, + "epoch": 0.4913428241631397, + "flos": 438716445696.0, + "grad_norm": 0.05210768805810414, + "language_loss": 0.85049736, + "learning_rate": 0.000538282202710971, + "loss": 0.86129045, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.37768555, + "step": 2554, + "time_per_iteration": 2.5379602909088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073496, + "balance_loss_mlp": 1.03613555, + "epoch": 0.4915352058484032, + "flos": 635806955520.0, + "grad_norm": 0.06005848629390598, + "language_loss": 0.81770831, + "learning_rate": 0.000537971567479421, + "loss": 0.82844329, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.37329102, + "step": 2555, + "time_per_iteration": 2.7403476238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107527, + "balance_loss_mlp": 1.0371232, + "epoch": 0.4917275875336668, + "flos": 504272148480.0, + "grad_norm": 0.05941814666543565, + "language_loss": 0.87821388, + "learning_rate": 0.0005376609175060011, + "loss": 0.88896656, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.38110352, + "step": 2556, + "time_per_iteration": 2.5817511081695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069861, + "balance_loss_mlp": 1.03192806, + "epoch": 0.49191996921893033, + "flos": 654251765760.0, + "grad_norm": 0.06032782721564886, + "language_loss": 0.80381918, + "learning_rate": 0.0005373502529113162, + "loss": 0.81451786, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.37915039, + "step": 2557, + "time_per_iteration": 2.7871665954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077426, + "balance_loss_mlp": 1.03939795, + "epoch": 0.4921123509041939, + "flos": 492101194752.0, + "grad_norm": 0.054204772274654804, + "language_loss": 0.81538296, + "learning_rate": 0.0005370395738159773, + "loss": 0.82615721, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.38012695, + "step": 2558, + "time_per_iteration": 2.667402744293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071328, + "balance_loss_mlp": 1.03368151, + "epoch": 0.4923047325894575, + "flos": 545907935232.0, + "grad_norm": 0.05883600684350466, + "language_loss": 0.82952267, + "learning_rate": 0.0005367288803406003, + "loss": 0.84023595, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.3762207, + "step": 2559, + "time_per_iteration": 2.626527786254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078144, + "balance_loss_mlp": 1.03937757, + "epoch": 0.49249711427472104, + "flos": 596195043840.0, + "grad_norm": 0.05079842806629368, + "language_loss": 0.8133688, + "learning_rate": 0.0005364181726058073, + "loss": 0.82415026, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.38720703, + "step": 2560, + "time_per_iteration": 2.6742072105407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079994, + "balance_loss_mlp": 1.0413698, + "epoch": 0.4926894959599846, + "flos": 497580682752.0, + "grad_norm": 0.07402195837362009, + "language_loss": 0.8230688, + "learning_rate": 0.0005361074507322261, + "loss": 0.83386874, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.38574219, + "step": 2561, + "time_per_iteration": 2.5911788940429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079985, + "balance_loss_mlp": 1.04226756, + "epoch": 0.49288187764524816, + "flos": 535868545536.0, + "grad_norm": 0.051530448614758514, + "language_loss": 0.81235635, + "learning_rate": 0.000535796714840489, + "loss": 0.82315624, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.37695312, + "step": 2562, + "time_per_iteration": 2.607124090194702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108504, + "balance_loss_mlp": 1.04694033, + "epoch": 0.49307425933051174, + "flos": 641267504640.0, + "grad_norm": 0.0614534794373117, + "language_loss": 0.83895457, + "learning_rate": 0.0005354859650512348, + "loss": 0.84980506, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.38037109, + "step": 2563, + "time_per_iteration": 2.757147789001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.04889464, + "epoch": 0.4932666410157753, + "flos": 516000761856.0, + "grad_norm": 0.06049941260890761, + "language_loss": 0.87262708, + "learning_rate": 0.0005351752014851074, + "loss": 0.88350135, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.38500977, + "step": 2564, + "time_per_iteration": 2.546381711959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090812, + "balance_loss_mlp": 1.05190217, + "epoch": 0.49345902270103886, + "flos": 601217634816.0, + "grad_norm": 0.06075916964602771, + "language_loss": 0.83327425, + "learning_rate": 0.0005348644242627553, + "loss": 0.84418237, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.38867188, + "step": 2565, + "time_per_iteration": 2.737234592437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080753, + "balance_loss_mlp": 1.06368184, + "epoch": 0.49365140438630245, + "flos": 1492849585152.0, + "grad_norm": 0.03629255242441858, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76367378, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.17089844, + "step": 2566, + "time_per_iteration": 4.96724271774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093722, + "balance_loss_mlp": 1.05462122, + "epoch": 0.493843786071566, + "flos": 629303164416.0, + "grad_norm": 0.05641611710897844, + "language_loss": 0.81215966, + "learning_rate": 0.0005342428293320013, + "loss": 0.82309687, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.390625, + "step": 2567, + "time_per_iteration": 2.75099778175354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085401, + "balance_loss_mlp": 1.04722989, + "epoch": 0.49403616775682957, + "flos": 617283569664.0, + "grad_norm": 0.05682733114828458, + "language_loss": 0.83676398, + "learning_rate": 0.0005339320118649238, + "loss": 0.84761798, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.3815918, + "step": 2568, + "time_per_iteration": 2.6829991340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087145, + "balance_loss_mlp": 1.04945099, + "epoch": 0.4942285494420931, + "flos": 577357355520.0, + "grad_norm": 0.053270861905881636, + "language_loss": 0.86332101, + "learning_rate": 0.000533621181224271, + "loss": 0.87419248, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.37646484, + "step": 2569, + "time_per_iteration": 2.777698278427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092012, + "balance_loss_mlp": 1.0536983, + "epoch": 0.4944209311273567, + "flos": 629899683840.0, + "grad_norm": 0.059449335887268515, + "language_loss": 0.81470358, + "learning_rate": 0.0005333103375307182, + "loss": 0.82562375, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.3828125, + "step": 2570, + "time_per_iteration": 2.866680145263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087838, + "balance_loss_mlp": 1.0502398, + "epoch": 0.4946133128126202, + "flos": 587337108480.0, + "grad_norm": 0.04632852912872097, + "language_loss": 0.86004198, + "learning_rate": 0.0005329994809049451, + "loss": 0.8709203, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.37548828, + "step": 2571, + "time_per_iteration": 2.719249963760376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089339, + "balance_loss_mlp": 1.05147839, + "epoch": 0.4948056944978838, + "flos": 583437745152.0, + "grad_norm": 0.05131083950778726, + "language_loss": 0.87596244, + "learning_rate": 0.0005326886114676375, + "loss": 0.88685584, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.37866211, + "step": 2572, + "time_per_iteration": 2.7392373085021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083396, + "balance_loss_mlp": 1.04524934, + "epoch": 0.49499807618314734, + "flos": 481583149056.0, + "grad_norm": 0.0472919496744071, + "language_loss": 0.87958217, + "learning_rate": 0.0005323777293394854, + "loss": 0.89041615, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.38110352, + "step": 2573, + "time_per_iteration": 2.531196355819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078942, + "balance_loss_mlp": 1.04072404, + "epoch": 0.4951904578684109, + "flos": 518714288640.0, + "grad_norm": 0.0452048253819277, + "language_loss": 0.82375443, + "learning_rate": 0.000532066834641184, + "loss": 0.83454382, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.38183594, + "step": 2574, + "time_per_iteration": 2.6414644718170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076991, + "balance_loss_mlp": 1.03939271, + "epoch": 0.4953828395536745, + "flos": 535238530560.0, + "grad_norm": 0.0513606490930485, + "language_loss": 0.84946954, + "learning_rate": 0.0005317559274934334, + "loss": 0.86023939, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.37573242, + "step": 2575, + "time_per_iteration": 2.764742374420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075902, + "balance_loss_mlp": 1.03904271, + "epoch": 0.49557522123893805, + "flos": 528305545728.0, + "grad_norm": 0.0624025017343203, + "language_loss": 0.80560994, + "learning_rate": 0.0005314450080169382, + "loss": 0.816369, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.3684082, + "step": 2576, + "time_per_iteration": 2.594782590866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078443, + "balance_loss_mlp": 1.04017663, + "epoch": 0.49576760292420163, + "flos": 427780790784.0, + "grad_norm": 0.059991931078834576, + "language_loss": 0.80652928, + "learning_rate": 0.0005311340763324083, + "loss": 0.81731379, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.38232422, + "step": 2577, + "time_per_iteration": 2.5488879680633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107968, + "balance_loss_mlp": 1.04232025, + "epoch": 0.49595998460946517, + "flos": 564968203776.0, + "grad_norm": 0.04956045110382575, + "language_loss": 0.81899893, + "learning_rate": 0.0005308231325605578, + "loss": 0.82979578, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.37329102, + "step": 2578, + "time_per_iteration": 2.6677722930908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077103, + "balance_loss_mlp": 1.03905153, + "epoch": 0.49615236629472875, + "flos": 702161409024.0, + "grad_norm": 0.04106026216453222, + "language_loss": 0.76928478, + "learning_rate": 0.0005305121768221061, + "loss": 0.78005582, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.38012695, + "step": 2579, + "time_per_iteration": 3.070509672164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024013, + "balance_loss_mlp": 1.00970817, + "epoch": 0.4963447479799923, + "flos": 1440896573952.0, + "grad_norm": 0.02117966265403326, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76062334, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.14257812, + "step": 2580, + "time_per_iteration": 4.802190780639648 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084007, + "balance_loss_mlp": 1.04669428, + "epoch": 0.49653712966525587, + "flos": 537370094592.0, + "grad_norm": 0.04967277918174837, + "language_loss": 0.91594803, + "learning_rate": 0.0005298902299282984, + "loss": 0.92678809, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.37304688, + "step": 2581, + "time_per_iteration": 2.5916941165924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075168, + "balance_loss_mlp": 1.03823721, + "epoch": 0.4967295113505194, + "flos": 607002660864.0, + "grad_norm": 0.058889996692992934, + "language_loss": 0.84090436, + "learning_rate": 0.0005295792390144033, + "loss": 0.85165608, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.36889648, + "step": 2582, + "time_per_iteration": 2.731971502304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077994, + "balance_loss_mlp": 1.04065764, + "epoch": 0.496921893035783, + "flos": 474340243968.0, + "grad_norm": 0.06551304805839393, + "language_loss": 0.83421808, + "learning_rate": 0.0005292682366168294, + "loss": 0.844998, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.37304688, + "step": 2583, + "time_per_iteration": 2.575511932373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071469, + "balance_loss_mlp": 1.03437066, + "epoch": 0.4971142747210466, + "flos": 597180059136.0, + "grad_norm": 0.09149919184070833, + "language_loss": 0.79965729, + "learning_rate": 0.0005289572228563181, + "loss": 0.81037199, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.37084961, + "step": 2584, + "time_per_iteration": 2.7206363677978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107391, + "balance_loss_mlp": 1.03533435, + "epoch": 0.4973066564063101, + "flos": 599321797632.0, + "grad_norm": 0.052533233614156426, + "language_loss": 0.82869196, + "learning_rate": 0.000528646197853616, + "loss": 0.83943105, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.38549805, + "step": 2585, + "time_per_iteration": 2.6923370361328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078928, + "balance_loss_mlp": 1.04097223, + "epoch": 0.4974990380915737, + "flos": 649152009216.0, + "grad_norm": 0.05229001766272028, + "language_loss": 0.85541296, + "learning_rate": 0.0005283351617294735, + "loss": 0.86620224, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.37939453, + "step": 2586, + "time_per_iteration": 2.929431915283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021123, + "balance_loss_mlp": 1.00719905, + "epoch": 0.49769141977683723, + "flos": 1528490912256.0, + "grad_norm": 0.01235864360091676, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77657783, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.13964844, + "step": 2587, + "time_per_iteration": 5.021655082702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077541, + "balance_loss_mlp": 1.03977549, + "epoch": 0.4978838014621008, + "flos": 536114446848.0, + "grad_norm": 0.05582319417935397, + "language_loss": 0.866669, + "learning_rate": 0.0005277130565998916, + "loss": 0.87744439, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.37719727, + "step": 2588, + "time_per_iteration": 2.729919195175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079282, + "balance_loss_mlp": 1.04163599, + "epoch": 0.49807618314736435, + "flos": 539335742976.0, + "grad_norm": 0.05154521563335112, + "language_loss": 0.81850547, + "learning_rate": 0.0005274019878359748, + "loss": 0.82929826, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.3762207, + "step": 2589, + "time_per_iteration": 2.692312240600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080128, + "balance_loss_mlp": 1.04243433, + "epoch": 0.49826856483262794, + "flos": 542215185408.0, + "grad_norm": 0.0590106194524904, + "language_loss": 0.87004912, + "learning_rate": 0.0005270909084336628, + "loss": 0.88085043, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.37695312, + "step": 2590, + "time_per_iteration": 2.684134006500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085832, + "balance_loss_mlp": 1.04637384, + "epoch": 0.4984609465178915, + "flos": 522062212608.0, + "grad_norm": 0.056922673879229405, + "language_loss": 0.89000517, + "learning_rate": 0.0005267798185137276, + "loss": 0.90086353, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.39428711, + "step": 2591, + "time_per_iteration": 2.6129040718078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087547, + "balance_loss_mlp": 1.04942417, + "epoch": 0.49865332820315506, + "flos": 574255332864.0, + "grad_norm": 0.05087809825508884, + "language_loss": 0.89274907, + "learning_rate": 0.0005264687181969444, + "loss": 0.90362453, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.38085938, + "step": 2592, + "time_per_iteration": 2.7253634929656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087493, + "balance_loss_mlp": 1.04891706, + "epoch": 0.49884570988841864, + "flos": 1013207657472.0, + "grad_norm": 0.06815052907107509, + "language_loss": 0.75056839, + "learning_rate": 0.0005261576076040937, + "loss": 0.76144326, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.38525391, + "step": 2593, + "time_per_iteration": 3.2982125282287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086885, + "balance_loss_mlp": 1.04790401, + "epoch": 0.4990380915736822, + "flos": 559315597824.0, + "grad_norm": 0.05997761702509101, + "language_loss": 0.84464318, + "learning_rate": 0.0005258464868559591, + "loss": 0.85551196, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.38964844, + "step": 2594, + "time_per_iteration": 2.650743007659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087007, + "balance_loss_mlp": 1.04819274, + "epoch": 0.49923047325894576, + "flos": 498708292608.0, + "grad_norm": 0.060987476024219604, + "language_loss": 0.88568228, + "learning_rate": 0.0005255353560733284, + "loss": 0.89655238, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.38793945, + "step": 2595, + "time_per_iteration": 2.5599913597106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041145, + "balance_loss_mlp": 1.02760279, + "epoch": 0.4994228549442093, + "flos": 1495851273216.0, + "grad_norm": 0.01946244961408958, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76619792, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.13574219, + "step": 2596, + "time_per_iteration": 4.769503593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108652, + "balance_loss_mlp": 1.0481348, + "epoch": 0.4996152366294729, + "flos": 557090901504.0, + "grad_norm": 0.052826274831603945, + "language_loss": 0.83429873, + "learning_rate": 0.0005249130648877492, + "loss": 0.84516394, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.38354492, + "step": 2597, + "time_per_iteration": 2.724168300628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085761, + "balance_loss_mlp": 1.04785287, + "epoch": 0.4998076183147364, + "flos": 415372700160.0, + "grad_norm": 0.05706521232724688, + "language_loss": 0.84317046, + "learning_rate": 0.0005246019047263953, + "loss": 0.85402811, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.37841797, + "step": 2598, + "time_per_iteration": 2.4463517665863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081855, + "balance_loss_mlp": 1.04475701, + "epoch": 0.5, + "flos": 467107513344.0, + "grad_norm": 0.6792645039501298, + "language_loss": 0.82562613, + "learning_rate": 0.0005242907350137353, + "loss": 0.83644474, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.37060547, + "step": 2599, + "time_per_iteration": 2.560786008834839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099746, + "balance_loss_mlp": 1.06193328, + "epoch": 0.5001923816852636, + "flos": 482460475392.0, + "grad_norm": 0.06436348420044716, + "language_loss": 0.78717571, + "learning_rate": 0.0005239795558705754, + "loss": 0.79817319, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.37817383, + "step": 2600, + "time_per_iteration": 2.691749095916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103949, + "balance_loss_mlp": 1.06613564, + "epoch": 0.5003847633705272, + "flos": 533534750208.0, + "grad_norm": 0.05701005713359991, + "language_loss": 0.89229304, + "learning_rate": 0.0005236683674177264, + "loss": 0.90333253, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.37744141, + "step": 2601, + "time_per_iteration": 2.6216700077056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118846, + "balance_loss_mlp": 1.08053231, + "epoch": 0.5005771450557907, + "flos": 737473876992.0, + "grad_norm": 0.059257141019647214, + "language_loss": 0.82444715, + "learning_rate": 0.0005233571697760021, + "loss": 0.83563566, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.3828125, + "step": 2602, + "time_per_iteration": 2.856107473373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127913, + "balance_loss_mlp": 1.08902669, + "epoch": 0.5007695267410542, + "flos": 778646974464.0, + "grad_norm": 0.08832305121279985, + "language_loss": 0.83020616, + "learning_rate": 0.0005230459630662203, + "loss": 0.84148532, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.38842773, + "step": 2603, + "time_per_iteration": 2.954914093017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133998, + "balance_loss_mlp": 1.09563613, + "epoch": 0.5009619084263178, + "flos": 623192251392.0, + "grad_norm": 0.09845505678723535, + "language_loss": 0.81501806, + "learning_rate": 0.0005227347474092022, + "loss": 0.82635808, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.38354492, + "step": 2604, + "time_per_iteration": 2.7330713272094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132886, + "balance_loss_mlp": 1.09223533, + "epoch": 0.5011542901115814, + "flos": 530812459008.0, + "grad_norm": 0.044602380755084235, + "language_loss": 0.83597159, + "learning_rate": 0.0005224235229257724, + "loss": 0.84730041, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.40649414, + "step": 2605, + "time_per_iteration": 2.682590961456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134983, + "balance_loss_mlp": 1.09485674, + "epoch": 0.5013466717968449, + "flos": 527262303744.0, + "grad_norm": 0.06172408458695075, + "language_loss": 0.86453664, + "learning_rate": 0.0005221122897367589, + "loss": 0.87588644, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.40136719, + "step": 2606, + "time_per_iteration": 2.7657558917999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130017, + "balance_loss_mlp": 1.08970046, + "epoch": 0.5015390534821085, + "flos": 565750987776.0, + "grad_norm": 0.060573415362282904, + "language_loss": 0.80914944, + "learning_rate": 0.0005218010479629932, + "loss": 0.82044959, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.40332031, + "step": 2607, + "time_per_iteration": 2.650521755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137201, + "balance_loss_mlp": 1.09564483, + "epoch": 0.5017314351673721, + "flos": 566430465024.0, + "grad_norm": 0.062462394429491495, + "language_loss": 0.82171839, + "learning_rate": 0.0005214897977253102, + "loss": 0.83309042, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.41552734, + "step": 2608, + "time_per_iteration": 2.679605484008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135192, + "balance_loss_mlp": 1.09222913, + "epoch": 0.5019238168526357, + "flos": 522018542592.0, + "grad_norm": 0.04524020883908707, + "language_loss": 0.84520149, + "learning_rate": 0.0005211785391445473, + "loss": 0.85655344, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.4296875, + "step": 2609, + "time_per_iteration": 2.727029323577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133676, + "balance_loss_mlp": 1.09128523, + "epoch": 0.5021161985378992, + "flos": 641135084544.0, + "grad_norm": 0.0754859849582408, + "language_loss": 0.79190326, + "learning_rate": 0.0005208672723415467, + "loss": 0.80324006, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.42358398, + "step": 2610, + "time_per_iteration": 2.7925145626068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132046, + "balance_loss_mlp": 1.09058475, + "epoch": 0.5023085802231627, + "flos": 591000744960.0, + "grad_norm": 0.05557553185326306, + "language_loss": 0.78870118, + "learning_rate": 0.0005205559974371525, + "loss": 0.80002165, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.41455078, + "step": 2611, + "time_per_iteration": 2.7993710041046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129577, + "balance_loss_mlp": 1.08747184, + "epoch": 0.5025009619084263, + "flos": 472134486528.0, + "grad_norm": 0.05627981978612443, + "language_loss": 0.81993866, + "learning_rate": 0.0005202447145522123, + "loss": 0.83123446, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.42089844, + "step": 2612, + "time_per_iteration": 2.6950342655181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120122, + "balance_loss_mlp": 1.0788281, + "epoch": 0.5026933435936899, + "flos": 454906036224.0, + "grad_norm": 0.05146182880646494, + "language_loss": 0.79119051, + "learning_rate": 0.0005199334238075769, + "loss": 0.80239171, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.4128418, + "step": 2613, + "time_per_iteration": 2.533280372619629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121533, + "balance_loss_mlp": 1.08064461, + "epoch": 0.5028857252789535, + "flos": 491504675328.0, + "grad_norm": 0.049706042989329166, + "language_loss": 0.91481262, + "learning_rate": 0.0005196221253241, + "loss": 0.92602801, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.40869141, + "step": 2614, + "time_per_iteration": 2.562459707260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125484, + "balance_loss_mlp": 1.08271146, + "epoch": 0.503078106964217, + "flos": 625280145408.0, + "grad_norm": 0.05688830610190983, + "language_loss": 0.82597703, + "learning_rate": 0.0005193108192226383, + "loss": 0.83723187, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.42797852, + "step": 2615, + "time_per_iteration": 2.7700836658477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124223, + "balance_loss_mlp": 1.08054483, + "epoch": 0.5032704886494805, + "flos": 578774536704.0, + "grad_norm": 0.07123141067873749, + "language_loss": 0.87046134, + "learning_rate": 0.000518999505624052, + "loss": 0.88170362, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.43701172, + "step": 2616, + "time_per_iteration": 2.6920361518859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110793, + "balance_loss_mlp": 1.06897473, + "epoch": 0.5034628703347441, + "flos": 471481150464.0, + "grad_norm": 0.07512500822512953, + "language_loss": 0.83250809, + "learning_rate": 0.000518688184649203, + "loss": 0.84361595, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.41845703, + "step": 2617, + "time_per_iteration": 2.8107755184173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109828, + "balance_loss_mlp": 1.06786621, + "epoch": 0.5036552520200077, + "flos": 489594281472.0, + "grad_norm": 0.05241889370213675, + "language_loss": 0.83636624, + "learning_rate": 0.0005183768564189577, + "loss": 0.84746444, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.41967773, + "step": 2618, + "time_per_iteration": 2.5401604175567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117501, + "balance_loss_mlp": 1.07649279, + "epoch": 0.5038476337052713, + "flos": 493991239680.0, + "grad_norm": 0.05660213632560354, + "language_loss": 0.8184489, + "learning_rate": 0.0005180655210541838, + "loss": 0.82962382, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.40991211, + "step": 2619, + "time_per_iteration": 2.603214979171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111785, + "balance_loss_mlp": 1.06829762, + "epoch": 0.5040400153905348, + "flos": 600321369600.0, + "grad_norm": 0.06441755274122189, + "language_loss": 0.83548617, + "learning_rate": 0.0005177541786757527, + "loss": 0.84660405, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.43481445, + "step": 2620, + "time_per_iteration": 2.760035276412964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122898, + "balance_loss_mlp": 1.07759881, + "epoch": 0.5042323970757984, + "flos": 811178924544.0, + "grad_norm": 0.05307882661131351, + "language_loss": 0.82779682, + "learning_rate": 0.000517442829404538, + "loss": 0.8390258, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.453125, + "step": 2621, + "time_per_iteration": 2.9839560985565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110947, + "balance_loss_mlp": 1.06581521, + "epoch": 0.504424778761062, + "flos": 626985335808.0, + "grad_norm": 0.08823829105457728, + "language_loss": 0.87315869, + "learning_rate": 0.0005171314733614166, + "loss": 0.88425338, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.43676758, + "step": 2622, + "time_per_iteration": 2.901881456375122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_mlp": 1.05961967, + "epoch": 0.5046171604463255, + "flos": 515651553792.0, + "grad_norm": 0.052612789537889, + "language_loss": 0.78039354, + "learning_rate": 0.0005168201106672671, + "loss": 0.79141223, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.42236328, + "step": 2623, + "time_per_iteration": 2.7674055099487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111898, + "balance_loss_mlp": 1.07046056, + "epoch": 0.504809542131589, + "flos": 527576606208.0, + "grad_norm": 0.08464756430959838, + "language_loss": 0.8495788, + "learning_rate": 0.0005165087414429717, + "loss": 0.86069775, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.41430664, + "step": 2624, + "time_per_iteration": 2.602158546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117075, + "balance_loss_mlp": 1.07261038, + "epoch": 0.5050019238168526, + "flos": 553855048704.0, + "grad_norm": 0.23140620797494316, + "language_loss": 0.83667731, + "learning_rate": 0.0005161973658094144, + "loss": 0.84784812, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.44458008, + "step": 2625, + "time_per_iteration": 2.6992454528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108998, + "balance_loss_mlp": 1.06834817, + "epoch": 0.5051943055021162, + "flos": 574486677504.0, + "grad_norm": 0.05317382862924398, + "language_loss": 0.82239455, + "learning_rate": 0.000515885983887482, + "loss": 0.83348453, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.40649414, + "step": 2626, + "time_per_iteration": 2.7204251289367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110206, + "balance_loss_mlp": 1.06781507, + "epoch": 0.5053866871873798, + "flos": 496438516224.0, + "grad_norm": 0.08071327634258786, + "language_loss": 0.84119672, + "learning_rate": 0.0005155745957980636, + "loss": 0.85229874, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.42382812, + "step": 2627, + "time_per_iteration": 2.5813376903533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118048, + "balance_loss_mlp": 1.0760628, + "epoch": 0.5055790688726434, + "flos": 501963084288.0, + "grad_norm": 0.04526623404133713, + "language_loss": 0.88577604, + "learning_rate": 0.000515263201662051, + "loss": 0.89695656, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.41992188, + "step": 2628, + "time_per_iteration": 2.6876380443573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111719, + "balance_loss_mlp": 1.07625389, + "epoch": 0.5057714505579068, + "flos": 844844276736.0, + "grad_norm": 0.05588400488715087, + "language_loss": 0.82233381, + "learning_rate": 0.0005149518016003378, + "loss": 0.83350569, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.40942383, + "step": 2629, + "time_per_iteration": 3.1858632564544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124651, + "balance_loss_mlp": 1.0810678, + "epoch": 0.5059638322431704, + "flos": 497580682752.0, + "grad_norm": 0.0555737706891176, + "language_loss": 0.82261145, + "learning_rate": 0.0005146403957338206, + "loss": 0.83385789, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.43603516, + "step": 2630, + "time_per_iteration": 2.548497438430786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118356, + "balance_loss_mlp": 1.07703853, + "epoch": 0.506156213928434, + "flos": 617526498816.0, + "grad_norm": 0.05055767229530262, + "language_loss": 0.82073247, + "learning_rate": 0.0005143289841833975, + "loss": 0.83191609, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.41308594, + "step": 2631, + "time_per_iteration": 2.847142457962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116463, + "balance_loss_mlp": 1.07500172, + "epoch": 0.5063485956136976, + "flos": 424624923648.0, + "grad_norm": 0.06986911289391046, + "language_loss": 0.81789684, + "learning_rate": 0.0005140175670699696, + "loss": 0.82906151, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.41455078, + "step": 2632, + "time_per_iteration": 2.6268298625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_mlp": 1.0729686, + "epoch": 0.5065409772989612, + "flos": 569641586688.0, + "grad_norm": 0.04802770333155415, + "language_loss": 0.8255887, + "learning_rate": 0.0005137061445144395, + "loss": 0.8367523, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.43383789, + "step": 2633, + "time_per_iteration": 2.93361759185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106091, + "balance_loss_mlp": 1.06458259, + "epoch": 0.5067333589842247, + "flos": 628510205952.0, + "grad_norm": 0.0826873370301202, + "language_loss": 0.86646289, + "learning_rate": 0.000513394716637712, + "loss": 0.87752378, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.4152832, + "step": 2634, + "time_per_iteration": 2.8372714519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083943, + "balance_loss_mlp": 1.06868434, + "epoch": 0.5069257406694883, + "flos": 1447062741504.0, + "grad_norm": 0.03147096823206272, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80275649, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.15234375, + "step": 2635, + "time_per_iteration": 4.893187046051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109252, + "balance_loss_mlp": 1.06812489, + "epoch": 0.5071181223547518, + "flos": 638530656768.0, + "grad_norm": 0.046825638192595165, + "language_loss": 0.80415404, + "learning_rate": 0.0005127718454042958, + "loss": 0.81524646, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.41113281, + "step": 2636, + "time_per_iteration": 2.8583669662475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104227, + "balance_loss_mlp": 1.06250417, + "epoch": 0.5073105040400154, + "flos": 713239658496.0, + "grad_norm": 0.061804914120772665, + "language_loss": 0.84210312, + "learning_rate": 0.0005124604022894269, + "loss": 0.85314542, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.41723633, + "step": 2637, + "time_per_iteration": 2.924973726272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047259, + "balance_loss_mlp": 1.03228605, + "epoch": 0.5075028857252789, + "flos": 1435658605056.0, + "grad_norm": 0.01918715016894911, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78235483, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.14941406, + "step": 2638, + "time_per_iteration": 4.856257915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103908, + "balance_loss_mlp": 1.06115913, + "epoch": 0.5076952674105425, + "flos": 570857946624.0, + "grad_norm": 0.0603044028086303, + "language_loss": 0.83185166, + "learning_rate": 0.0005118375016679325, + "loss": 0.84289074, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.42749023, + "step": 2639, + "time_per_iteration": 2.788266897201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108523, + "balance_loss_mlp": 1.06651402, + "epoch": 0.5078876490958061, + "flos": 516463451136.0, + "grad_norm": 0.06423032366665075, + "language_loss": 0.8059274, + "learning_rate": 0.0005115260444031382, + "loss": 0.81701261, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.42016602, + "step": 2640, + "time_per_iteration": 2.5973188877105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036794, + "balance_loss_mlp": 1.02191687, + "epoch": 0.5080800307810697, + "flos": 1583378620416.0, + "grad_norm": 0.017407415587129545, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.7976861, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.1484375, + "step": 2641, + "time_per_iteration": 4.9824395179748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107231, + "balance_loss_mlp": 1.06340933, + "epoch": 0.5082724124663333, + "flos": 484965978624.0, + "grad_norm": 0.05963770496992207, + "language_loss": 0.8711704, + "learning_rate": 0.0005109031165700483, + "loss": 0.88224268, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.43823242, + "step": 2642, + "time_per_iteration": 2.5530447959899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103344, + "balance_loss_mlp": 1.05997539, + "epoch": 0.5084647941515967, + "flos": 681928450560.0, + "grad_norm": 0.05207490997788611, + "language_loss": 0.8334229, + "learning_rate": 0.0005105916462435945, + "loss": 0.84445643, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.43359375, + "step": 2643, + "time_per_iteration": 2.8092200756073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101863, + "balance_loss_mlp": 1.05863762, + "epoch": 0.5086571758368603, + "flos": 548468692992.0, + "grad_norm": 0.0494294374601552, + "language_loss": 0.85464209, + "learning_rate": 0.0005102801718050989, + "loss": 0.86566073, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.43261719, + "step": 2644, + "time_per_iteration": 2.6660444736480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111917, + "balance_loss_mlp": 1.06735659, + "epoch": 0.5088495575221239, + "flos": 563751843840.0, + "grad_norm": 0.0695979688507087, + "language_loss": 0.88942361, + "learning_rate": 0.0005099686933754867, + "loss": 0.9005428, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.44580078, + "step": 2645, + "time_per_iteration": 2.673337697982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108704, + "balance_loss_mlp": 1.06283236, + "epoch": 0.5090419392073875, + "flos": 551132757504.0, + "grad_norm": 0.05355859457172443, + "language_loss": 0.84209561, + "learning_rate": 0.0005096572110756845, + "loss": 0.85318267, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.45874023, + "step": 2646, + "time_per_iteration": 2.6638782024383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112209, + "balance_loss_mlp": 1.06686139, + "epoch": 0.509234320892651, + "flos": 567504230400.0, + "grad_norm": 0.04874041351849401, + "language_loss": 0.85460532, + "learning_rate": 0.0005093457250266205, + "loss": 0.86572737, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.45361328, + "step": 2647, + "time_per_iteration": 2.6637892723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107252, + "balance_loss_mlp": 1.0633595, + "epoch": 0.5094267025779146, + "flos": 582339248640.0, + "grad_norm": 0.05998717956466229, + "language_loss": 0.8317883, + "learning_rate": 0.000509034235349224, + "loss": 0.84286082, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.43920898, + "step": 2648, + "time_per_iteration": 2.6878888607025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100829, + "balance_loss_mlp": 1.05846214, + "epoch": 0.5096190842631781, + "flos": 591704953344.0, + "grad_norm": 0.05244355272630434, + "language_loss": 0.812711, + "learning_rate": 0.0005087227421644266, + "loss": 0.82371926, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.42407227, + "step": 2649, + "time_per_iteration": 2.7117576599121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106927, + "balance_loss_mlp": 1.06346333, + "epoch": 0.5098114659484417, + "flos": 513307584000.0, + "grad_norm": 0.052249476616985355, + "language_loss": 0.8603372, + "learning_rate": 0.0005084112455931602, + "loss": 0.87140644, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.43457031, + "step": 2650, + "time_per_iteration": 2.6070332527160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106986, + "balance_loss_mlp": 1.06578696, + "epoch": 0.5100038476337053, + "flos": 484389808128.0, + "grad_norm": 0.053750245063259934, + "language_loss": 0.85138631, + "learning_rate": 0.0005080997457563586, + "loss": 0.8624562, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.41210938, + "step": 2651, + "time_per_iteration": 2.53045654296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105107, + "balance_loss_mlp": 1.06374109, + "epoch": 0.5101962293189688, + "flos": 461366157312.0, + "grad_norm": 0.06332454651149101, + "language_loss": 0.79166603, + "learning_rate": 0.0005077882427749569, + "loss": 0.80271709, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.41381836, + "step": 2652, + "time_per_iteration": 2.4946300983428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113144, + "balance_loss_mlp": 1.07084906, + "epoch": 0.5103886110042324, + "flos": 586760937984.0, + "grad_norm": 0.06191877346451425, + "language_loss": 0.8487432, + "learning_rate": 0.0005074767367698913, + "loss": 0.85987473, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.42285156, + "step": 2653, + "time_per_iteration": 2.6763722896575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105422, + "balance_loss_mlp": 1.06455684, + "epoch": 0.510580992689496, + "flos": 844906885632.0, + "grad_norm": 0.056937070163659766, + "language_loss": 0.83570945, + "learning_rate": 0.0005071652278620988, + "loss": 0.84676373, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.40869141, + "step": 2654, + "time_per_iteration": 3.0378835201263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109149, + "balance_loss_mlp": 1.06706858, + "epoch": 0.5107733743747596, + "flos": 658328629248.0, + "grad_norm": 0.057649397656864075, + "language_loss": 0.83013982, + "learning_rate": 0.0005068537161725186, + "loss": 0.84123135, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.42041016, + "step": 2655, + "time_per_iteration": 2.7623610496520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105478, + "balance_loss_mlp": 1.06385016, + "epoch": 0.510965756060023, + "flos": 701426677248.0, + "grad_norm": 0.05708536741035134, + "language_loss": 0.8435111, + "learning_rate": 0.0005065422018220893, + "loss": 0.85456586, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.41601562, + "step": 2656, + "time_per_iteration": 2.823542833328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102091, + "balance_loss_mlp": 1.06096351, + "epoch": 0.5111581377452866, + "flos": 559430489088.0, + "grad_norm": 0.05217113074905386, + "language_loss": 0.80225503, + "learning_rate": 0.0005062306849317521, + "loss": 0.81327593, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.41113281, + "step": 2657, + "time_per_iteration": 2.8275818824768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084461, + "balance_loss_mlp": 1.04314327, + "epoch": 0.5113505194305502, + "flos": 608745729024.0, + "grad_norm": 0.05701327198704139, + "language_loss": 0.83469534, + "learning_rate": 0.0005059191656224487, + "loss": 0.84553993, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.41308594, + "step": 2658, + "time_per_iteration": 2.7243552207946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094832, + "balance_loss_mlp": 1.05158317, + "epoch": 0.5115429011158138, + "flos": 534214227456.0, + "grad_norm": 0.0458707137929394, + "language_loss": 0.89186656, + "learning_rate": 0.0005056076440151212, + "loss": 0.90281487, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.43237305, + "step": 2659, + "time_per_iteration": 2.663668632507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047323, + "balance_loss_mlp": 1.0349257, + "epoch": 0.5117352828010774, + "flos": 1361451580416.0, + "grad_norm": 0.020991592608455897, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77335441, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.12402344, + "step": 2660, + "time_per_iteration": 4.851064205169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095712, + "balance_loss_mlp": 1.05420339, + "epoch": 0.5119276644863409, + "flos": 633444046848.0, + "grad_norm": 0.05508509945890564, + "language_loss": 0.87153888, + "learning_rate": 0.0005049845943901691, + "loss": 0.882496, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.41479492, + "step": 2661, + "time_per_iteration": 2.827824831008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085148, + "balance_loss_mlp": 1.04459286, + "epoch": 0.5121200461716044, + "flos": 585304468992.0, + "grad_norm": 0.05132624096148621, + "language_loss": 0.86219436, + "learning_rate": 0.0005046730666144338, + "loss": 0.8730458, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.40527344, + "step": 2662, + "time_per_iteration": 2.75281023979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096524, + "balance_loss_mlp": 1.05542088, + "epoch": 0.512312427856868, + "flos": 1032081661440.0, + "grad_norm": 0.048177160037868025, + "language_loss": 0.87700105, + "learning_rate": 0.0005043615370244532, + "loss": 0.88796628, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.41113281, + "step": 2663, + "time_per_iteration": 3.3618671894073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028213, + "balance_loss_mlp": 1.01524341, + "epoch": 0.5125048095421316, + "flos": 1537257277440.0, + "grad_norm": 0.012858425268609664, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79272604, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.12988281, + "step": 2664, + "time_per_iteration": 4.658047914505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093765, + "balance_loss_mlp": 1.05292368, + "epoch": 0.5126971912273951, + "flos": 590814480384.0, + "grad_norm": 0.04944817886166227, + "language_loss": 0.85279715, + "learning_rate": 0.0005037384728855425, + "loss": 0.86373478, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.40820312, + "step": 2665, + "time_per_iteration": 2.8461544513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098547, + "balance_loss_mlp": 1.05620384, + "epoch": 0.5128895729126587, + "flos": 551393215488.0, + "grad_norm": 0.158979293172939, + "language_loss": 0.84343994, + "learning_rate": 0.0005034269385785075, + "loss": 0.85442543, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.42333984, + "step": 2666, + "time_per_iteration": 2.651714563369751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092703, + "balance_loss_mlp": 1.05222011, + "epoch": 0.5130819545979223, + "flos": 481031709696.0, + "grad_norm": 0.06506731950678159, + "language_loss": 0.84809029, + "learning_rate": 0.0005031154029410168, + "loss": 0.85901731, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.40478516, + "step": 2667, + "time_per_iteration": 2.5316364765167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095809, + "balance_loss_mlp": 1.05368042, + "epoch": 0.5132743362831859, + "flos": 475556603904.0, + "grad_norm": 0.06903413954772, + "language_loss": 0.86695576, + "learning_rate": 0.0005028038660940197, + "loss": 0.87791383, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.42138672, + "step": 2668, + "time_per_iteration": 2.521328926086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090159, + "balance_loss_mlp": 1.04962766, + "epoch": 0.5134667179684494, + "flos": 503559175680.0, + "grad_norm": 0.047102953885103854, + "language_loss": 0.84545898, + "learning_rate": 0.0005024923281584648, + "loss": 0.85636055, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.4050293, + "step": 2669, + "time_per_iteration": 2.6462371349334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092047, + "balance_loss_mlp": 1.05330372, + "epoch": 0.5136590996537129, + "flos": 503647925760.0, + "grad_norm": 0.04719667862832961, + "language_loss": 0.82488692, + "learning_rate": 0.0005021807892553026, + "loss": 0.83580744, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.38696289, + "step": 2670, + "time_per_iteration": 2.732416868209839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094857, + "balance_loss_mlp": 1.05370605, + "epoch": 0.5138514813389765, + "flos": 624330035712.0, + "grad_norm": 0.05149766622145395, + "language_loss": 0.84497285, + "learning_rate": 0.0005018692495054828, + "loss": 0.85592139, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.41137695, + "step": 2671, + "time_per_iteration": 2.760014533996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092038, + "balance_loss_mlp": 1.05174494, + "epoch": 0.5140438630242401, + "flos": 583274801664.0, + "grad_norm": 0.05511271146100304, + "language_loss": 0.80692601, + "learning_rate": 0.0005015577090299561, + "loss": 0.81784636, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.40283203, + "step": 2672, + "time_per_iteration": 2.6871819496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102168, + "balance_loss_mlp": 1.06046844, + "epoch": 0.5142362447095037, + "flos": 487683887616.0, + "grad_norm": 0.05906966789334332, + "language_loss": 0.86718851, + "learning_rate": 0.0005012461679496729, + "loss": 0.87821019, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.41674805, + "step": 2673, + "time_per_iteration": 2.573075771331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111889, + "balance_loss_mlp": 1.06968939, + "epoch": 0.5144286263947672, + "flos": 526601765376.0, + "grad_norm": 0.050226260736663565, + "language_loss": 0.87357539, + "learning_rate": 0.0005009346263855848, + "loss": 0.88469428, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.42211914, + "step": 2674, + "time_per_iteration": 2.6014504432678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100642, + "balance_loss_mlp": 1.06106424, + "epoch": 0.5146210080800308, + "flos": 486252149760.0, + "grad_norm": 0.047502810841318265, + "language_loss": 0.8393209, + "learning_rate": 0.0005006230844586422, + "loss": 0.85032737, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.39599609, + "step": 2675, + "time_per_iteration": 2.7817234992980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102843, + "balance_loss_mlp": 1.06152487, + "epoch": 0.5148133897652943, + "flos": 515622440448.0, + "grad_norm": 0.04472754928085029, + "language_loss": 0.79101396, + "learning_rate": 0.0005003115422897968, + "loss": 0.80204242, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.4128418, + "step": 2676, + "time_per_iteration": 2.72664213180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102609, + "balance_loss_mlp": 1.06243563, + "epoch": 0.5150057714505579, + "flos": 510963614208.0, + "grad_norm": 0.061230997357755966, + "language_loss": 0.86760038, + "learning_rate": 0.0005, + "loss": 0.87862647, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.40161133, + "step": 2677, + "time_per_iteration": 2.6518850326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095319, + "balance_loss_mlp": 1.05648041, + "epoch": 0.5151981531358215, + "flos": 910541164032.0, + "grad_norm": 0.056847893934042666, + "language_loss": 0.79409456, + "learning_rate": 0.0004996884577102033, + "loss": 0.80504775, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.38818359, + "step": 2678, + "time_per_iteration": 3.0679850578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096526, + "balance_loss_mlp": 1.05623293, + "epoch": 0.515390534821085, + "flos": 471599013888.0, + "grad_norm": 0.047432465044858714, + "language_loss": 0.8447082, + "learning_rate": 0.000499376915541358, + "loss": 0.85567349, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.40283203, + "step": 2679, + "time_per_iteration": 2.6785037517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099943, + "balance_loss_mlp": 1.06086659, + "epoch": 0.5155829165063486, + "flos": 649811137536.0, + "grad_norm": 0.04795230358992159, + "language_loss": 0.81296241, + "learning_rate": 0.0004990653736144155, + "loss": 0.82396191, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.390625, + "step": 2680, + "time_per_iteration": 2.840188980102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100494, + "balance_loss_mlp": 1.06072533, + "epoch": 0.5157752981916122, + "flos": 414038476800.0, + "grad_norm": 0.062126395708719404, + "language_loss": 0.86077356, + "learning_rate": 0.0004987538320503271, + "loss": 0.87177849, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.3972168, + "step": 2681, + "time_per_iteration": 2.4594664573669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100598, + "balance_loss_mlp": 1.06054354, + "epoch": 0.5159676798768758, + "flos": 553569859584.0, + "grad_norm": 0.05537703124714055, + "language_loss": 0.82735646, + "learning_rate": 0.0004984422909700442, + "loss": 0.83836246, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.39990234, + "step": 2682, + "time_per_iteration": 2.66052508354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091816, + "balance_loss_mlp": 1.05292952, + "epoch": 0.5161600615621393, + "flos": 586234229760.0, + "grad_norm": 0.051780542585777085, + "language_loss": 0.83951235, + "learning_rate": 0.0004981307504945173, + "loss": 0.85043043, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.38867188, + "step": 2683, + "time_per_iteration": 2.6698381900787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109715, + "balance_loss_mlp": 1.05766809, + "epoch": 0.5163524432474028, + "flos": 588568025088.0, + "grad_norm": 0.05164690349476628, + "language_loss": 0.8939817, + "learning_rate": 0.0004978192107446976, + "loss": 0.90495312, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.39428711, + "step": 2684, + "time_per_iteration": 2.7249348163604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095053, + "balance_loss_mlp": 1.05325842, + "epoch": 0.5165448249326664, + "flos": 503642133504.0, + "grad_norm": 0.05677264338484585, + "language_loss": 0.87172639, + "learning_rate": 0.0004975076718415353, + "loss": 0.8826769, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.41796875, + "step": 2685, + "time_per_iteration": 2.599235773086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086892, + "balance_loss_mlp": 1.04676652, + "epoch": 0.51673720661793, + "flos": 416539597824.0, + "grad_norm": 0.05087662124677675, + "language_loss": 0.90954995, + "learning_rate": 0.0004971961339059806, + "loss": 0.92041892, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.40112305, + "step": 2686, + "time_per_iteration": 2.4647631645202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091735, + "balance_loss_mlp": 1.04986906, + "epoch": 0.5169295883031936, + "flos": 598696164864.0, + "grad_norm": 0.1190187036629449, + "language_loss": 0.83923638, + "learning_rate": 0.0004968845970589832, + "loss": 0.85015374, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.41870117, + "step": 2687, + "time_per_iteration": 2.6631908416748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087993, + "balance_loss_mlp": 1.04793859, + "epoch": 0.517121969988457, + "flos": 556543844352.0, + "grad_norm": 0.06869038553700607, + "language_loss": 0.8455354, + "learning_rate": 0.0004965730614214926, + "loss": 0.85641533, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.40039062, + "step": 2688, + "time_per_iteration": 2.628286361694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098374, + "balance_loss_mlp": 1.05576849, + "epoch": 0.5173143516737206, + "flos": 469214346240.0, + "grad_norm": 0.05001993876024353, + "language_loss": 0.85256827, + "learning_rate": 0.0004962615271144576, + "loss": 0.86355197, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.42602539, + "step": 2689, + "time_per_iteration": 2.5224428176879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091114, + "balance_loss_mlp": 1.05017805, + "epoch": 0.5175067333589842, + "flos": 719739067392.0, + "grad_norm": 0.0600896413832987, + "language_loss": 0.82435369, + "learning_rate": 0.0004959499942588264, + "loss": 0.8352648, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.40917969, + "step": 2690, + "time_per_iteration": 2.923792600631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043722, + "balance_loss_mlp": 1.02932107, + "epoch": 0.5176991150442478, + "flos": 1465402834944.0, + "grad_norm": 0.02659438930583784, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79243743, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.14355469, + "step": 2691, + "time_per_iteration": 4.779648542404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089552, + "balance_loss_mlp": 1.04863954, + "epoch": 0.5178914967295114, + "flos": 612345346560.0, + "grad_norm": 0.05374555179207371, + "language_loss": 0.85215712, + "learning_rate": 0.0004953269333855661, + "loss": 0.86305267, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.40917969, + "step": 2692, + "time_per_iteration": 2.7646090984344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086121, + "balance_loss_mlp": 1.04604328, + "epoch": 0.5180838784147749, + "flos": 500663766528.0, + "grad_norm": 0.05670677168127033, + "language_loss": 0.84148359, + "learning_rate": 0.0004950154056098309, + "loss": 0.85234475, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.40039062, + "step": 2693, + "time_per_iteration": 2.7038145065307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088689, + "balance_loss_mlp": 1.0469892, + "epoch": 0.5182762601000385, + "flos": 688531166208.0, + "grad_norm": 0.05599909013755839, + "language_loss": 0.84343493, + "learning_rate": 0.0004947038797692867, + "loss": 0.85432184, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.41699219, + "step": 2694, + "time_per_iteration": 2.8155903816223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092198, + "balance_loss_mlp": 1.05147612, + "epoch": 0.518468641785302, + "flos": 665315458560.0, + "grad_norm": 0.046372715162849826, + "language_loss": 0.77593923, + "learning_rate": 0.0004943923559848789, + "loss": 0.7868613, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.40698242, + "step": 2695, + "time_per_iteration": 2.787229061126709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086627, + "balance_loss_mlp": 1.04714453, + "epoch": 0.5186610234705656, + "flos": 566440639488.0, + "grad_norm": 0.05332286724917534, + "language_loss": 0.89972508, + "learning_rate": 0.0004940808343775515, + "loss": 0.9105913, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.39453125, + "step": 2696, + "time_per_iteration": 2.6648201942443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083742, + "balance_loss_mlp": 1.04292464, + "epoch": 0.5188534051558291, + "flos": 428652324864.0, + "grad_norm": 0.055572994373314345, + "language_loss": 0.82251114, + "learning_rate": 0.0004937693150682479, + "loss": 0.83334857, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.40820312, + "step": 2697, + "time_per_iteration": 2.5013554096221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089603, + "balance_loss_mlp": 1.04804635, + "epoch": 0.5190457868410927, + "flos": 546085435392.0, + "grad_norm": 0.05634548635888483, + "language_loss": 0.7652837, + "learning_rate": 0.0004934577981779107, + "loss": 0.77617967, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.41552734, + "step": 2698, + "time_per_iteration": 2.7512943744659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092838, + "balance_loss_mlp": 1.04958856, + "epoch": 0.5192381685263563, + "flos": 548321716224.0, + "grad_norm": 0.04670174030259061, + "language_loss": 0.81419832, + "learning_rate": 0.0004931462838274817, + "loss": 0.82512677, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.43237305, + "step": 2699, + "time_per_iteration": 2.8294084072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082299, + "balance_loss_mlp": 1.04296041, + "epoch": 0.5194305502116199, + "flos": 574993036800.0, + "grad_norm": 0.05440575131052059, + "language_loss": 0.83835357, + "learning_rate": 0.0004928347721379011, + "loss": 0.84917653, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.39331055, + "step": 2700, + "time_per_iteration": 2.643941879272461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084407, + "balance_loss_mlp": 1.04485357, + "epoch": 0.5196229318968835, + "flos": 434019741696.0, + "grad_norm": 0.054958496552239416, + "language_loss": 0.81611145, + "learning_rate": 0.0004925232632301089, + "loss": 0.8269555, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.39526367, + "step": 2701, + "time_per_iteration": 2.5408122539520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084434, + "balance_loss_mlp": 1.04638255, + "epoch": 0.5198153135821469, + "flos": 558607007232.0, + "grad_norm": 0.05193596738822722, + "language_loss": 0.79534626, + "learning_rate": 0.0004922117572250431, + "loss": 0.80619061, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.38037109, + "step": 2702, + "time_per_iteration": 2.6687467098236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078961, + "balance_loss_mlp": 1.04152906, + "epoch": 0.5200076952674105, + "flos": 565397397504.0, + "grad_norm": 0.04814908286006495, + "language_loss": 0.80652344, + "learning_rate": 0.0004919002542436414, + "loss": 0.81731308, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.37451172, + "step": 2703, + "time_per_iteration": 2.811460256576538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085168, + "balance_loss_mlp": 1.04644859, + "epoch": 0.5202000769526741, + "flos": 570916173312.0, + "grad_norm": 0.05555982935463854, + "language_loss": 0.81149572, + "learning_rate": 0.0004915887544068399, + "loss": 0.8223474, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.38720703, + "step": 2704, + "time_per_iteration": 2.6499714851379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093505, + "balance_loss_mlp": 1.05199671, + "epoch": 0.5203924586379377, + "flos": 693898583040.0, + "grad_norm": 0.050837486186397586, + "language_loss": 0.77994883, + "learning_rate": 0.0004912772578355736, + "loss": 0.7908839, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.41503906, + "step": 2705, + "time_per_iteration": 2.8637514114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094973, + "balance_loss_mlp": 1.0555619, + "epoch": 0.5205848403232012, + "flos": 566215087104.0, + "grad_norm": 0.054100857686445215, + "language_loss": 0.8301729, + "learning_rate": 0.000490965764650776, + "loss": 0.84112263, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.39404297, + "step": 2706, + "time_per_iteration": 2.8644323348999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085669, + "balance_loss_mlp": 1.04661632, + "epoch": 0.5207772220084648, + "flos": 1213775539200.0, + "grad_norm": 0.05228956126941533, + "language_loss": 0.82813179, + "learning_rate": 0.0004906542749733798, + "loss": 0.83898848, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.39013672, + "step": 2707, + "time_per_iteration": 3.6128242015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084528, + "balance_loss_mlp": 1.04635715, + "epoch": 0.5209696036937284, + "flos": 592547374080.0, + "grad_norm": 0.12708447176708407, + "language_loss": 0.84871459, + "learning_rate": 0.0004903427889243156, + "loss": 0.85955989, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.38134766, + "step": 2708, + "time_per_iteration": 2.86226487159729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109093, + "balance_loss_mlp": 1.05211544, + "epoch": 0.5211619853789919, + "flos": 522623826432.0, + "grad_norm": 0.05348625186790992, + "language_loss": 0.85548282, + "learning_rate": 0.0004900313066245134, + "loss": 0.86639208, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.38818359, + "step": 2709, + "time_per_iteration": 2.662485122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081893, + "balance_loss_mlp": 1.0432452, + "epoch": 0.5213543670642555, + "flos": 502534872576.0, + "grad_norm": 0.050688452880556414, + "language_loss": 0.80490649, + "learning_rate": 0.0004897198281949012, + "loss": 0.81572545, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.38647461, + "step": 2710, + "time_per_iteration": 2.6449263095855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085994, + "balance_loss_mlp": 1.04636908, + "epoch": 0.521546748749519, + "flos": 585682790400.0, + "grad_norm": 0.05860885905894002, + "language_loss": 0.77534401, + "learning_rate": 0.0004894083537564057, + "loss": 0.78620392, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.39599609, + "step": 2711, + "time_per_iteration": 2.7473373413085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083493, + "balance_loss_mlp": 1.04458284, + "epoch": 0.5217391304347826, + "flos": 569833643520.0, + "grad_norm": 0.04954385524753536, + "language_loss": 0.80801934, + "learning_rate": 0.0004890968834299519, + "loss": 0.81885427, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.38867188, + "step": 2712, + "time_per_iteration": 2.7709779739379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084865, + "balance_loss_mlp": 1.04621696, + "epoch": 0.5219315121200462, + "flos": 542501784576.0, + "grad_norm": 0.06807472429400872, + "language_loss": 0.78801876, + "learning_rate": 0.0004887854173364633, + "loss": 0.7988674, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.38623047, + "step": 2713, + "time_per_iteration": 2.710489273071289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084971, + "balance_loss_mlp": 1.04713416, + "epoch": 0.5221238938053098, + "flos": 550006557696.0, + "grad_norm": 0.048000843690728094, + "language_loss": 0.81816071, + "learning_rate": 0.0004884739555968617, + "loss": 0.82901043, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.37866211, + "step": 2714, + "time_per_iteration": 2.8097493648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_mlp": 1.01785719, + "epoch": 0.5223162754905732, + "flos": 1354373028864.0, + "grad_norm": 0.016208306264550634, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80007499, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.12597656, + "step": 2715, + "time_per_iteration": 4.9789557456970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083164, + "balance_loss_mlp": 1.04444456, + "epoch": 0.5225086571758368, + "flos": 567441621504.0, + "grad_norm": 0.04806245104826077, + "language_loss": 0.86670554, + "learning_rate": 0.0004878510456629992, + "loss": 0.87753725, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.38696289, + "step": 2716, + "time_per_iteration": 3.015443801879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084237, + "balance_loss_mlp": 1.0459466, + "epoch": 0.5227010388611004, + "flos": 499914478080.0, + "grad_norm": 0.051081355886524536, + "language_loss": 0.85046101, + "learning_rate": 0.00048753959771057314, + "loss": 0.86130333, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.3828125, + "step": 2717, + "time_per_iteration": 2.623352289199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084285, + "balance_loss_mlp": 1.04539871, + "epoch": 0.522893420546364, + "flos": 597372115968.0, + "grad_norm": 0.0531417340924391, + "language_loss": 0.82181746, + "learning_rate": 0.0004872281545957044, + "loss": 0.83266038, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.38842773, + "step": 2718, + "time_per_iteration": 2.7300612926483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080864, + "balance_loss_mlp": 1.04154897, + "epoch": 0.5230858022316276, + "flos": 664278008832.0, + "grad_norm": 0.05093940259468129, + "language_loss": 0.85964847, + "learning_rate": 0.0004869167164393055, + "loss": 0.87045711, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.39306641, + "step": 2719, + "time_per_iteration": 2.9219412803649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108043, + "balance_loss_mlp": 1.04206884, + "epoch": 0.5232781839168911, + "flos": 603547047936.0, + "grad_norm": 0.04294663688852852, + "language_loss": 0.89195794, + "learning_rate": 0.00048660528336228793, + "loss": 0.90276217, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.38330078, + "step": 2720, + "time_per_iteration": 2.7792000770568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076128, + "balance_loss_mlp": 1.03781438, + "epoch": 0.5234705656021547, + "flos": 550438723584.0, + "grad_norm": 0.04780199229625597, + "language_loss": 0.90052795, + "learning_rate": 0.0004862938554855606, + "loss": 0.91128922, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.3828125, + "step": 2721, + "time_per_iteration": 2.781075954437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083526, + "balance_loss_mlp": 1.04509294, + "epoch": 0.5236629472874182, + "flos": 504026247168.0, + "grad_norm": 0.06026541291367098, + "language_loss": 0.85920995, + "learning_rate": 0.0004859824329300304, + "loss": 0.87004519, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.3840332, + "step": 2722, + "time_per_iteration": 2.5523464679718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078682, + "balance_loss_mlp": 1.04043949, + "epoch": 0.5238553289726818, + "flos": 547394927616.0, + "grad_norm": 0.04759572809953804, + "language_loss": 0.83678633, + "learning_rate": 0.00048567101581660244, + "loss": 0.84757316, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.38208008, + "step": 2723, + "time_per_iteration": 2.62168288230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081139, + "balance_loss_mlp": 1.04208636, + "epoch": 0.5240477106579453, + "flos": 531702931968.0, + "grad_norm": 0.060086559712579084, + "language_loss": 0.87061596, + "learning_rate": 0.00048535960426617956, + "loss": 0.88142729, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.39038086, + "step": 2724, + "time_per_iteration": 2.5913078784942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081015, + "balance_loss_mlp": 1.04208124, + "epoch": 0.5242400923432089, + "flos": 617653126656.0, + "grad_norm": 0.05554996608046291, + "language_loss": 0.81582165, + "learning_rate": 0.0004850481983996621, + "loss": 0.82663178, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.3894043, + "step": 2725, + "time_per_iteration": 2.744001865386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083025, + "balance_loss_mlp": 1.04366207, + "epoch": 0.5244324740284725, + "flos": 416461022208.0, + "grad_norm": 0.051041166575027594, + "language_loss": 0.87690443, + "learning_rate": 0.0004847367983379492, + "loss": 0.88773465, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.39331055, + "step": 2726, + "time_per_iteration": 2.452622652053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081299, + "balance_loss_mlp": 1.04327154, + "epoch": 0.5246248557137361, + "flos": 626113801728.0, + "grad_norm": 0.0465947896589182, + "language_loss": 0.7866348, + "learning_rate": 0.00048442540420193643, + "loss": 0.7974478, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.38012695, + "step": 2727, + "time_per_iteration": 2.8958897590637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085515, + "balance_loss_mlp": 1.04524565, + "epoch": 0.5248172373989997, + "flos": 1247980746240.0, + "grad_norm": 0.0639927904505779, + "language_loss": 0.79006433, + "learning_rate": 0.0004841140161125182, + "loss": 0.80091947, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.40234375, + "step": 2728, + "time_per_iteration": 3.5769736766815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109207, + "balance_loss_mlp": 1.05370796, + "epoch": 0.5250096190842631, + "flos": 506616118272.0, + "grad_norm": 0.05909227072060698, + "language_loss": 0.84801137, + "learning_rate": 0.0004838026341905857, + "loss": 0.85893214, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.38354492, + "step": 2729, + "time_per_iteration": 2.6979076862335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082745, + "balance_loss_mlp": 1.04476523, + "epoch": 0.5252020007695267, + "flos": 611021297664.0, + "grad_norm": 0.0531469423300266, + "language_loss": 0.85391581, + "learning_rate": 0.00048349125855702844, + "loss": 0.86474323, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.37915039, + "step": 2730, + "time_per_iteration": 2.7757534980773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084625, + "balance_loss_mlp": 1.04669309, + "epoch": 0.5253943824547903, + "flos": 538970568192.0, + "grad_norm": 0.04649712268604906, + "language_loss": 0.81255782, + "learning_rate": 0.00048317988933273287, + "loss": 0.82340407, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.37939453, + "step": 2731, + "time_per_iteration": 2.7401769161224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094943, + "balance_loss_mlp": 1.05476904, + "epoch": 0.5255867641400539, + "flos": 697714988544.0, + "grad_norm": 0.05136039584795155, + "language_loss": 0.82178587, + "learning_rate": 0.00048286852663858367, + "loss": 0.8327353, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.40161133, + "step": 2732, + "time_per_iteration": 2.9572720527648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088204, + "balance_loss_mlp": 1.05084419, + "epoch": 0.5257791458253175, + "flos": 666975568896.0, + "grad_norm": 0.08443038207797475, + "language_loss": 0.83823925, + "learning_rate": 0.000482557170595462, + "loss": 0.84912133, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.37304688, + "step": 2733, + "time_per_iteration": 2.881659746170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092705, + "balance_loss_mlp": 1.05443931, + "epoch": 0.525971527510581, + "flos": 483375679488.0, + "grad_norm": 0.04826672636793544, + "language_loss": 0.87744856, + "learning_rate": 0.0004822458213242475, + "loss": 0.88837564, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.38232422, + "step": 2734, + "time_per_iteration": 2.5599043369293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090689, + "balance_loss_mlp": 1.05270863, + "epoch": 0.5261639091958445, + "flos": 829559715840.0, + "grad_norm": 0.055467035242162094, + "language_loss": 0.85945731, + "learning_rate": 0.00048193447894581627, + "loss": 0.87036419, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.37988281, + "step": 2735, + "time_per_iteration": 3.1253552436828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102448, + "balance_loss_mlp": 1.06258464, + "epoch": 0.5263562908811081, + "flos": 520461739008.0, + "grad_norm": 0.05936611315903256, + "language_loss": 0.87591684, + "learning_rate": 0.00048162314358104243, + "loss": 0.88694137, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.39868164, + "step": 2736, + "time_per_iteration": 2.5996334552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094957, + "balance_loss_mlp": 1.05704832, + "epoch": 0.5265486725663717, + "flos": 574722404352.0, + "grad_norm": 0.047689297469847035, + "language_loss": 0.82871807, + "learning_rate": 0.0004813118153507969, + "loss": 0.83966762, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.37890625, + "step": 2737, + "time_per_iteration": 2.7455976009368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058466, + "balance_loss_mlp": 1.04540098, + "epoch": 0.5267410542516352, + "flos": 1546439537664.0, + "grad_norm": 0.021507379855054985, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83505595, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.13085938, + "step": 2738, + "time_per_iteration": 4.774937629699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110016, + "balance_loss_mlp": 1.06184578, + "epoch": 0.5269334359368988, + "flos": 929576701440.0, + "grad_norm": 0.045277698895202834, + "language_loss": 0.83199632, + "learning_rate": 0.00048068918077736163, + "loss": 0.84299791, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.38305664, + "step": 2739, + "time_per_iteration": 3.253458261489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102121, + "balance_loss_mlp": 1.06256771, + "epoch": 0.5271258176221624, + "flos": 655079629824.0, + "grad_norm": 0.05720476143842487, + "language_loss": 0.81167477, + "learning_rate": 0.0004803778746759001, + "loss": 0.82269597, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.39526367, + "step": 2740, + "time_per_iteration": 2.890253782272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095422, + "balance_loss_mlp": 1.05777621, + "epoch": 0.527318199307426, + "flos": 542781181440.0, + "grad_norm": 0.064499445698322, + "language_loss": 0.81573081, + "learning_rate": 0.00048006657619242317, + "loss": 0.82668501, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.37646484, + "step": 2741, + "time_per_iteration": 2.696274518966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104347, + "balance_loss_mlp": 1.06419694, + "epoch": 0.5275105809926895, + "flos": 447629635584.0, + "grad_norm": 0.05845576302131632, + "language_loss": 0.78272831, + "learning_rate": 0.00047975528544778775, + "loss": 0.79377174, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.40112305, + "step": 2742, + "time_per_iteration": 2.6140294075012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094189, + "balance_loss_mlp": 1.05508804, + "epoch": 0.527702962677953, + "flos": 578656673280.0, + "grad_norm": 0.058395918180573554, + "language_loss": 0.88265073, + "learning_rate": 0.00047944400256284754, + "loss": 0.89359266, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.39086914, + "step": 2743, + "time_per_iteration": 2.7063393592834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097827, + "balance_loss_mlp": 1.0614922, + "epoch": 0.5278953443632166, + "flos": 652465027584.0, + "grad_norm": 0.07282412653967131, + "language_loss": 0.79796684, + "learning_rate": 0.0004791327276584532, + "loss": 0.80894512, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.36352539, + "step": 2744, + "time_per_iteration": 2.8260412216186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109844, + "balance_loss_mlp": 1.06031692, + "epoch": 0.5280877260484802, + "flos": 513741159936.0, + "grad_norm": 0.04991281876590649, + "language_loss": 0.80703586, + "learning_rate": 0.00047882146085545264, + "loss": 0.81802028, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.38061523, + "step": 2745, + "time_per_iteration": 2.6051464080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018989, + "balance_loss_mlp": 1.00611436, + "epoch": 0.5282801077337438, + "flos": 1444650370560.0, + "grad_norm": 0.010819489631099216, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76421368, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.12890625, + "step": 2746, + "time_per_iteration": 4.9944517612457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084488, + "balance_loss_mlp": 1.0470562, + "epoch": 0.5284724894190073, + "flos": 604580115456.0, + "grad_norm": 0.058273426421755106, + "language_loss": 0.79290295, + "learning_rate": 0.00047819895203700684, + "loss": 0.80374789, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.37451172, + "step": 2747, + "time_per_iteration": 2.728018045425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016603, + "balance_loss_mlp": 1.00410998, + "epoch": 0.5286648711042709, + "flos": 1494172224000.0, + "grad_norm": 0.012264329558562137, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76529038, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.125, + "step": 2748, + "time_per_iteration": 4.659038782119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077352, + "balance_loss_mlp": 1.03860867, + "epoch": 0.5288572527895344, + "flos": 597313889280.0, + "grad_norm": 0.056212558578819974, + "language_loss": 0.88259304, + "learning_rate": 0.0004775764770742277, + "loss": 0.89336658, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.38720703, + "step": 2749, + "time_per_iteration": 2.845102548599243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086383, + "balance_loss_mlp": 1.04699659, + "epoch": 0.529049634474798, + "flos": 557041439232.0, + "grad_norm": 0.05924821658857843, + "language_loss": 0.86565638, + "learning_rate": 0.00047726525259079777, + "loss": 0.87652022, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.39404297, + "step": 2750, + "time_per_iteration": 2.773296356201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085746, + "balance_loss_mlp": 1.04793251, + "epoch": 0.5292420161600616, + "flos": 580986086400.0, + "grad_norm": 0.05670035904014211, + "language_loss": 0.885436, + "learning_rate": 0.0004769540369337798, + "loss": 0.89629346, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.37792969, + "step": 2751, + "time_per_iteration": 2.715921401977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084379, + "balance_loss_mlp": 1.04563594, + "epoch": 0.5294343978453251, + "flos": 607989086208.0, + "grad_norm": 0.05448198338431079, + "language_loss": 0.86051679, + "learning_rate": 0.00047664283022399794, + "loss": 0.87136054, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.38720703, + "step": 2752, + "time_per_iteration": 2.8683502674102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078933, + "balance_loss_mlp": 1.04078627, + "epoch": 0.5296267795305887, + "flos": 646226076672.0, + "grad_norm": 0.05827570747642561, + "language_loss": 0.81129229, + "learning_rate": 0.00047633163258227376, + "loss": 0.82208163, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.38110352, + "step": 2753, + "time_per_iteration": 2.8427987098693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084727, + "balance_loss_mlp": 1.04595971, + "epoch": 0.5298191612158523, + "flos": 559482923520.0, + "grad_norm": 0.14342502720880523, + "language_loss": 0.85232151, + "learning_rate": 0.0004760204441294247, + "loss": 0.86316884, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.38745117, + "step": 2754, + "time_per_iteration": 2.644049882888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088138, + "balance_loss_mlp": 1.05096865, + "epoch": 0.5300115429011159, + "flos": 513776065536.0, + "grad_norm": 0.052931776937271004, + "language_loss": 0.86139393, + "learning_rate": 0.00047570926498626486, + "loss": 0.87227535, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.37133789, + "step": 2755, + "time_per_iteration": 2.6872901916503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092723, + "balance_loss_mlp": 1.05402756, + "epoch": 0.5302039245863793, + "flos": 672475405824.0, + "grad_norm": 0.0470441247054563, + "language_loss": 0.81654894, + "learning_rate": 0.00047539809527360474, + "loss": 0.82747614, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.38696289, + "step": 2756, + "time_per_iteration": 2.865553379058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093267, + "balance_loss_mlp": 1.05488133, + "epoch": 0.5303963062716429, + "flos": 730507396608.0, + "grad_norm": 0.04188022637432273, + "language_loss": 0.82037127, + "learning_rate": 0.0004750869351122511, + "loss": 0.83130395, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.38330078, + "step": 2757, + "time_per_iteration": 2.9792187213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093906, + "balance_loss_mlp": 1.0563792, + "epoch": 0.5305886879569065, + "flos": 573156836352.0, + "grad_norm": 0.0631181134246054, + "language_loss": 0.81604397, + "learning_rate": 0.00047477578462300685, + "loss": 0.82698298, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.37524414, + "step": 2758, + "time_per_iteration": 2.6986684799194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093632, + "balance_loss_mlp": 1.05553293, + "epoch": 0.5307810696421701, + "flos": 694988315136.0, + "grad_norm": 0.050985358767642326, + "language_loss": 0.79166949, + "learning_rate": 0.0004744646439266718, + "loss": 0.80260581, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.38085938, + "step": 2759, + "time_per_iteration": 2.978621006011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091884, + "balance_loss_mlp": 1.05342746, + "epoch": 0.5309734513274337, + "flos": 648629683200.0, + "grad_norm": 0.042424952199748935, + "language_loss": 0.92400765, + "learning_rate": 0.000474153513144041, + "loss": 0.93492657, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.38427734, + "step": 2760, + "time_per_iteration": 2.8996803760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109538, + "balance_loss_mlp": 1.05534935, + "epoch": 0.5311658330126972, + "flos": 604517506560.0, + "grad_norm": 0.048779343359875056, + "language_loss": 0.86932075, + "learning_rate": 0.00047384239239590633, + "loss": 0.88027459, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.39990234, + "step": 2761, + "time_per_iteration": 2.8649730682373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090925, + "balance_loss_mlp": 1.05342138, + "epoch": 0.5313582146979607, + "flos": 557995931136.0, + "grad_norm": 0.062125162710189655, + "language_loss": 0.88300002, + "learning_rate": 0.0004735312818030556, + "loss": 0.89390922, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.37475586, + "step": 2762, + "time_per_iteration": 2.664534091949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108596, + "balance_loss_mlp": 1.04776537, + "epoch": 0.5315505963832243, + "flos": 508152572928.0, + "grad_norm": 0.04725442501000759, + "language_loss": 0.82514352, + "learning_rate": 0.0004732201814862727, + "loss": 0.83600307, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.38183594, + "step": 2763, + "time_per_iteration": 2.7150583267211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100901, + "balance_loss_mlp": 1.06113279, + "epoch": 0.5317429780684879, + "flos": 626132740608.0, + "grad_norm": 0.050347986684343975, + "language_loss": 0.81810606, + "learning_rate": 0.0004729090915663373, + "loss": 0.82911509, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.39746094, + "step": 2764, + "time_per_iteration": 2.837186336517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093533, + "balance_loss_mlp": 1.05509973, + "epoch": 0.5319353597537514, + "flos": 476506713600.0, + "grad_norm": 0.06358705333883939, + "language_loss": 0.85396516, + "learning_rate": 0.00047259801216402534, + "loss": 0.86490047, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.38427734, + "step": 2765, + "time_per_iteration": 2.5005743503570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094119, + "balance_loss_mlp": 1.05592442, + "epoch": 0.532127741439015, + "flos": 501386913792.0, + "grad_norm": 0.06543180937467778, + "language_loss": 0.8612839, + "learning_rate": 0.00047228694340010845, + "loss": 0.87222505, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.38183594, + "step": 2766, + "time_per_iteration": 2.549018144607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096891, + "balance_loss_mlp": 1.0578146, + "epoch": 0.5323201231242786, + "flos": 1164114063360.0, + "grad_norm": 0.04837235133211893, + "language_loss": 0.85614288, + "learning_rate": 0.0004719758853953544, + "loss": 0.8671118, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.390625, + "step": 2767, + "time_per_iteration": 3.568779468536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095476, + "balance_loss_mlp": 1.05709052, + "epoch": 0.5325125048095422, + "flos": 378493254144.0, + "grad_norm": 0.06740098585195309, + "language_loss": 0.84098738, + "learning_rate": 0.00047166483827052645, + "loss": 0.85194218, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.38354492, + "step": 2768, + "time_per_iteration": 2.4389522075653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_mlp": 1.01784337, + "epoch": 0.5327048864948057, + "flos": 1540507534848.0, + "grad_norm": 0.01937833439113787, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78109497, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.13183594, + "step": 2769, + "time_per_iteration": 4.967049837112427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093166, + "balance_loss_mlp": 1.05320704, + "epoch": 0.5328972681800692, + "flos": 910877225472.0, + "grad_norm": 0.052506511923680964, + "language_loss": 0.83564013, + "learning_rate": 0.000471042777143682, + "loss": 0.8465718, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.3996582, + "step": 2770, + "time_per_iteration": 3.2065277099609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083074, + "balance_loss_mlp": 1.04530883, + "epoch": 0.5330896498653328, + "flos": 473660766720.0, + "grad_norm": 0.0519747156636442, + "language_loss": 0.79680347, + "learning_rate": 0.0004707317633831707, + "loss": 0.80763417, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.37744141, + "step": 2771, + "time_per_iteration": 2.5498273372650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091325, + "balance_loss_mlp": 1.05408382, + "epoch": 0.5332820315505964, + "flos": 501386913792.0, + "grad_norm": 0.05598064533442757, + "language_loss": 0.77608013, + "learning_rate": 0.00047042076098559673, + "loss": 0.78699338, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.37231445, + "step": 2772, + "time_per_iteration": 2.5759775638580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091384, + "balance_loss_mlp": 1.05323732, + "epoch": 0.53347441323586, + "flos": 924043368960.0, + "grad_norm": 0.060675625301583505, + "language_loss": 0.73884845, + "learning_rate": 0.00047010977007170174, + "loss": 0.7497623, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.38110352, + "step": 2773, + "time_per_iteration": 3.257833957672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089952, + "balance_loss_mlp": 1.05099463, + "epoch": 0.5336667949211235, + "flos": 574185521664.0, + "grad_norm": 0.06246333407972351, + "language_loss": 0.82451814, + "learning_rate": 0.00046979879076222334, + "loss": 0.83541769, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.38916016, + "step": 2774, + "time_per_iteration": 2.6394476890563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091431, + "balance_loss_mlp": 1.05306923, + "epoch": 0.533859176606387, + "flos": 1064233880064.0, + "grad_norm": 0.044878758318980805, + "language_loss": 0.85063684, + "learning_rate": 0.0004694878231778939, + "loss": 0.86155117, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.38330078, + "step": 2775, + "time_per_iteration": 3.3668456077575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085244, + "balance_loss_mlp": 1.04695392, + "epoch": 0.5340515582916506, + "flos": 746277967872.0, + "grad_norm": 0.04760082973405309, + "language_loss": 0.84270054, + "learning_rate": 0.0004691768674394423, + "loss": 0.85355294, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.38305664, + "step": 2776, + "time_per_iteration": 2.9580860137939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_mlp": 1.02644587, + "epoch": 0.5342439399769142, + "flos": 1444905036288.0, + "grad_norm": 0.01780260433895519, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85522568, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.12109375, + "step": 2777, + "time_per_iteration": 4.798782825469971 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036301, + "balance_loss_mlp": 1.02423704, + "epoch": 0.5344363216621778, + "flos": 1426790495232.0, + "grad_norm": 0.016806659478265918, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77689832, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.12060547, + "step": 2778, + "time_per_iteration": 4.971946477890015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083696, + "balance_loss_mlp": 1.04650259, + "epoch": 0.5346287033474413, + "flos": 527355436032.0, + "grad_norm": 0.27028176168378437, + "language_loss": 0.79060376, + "learning_rate": 0.00046824407250656676, + "loss": 0.80144072, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.37158203, + "step": 2779, + "time_per_iteration": 2.639554738998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083286, + "balance_loss_mlp": 1.04528189, + "epoch": 0.5348210850327049, + "flos": 510515481600.0, + "grad_norm": 0.04912000707376091, + "language_loss": 0.83288354, + "learning_rate": 0.0004679331653588161, + "loss": 0.84371638, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.37988281, + "step": 2780, + "time_per_iteration": 2.590897560119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082747, + "balance_loss_mlp": 1.04388487, + "epoch": 0.5350134667179685, + "flos": 462429748224.0, + "grad_norm": 0.07636572739089499, + "language_loss": 0.8547262, + "learning_rate": 0.0004676222706605147, + "loss": 0.86555368, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.38867188, + "step": 2781, + "time_per_iteration": 2.606795310974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088005, + "balance_loss_mlp": 1.04647303, + "epoch": 0.535205848403232, + "flos": 708566275584.0, + "grad_norm": 0.05667741573580048, + "language_loss": 0.84751678, + "learning_rate": 0.0004673113885323626, + "loss": 0.85839683, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.4152832, + "step": 2782, + "time_per_iteration": 2.813957691192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084509, + "balance_loss_mlp": 1.04507411, + "epoch": 0.5353982300884956, + "flos": 893855388672.0, + "grad_norm": 0.04933634097838137, + "language_loss": 0.78395712, + "learning_rate": 0.00046700051909505494, + "loss": 0.79480219, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.39404297, + "step": 2783, + "time_per_iteration": 3.151244878768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089347, + "balance_loss_mlp": 1.0476948, + "epoch": 0.5355906117737591, + "flos": 535701219840.0, + "grad_norm": 0.06378381527079717, + "language_loss": 0.83984947, + "learning_rate": 0.000466689662469282, + "loss": 0.85074294, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.41650391, + "step": 2784, + "time_per_iteration": 2.6275248527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081581, + "balance_loss_mlp": 1.04159856, + "epoch": 0.5357829934590227, + "flos": 868477593600.0, + "grad_norm": 0.05202541270375375, + "language_loss": 0.83895493, + "learning_rate": 0.00046637881877572917, + "loss": 0.84977078, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.3996582, + "step": 2785, + "time_per_iteration": 3.069645404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085059, + "balance_loss_mlp": 1.04481411, + "epoch": 0.5359753751442863, + "flos": 552999481344.0, + "grad_norm": 0.08844651025983005, + "language_loss": 0.8452431, + "learning_rate": 0.0004660679881350764, + "loss": 0.85609365, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.40234375, + "step": 2786, + "time_per_iteration": 2.7307839393615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059616, + "balance_loss_mlp": 1.04531133, + "epoch": 0.5361677568295499, + "flos": 1479687823872.0, + "grad_norm": 0.02226240505672553, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76667762, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.14257812, + "step": 2787, + "time_per_iteration": 5.010236740112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083366, + "balance_loss_mlp": 1.04352605, + "epoch": 0.5363601385148133, + "flos": 805910432256.0, + "grad_norm": 0.0562451411020875, + "language_loss": 0.78052628, + "learning_rate": 0.0004654463664951667, + "loss": 0.79135996, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.3984375, + "step": 2788, + "time_per_iteration": 2.9822394847869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090776, + "balance_loss_mlp": 1.05076993, + "epoch": 0.5365525202000769, + "flos": 507630246912.0, + "grad_norm": 0.05204597911301594, + "language_loss": 0.82849109, + "learning_rate": 0.0004651355757372447, + "loss": 0.83939886, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.39990234, + "step": 2789, + "time_per_iteration": 2.615691900253296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089332, + "balance_loss_mlp": 1.04937315, + "epoch": 0.5367449018853405, + "flos": 528660546048.0, + "grad_norm": 0.0871364316310779, + "language_loss": 0.854258, + "learning_rate": 0.00046482479851489274, + "loss": 0.86515129, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.39941406, + "step": 2790, + "time_per_iteration": 2.7088706493377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089329, + "balance_loss_mlp": 1.04853582, + "epoch": 0.5369372835706041, + "flos": 649614698496.0, + "grad_norm": 0.059769288934836705, + "language_loss": 0.78002077, + "learning_rate": 0.00046451403494876525, + "loss": 0.79091412, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.40795898, + "step": 2791, + "time_per_iteration": 2.8624680042266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082264, + "balance_loss_mlp": 1.04254341, + "epoch": 0.5371296652558677, + "flos": 584205972480.0, + "grad_norm": 0.05423678017273499, + "language_loss": 0.84187895, + "learning_rate": 0.0004642032851595111, + "loss": 0.8527016, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.3972168, + "step": 2792, + "time_per_iteration": 2.7222046852111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090422, + "balance_loss_mlp": 1.04877055, + "epoch": 0.5373220469411312, + "flos": 595570821120.0, + "grad_norm": 0.05596231110481221, + "language_loss": 0.84764576, + "learning_rate": 0.00046389254926777404, + "loss": 0.85855001, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.41674805, + "step": 2793, + "time_per_iteration": 2.8049495220184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083638, + "balance_loss_mlp": 1.04286838, + "epoch": 0.5375144286263948, + "flos": 1113965167104.0, + "grad_norm": 0.05603938595076487, + "language_loss": 0.78227508, + "learning_rate": 0.0004635818273941926, + "loss": 0.79311144, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.4074707, + "step": 2794, + "time_per_iteration": 3.506617307662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085514, + "balance_loss_mlp": 1.04495919, + "epoch": 0.5377068103116583, + "flos": 595319127552.0, + "grad_norm": 0.07610950885477011, + "language_loss": 0.81443048, + "learning_rate": 0.0004632711196593997, + "loss": 0.82528561, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.40527344, + "step": 2795, + "time_per_iteration": 2.7142324447631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083447, + "balance_loss_mlp": 1.04377437, + "epoch": 0.5378991919969219, + "flos": 883839320064.0, + "grad_norm": 0.061986224183990205, + "language_loss": 0.85229117, + "learning_rate": 0.00046296042618402297, + "loss": 0.86312562, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.39697266, + "step": 2796, + "time_per_iteration": 3.0699656009674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077763, + "balance_loss_mlp": 1.03801823, + "epoch": 0.5380915736821854, + "flos": 710344249344.0, + "grad_norm": 0.04828732184108336, + "language_loss": 0.792054, + "learning_rate": 0.0004626497470886839, + "loss": 0.80283165, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.39746094, + "step": 2797, + "time_per_iteration": 2.9337801933288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085503, + "balance_loss_mlp": 1.04444742, + "epoch": 0.538283955367449, + "flos": 556721344512.0, + "grad_norm": 0.04667541599746409, + "language_loss": 0.8208226, + "learning_rate": 0.00046233908249399897, + "loss": 0.83167768, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.41040039, + "step": 2798, + "time_per_iteration": 2.736253023147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086245, + "balance_loss_mlp": 1.04585731, + "epoch": 0.5384763370527126, + "flos": 513218833920.0, + "grad_norm": 0.05904964511977083, + "language_loss": 0.78162259, + "learning_rate": 0.00046202843252057905, + "loss": 0.79248506, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.40380859, + "step": 2799, + "time_per_iteration": 2.5839316844940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085466, + "balance_loss_mlp": 1.04503012, + "epoch": 0.5386687187379762, + "flos": 489490974720.0, + "grad_norm": 0.06428119470797507, + "language_loss": 0.83220208, + "learning_rate": 0.00046171779728902896, + "loss": 0.8430568, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.40405273, + "step": 2800, + "time_per_iteration": 2.6141908168792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087168, + "balance_loss_mlp": 1.04801977, + "epoch": 0.5388611004232398, + "flos": 482415395328.0, + "grad_norm": 0.12344174959648258, + "language_loss": 0.86207569, + "learning_rate": 0.000461407176919948, + "loss": 0.87294734, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.39111328, + "step": 2801, + "time_per_iteration": 2.503673791885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_mlp": 1.04158366, + "epoch": 0.5390534821085032, + "flos": 560709457920.0, + "grad_norm": 0.05013064620145656, + "language_loss": 0.85174656, + "learning_rate": 0.00046109657153392997, + "loss": 0.86255008, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.38720703, + "step": 2802, + "time_per_iteration": 2.6549510955810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086897, + "balance_loss_mlp": 1.04624677, + "epoch": 0.5392458637937668, + "flos": 488132020224.0, + "grad_norm": 0.05351248634305854, + "language_loss": 0.82771289, + "learning_rate": 0.0004607859812515622, + "loss": 0.8385818, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.40649414, + "step": 2803, + "time_per_iteration": 2.592742681503296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085335, + "balance_loss_mlp": 1.0456624, + "epoch": 0.5394382454790304, + "flos": 511810417152.0, + "grad_norm": 0.06156300752407298, + "language_loss": 0.87926197, + "learning_rate": 0.00046047540619342667, + "loss": 0.89011538, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.39648438, + "step": 2804, + "time_per_iteration": 2.566542863845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108144, + "balance_loss_mlp": 1.04343605, + "epoch": 0.539630627164294, + "flos": 567312173568.0, + "grad_norm": 0.04852529488921132, + "language_loss": 0.7995888, + "learning_rate": 0.00046016484648009933, + "loss": 0.81040317, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.38012695, + "step": 2805, + "time_per_iteration": 2.693988561630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108233, + "balance_loss_mlp": 1.04415882, + "epoch": 0.5398230088495575, + "flos": 526203095040.0, + "grad_norm": 0.058780411040176145, + "language_loss": 0.8077246, + "learning_rate": 0.0004598543022321501, + "loss": 0.81854796, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.38134766, + "step": 2806, + "time_per_iteration": 2.635873317718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079563, + "balance_loss_mlp": 1.0410111, + "epoch": 0.5400153905348211, + "flos": 538493322240.0, + "grad_norm": 0.05389643439716648, + "language_loss": 0.7979452, + "learning_rate": 0.0004595437735701433, + "loss": 0.80874085, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.38500977, + "step": 2807, + "time_per_iteration": 2.671004056930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082107, + "balance_loss_mlp": 1.04252934, + "epoch": 0.5402077722200846, + "flos": 513259531776.0, + "grad_norm": 0.056977099557855106, + "language_loss": 0.83333278, + "learning_rate": 0.00045923326061463623, + "loss": 0.84415388, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.39575195, + "step": 2808, + "time_per_iteration": 2.748844861984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108444, + "balance_loss_mlp": 1.04519629, + "epoch": 0.5404001539053482, + "flos": 675932428800.0, + "grad_norm": 0.053531678156081904, + "language_loss": 0.81448805, + "learning_rate": 0.00045892276348618113, + "loss": 0.82533252, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.39208984, + "step": 2809, + "time_per_iteration": 2.9712717533111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034069, + "balance_loss_mlp": 1.02195704, + "epoch": 0.5405925355906118, + "flos": 1553998155264.0, + "grad_norm": 0.02221665300745606, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79294896, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.12109375, + "step": 2810, + "time_per_iteration": 4.987140893936157 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085482, + "balance_loss_mlp": 1.04697728, + "epoch": 0.5407849172758753, + "flos": 647004478464.0, + "grad_norm": 0.050822756134718025, + "language_loss": 0.80942833, + "learning_rate": 0.000458301817192603, + "loss": 0.82028317, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.38500977, + "step": 2811, + "time_per_iteration": 2.826511859893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028161, + "balance_loss_mlp": 1.01576281, + "epoch": 0.5409772989611389, + "flos": 1406641904640.0, + "grad_norm": 0.017319914930323605, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81869948, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.12353516, + "step": 2812, + "time_per_iteration": 4.797938346862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083094, + "balance_loss_mlp": 1.04525733, + "epoch": 0.5411696806464025, + "flos": 554102360064.0, + "grad_norm": 0.08517188397837483, + "language_loss": 0.87214613, + "learning_rate": 0.00045768093565369983, + "loss": 0.88297707, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.37817383, + "step": 2813, + "time_per_iteration": 2.716890811920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082803, + "balance_loss_mlp": 1.04441762, + "epoch": 0.5413620623316661, + "flos": 527853030912.0, + "grad_norm": 0.05234072905155942, + "language_loss": 0.81825578, + "learning_rate": 0.0004573705194685646, + "loss": 0.8290838, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.38330078, + "step": 2814, + "time_per_iteration": 2.6517584323883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082477, + "balance_loss_mlp": 1.04380536, + "epoch": 0.5415544440169295, + "flos": 598464820224.0, + "grad_norm": 0.054888895455983605, + "language_loss": 0.84797984, + "learning_rate": 0.00045706011983366157, + "loss": 0.85880458, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.38623047, + "step": 2815, + "time_per_iteration": 2.670135974884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088358, + "balance_loss_mlp": 1.050807, + "epoch": 0.5417468257021931, + "flos": 470519456256.0, + "grad_norm": 0.06349065912195655, + "language_loss": 0.82603323, + "learning_rate": 0.00045674973686949847, + "loss": 0.8369168, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.37524414, + "step": 2816, + "time_per_iteration": 2.51487398147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085537, + "balance_loss_mlp": 1.04710388, + "epoch": 0.5419392073874567, + "flos": 680477773824.0, + "grad_norm": 0.04802331030108417, + "language_loss": 0.85519576, + "learning_rate": 0.0004564393706965766, + "loss": 0.86605108, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.3840332, + "step": 2817, + "time_per_iteration": 2.9650819301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088505, + "balance_loss_mlp": 1.05031061, + "epoch": 0.5421315890727203, + "flos": 462134384640.0, + "grad_norm": 0.11431790588446349, + "language_loss": 0.81361973, + "learning_rate": 0.00045612902143539116, + "loss": 0.82450485, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.3815918, + "step": 2818, + "time_per_iteration": 2.5874366760253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083753, + "balance_loss_mlp": 1.04620242, + "epoch": 0.5423239707579839, + "flos": 436727476224.0, + "grad_norm": 0.06287409893753121, + "language_loss": 0.81734043, + "learning_rate": 0.00045581868920642986, + "loss": 0.82817793, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.375, + "step": 2819, + "time_per_iteration": 2.4778597354888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.04818964, + "epoch": 0.5425163524432474, + "flos": 458067695616.0, + "grad_norm": 0.0556653381868651, + "language_loss": 0.79541689, + "learning_rate": 0.00045550837413017457, + "loss": 0.8062731, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.37402344, + "step": 2820, + "time_per_iteration": 2.653878688812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087265, + "balance_loss_mlp": 1.04873669, + "epoch": 0.542708734128511, + "flos": 419267681280.0, + "grad_norm": 0.047652791336190936, + "language_loss": 0.85203838, + "learning_rate": 0.0004551980763271005, + "loss": 0.86291105, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.38500977, + "step": 2821, + "time_per_iteration": 2.6410272121429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088978, + "balance_loss_mlp": 1.04942417, + "epoch": 0.5429011158137745, + "flos": 678142568448.0, + "grad_norm": 0.047512644994480734, + "language_loss": 0.83545935, + "learning_rate": 0.0004548877959176756, + "loss": 0.84634912, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.39550781, + "step": 2822, + "time_per_iteration": 2.8824410438537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083878, + "balance_loss_mlp": 1.04542077, + "epoch": 0.5430934974990381, + "flos": 540664174080.0, + "grad_norm": 0.05440283794038225, + "language_loss": 0.8588357, + "learning_rate": 0.00045457753302236166, + "loss": 0.86967444, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.3840332, + "step": 2823, + "time_per_iteration": 2.665828227996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078196, + "balance_loss_mlp": 1.04069233, + "epoch": 0.5432858791843016, + "flos": 658175860224.0, + "grad_norm": 0.053164692369765, + "language_loss": 0.86939847, + "learning_rate": 0.00045426728776161353, + "loss": 0.88018048, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.37475586, + "step": 2824, + "time_per_iteration": 2.79662823677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082032, + "balance_loss_mlp": 1.04367089, + "epoch": 0.5434782608695652, + "flos": 531678200832.0, + "grad_norm": 0.051257131946256196, + "language_loss": 0.81339788, + "learning_rate": 0.00045395706025587863, + "loss": 0.82421821, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.38330078, + "step": 2825, + "time_per_iteration": 2.612839698791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083107, + "balance_loss_mlp": 1.04298067, + "epoch": 0.5436706425548288, + "flos": 608219020800.0, + "grad_norm": 0.0654215378261843, + "language_loss": 0.8246271, + "learning_rate": 0.00045364685062559843, + "loss": 0.83545816, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.40112305, + "step": 2826, + "time_per_iteration": 2.8304717540740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077545, + "balance_loss_mlp": 1.03863502, + "epoch": 0.5438630242400924, + "flos": 705081549312.0, + "grad_norm": 0.05153461088450525, + "language_loss": 0.91323566, + "learning_rate": 0.0004533366589912067, + "loss": 0.92401117, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.38891602, + "step": 2827, + "time_per_iteration": 2.9909794330596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083973, + "balance_loss_mlp": 1.04399014, + "epoch": 0.544055405925356, + "flos": 856073885184.0, + "grad_norm": 0.06162926864421369, + "language_loss": 0.77631354, + "learning_rate": 0.0004530264854731306, + "loss": 0.78715324, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.3996582, + "step": 2828, + "time_per_iteration": 3.0477852821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079886, + "balance_loss_mlp": 1.0402137, + "epoch": 0.5442477876106194, + "flos": 571483579392.0, + "grad_norm": 0.04880017685382554, + "language_loss": 0.83835936, + "learning_rate": 0.00045271633019179034, + "loss": 0.84915829, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.39648438, + "step": 2829, + "time_per_iteration": 2.8048830032348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108649, + "balance_loss_mlp": 1.04684114, + "epoch": 0.544440169295883, + "flos": 625246649856.0, + "grad_norm": 0.05731672371216008, + "language_loss": 0.87693858, + "learning_rate": 0.0004524061932675986, + "loss": 0.88780355, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.39624023, + "step": 2830, + "time_per_iteration": 2.880328893661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081175, + "balance_loss_mlp": 1.0420748, + "epoch": 0.5446325509811466, + "flos": 835896181248.0, + "grad_norm": 0.061736377466748704, + "language_loss": 0.8659271, + "learning_rate": 0.00045209607482096125, + "loss": 0.87673885, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.390625, + "step": 2831, + "time_per_iteration": 2.9996933937072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080099, + "balance_loss_mlp": 1.04016387, + "epoch": 0.5448249326664102, + "flos": 483129778176.0, + "grad_norm": 0.057163759026562816, + "language_loss": 0.8399148, + "learning_rate": 0.0004517859749722772, + "loss": 0.85071582, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.39892578, + "step": 2832, + "time_per_iteration": 2.6431195735931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085662, + "balance_loss_mlp": 1.04606068, + "epoch": 0.5450173143516738, + "flos": 560799618048.0, + "grad_norm": 0.061436781325619555, + "language_loss": 0.78688192, + "learning_rate": 0.0004514758938419376, + "loss": 0.79773855, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.39575195, + "step": 2833, + "time_per_iteration": 2.811894655227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058831, + "balance_loss_mlp": 1.04280972, + "epoch": 0.5452096960369373, + "flos": 1469632467456.0, + "grad_norm": 0.020133642361800857, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77979416, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.16015625, + "step": 2834, + "time_per_iteration": 4.920469760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077415, + "balance_loss_mlp": 1.03798103, + "epoch": 0.5454020777222008, + "flos": 464827562496.0, + "grad_norm": 0.051503170745990534, + "language_loss": 0.83848447, + "learning_rate": 0.00045085578821782175, + "loss": 0.84925866, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.39404297, + "step": 2835, + "time_per_iteration": 2.523089647293091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048396, + "balance_loss_mlp": 1.03246999, + "epoch": 0.5455944594074644, + "flos": 1468921056768.0, + "grad_norm": 0.01613355837810212, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77183139, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.15917969, + "step": 2836, + "time_per_iteration": 4.865030288696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082059, + "balance_loss_mlp": 1.0422194, + "epoch": 0.545786841092728, + "flos": 532900353024.0, + "grad_norm": 0.04532447535161293, + "language_loss": 0.81224561, + "learning_rate": 0.00045023575891159866, + "loss": 0.82306617, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.3984375, + "step": 2837, + "time_per_iteration": 2.7024872303009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038306, + "balance_loss_mlp": 1.02285683, + "epoch": 0.5459792227779915, + "flos": 1351633360896.0, + "grad_norm": 0.01633471064412587, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75802112, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.15429688, + "step": 2838, + "time_per_iteration": 4.88713812828064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072439, + "balance_loss_mlp": 1.03436387, + "epoch": 0.5461716044632551, + "flos": 637584929280.0, + "grad_norm": 0.044187924464620755, + "language_loss": 0.77777064, + "learning_rate": 0.0004496158068861354, + "loss": 0.788495, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.38037109, + "step": 2839, + "time_per_iteration": 2.7734854221343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083666, + "balance_loss_mlp": 1.04451799, + "epoch": 0.5463639861485187, + "flos": 602458725888.0, + "grad_norm": 0.04916115853202861, + "language_loss": 0.80780178, + "learning_rate": 0.00044930586015455207, + "loss": 0.81863844, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.39111328, + "step": 2840, + "time_per_iteration": 2.776756525039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079059, + "balance_loss_mlp": 1.04105484, + "epoch": 0.5465563678337823, + "flos": 642208849920.0, + "grad_norm": 0.047638532734035705, + "language_loss": 0.89027333, + "learning_rate": 0.000448995933104179, + "loss": 0.90106392, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.37939453, + "step": 2841, + "time_per_iteration": 2.835770606994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084722, + "balance_loss_mlp": 1.04526389, + "epoch": 0.5467487495190458, + "flos": 613852687872.0, + "grad_norm": 0.05241434980763647, + "language_loss": 0.79585081, + "learning_rate": 0.00044868602585534077, + "loss": 0.80669802, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.39428711, + "step": 2842, + "time_per_iteration": 2.8165202140808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081214, + "balance_loss_mlp": 1.04297209, + "epoch": 0.5469411312043093, + "flos": 460957312512.0, + "grad_norm": 0.05377375824052972, + "language_loss": 0.88703167, + "learning_rate": 0.0004483761385283541, + "loss": 0.89784384, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.38183594, + "step": 2843, + "time_per_iteration": 2.5191187858581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085655, + "balance_loss_mlp": 1.04705536, + "epoch": 0.5471335128895729, + "flos": 560930628096.0, + "grad_norm": 0.05339183941738246, + "language_loss": 0.82029176, + "learning_rate": 0.0004480662712435281, + "loss": 0.83114827, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.38549805, + "step": 2844, + "time_per_iteration": 2.7347452640533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084571, + "balance_loss_mlp": 1.046996, + "epoch": 0.5473258945748365, + "flos": 518437863936.0, + "grad_norm": 0.05481278216627967, + "language_loss": 0.88263971, + "learning_rate": 0.0004477564241211635, + "loss": 0.89348543, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.37548828, + "step": 2845, + "time_per_iteration": 2.566675901412964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085991, + "balance_loss_mlp": 1.0476774, + "epoch": 0.5475182762601001, + "flos": 433600722432.0, + "grad_norm": 0.05360762168993706, + "language_loss": 0.87165999, + "learning_rate": 0.0004474465972815541, + "loss": 0.88251984, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.38256836, + "step": 2846, + "time_per_iteration": 2.458261489868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085193, + "balance_loss_mlp": 1.04754686, + "epoch": 0.5477106579453636, + "flos": 511308440064.0, + "grad_norm": 0.04786363547278841, + "language_loss": 0.87439841, + "learning_rate": 0.000447136790844985, + "loss": 0.88525033, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.37646484, + "step": 2847, + "time_per_iteration": 2.667609214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108262, + "balance_loss_mlp": 1.04547465, + "epoch": 0.5479030396306271, + "flos": 675606541824.0, + "grad_norm": 0.050829406458998395, + "language_loss": 0.80589354, + "learning_rate": 0.00044682700493173385, + "loss": 0.81671977, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.37133789, + "step": 2848, + "time_per_iteration": 2.83048677444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088336, + "balance_loss_mlp": 1.04978406, + "epoch": 0.5480954213158907, + "flos": 875720498688.0, + "grad_norm": 0.057674115143319986, + "language_loss": 0.80473161, + "learning_rate": 0.00044651723966207004, + "loss": 0.81561506, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.38500977, + "step": 2849, + "time_per_iteration": 3.1320085525512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084454, + "balance_loss_mlp": 1.04780865, + "epoch": 0.5482878030011543, + "flos": 621715433472.0, + "grad_norm": 0.04900831188074684, + "language_loss": 0.78059959, + "learning_rate": 0.00044620749515625536, + "loss": 0.79144412, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.36669922, + "step": 2850, + "time_per_iteration": 2.784318447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091667, + "balance_loss_mlp": 1.05404472, + "epoch": 0.5484801846864179, + "flos": 496946285568.0, + "grad_norm": 0.05697086220906577, + "language_loss": 0.84891641, + "learning_rate": 0.00044589777153454334, + "loss": 0.85983306, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.37597656, + "step": 2851, + "time_per_iteration": 2.7432825565338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087741, + "balance_loss_mlp": 1.04973722, + "epoch": 0.5486725663716814, + "flos": 442202582016.0, + "grad_norm": 0.05425914558119235, + "language_loss": 0.83565009, + "learning_rate": 0.00044558806891717895, + "loss": 0.84652746, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.37963867, + "step": 2852, + "time_per_iteration": 2.486581563949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093078, + "balance_loss_mlp": 1.05528831, + "epoch": 0.548864948056945, + "flos": 654867224064.0, + "grad_norm": 0.04695408394518552, + "language_loss": 0.79779923, + "learning_rate": 0.0004452783874243998, + "loss": 0.80873001, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.37817383, + "step": 2853, + "time_per_iteration": 2.823004722595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088751, + "balance_loss_mlp": 1.05246305, + "epoch": 0.5490573297422086, + "flos": 545760958464.0, + "grad_norm": 0.06406980317061135, + "language_loss": 0.84579176, + "learning_rate": 0.00044496872717643475, + "loss": 0.85667926, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.36279297, + "step": 2854, + "time_per_iteration": 2.6582207679748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104165, + "balance_loss_mlp": 1.02906144, + "epoch": 0.5492497114274721, + "flos": 1589450245632.0, + "grad_norm": 0.019738925867794382, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78130943, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.12597656, + "step": 2855, + "time_per_iteration": 4.917479991912842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086161, + "balance_loss_mlp": 1.0507319, + "epoch": 0.5494420931127356, + "flos": 750567237120.0, + "grad_norm": 0.05097157568088764, + "language_loss": 0.82032043, + "learning_rate": 0.0004443494708958217, + "loss": 0.83118206, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.35473633, + "step": 2856, + "time_per_iteration": 2.944794178009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087153, + "balance_loss_mlp": 1.04860103, + "epoch": 0.5496344747979992, + "flos": 625704956928.0, + "grad_norm": 0.05077616299787212, + "language_loss": 0.80950212, + "learning_rate": 0.0004440398751035906, + "loss": 0.82037365, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.38549805, + "step": 2857, + "time_per_iteration": 2.8557775020599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083742, + "balance_loss_mlp": 1.04707289, + "epoch": 0.5498268564832628, + "flos": 522859553280.0, + "grad_norm": 0.07234504005195413, + "language_loss": 0.83526963, + "learning_rate": 0.00044373030103700645, + "loss": 0.84610707, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.3671875, + "step": 2858, + "time_per_iteration": 2.5718507766723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079618, + "balance_loss_mlp": 1.04337823, + "epoch": 0.5500192381685264, + "flos": 604290544128.0, + "grad_norm": 0.05047837894946753, + "language_loss": 0.79457223, + "learning_rate": 0.000443420748816257, + "loss": 0.80536836, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.36279297, + "step": 2859, + "time_per_iteration": 2.791083335876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_mlp": 1.0475843, + "epoch": 0.55021161985379, + "flos": 520246361088.0, + "grad_norm": 0.05245161408681963, + "language_loss": 0.78267741, + "learning_rate": 0.0004431112185615208, + "loss": 0.79352212, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.36914062, + "step": 2860, + "time_per_iteration": 2.755300760269165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084785, + "balance_loss_mlp": 1.04873633, + "epoch": 0.5504040015390534, + "flos": 489426955776.0, + "grad_norm": 0.05433061967205067, + "language_loss": 0.79769695, + "learning_rate": 0.00044280171039296845, + "loss": 0.80854475, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.3605957, + "step": 2861, + "time_per_iteration": 2.611142873764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086738, + "balance_loss_mlp": 1.04925907, + "epoch": 0.550596383224317, + "flos": 575519745024.0, + "grad_norm": 0.06168485457456991, + "language_loss": 0.88482428, + "learning_rate": 0.0004424922244307616, + "loss": 0.89569169, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.375, + "step": 2862, + "time_per_iteration": 2.673872470855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085076, + "balance_loss_mlp": 1.04750168, + "epoch": 0.5507887649095806, + "flos": 642149213184.0, + "grad_norm": 0.06448144785997337, + "language_loss": 0.82166171, + "learning_rate": 0.00044218276079505315, + "loss": 0.83251244, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.37524414, + "step": 2863, + "time_per_iteration": 2.8468000888824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088216, + "balance_loss_mlp": 1.05126143, + "epoch": 0.5509811465948442, + "flos": 531589450752.0, + "grad_norm": 0.050966073807123834, + "language_loss": 0.7469635, + "learning_rate": 0.0004418733196059876, + "loss": 0.7578457, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.36938477, + "step": 2864, + "time_per_iteration": 2.662949323654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088219, + "balance_loss_mlp": 1.05174112, + "epoch": 0.5511735282801077, + "flos": 654439440384.0, + "grad_norm": 0.054186590964919915, + "language_loss": 0.79709429, + "learning_rate": 0.0004415639009837008, + "loss": 0.80797648, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.36474609, + "step": 2865, + "time_per_iteration": 2.8164796829223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080612, + "balance_loss_mlp": 1.04503989, + "epoch": 0.5513659099653713, + "flos": 529222159872.0, + "grad_norm": 0.05095499883513892, + "language_loss": 0.81590974, + "learning_rate": 0.00044125450504831955, + "loss": 0.82671583, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.35620117, + "step": 2866, + "time_per_iteration": 2.7417778968811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088604, + "balance_loss_mlp": 1.05162513, + "epoch": 0.5515582916506349, + "flos": 554594162688.0, + "grad_norm": 0.05682958193324047, + "language_loss": 0.82243145, + "learning_rate": 0.0004409451319199622, + "loss": 0.83331752, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.36987305, + "step": 2867, + "time_per_iteration": 2.6530325412750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082608, + "balance_loss_mlp": 1.04608202, + "epoch": 0.5517506733358984, + "flos": 735067298304.0, + "grad_norm": 0.04759427919913488, + "language_loss": 0.84027618, + "learning_rate": 0.0004406357817187381, + "loss": 0.85110223, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.36572266, + "step": 2868, + "time_per_iteration": 2.9475574493408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083501, + "balance_loss_mlp": 1.04590225, + "epoch": 0.551943055021162, + "flos": 1114861432320.0, + "grad_norm": 0.043872910920917114, + "language_loss": 0.80878294, + "learning_rate": 0.0004403264545647474, + "loss": 0.81961799, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.37597656, + "step": 2869, + "time_per_iteration": 3.5124435424804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080703, + "balance_loss_mlp": 1.04422534, + "epoch": 0.5521354367064255, + "flos": 544092083712.0, + "grad_norm": 0.0550168733336382, + "language_loss": 0.84926724, + "learning_rate": 0.00044001715057808154, + "loss": 0.86007428, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.36499023, + "step": 2870, + "time_per_iteration": 2.7501060962677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085896, + "balance_loss_mlp": 1.04855943, + "epoch": 0.5523278183916891, + "flos": 935889845760.0, + "grad_norm": 0.05461062340152541, + "language_loss": 0.81539249, + "learning_rate": 0.0004397078698788232, + "loss": 0.82625151, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.3737793, + "step": 2871, + "time_per_iteration": 3.2084577083587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026675, + "balance_loss_mlp": 1.01427722, + "epoch": 0.5525202000769527, + "flos": 1465117645824.0, + "grad_norm": 0.012296141252344654, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81469035, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.12353516, + "step": 2872, + "time_per_iteration": 4.909080266952515 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087659, + "balance_loss_mlp": 1.05115747, + "epoch": 0.5527125817622163, + "flos": 489554993664.0, + "grad_norm": 0.06201182150044637, + "language_loss": 0.78260124, + "learning_rate": 0.00043908937882281343, + "loss": 0.79347777, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.36523438, + "step": 2873, + "time_per_iteration": 2.5999958515167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078657, + "balance_loss_mlp": 1.0410111, + "epoch": 0.5529049634474797, + "flos": 634606562304.0, + "grad_norm": 0.05626101072807578, + "language_loss": 0.82624078, + "learning_rate": 0.0004387801687061814, + "loss": 0.83702731, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.37573242, + "step": 2874, + "time_per_iteration": 2.816607713699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082085, + "balance_loss_mlp": 1.04310322, + "epoch": 0.5530973451327433, + "flos": 580986086400.0, + "grad_norm": 0.04886656520386433, + "language_loss": 0.80143493, + "learning_rate": 0.0004384709823571958, + "loss": 0.8122558, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.38964844, + "step": 2875, + "time_per_iteration": 2.7270736694335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079953, + "balance_loss_mlp": 1.04113841, + "epoch": 0.5532897268180069, + "flos": 1122030144000.0, + "grad_norm": 0.06103557908182598, + "language_loss": 0.83129716, + "learning_rate": 0.0004381618198958932, + "loss": 0.84209669, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.38793945, + "step": 2876, + "time_per_iteration": 3.4826347827911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085381, + "balance_loss_mlp": 1.04721045, + "epoch": 0.5534821085032705, + "flos": 636965088768.0, + "grad_norm": 0.05070554688334561, + "language_loss": 0.83524168, + "learning_rate": 0.00043785268144230137, + "loss": 0.84609544, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.38183594, + "step": 2877, + "time_per_iteration": 2.8850836753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080685, + "balance_loss_mlp": 1.04332519, + "epoch": 0.5536744901885341, + "flos": 570837597696.0, + "grad_norm": 0.056027333180870484, + "language_loss": 0.82300985, + "learning_rate": 0.00043754356711643837, + "loss": 0.83381677, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.37353516, + "step": 2878, + "time_per_iteration": 2.6629955768585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079329, + "balance_loss_mlp": 1.04180145, + "epoch": 0.5538668718737976, + "flos": 595418052096.0, + "grad_norm": 0.051053801448504514, + "language_loss": 0.84143484, + "learning_rate": 0.0004372344770383132, + "loss": 0.85222816, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.37475586, + "step": 2879, + "time_per_iteration": 2.809924364089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080427, + "balance_loss_mlp": 1.04220867, + "epoch": 0.5540592535590612, + "flos": 532324182528.0, + "grad_norm": 0.054354704442993965, + "language_loss": 0.83048761, + "learning_rate": 0.00043692541132792507, + "loss": 0.8412919, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.38183594, + "step": 2880, + "time_per_iteration": 2.6826112270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076517, + "balance_loss_mlp": 1.03915703, + "epoch": 0.5542516352443247, + "flos": 412398715392.0, + "grad_norm": 0.060842521075957015, + "language_loss": 0.83359361, + "learning_rate": 0.00043661637010526384, + "loss": 0.84435874, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.37329102, + "step": 2881, + "time_per_iteration": 2.5412843227386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077156, + "balance_loss_mlp": 1.03946209, + "epoch": 0.5544440169295883, + "flos": 547341083136.0, + "grad_norm": 0.06506612292228302, + "language_loss": 0.82828653, + "learning_rate": 0.00043630735349031025, + "loss": 0.83905804, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.37646484, + "step": 2882, + "time_per_iteration": 2.6428792476654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079595, + "balance_loss_mlp": 1.04132843, + "epoch": 0.5546363986148518, + "flos": 621518994432.0, + "grad_norm": 0.04746548389090053, + "language_loss": 0.8146224, + "learning_rate": 0.00043599836160303495, + "loss": 0.82541835, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.38232422, + "step": 2883, + "time_per_iteration": 2.836928367614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076033, + "balance_loss_mlp": 1.03833902, + "epoch": 0.5548287803001154, + "flos": 704972450304.0, + "grad_norm": 0.05191443424956408, + "language_loss": 0.77216405, + "learning_rate": 0.0004356893945633995, + "loss": 0.78292441, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.37719727, + "step": 2884, + "time_per_iteration": 2.959998846054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077354, + "balance_loss_mlp": 1.03877735, + "epoch": 0.555021161985379, + "flos": 503952053760.0, + "grad_norm": 0.04795057861891694, + "language_loss": 0.8143183, + "learning_rate": 0.0004353804524913551, + "loss": 0.82509184, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.38549805, + "step": 2885, + "time_per_iteration": 2.587458848953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076676, + "balance_loss_mlp": 1.03960204, + "epoch": 0.5552135436706426, + "flos": 615782020608.0, + "grad_norm": 0.060100634137020215, + "language_loss": 0.81801999, + "learning_rate": 0.0004350715355068441, + "loss": 0.82878673, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.37109375, + "step": 2886, + "time_per_iteration": 2.739311933517456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080638, + "balance_loss_mlp": 1.04227662, + "epoch": 0.5554059253559062, + "flos": 463635933696.0, + "grad_norm": 0.06732751663430354, + "language_loss": 0.79759407, + "learning_rate": 0.00043476264372979847, + "loss": 0.80840045, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.38305664, + "step": 2887, + "time_per_iteration": 2.5322625637054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081996, + "balance_loss_mlp": 1.04425478, + "epoch": 0.5555983070411696, + "flos": 1561923813888.0, + "grad_norm": 0.05205208802168105, + "language_loss": 0.78767329, + "learning_rate": 0.0004344537772801408, + "loss": 0.79849327, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.37744141, + "step": 2888, + "time_per_iteration": 3.8099794387817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022363, + "balance_loss_mlp": 1.00986981, + "epoch": 0.5557906887264332, + "flos": 1467093468672.0, + "grad_norm": 0.012872465654446894, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74444818, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.12451172, + "step": 2889, + "time_per_iteration": 4.8980872631073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080645, + "balance_loss_mlp": 1.04373789, + "epoch": 0.5559830704116968, + "flos": 529575750144.0, + "grad_norm": 0.056518477254008576, + "language_loss": 0.83232135, + "learning_rate": 0.0004338361208426298, + "loss": 0.84312785, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.36889648, + "step": 2890, + "time_per_iteration": 2.596644163131714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108101, + "balance_loss_mlp": 1.04312527, + "epoch": 0.5561754520969604, + "flos": 650895077376.0, + "grad_norm": 0.04719414959796351, + "language_loss": 0.81189138, + "learning_rate": 0.00043352733109457164, + "loss": 0.82270145, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.37841797, + "step": 2891, + "time_per_iteration": 2.8776957988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079084, + "balance_loss_mlp": 1.04158103, + "epoch": 0.556367833782224, + "flos": 733968801792.0, + "grad_norm": 0.04510399892940866, + "language_loss": 0.84577823, + "learning_rate": 0.00043321856715349244, + "loss": 0.85656911, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.37451172, + "step": 2892, + "time_per_iteration": 2.9247210025787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107932, + "balance_loss_mlp": 1.04243708, + "epoch": 0.5565602154674875, + "flos": 672120405504.0, + "grad_norm": 0.04457708587394983, + "language_loss": 0.80344868, + "learning_rate": 0.00043290982913926466, + "loss": 0.81424183, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.36889648, + "step": 2893, + "time_per_iteration": 2.791151285171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087078, + "balance_loss_mlp": 1.04919362, + "epoch": 0.556752597152751, + "flos": 585911162880.0, + "grad_norm": 0.05091942660655845, + "language_loss": 0.84425044, + "learning_rate": 0.0004326011171717514, + "loss": 0.8551212, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.37866211, + "step": 2894, + "time_per_iteration": 2.8832085132598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085909, + "balance_loss_mlp": 1.04788101, + "epoch": 0.5569449788380146, + "flos": 437549548032.0, + "grad_norm": 0.04808991967010034, + "language_loss": 0.81074953, + "learning_rate": 0.0004322924313708051, + "loss": 0.82160866, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.38012695, + "step": 2895, + "time_per_iteration": 2.5033986568450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079423, + "balance_loss_mlp": 1.04315972, + "epoch": 0.5571373605232782, + "flos": 502002372096.0, + "grad_norm": 0.057289668121921454, + "language_loss": 0.84257507, + "learning_rate": 0.0004319837718562681, + "loss": 0.85336924, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.36254883, + "step": 2896, + "time_per_iteration": 2.55461049079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086447, + "balance_loss_mlp": 1.04856229, + "epoch": 0.5573297422085417, + "flos": 577126010880.0, + "grad_norm": 0.05427319641394577, + "language_loss": 0.83001935, + "learning_rate": 0.0004316751387479726, + "loss": 0.84088391, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.37841797, + "step": 2897, + "time_per_iteration": 2.726621150970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010828, + "balance_loss_mlp": 1.04622626, + "epoch": 0.5575221238938053, + "flos": 1343536754688.0, + "grad_norm": 0.07147882998338702, + "language_loss": 0.82389295, + "learning_rate": 0.0004313665321657409, + "loss": 0.83472097, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.36572266, + "step": 2898, + "time_per_iteration": 3.705557107925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084994, + "balance_loss_mlp": 1.04756212, + "epoch": 0.5577145055790689, + "flos": 601680324096.0, + "grad_norm": 0.06263472170874507, + "language_loss": 0.80018216, + "learning_rate": 0.00043105795222938436, + "loss": 0.81103212, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.37451172, + "step": 2899, + "time_per_iteration": 2.7069091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082839, + "balance_loss_mlp": 1.04500163, + "epoch": 0.5579068872643325, + "flos": 562353601536.0, + "grad_norm": 0.0921941925102754, + "language_loss": 0.78331131, + "learning_rate": 0.00043074939905870467, + "loss": 0.79413968, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.37817383, + "step": 2900, + "time_per_iteration": 2.6597537994384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108264, + "balance_loss_mlp": 1.04468393, + "epoch": 0.558099268949596, + "flos": 544292904960.0, + "grad_norm": 0.05487003421557055, + "language_loss": 0.80032802, + "learning_rate": 0.0004304408727734927, + "loss": 0.81115448, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.37939453, + "step": 2901, + "time_per_iteration": 2.61590838432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077829, + "balance_loss_mlp": 1.04120803, + "epoch": 0.5582916506348595, + "flos": 552520825344.0, + "grad_norm": 0.05406538300276566, + "language_loss": 0.88821226, + "learning_rate": 0.0004301323734935288, + "loss": 0.89899063, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.36645508, + "step": 2902, + "time_per_iteration": 2.6357102394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082783, + "balance_loss_mlp": 1.04573286, + "epoch": 0.5584840323201231, + "flos": 543126007296.0, + "grad_norm": 0.054631389421551546, + "language_loss": 0.87217975, + "learning_rate": 0.000429823901338583, + "loss": 0.88300759, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.37011719, + "step": 2903, + "time_per_iteration": 2.6050922870635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073691, + "balance_loss_mlp": 1.03678417, + "epoch": 0.5586764140053867, + "flos": 815212118016.0, + "grad_norm": 0.05529085617610277, + "language_loss": 0.86446041, + "learning_rate": 0.00042951545642841513, + "loss": 0.87519729, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.36914062, + "step": 2904, + "time_per_iteration": 3.0609569549560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076695, + "balance_loss_mlp": 1.03981209, + "epoch": 0.5588687956906503, + "flos": 486196895232.0, + "grad_norm": 0.04557850009306157, + "language_loss": 0.86361349, + "learning_rate": 0.0004292070388827737, + "loss": 0.87438047, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.3684082, + "step": 2905, + "time_per_iteration": 2.5549428462982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107784, + "balance_loss_mlp": 1.04017019, + "epoch": 0.5590611773759138, + "flos": 451809805824.0, + "grad_norm": 0.04842795237529701, + "language_loss": 0.8078168, + "learning_rate": 0.00042889864882139753, + "loss": 0.81859523, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.37646484, + "step": 2906, + "time_per_iteration": 2.6019363403320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072955, + "balance_loss_mlp": 1.03662026, + "epoch": 0.5592535590611774, + "flos": 520693083648.0, + "grad_norm": 0.04884179046821603, + "language_loss": 0.81762469, + "learning_rate": 0.0004285902863640139, + "loss": 0.8283543, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.36352539, + "step": 2907, + "time_per_iteration": 2.5899524688720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072848, + "balance_loss_mlp": 1.03622651, + "epoch": 0.5594459407464409, + "flos": 552250192896.0, + "grad_norm": 0.048074009249812255, + "language_loss": 0.8615104, + "learning_rate": 0.00042828195163033966, + "loss": 0.87223887, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.36645508, + "step": 2908, + "time_per_iteration": 2.676518440246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072859, + "balance_loss_mlp": 1.03585625, + "epoch": 0.5596383224317045, + "flos": 484596421632.0, + "grad_norm": 0.0512741694464887, + "language_loss": 0.79307508, + "learning_rate": 0.0004279736447400812, + "loss": 0.80380368, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.36987305, + "step": 2909, + "time_per_iteration": 2.590859889984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074509, + "balance_loss_mlp": 1.03676748, + "epoch": 0.5598307041169681, + "flos": 610976217600.0, + "grad_norm": 0.05469922136848912, + "language_loss": 0.78325337, + "learning_rate": 0.00042766536581293385, + "loss": 0.79399848, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.37695312, + "step": 2910, + "time_per_iteration": 2.7034008502960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074194, + "balance_loss_mlp": 1.03654802, + "epoch": 0.5600230858022316, + "flos": 488585945088.0, + "grad_norm": 0.05207227245540468, + "language_loss": 0.79564762, + "learning_rate": 0.0004273571149685819, + "loss": 0.80638957, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.37597656, + "step": 2911, + "time_per_iteration": 2.7075796127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074918, + "balance_loss_mlp": 1.03650868, + "epoch": 0.5602154674874952, + "flos": 598592858112.0, + "grad_norm": 0.04994756976596268, + "language_loss": 0.84006047, + "learning_rate": 0.00042704889232669937, + "loss": 0.85080969, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.38354492, + "step": 2912, + "time_per_iteration": 2.6922175884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071101, + "balance_loss_mlp": 1.03431344, + "epoch": 0.5604078491727588, + "flos": 585697347072.0, + "grad_norm": 0.05437848146357707, + "language_loss": 0.85302234, + "learning_rate": 0.0004267406980069484, + "loss": 0.86373341, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.36791992, + "step": 2913, + "time_per_iteration": 2.70796537399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067512, + "balance_loss_mlp": 1.03077149, + "epoch": 0.5606002308580224, + "flos": 540926042112.0, + "grad_norm": 0.045341959008097614, + "language_loss": 0.79753983, + "learning_rate": 0.0004264325321289808, + "loss": 0.80821496, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.3671875, + "step": 2914, + "time_per_iteration": 2.761362314224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107061, + "balance_loss_mlp": 1.03241491, + "epoch": 0.5607926125432858, + "flos": 583654533120.0, + "grad_norm": 0.0532534560102953, + "language_loss": 0.85864502, + "learning_rate": 0.00042612439481243736, + "loss": 0.86935115, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.38183594, + "step": 2915, + "time_per_iteration": 2.745008945465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073585, + "balance_loss_mlp": 1.03655863, + "epoch": 0.5609849942285494, + "flos": 627205095936.0, + "grad_norm": 0.06454697115510677, + "language_loss": 0.90024638, + "learning_rate": 0.00042581628617694735, + "loss": 0.91098225, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.37036133, + "step": 2916, + "time_per_iteration": 2.7654495239257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072441, + "balance_loss_mlp": 1.0346992, + "epoch": 0.561177375913813, + "flos": 588095161344.0, + "grad_norm": 0.05235254168005436, + "language_loss": 0.81651318, + "learning_rate": 0.0004255082063421296, + "loss": 0.82723755, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.37719727, + "step": 2917, + "time_per_iteration": 2.674204111099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107424, + "balance_loss_mlp": 1.03726149, + "epoch": 0.5613697575990766, + "flos": 526774883328.0, + "grad_norm": 0.05687183599046208, + "language_loss": 0.8481921, + "learning_rate": 0.00042520015542759065, + "loss": 0.85893452, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.36987305, + "step": 2918, + "time_per_iteration": 2.8309459686279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079966, + "balance_loss_mlp": 1.04134226, + "epoch": 0.5615621392843402, + "flos": 642351444480.0, + "grad_norm": 0.05024796403090353, + "language_loss": 0.88020825, + "learning_rate": 0.00042489213355292687, + "loss": 0.89100802, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.38598633, + "step": 2919, + "time_per_iteration": 2.8605942726135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083327, + "balance_loss_mlp": 1.04444087, + "epoch": 0.5617545209696037, + "flos": 427524715008.0, + "grad_norm": 0.05130722807003229, + "language_loss": 0.8097831, + "learning_rate": 0.00042458414083772276, + "loss": 0.82061636, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.38842773, + "step": 2920, + "time_per_iteration": 2.5186893939971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076823, + "balance_loss_mlp": 1.03920078, + "epoch": 0.5619469026548672, + "flos": 568140037632.0, + "grad_norm": 0.04280127072200588, + "language_loss": 0.84787017, + "learning_rate": 0.000424276177401552, + "loss": 0.85863835, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.37597656, + "step": 2921, + "time_per_iteration": 2.773881435394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079464, + "balance_loss_mlp": 1.04203272, + "epoch": 0.5621392843401308, + "flos": 504947243520.0, + "grad_norm": 0.056711430924252765, + "language_loss": 0.85714108, + "learning_rate": 0.0004239682433639763, + "loss": 0.86793578, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.37426758, + "step": 2922, + "time_per_iteration": 2.714646816253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081153, + "balance_loss_mlp": 1.04477036, + "epoch": 0.5623316660253944, + "flos": 516744258048.0, + "grad_norm": 0.060505090734525195, + "language_loss": 0.85348099, + "learning_rate": 0.0004236603388445467, + "loss": 0.8642925, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.36425781, + "step": 2923, + "time_per_iteration": 2.6141107082366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075818, + "balance_loss_mlp": 1.03905368, + "epoch": 0.5625240477106579, + "flos": 605732456448.0, + "grad_norm": 0.05369747698254185, + "language_loss": 0.81871819, + "learning_rate": 0.00042335246396280166, + "loss": 0.82947636, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.3671875, + "step": 2924, + "time_per_iteration": 2.7129671573638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081443, + "balance_loss_mlp": 1.0438447, + "epoch": 0.5627164293959215, + "flos": 450203539968.0, + "grad_norm": 0.06323509209264203, + "language_loss": 0.89955974, + "learning_rate": 0.0004230446188382693, + "loss": 0.9103741, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.3762207, + "step": 2925, + "time_per_iteration": 2.5567660331726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077212, + "balance_loss_mlp": 1.04101968, + "epoch": 0.5629088110811851, + "flos": 741734032896.0, + "grad_norm": 0.055420573846539395, + "language_loss": 0.80082184, + "learning_rate": 0.0004227368035904654, + "loss": 0.81159395, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.36181641, + "step": 2926, + "time_per_iteration": 2.947251319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108299, + "balance_loss_mlp": 1.04610705, + "epoch": 0.5631011927664487, + "flos": 496719323136.0, + "grad_norm": 0.04719463019166682, + "language_loss": 0.82913107, + "learning_rate": 0.00042242901833889474, + "loss": 0.83996093, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.36889648, + "step": 2927, + "time_per_iteration": 2.6429412364959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085596, + "balance_loss_mlp": 1.0498333, + "epoch": 0.5632935744517122, + "flos": 885774445056.0, + "grad_norm": 0.055780235249339845, + "language_loss": 0.85862845, + "learning_rate": 0.0004221212632030501, + "loss": 0.86948442, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.35791016, + "step": 2928, + "time_per_iteration": 3.0935142040252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085595, + "balance_loss_mlp": 1.04897451, + "epoch": 0.5634859561369757, + "flos": 604516096512.0, + "grad_norm": 0.08179321361553939, + "language_loss": 0.80431306, + "learning_rate": 0.0004218135383024124, + "loss": 0.81516898, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.3659668, + "step": 2929, + "time_per_iteration": 2.688404083251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079905, + "balance_loss_mlp": 1.04359436, + "epoch": 0.5636783378222393, + "flos": 453670737408.0, + "grad_norm": 0.05341288147748167, + "language_loss": 0.85107243, + "learning_rate": 0.0004215058437564511, + "loss": 0.86187148, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.36352539, + "step": 2930, + "time_per_iteration": 2.5591979026794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083948, + "balance_loss_mlp": 1.04725528, + "epoch": 0.5638707195075029, + "flos": 518206519296.0, + "grad_norm": 0.06241038231461263, + "language_loss": 0.82415265, + "learning_rate": 0.00042119817968462397, + "loss": 0.83499211, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.36694336, + "step": 2931, + "time_per_iteration": 2.5755324363708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075664, + "balance_loss_mlp": 1.03916192, + "epoch": 0.5640631011927665, + "flos": 564632142336.0, + "grad_norm": 0.06755883510394861, + "language_loss": 0.87004125, + "learning_rate": 0.0004208905462063766, + "loss": 0.88079786, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.36499023, + "step": 2932, + "time_per_iteration": 2.6330130100250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077211, + "balance_loss_mlp": 1.04097116, + "epoch": 0.56425548287803, + "flos": 516783545856.0, + "grad_norm": 0.04875434703648171, + "language_loss": 0.84473455, + "learning_rate": 0.00042058294344114315, + "loss": 0.85550666, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.36254883, + "step": 2933, + "time_per_iteration": 2.60188627243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081111, + "balance_loss_mlp": 1.04477572, + "epoch": 0.5644478645632935, + "flos": 853907415552.0, + "grad_norm": 0.05278955631679875, + "language_loss": 0.77495515, + "learning_rate": 0.0004202753715083456, + "loss": 0.78576624, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.36352539, + "step": 2934, + "time_per_iteration": 3.0625100135803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084889, + "balance_loss_mlp": 1.04860175, + "epoch": 0.5646402462485571, + "flos": 553175571456.0, + "grad_norm": 0.05717629686508025, + "language_loss": 0.81433523, + "learning_rate": 0.0004199678305273936, + "loss": 0.82518411, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.36279297, + "step": 2935, + "time_per_iteration": 2.6390254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082496, + "balance_loss_mlp": 1.04587531, + "epoch": 0.5648326279338207, + "flos": 685661898240.0, + "grad_norm": 0.05411523361189988, + "language_loss": 0.81180829, + "learning_rate": 0.0004196603206176854, + "loss": 0.82263327, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.36669922, + "step": 2936, + "time_per_iteration": 2.9184954166412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079161, + "balance_loss_mlp": 1.04354107, + "epoch": 0.5650250096190843, + "flos": 802990291968.0, + "grad_norm": 0.04902014595353554, + "language_loss": 0.83833814, + "learning_rate": 0.000419352841898607, + "loss": 0.84912974, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.35644531, + "step": 2937, + "time_per_iteration": 2.963693618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078443, + "balance_loss_mlp": 1.04248953, + "epoch": 0.5652173913043478, + "flos": 581787809280.0, + "grad_norm": 0.05926519799053672, + "language_loss": 0.77107543, + "learning_rate": 0.000419045394489532, + "loss": 0.78185987, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.359375, + "step": 2938, + "time_per_iteration": 2.727398633956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076353, + "balance_loss_mlp": 1.03975606, + "epoch": 0.5654097729896114, + "flos": 820269614592.0, + "grad_norm": 0.053889258634032246, + "language_loss": 0.76768535, + "learning_rate": 0.0004187379785098224, + "loss": 0.77844894, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.3659668, + "step": 2939, + "time_per_iteration": 3.1188313961029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079563, + "balance_loss_mlp": 1.04339492, + "epoch": 0.565602154674875, + "flos": 783826716672.0, + "grad_norm": 0.05512056097545077, + "language_loss": 0.83633238, + "learning_rate": 0.00041843059407882744, + "loss": 0.84712803, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.36206055, + "step": 2940, + "time_per_iteration": 2.983302116394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076269, + "balance_loss_mlp": 1.04072082, + "epoch": 0.5657945363601385, + "flos": 549418802688.0, + "grad_norm": 0.05159052201649483, + "language_loss": 0.82491434, + "learning_rate": 0.0004181232413158842, + "loss": 0.83567703, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.35571289, + "step": 2941, + "time_per_iteration": 2.6737120151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_mlp": 1.04028893, + "epoch": 0.5659869180454021, + "flos": 667826754048.0, + "grad_norm": 0.06466569325042074, + "language_loss": 0.82093412, + "learning_rate": 0.0004178159203403179, + "loss": 0.83170253, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.36547852, + "step": 2942, + "time_per_iteration": 2.8263752460479736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077242, + "balance_loss_mlp": 1.0423857, + "epoch": 0.5661792997306656, + "flos": 499707864576.0, + "grad_norm": 0.05486974364690197, + "language_loss": 0.81532693, + "learning_rate": 0.0004175086312714409, + "loss": 0.82609934, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.34912109, + "step": 2943, + "time_per_iteration": 2.5581164360046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084461, + "balance_loss_mlp": 1.04848337, + "epoch": 0.5663716814159292, + "flos": 600922271232.0, + "grad_norm": 0.04881995286740945, + "language_loss": 0.83686805, + "learning_rate": 0.00041720137422855366, + "loss": 0.84771264, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.35961914, + "step": 2944, + "time_per_iteration": 2.7574734687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080125, + "balance_loss_mlp": 1.04390931, + "epoch": 0.5665640631011928, + "flos": 540728193024.0, + "grad_norm": 0.05214507443979086, + "language_loss": 0.79004753, + "learning_rate": 0.00041689414933094383, + "loss": 0.80084872, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.36230469, + "step": 2945, + "time_per_iteration": 2.6470541954040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080942, + "balance_loss_mlp": 1.0463953, + "epoch": 0.5667564447864564, + "flos": 601655592960.0, + "grad_norm": 0.06146311821637782, + "language_loss": 0.80673099, + "learning_rate": 0.00041658695669788653, + "loss": 0.81754035, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.34594727, + "step": 2946, + "time_per_iteration": 2.721078872680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083586, + "balance_loss_mlp": 1.04791868, + "epoch": 0.5669488264717198, + "flos": 659224894464.0, + "grad_norm": 0.05891401598443517, + "language_loss": 0.80939281, + "learning_rate": 0.00041627979644864453, + "loss": 0.82022864, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.35717773, + "step": 2947, + "time_per_iteration": 2.877037286758423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085181, + "balance_loss_mlp": 1.04941845, + "epoch": 0.5671412081569834, + "flos": 485158035456.0, + "grad_norm": 0.042998309327625356, + "language_loss": 0.809735, + "learning_rate": 0.0004159726687024683, + "loss": 0.8205868, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.35791016, + "step": 2948, + "time_per_iteration": 2.617147207260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_mlp": 1.04832673, + "epoch": 0.567333589842247, + "flos": 729487475712.0, + "grad_norm": 0.049875608566737006, + "language_loss": 0.79203111, + "learning_rate": 0.00041566557357859506, + "loss": 0.80287302, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.35888672, + "step": 2949, + "time_per_iteration": 2.859217882156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080493, + "balance_loss_mlp": 1.04494464, + "epoch": 0.5675259715275106, + "flos": 968471258112.0, + "grad_norm": 0.06410563873068757, + "language_loss": 0.79063594, + "learning_rate": 0.0004153585111962502, + "loss": 0.80144083, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.35571289, + "step": 2950, + "time_per_iteration": 3.3080387115478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084603, + "balance_loss_mlp": 1.04767203, + "epoch": 0.5677183532127742, + "flos": 564879453696.0, + "grad_norm": 0.058242755990822084, + "language_loss": 0.84030402, + "learning_rate": 0.0004150514816746453, + "loss": 0.85115004, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.36938477, + "step": 2951, + "time_per_iteration": 2.66630220413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080246, + "balance_loss_mlp": 1.04517412, + "epoch": 0.5679107348980377, + "flos": 551432503296.0, + "grad_norm": 0.05117838990465897, + "language_loss": 0.85669959, + "learning_rate": 0.0004147444851329802, + "loss": 0.86750209, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.35107422, + "step": 2952, + "time_per_iteration": 2.645735502243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108119, + "balance_loss_mlp": 1.04585648, + "epoch": 0.5681031165833013, + "flos": 819115863552.0, + "grad_norm": 0.04931619960622222, + "language_loss": 0.85395974, + "learning_rate": 0.00041443752169044126, + "loss": 0.8647716, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.35351562, + "step": 2953, + "time_per_iteration": 3.025468349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087019, + "balance_loss_mlp": 1.05116129, + "epoch": 0.5682954982685648, + "flos": 617731702272.0, + "grad_norm": 0.05138113495872943, + "language_loss": 0.84811544, + "learning_rate": 0.0004141305914662025, + "loss": 0.85898566, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.35888672, + "step": 2954, + "time_per_iteration": 2.7767860889434814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108515, + "balance_loss_mlp": 1.04848099, + "epoch": 0.5684878799538284, + "flos": 647625729024.0, + "grad_norm": 0.04880277930525614, + "language_loss": 0.80257368, + "learning_rate": 0.0004138236945794246, + "loss": 0.81342518, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.36645508, + "step": 2955, + "time_per_iteration": 2.9492557048797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079722, + "balance_loss_mlp": 1.04434061, + "epoch": 0.5686802616390919, + "flos": 805615068672.0, + "grad_norm": 0.060523381383535066, + "language_loss": 0.83239132, + "learning_rate": 0.00041351683114925576, + "loss": 0.84318852, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.35424805, + "step": 2956, + "time_per_iteration": 3.0558693408966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080356, + "balance_loss_mlp": 1.0441637, + "epoch": 0.5688726433243555, + "flos": 546882776064.0, + "grad_norm": 0.06102379875806974, + "language_loss": 0.86688364, + "learning_rate": 0.0004132100012948308, + "loss": 0.87768722, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.36230469, + "step": 2957, + "time_per_iteration": 2.6131510734558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084018, + "balance_loss_mlp": 1.04689598, + "epoch": 0.5690650250096191, + "flos": 486324933120.0, + "grad_norm": 0.05856765821562534, + "language_loss": 0.84111595, + "learning_rate": 0.00041290320513527145, + "loss": 0.85195613, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.37133789, + "step": 2958, + "time_per_iteration": 2.584434986114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077095, + "balance_loss_mlp": 1.04154706, + "epoch": 0.5692574066948827, + "flos": 577184237568.0, + "grad_norm": 0.04674501738886335, + "language_loss": 0.85154927, + "learning_rate": 0.0004125964427896867, + "loss": 0.86232018, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.35571289, + "step": 2959, + "time_per_iteration": 2.6582295894622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071399, + "balance_loss_mlp": 1.03551733, + "epoch": 0.5694497883801463, + "flos": 454005388800.0, + "grad_norm": 0.055082869163009494, + "language_loss": 0.79042369, + "learning_rate": 0.0004122897143771723, + "loss": 0.80113769, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.35888672, + "step": 2960, + "time_per_iteration": 2.555941104888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075778, + "balance_loss_mlp": 1.0394429, + "epoch": 0.5696421700654097, + "flos": 559251578880.0, + "grad_norm": 0.0498118595632428, + "language_loss": 0.81253064, + "learning_rate": 0.0004119830200168109, + "loss": 0.82328844, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.36376953, + "step": 2961, + "time_per_iteration": 2.6521012783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073119, + "balance_loss_mlp": 1.03780937, + "epoch": 0.5698345517506733, + "flos": 465314982912.0, + "grad_norm": 0.05616905034177488, + "language_loss": 0.8830415, + "learning_rate": 0.0004116763598276714, + "loss": 0.89377272, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.35327148, + "step": 2962, + "time_per_iteration": 2.5006790161132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073408, + "balance_loss_mlp": 1.03702545, + "epoch": 0.5700269334359369, + "flos": 605645116416.0, + "grad_norm": 0.05368070912324084, + "language_loss": 0.8055867, + "learning_rate": 0.00041136973392881017, + "loss": 0.81632078, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.36376953, + "step": 2963, + "time_per_iteration": 2.8011715412139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073783, + "balance_loss_mlp": 1.03852105, + "epoch": 0.5702193151212005, + "flos": 562423412736.0, + "grad_norm": 0.05977105557008513, + "language_loss": 0.81818962, + "learning_rate": 0.00041106314243926983, + "loss": 0.82892752, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.35302734, + "step": 2964, + "time_per_iteration": 2.7296242713928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070445, + "balance_loss_mlp": 1.03558779, + "epoch": 0.570411696806464, + "flos": 522983208960.0, + "grad_norm": 0.05693204807949615, + "language_loss": 0.87045705, + "learning_rate": 0.0004107565854780798, + "loss": 0.88116145, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.34887695, + "step": 2965, + "time_per_iteration": 2.5964605808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075214, + "balance_loss_mlp": 1.04002357, + "epoch": 0.5706040784917276, + "flos": 717911631360.0, + "grad_norm": 0.05031367362382368, + "language_loss": 0.80980343, + "learning_rate": 0.000410450063164256, + "loss": 0.82055557, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.35229492, + "step": 2966, + "time_per_iteration": 2.8248300552368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076636, + "balance_loss_mlp": 1.04127812, + "epoch": 0.5707964601769911, + "flos": 476467425792.0, + "grad_norm": 0.059966750204006415, + "language_loss": 0.8167066, + "learning_rate": 0.00041014357561680115, + "loss": 0.82747293, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.35351562, + "step": 2967, + "time_per_iteration": 2.4996910095214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077241, + "balance_loss_mlp": 1.04278946, + "epoch": 0.5709888418622547, + "flos": 579823570944.0, + "grad_norm": 0.05891056148222195, + "language_loss": 0.85875672, + "learning_rate": 0.0004098371229547039, + "loss": 0.86952913, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.3449707, + "step": 2968, + "time_per_iteration": 2.6908459663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131838, + "balance_loss_mlp": 1.11677039, + "epoch": 0.5711812235475183, + "flos": 1579108290048.0, + "grad_norm": 0.050443633584492734, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81142646, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.15039062, + "step": 2969, + "time_per_iteration": 4.709675550460815 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107846, + "balance_loss_mlp": 1.04233932, + "epoch": 0.5713736052327818, + "flos": 468259854336.0, + "grad_norm": 0.04864564090032181, + "language_loss": 0.80513656, + "learning_rate": 0.00040922432276247107, + "loss": 0.81592119, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.36132812, + "step": 2970, + "time_per_iteration": 2.554276466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078647, + "balance_loss_mlp": 1.04412448, + "epoch": 0.5715659869180454, + "flos": 537390443520.0, + "grad_norm": 0.06858717783230618, + "language_loss": 0.84265316, + "learning_rate": 0.0004089179754702457, + "loss": 0.85343957, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.34570312, + "step": 2971, + "time_per_iteration": 2.7972512245178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072054, + "balance_loss_mlp": 1.0365299, + "epoch": 0.571758368603309, + "flos": 655778045952.0, + "grad_norm": 0.0710461233457747, + "language_loss": 0.79649973, + "learning_rate": 0.00040861166353919843, + "loss": 0.80722028, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.35546875, + "step": 2972, + "time_per_iteration": 2.7805516719818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076554, + "balance_loss_mlp": 1.04076695, + "epoch": 0.5719507502885726, + "flos": 667609966080.0, + "grad_norm": 0.05192257726698222, + "language_loss": 0.81693333, + "learning_rate": 0.00040830538708824983, + "loss": 0.82769883, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.35839844, + "step": 2973, + "time_per_iteration": 2.8635294437408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070547, + "balance_loss_mlp": 1.03507066, + "epoch": 0.572143131973836, + "flos": 476083312128.0, + "grad_norm": 0.060626408017241236, + "language_loss": 0.81790257, + "learning_rate": 0.000407999146236307, + "loss": 0.82860804, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.35498047, + "step": 2974, + "time_per_iteration": 2.5645899772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074889, + "balance_loss_mlp": 1.03943634, + "epoch": 0.5723355136590996, + "flos": 539255757312.0, + "grad_norm": 0.06009071322865027, + "language_loss": 0.83246768, + "learning_rate": 0.0004076929411022634, + "loss": 0.84321654, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.35449219, + "step": 2975, + "time_per_iteration": 2.655545234680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075121, + "balance_loss_mlp": 1.0383811, + "epoch": 0.5725278953443632, + "flos": 823784864256.0, + "grad_norm": 0.053970809123607175, + "language_loss": 0.79314309, + "learning_rate": 0.0004073867718049982, + "loss": 0.80389434, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.36743164, + "step": 2976, + "time_per_iteration": 3.0664896965026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078369, + "balance_loss_mlp": 1.0429157, + "epoch": 0.5727202770296268, + "flos": 587155226112.0, + "grad_norm": 0.05912475797179562, + "language_loss": 0.82244706, + "learning_rate": 0.00040708063846337704, + "loss": 0.83323073, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.35522461, + "step": 2977, + "time_per_iteration": 2.7131478786468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083864, + "balance_loss_mlp": 1.04800642, + "epoch": 0.5729126587148904, + "flos": 446723195904.0, + "grad_norm": 0.048537452765021645, + "language_loss": 0.80846637, + "learning_rate": 0.00040677454119625143, + "loss": 0.81930506, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.35864258, + "step": 2978, + "time_per_iteration": 2.6209888458251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078154, + "balance_loss_mlp": 1.0418427, + "epoch": 0.5731050404001539, + "flos": 519206091264.0, + "grad_norm": 0.05702144714813726, + "language_loss": 0.82471335, + "learning_rate": 0.0004064684801224587, + "loss": 0.83549494, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.36328125, + "step": 2979, + "time_per_iteration": 2.5915722846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077909, + "balance_loss_mlp": 1.04197955, + "epoch": 0.5732974220854175, + "flos": 504528224256.0, + "grad_norm": 0.05171310351774622, + "language_loss": 0.80115962, + "learning_rate": 0.00040616245536082224, + "loss": 0.8119387, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.35961914, + "step": 2980, + "time_per_iteration": 2.6032769680023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076097, + "balance_loss_mlp": 1.04057276, + "epoch": 0.573489803770681, + "flos": 592187991552.0, + "grad_norm": 0.049753074122949235, + "language_loss": 0.80894011, + "learning_rate": 0.00040585646703015165, + "loss": 0.81970108, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.35522461, + "step": 2981, + "time_per_iteration": 2.79546856880188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074296, + "balance_loss_mlp": 1.03891444, + "epoch": 0.5736821854559446, + "flos": 489672857088.0, + "grad_norm": 0.06088968225358262, + "language_loss": 0.78612393, + "learning_rate": 0.0004055505152492419, + "loss": 0.79686689, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.35449219, + "step": 2982, + "time_per_iteration": 2.6494040489196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078742, + "balance_loss_mlp": 1.04283655, + "epoch": 0.5738745671412081, + "flos": 457895987712.0, + "grad_norm": 0.05054468303814383, + "language_loss": 0.74372864, + "learning_rate": 0.00040524460013687425, + "loss": 0.75451601, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.359375, + "step": 2983, + "time_per_iteration": 2.7171366214752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078136, + "balance_loss_mlp": 1.04294515, + "epoch": 0.5740669488264717, + "flos": 580012655616.0, + "grad_norm": 0.044553783792680594, + "language_loss": 0.80828458, + "learning_rate": 0.0004049387218118155, + "loss": 0.81906593, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.35229492, + "step": 2984, + "time_per_iteration": 2.995347738265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073612, + "balance_loss_mlp": 1.03725314, + "epoch": 0.5742593305117353, + "flos": 524155898880.0, + "grad_norm": 0.05730874981758524, + "language_loss": 0.8475495, + "learning_rate": 0.00040463288039281777, + "loss": 0.85828567, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.36328125, + "step": 2985, + "time_per_iteration": 2.715092182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102117, + "balance_loss_mlp": 1.0106324, + "epoch": 0.5744517121969989, + "flos": 1553033488896.0, + "grad_norm": 0.021440825644231668, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78897589, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.10546875, + "step": 2986, + "time_per_iteration": 4.936111211776733 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071312, + "balance_loss_mlp": 1.03588247, + "epoch": 0.5746440938822625, + "flos": 751600304640.0, + "grad_norm": 0.05668637583843988, + "language_loss": 0.81840217, + "learning_rate": 0.0004040213087479444, + "loss": 0.82911527, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.35449219, + "step": 2987, + "time_per_iteration": 2.949164628982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074412, + "balance_loss_mlp": 1.03955531, + "epoch": 0.5748364755675259, + "flos": 501618258432.0, + "grad_norm": 0.05762088821448085, + "language_loss": 0.84999508, + "learning_rate": 0.0004037155787595018, + "loss": 0.86073923, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.34887695, + "step": 2988, + "time_per_iteration": 2.6570816040039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010738, + "balance_loss_mlp": 1.03863311, + "epoch": 0.5750288572527895, + "flos": 503757024768.0, + "grad_norm": 0.17757642281187902, + "language_loss": 0.80609345, + "learning_rate": 0.000403409886151987, + "loss": 0.81683147, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.35205078, + "step": 2989, + "time_per_iteration": 2.913994073867798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014651, + "balance_loss_mlp": 1.00430369, + "epoch": 0.5752212389380531, + "flos": 1540541030400.0, + "grad_norm": 0.007550989320398048, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83013755, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.10351562, + "step": 2990, + "time_per_iteration": 4.7991979122161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020765, + "balance_loss_mlp": 1.01027453, + "epoch": 0.5754136206233167, + "flos": 1566499378176.0, + "grad_norm": 0.009415259483784648, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79219365, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.10498047, + "step": 2991, + "time_per_iteration": 4.760354280471802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076847, + "balance_loss_mlp": 1.04282451, + "epoch": 0.5756060023085803, + "flos": 797806167552.0, + "grad_norm": 0.05030181344669937, + "language_loss": 0.76800382, + "learning_rate": 0.00040249303380173807, + "loss": 0.77877235, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.34057617, + "step": 2992, + "time_per_iteration": 3.083129644393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080903, + "balance_loss_mlp": 1.04573631, + "epoch": 0.5757983839938438, + "flos": 587588802048.0, + "grad_norm": 0.05896593059815975, + "language_loss": 0.78794599, + "learning_rate": 0.00040218749190459126, + "loss": 0.79875505, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.35229492, + "step": 2993, + "time_per_iteration": 2.763256788253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083869, + "balance_loss_mlp": 1.04884517, + "epoch": 0.5759907656791073, + "flos": 516576932352.0, + "grad_norm": 0.05409710441005256, + "language_loss": 0.82655573, + "learning_rate": 0.00040188198798162775, + "loss": 0.83739436, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.35058594, + "step": 2994, + "time_per_iteration": 2.6000871658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078452, + "balance_loss_mlp": 1.04333293, + "epoch": 0.5761831473643709, + "flos": 586845305856.0, + "grad_norm": 0.05831918093224265, + "language_loss": 0.85334295, + "learning_rate": 0.000401576522151455, + "loss": 0.8641274, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.3515625, + "step": 2995, + "time_per_iteration": 2.808647871017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081176, + "balance_loss_mlp": 1.04672456, + "epoch": 0.5763755290496345, + "flos": 543619219968.0, + "grad_norm": 0.04257335582462403, + "language_loss": 0.82291412, + "learning_rate": 0.0004012710945326651, + "loss": 0.83372593, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.34472656, + "step": 2996, + "time_per_iteration": 2.7611968517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082283, + "balance_loss_mlp": 1.04749799, + "epoch": 0.576567910734898, + "flos": 625930509312.0, + "grad_norm": 0.050767561493079726, + "language_loss": 0.80952752, + "learning_rate": 0.0004009657052438355, + "loss": 0.82035035, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.34814453, + "step": 2997, + "time_per_iteration": 2.788496971130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107931, + "balance_loss_mlp": 1.04392815, + "epoch": 0.5767602924201616, + "flos": 537985552896.0, + "grad_norm": 0.053276481047857226, + "language_loss": 0.85359365, + "learning_rate": 0.00040066035440352904, + "loss": 0.86438668, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.35400391, + "step": 2998, + "time_per_iteration": 2.6187028884887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010656, + "balance_loss_mlp": 1.05358338, + "epoch": 0.5769526741054252, + "flos": 1558969873920.0, + "grad_norm": 0.027624435835290975, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80358732, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.12011719, + "step": 2999, + "time_per_iteration": 4.880754470825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085333, + "balance_loss_mlp": 1.05071473, + "epoch": 0.5771450557906888, + "flos": 467939759616.0, + "grad_norm": 0.056203987299685475, + "language_loss": 0.7605744, + "learning_rate": 0.00040004976854266145, + "loss": 0.77142775, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.34667969, + "step": 3000, + "time_per_iteration": 2.537555694580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079139, + "balance_loss_mlp": 1.043329, + "epoch": 0.5773374374759523, + "flos": 574288828416.0, + "grad_norm": 0.059637526980377456, + "language_loss": 0.81006908, + "learning_rate": 0.0003997445337591505, + "loss": 0.82086051, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.35839844, + "step": 3001, + "time_per_iteration": 2.637199878692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072855, + "balance_loss_mlp": 1.03756905, + "epoch": 0.5775298191612158, + "flos": 528216795648.0, + "grad_norm": 0.054057225734739034, + "language_loss": 0.73747128, + "learning_rate": 0.0003994393378982635, + "loss": 0.74819982, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.35327148, + "step": 3002, + "time_per_iteration": 2.605628490447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042054, + "balance_loss_mlp": 1.03013277, + "epoch": 0.5777222008464794, + "flos": 1303178070528.0, + "grad_norm": 0.01828159888171313, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80580056, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.11914062, + "step": 3003, + "time_per_iteration": 4.791952848434448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072518, + "balance_loss_mlp": 1.03708899, + "epoch": 0.577914582531743, + "flos": 603344816640.0, + "grad_norm": 0.05129820562397971, + "language_loss": 0.88025165, + "learning_rate": 0.0003988290634182961, + "loss": 0.89097679, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.35449219, + "step": 3004, + "time_per_iteration": 2.7482082843780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107598, + "balance_loss_mlp": 1.04162431, + "epoch": 0.5781069642170066, + "flos": 486537338880.0, + "grad_norm": 0.060845290060135546, + "language_loss": 0.80967325, + "learning_rate": 0.0003985239850361453, + "loss": 0.82043308, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.34399414, + "step": 3005, + "time_per_iteration": 2.577929735183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074673, + "balance_loss_mlp": 1.03933978, + "epoch": 0.5782993459022701, + "flos": 506016626688.0, + "grad_norm": 0.06787324566679709, + "language_loss": 0.84799004, + "learning_rate": 0.0003982189460504777, + "loss": 0.85873681, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.35375977, + "step": 3006, + "time_per_iteration": 2.6993815898895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077227, + "balance_loss_mlp": 1.04179859, + "epoch": 0.5784917275875336, + "flos": 601872380928.0, + "grad_norm": 0.06968716045875477, + "language_loss": 0.79860866, + "learning_rate": 0.00039791394657971935, + "loss": 0.80938095, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.35449219, + "step": 3007, + "time_per_iteration": 2.6929664611816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070454, + "balance_loss_mlp": 1.03616893, + "epoch": 0.5786841092727972, + "flos": 521279428608.0, + "grad_norm": 0.07090711844515878, + "language_loss": 0.84396511, + "learning_rate": 0.00039760898674228205, + "loss": 0.85466969, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.34301758, + "step": 3008, + "time_per_iteration": 2.674983501434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074007, + "balance_loss_mlp": 1.03941262, + "epoch": 0.5788764909580608, + "flos": 767047809024.0, + "grad_norm": 0.04405411396785794, + "language_loss": 0.80589879, + "learning_rate": 0.0003973040666565613, + "loss": 0.81663889, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.34619141, + "step": 3009, + "time_per_iteration": 3.0445330142974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068256, + "balance_loss_mlp": 1.03347063, + "epoch": 0.5790688726433244, + "flos": 598786324992.0, + "grad_norm": 0.0464228238066257, + "language_loss": 0.81778955, + "learning_rate": 0.000396999186440938, + "loss": 0.82847214, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.34814453, + "step": 3010, + "time_per_iteration": 2.837510585784912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_mlp": 1.03594089, + "epoch": 0.5792612543285879, + "flos": 522805708800.0, + "grad_norm": 0.06076952990047212, + "language_loss": 0.8482464, + "learning_rate": 0.000396694346213777, + "loss": 0.85896629, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.36083984, + "step": 3011, + "time_per_iteration": 2.630096197128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071847, + "balance_loss_mlp": 1.03498721, + "epoch": 0.5794536360138515, + "flos": 876178805760.0, + "grad_norm": 0.045866643068031475, + "language_loss": 0.83350897, + "learning_rate": 0.0003963895460934276, + "loss": 0.84422737, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.3684082, + "step": 3012, + "time_per_iteration": 3.144862174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107149, + "balance_loss_mlp": 1.03555989, + "epoch": 0.5796460176991151, + "flos": 401221541376.0, + "grad_norm": 0.0681769397078292, + "language_loss": 0.84421676, + "learning_rate": 0.00039608478619822376, + "loss": 0.85493165, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.35961914, + "step": 3013, + "time_per_iteration": 2.459653854370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071668, + "balance_loss_mlp": 1.03545213, + "epoch": 0.5798383993843786, + "flos": 618229297152.0, + "grad_norm": 0.04312849012034037, + "language_loss": 0.82395273, + "learning_rate": 0.00039578006664648394, + "loss": 0.83466941, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.36206055, + "step": 3014, + "time_per_iteration": 2.759540557861328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068998, + "balance_loss_mlp": 1.0336163, + "epoch": 0.5800307810696421, + "flos": 843966950400.0, + "grad_norm": 0.05059644865737796, + "language_loss": 0.80954117, + "learning_rate": 0.0003954753875565105, + "loss": 0.82023108, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.35424805, + "step": 3015, + "time_per_iteration": 3.102818727493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065155, + "balance_loss_mlp": 1.02970195, + "epoch": 0.5802231627549057, + "flos": 569005779456.0, + "grad_norm": 0.049284538826036076, + "language_loss": 0.82072717, + "learning_rate": 0.00039517074904659057, + "loss": 0.83137876, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.35498047, + "step": 3016, + "time_per_iteration": 2.6733109951019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074461, + "balance_loss_mlp": 1.03884125, + "epoch": 0.5804155444401693, + "flos": 660160447488.0, + "grad_norm": 0.0506827974734746, + "language_loss": 0.84573597, + "learning_rate": 0.00039486615123499535, + "loss": 0.8564806, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.35668945, + "step": 3017, + "time_per_iteration": 2.8088088035583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071818, + "balance_loss_mlp": 1.0354352, + "epoch": 0.5806079261254329, + "flos": 513726603264.0, + "grad_norm": 0.053399367847764105, + "language_loss": 0.84808505, + "learning_rate": 0.00039456159423997996, + "loss": 0.85880327, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.36401367, + "step": 3018, + "time_per_iteration": 2.6254379749298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074265, + "balance_loss_mlp": 1.03747678, + "epoch": 0.5808003078106965, + "flos": 528379739136.0, + "grad_norm": 0.059071353461068586, + "language_loss": 0.89337808, + "learning_rate": 0.00039425707817978406, + "loss": 0.90412068, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.36767578, + "step": 3019, + "time_per_iteration": 2.65867280960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071634, + "balance_loss_mlp": 1.0357995, + "epoch": 0.58099268949596, + "flos": 476787520512.0, + "grad_norm": 0.06353889490099716, + "language_loss": 0.83356857, + "learning_rate": 0.00039395260317263124, + "loss": 0.84428501, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.35839844, + "step": 3020, + "time_per_iteration": 2.554124116897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074167, + "balance_loss_mlp": 1.03666329, + "epoch": 0.5811850711812235, + "flos": 517340777472.0, + "grad_norm": 0.05166922362438639, + "language_loss": 0.84975517, + "learning_rate": 0.0003936481693367291, + "loss": 0.86049688, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.37475586, + "step": 3021, + "time_per_iteration": 2.6460227966308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075928, + "balance_loss_mlp": 1.03976023, + "epoch": 0.5813774528664871, + "flos": 616122464256.0, + "grad_norm": 0.06649500378390247, + "language_loss": 0.876212, + "learning_rate": 0.0003933437767902697, + "loss": 0.88697129, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.36206055, + "step": 3022, + "time_per_iteration": 2.8114941120147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071818, + "balance_loss_mlp": 1.03588879, + "epoch": 0.5815698345517507, + "flos": 567194310144.0, + "grad_norm": 0.06503214921944889, + "language_loss": 0.78287327, + "learning_rate": 0.00039303942565142825, + "loss": 0.7935915, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.35961914, + "step": 3023, + "time_per_iteration": 2.7259762287139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071639, + "balance_loss_mlp": 1.03563786, + "epoch": 0.5817622162370142, + "flos": 562886102016.0, + "grad_norm": 0.05350887168996553, + "language_loss": 0.76429439, + "learning_rate": 0.0003927351160383644, + "loss": 0.77501082, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.36035156, + "step": 3024, + "time_per_iteration": 2.8155934810638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071996, + "balance_loss_mlp": 1.03730595, + "epoch": 0.5819545979222778, + "flos": 458982899712.0, + "grad_norm": 0.05396860990467202, + "language_loss": 0.77624023, + "learning_rate": 0.000392430848069222, + "loss": 0.78696012, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.34741211, + "step": 3025, + "time_per_iteration": 2.5123956203460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069758, + "balance_loss_mlp": 1.03387606, + "epoch": 0.5821469796075414, + "flos": 541215613440.0, + "grad_norm": 0.05894861582094883, + "language_loss": 0.82395303, + "learning_rate": 0.00039212662186212795, + "loss": 0.83465064, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.35913086, + "step": 3026, + "time_per_iteration": 2.6423861980438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075924, + "balance_loss_mlp": 1.03930306, + "epoch": 0.582339361292805, + "flos": 551994117120.0, + "grad_norm": 0.060293393109458415, + "language_loss": 0.77264106, + "learning_rate": 0.0003918224375351934, + "loss": 0.7834003, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.36621094, + "step": 3027, + "time_per_iteration": 2.691378593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075199, + "balance_loss_mlp": 1.04029393, + "epoch": 0.5825317429780685, + "flos": 496138770432.0, + "grad_norm": 0.05191318265313257, + "language_loss": 0.78248543, + "learning_rate": 0.0003915182952065135, + "loss": 0.79323745, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.34936523, + "step": 3028, + "time_per_iteration": 2.718275308609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073019, + "balance_loss_mlp": 1.03732777, + "epoch": 0.582724124663332, + "flos": 563890056192.0, + "grad_norm": 0.0482119369127772, + "language_loss": 0.87499475, + "learning_rate": 0.0003912141949941664, + "loss": 0.8857249, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.35766602, + "step": 3029, + "time_per_iteration": 2.6762070655822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075023, + "balance_loss_mlp": 1.03852117, + "epoch": 0.5829165063485956, + "flos": 491888788992.0, + "grad_norm": 0.06336756881053687, + "language_loss": 0.82355005, + "learning_rate": 0.0003909101370162143, + "loss": 0.83430028, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.36499023, + "step": 3030, + "time_per_iteration": 2.6055908203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035193, + "balance_loss_mlp": 1.02432156, + "epoch": 0.5831088880338592, + "flos": 1528134501888.0, + "grad_norm": 0.025423566517204055, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.7346909, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.10888672, + "step": 3031, + "time_per_iteration": 4.88014817237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071232, + "balance_loss_mlp": 1.03558815, + "epoch": 0.5833012697191228, + "flos": 617712763392.0, + "grad_norm": 0.04799878735573131, + "language_loss": 0.82774729, + "learning_rate": 0.0003903021482356622, + "loss": 0.83845961, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.35693359, + "step": 3032, + "time_per_iteration": 2.7778074741363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071187, + "balance_loss_mlp": 1.03542447, + "epoch": 0.5834936514043862, + "flos": 767578899456.0, + "grad_norm": 0.04830091888101656, + "language_loss": 0.82788891, + "learning_rate": 0.00038999821766910465, + "loss": 0.83860075, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.35791016, + "step": 3033, + "time_per_iteration": 2.9640953540802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070566, + "balance_loss_mlp": 1.03496981, + "epoch": 0.5836860330896498, + "flos": 458136096768.0, + "grad_norm": 0.045708981442043065, + "language_loss": 0.85570675, + "learning_rate": 0.00038969432980902606, + "loss": 0.8664124, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.35620117, + "step": 3034, + "time_per_iteration": 2.520869255065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029447, + "balance_loss_mlp": 1.01819336, + "epoch": 0.5838784147749134, + "flos": 1360485504000.0, + "grad_norm": 0.023110513117977256, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80813944, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.11230469, + "step": 3035, + "time_per_iteration": 4.791047811508179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076539, + "balance_loss_mlp": 1.04125297, + "epoch": 0.584070796460177, + "flos": 566942616576.0, + "grad_norm": 0.048603623386797364, + "language_loss": 0.82340151, + "learning_rate": 0.00038908668268020953, + "loss": 0.83416688, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.35302734, + "step": 3036, + "time_per_iteration": 2.6480767726898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073606, + "balance_loss_mlp": 1.03781927, + "epoch": 0.5842631781454406, + "flos": 611188623360.0, + "grad_norm": 0.04937423588772942, + "language_loss": 0.84850454, + "learning_rate": 0.00038878292364738097, + "loss": 0.85924065, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.3581543, + "step": 3037, + "time_per_iteration": 2.7739527225494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070708, + "balance_loss_mlp": 1.03418183, + "epoch": 0.5844555598307041, + "flos": 463148513280.0, + "grad_norm": 0.05602443207387838, + "language_loss": 0.86980963, + "learning_rate": 0.0003884792077928508, + "loss": 0.88051671, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.36523438, + "step": 3038, + "time_per_iteration": 2.488044500350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076481, + "balance_loss_mlp": 1.04083705, + "epoch": 0.5846479415159677, + "flos": 410005283328.0, + "grad_norm": 0.06107663121836191, + "language_loss": 0.76691568, + "learning_rate": 0.0003881755352345322, + "loss": 0.77768052, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.35644531, + "step": 3039, + "time_per_iteration": 2.4996848106384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076016, + "balance_loss_mlp": 1.03944278, + "epoch": 0.5848403232012312, + "flos": 491056542720.0, + "grad_norm": 0.04475599589029588, + "language_loss": 0.86940634, + "learning_rate": 0.0003878719060903207, + "loss": 0.88016653, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.36572266, + "step": 3040, + "time_per_iteration": 2.5631661415100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107371, + "balance_loss_mlp": 1.03823376, + "epoch": 0.5850327048864948, + "flos": 584146335744.0, + "grad_norm": 0.06623374989281658, + "language_loss": 0.82883763, + "learning_rate": 0.0003875683204780961, + "loss": 0.83957475, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.35522461, + "step": 3041, + "time_per_iteration": 2.7194101810455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074225, + "balance_loss_mlp": 1.03765166, + "epoch": 0.5852250865717584, + "flos": 651253049856.0, + "grad_norm": 0.05546398592496706, + "language_loss": 0.84983653, + "learning_rate": 0.00038726477851572043, + "loss": 0.86057878, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.36572266, + "step": 3042, + "time_per_iteration": 2.809687376022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072999, + "balance_loss_mlp": 1.03659296, + "epoch": 0.5854174682570219, + "flos": 534332090880.0, + "grad_norm": 0.07237686853447298, + "language_loss": 0.80418718, + "learning_rate": 0.0003869612803210395, + "loss": 0.81491715, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.36401367, + "step": 3043, + "time_per_iteration": 2.6141133308410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074965, + "balance_loss_mlp": 1.03872585, + "epoch": 0.5856098499422855, + "flos": 509501352960.0, + "grad_norm": 0.08321780378599658, + "language_loss": 0.83029413, + "learning_rate": 0.0003866578260118817, + "loss": 0.84104383, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.36254883, + "step": 3044, + "time_per_iteration": 2.5739400386810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070627, + "balance_loss_mlp": 1.03438699, + "epoch": 0.5858022316275491, + "flos": 593619729408.0, + "grad_norm": 0.061750802810204855, + "language_loss": 0.83199847, + "learning_rate": 0.0003863544157060581, + "loss": 0.84270471, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.36254883, + "step": 3045, + "time_per_iteration": 2.662442207336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077309, + "balance_loss_mlp": 1.04083109, + "epoch": 0.5859946133128127, + "flos": 558829587456.0, + "grad_norm": 0.0566139046566934, + "language_loss": 0.82210046, + "learning_rate": 0.0003860510495213634, + "loss": 0.83287358, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.36499023, + "step": 3046, + "time_per_iteration": 2.817676305770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086311, + "balance_loss_mlp": 1.04885542, + "epoch": 0.5861869949980761, + "flos": 553431647232.0, + "grad_norm": 0.06969052760403557, + "language_loss": 0.77781415, + "learning_rate": 0.0003857477275755746, + "loss": 0.78867728, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.37451172, + "step": 3047, + "time_per_iteration": 2.645547389984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076852, + "balance_loss_mlp": 1.03994477, + "epoch": 0.5863793766833397, + "flos": 718321886208.0, + "grad_norm": 0.060152245737565335, + "language_loss": 0.83672923, + "learning_rate": 0.00038544444998645167, + "loss": 0.84749776, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.36914062, + "step": 3048, + "time_per_iteration": 2.995572090148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080654, + "balance_loss_mlp": 1.04410434, + "epoch": 0.5865717583686033, + "flos": 472041354240.0, + "grad_norm": 0.05877541838315078, + "language_loss": 0.81869525, + "learning_rate": 0.00038514121687173767, + "loss": 0.82950181, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.36572266, + "step": 3049, + "time_per_iteration": 2.5653092861175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085484, + "balance_loss_mlp": 1.04819572, + "epoch": 0.5867641400538669, + "flos": 813143162880.0, + "grad_norm": 0.060327128014073625, + "language_loss": 0.82117838, + "learning_rate": 0.00038483802834915807, + "loss": 0.83203322, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.37280273, + "step": 3050, + "time_per_iteration": 2.9661922454833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074132, + "balance_loss_mlp": 1.03755879, + "epoch": 0.5869565217391305, + "flos": 486285645312.0, + "grad_norm": 0.05442603126978945, + "language_loss": 0.78767669, + "learning_rate": 0.00038453488453642074, + "loss": 0.79841799, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.36547852, + "step": 3051, + "time_per_iteration": 2.6421701908111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076937, + "balance_loss_mlp": 1.0401963, + "epoch": 0.587148903424394, + "flos": 569104704000.0, + "grad_norm": 0.050403805084847125, + "language_loss": 0.86714828, + "learning_rate": 0.00038423178555121697, + "loss": 0.87791765, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.36743164, + "step": 3052, + "time_per_iteration": 2.689039945602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079498, + "balance_loss_mlp": 1.04239988, + "epoch": 0.5873412851096576, + "flos": 746948680704.0, + "grad_norm": 0.04537735372020953, + "language_loss": 0.85335124, + "learning_rate": 0.00038392873151121994, + "loss": 0.86414617, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.37084961, + "step": 3053, + "time_per_iteration": 3.0252749919891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071586, + "balance_loss_mlp": 1.03510821, + "epoch": 0.5875336667949211, + "flos": 527882144256.0, + "grad_norm": 0.0531573443466337, + "language_loss": 0.82837141, + "learning_rate": 0.0003836257225340859, + "loss": 0.83908725, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.36474609, + "step": 3054, + "time_per_iteration": 2.6028475761413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074191, + "balance_loss_mlp": 1.03728426, + "epoch": 0.5877260484801847, + "flos": 823799420928.0, + "grad_norm": 0.057535155706969474, + "language_loss": 0.81870168, + "learning_rate": 0.00038332275873745336, + "loss": 0.82944363, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.36889648, + "step": 3055, + "time_per_iteration": 3.1007511615753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074496, + "balance_loss_mlp": 1.03682637, + "epoch": 0.5879184301654482, + "flos": 591325221888.0, + "grad_norm": 0.0460079349498171, + "language_loss": 0.82943761, + "learning_rate": 0.0003830198402389431, + "loss": 0.84018254, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.37646484, + "step": 3056, + "time_per_iteration": 2.6919126510620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010412, + "balance_loss_mlp": 1.02975643, + "epoch": 0.5881108118507118, + "flos": 1544953955328.0, + "grad_norm": 0.021887470100806234, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78390133, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.11425781, + "step": 3057, + "time_per_iteration": 4.971444368362427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072291, + "balance_loss_mlp": 1.03576517, + "epoch": 0.5883031935359754, + "flos": 489348380160.0, + "grad_norm": 0.055950804718103285, + "language_loss": 0.82692897, + "learning_rate": 0.0003824141396066855, + "loss": 0.83765185, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.36572266, + "step": 3058, + "time_per_iteration": 2.5848963260650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074876, + "balance_loss_mlp": 1.03842139, + "epoch": 0.588495575221239, + "flos": 582551654400.0, + "grad_norm": 0.05305150563857962, + "language_loss": 0.82647693, + "learning_rate": 0.000382111357708092, + "loss": 0.83722568, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.36499023, + "step": 3059, + "time_per_iteration": 2.750030279159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071916, + "balance_loss_mlp": 1.03558111, + "epoch": 0.5886879569065026, + "flos": 660751174656.0, + "grad_norm": 0.05165433097502605, + "language_loss": 0.83451211, + "learning_rate": 0.00038180862157792864, + "loss": 0.84523129, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.36303711, + "step": 3060, + "time_per_iteration": 2.7654812335968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070431, + "balance_loss_mlp": 1.03414369, + "epoch": 0.588880338591766, + "flos": 562392889344.0, + "grad_norm": 0.05703427459216956, + "language_loss": 0.82004499, + "learning_rate": 0.0003815059313337279, + "loss": 0.83074933, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.36279297, + "step": 3061, + "time_per_iteration": 2.659722089767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072147, + "balance_loss_mlp": 1.03585935, + "epoch": 0.5890727202770296, + "flos": 554451568128.0, + "grad_norm": 0.04901881896382658, + "language_loss": 0.77886307, + "learning_rate": 0.00038120328709300436, + "loss": 0.78958452, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.36279297, + "step": 3062, + "time_per_iteration": 2.8264663219451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076904, + "balance_loss_mlp": 1.04114151, + "epoch": 0.5892651019622932, + "flos": 655226606592.0, + "grad_norm": 0.057794453116502664, + "language_loss": 0.83449113, + "learning_rate": 0.0003809006889732549, + "loss": 0.84526014, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.35766602, + "step": 3063, + "time_per_iteration": 2.780714511871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073572, + "balance_loss_mlp": 1.03680801, + "epoch": 0.5894574836475568, + "flos": 452970911232.0, + "grad_norm": 0.048397381644471126, + "language_loss": 0.87604314, + "learning_rate": 0.0003805981370919589, + "loss": 0.88677883, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.36743164, + "step": 3064, + "time_per_iteration": 2.497511386871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077136, + "balance_loss_mlp": 1.03965652, + "epoch": 0.5896498653328203, + "flos": 518763750912.0, + "grad_norm": 0.05535483461806511, + "language_loss": 0.83910584, + "learning_rate": 0.0003802956315665771, + "loss": 0.84987724, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.37475586, + "step": 3065, + "time_per_iteration": 2.6540539264678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075706, + "balance_loss_mlp": 1.03965688, + "epoch": 0.5898422470180839, + "flos": 548793169920.0, + "grad_norm": 0.06978967624296899, + "language_loss": 0.81621277, + "learning_rate": 0.0003799931725145529, + "loss": 0.82696986, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.3605957, + "step": 3066, + "time_per_iteration": 2.5999929904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075756, + "balance_loss_mlp": 1.04015982, + "epoch": 0.5900346287033474, + "flos": 524046799872.0, + "grad_norm": 0.06178961053063138, + "language_loss": 0.85556895, + "learning_rate": 0.00037969076005331083, + "loss": 0.86632651, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.35571289, + "step": 3067, + "time_per_iteration": 2.7505955696105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080728, + "balance_loss_mlp": 1.04372525, + "epoch": 0.590227010388611, + "flos": 566893154304.0, + "grad_norm": 0.059517883137225745, + "language_loss": 0.88041914, + "learning_rate": 0.00037938839430025817, + "loss": 0.89122641, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.36962891, + "step": 3068, + "time_per_iteration": 2.6254634857177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072667, + "balance_loss_mlp": 1.03714228, + "epoch": 0.5904193920738746, + "flos": 583053631488.0, + "grad_norm": 0.05094647187222568, + "language_loss": 0.85285151, + "learning_rate": 0.0003790860753727835, + "loss": 0.8635782, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.35546875, + "step": 3069, + "time_per_iteration": 2.790996551513672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076132, + "balance_loss_mlp": 1.04056025, + "epoch": 0.5906117737591381, + "flos": 529428773376.0, + "grad_norm": 0.06487433034023032, + "language_loss": 0.82915914, + "learning_rate": 0.00037878380338825766, + "loss": 0.83992046, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.35644531, + "step": 3070, + "time_per_iteration": 2.6697611808776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078223, + "balance_loss_mlp": 1.04276967, + "epoch": 0.5908041554444017, + "flos": 683908655616.0, + "grad_norm": 0.053205750192721994, + "language_loss": 0.81560326, + "learning_rate": 0.00037848157846403287, + "loss": 0.8263855, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.35473633, + "step": 3071, + "time_per_iteration": 2.92523193359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077534, + "balance_loss_mlp": 1.04246306, + "epoch": 0.5909965371296653, + "flos": 549719958528.0, + "grad_norm": 0.04683417834560967, + "language_loss": 0.83405554, + "learning_rate": 0.0003781794007174435, + "loss": 0.84483093, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.35107422, + "step": 3072, + "time_per_iteration": 2.7881455421447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022638, + "balance_loss_mlp": 1.01200461, + "epoch": 0.5911889188149289, + "flos": 1491544475136.0, + "grad_norm": 0.008695883247199268, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75097167, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.10644531, + "step": 3073, + "time_per_iteration": 4.864701509475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078671, + "balance_loss_mlp": 1.04293227, + "epoch": 0.5913813005001923, + "flos": 487630043136.0, + "grad_norm": 0.053099165858615995, + "language_loss": 0.80592149, + "learning_rate": 0.0003775751872264152, + "loss": 0.81670815, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.35766602, + "step": 3074, + "time_per_iteration": 2.7932956218719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079498, + "balance_loss_mlp": 1.04409289, + "epoch": 0.5915736821854559, + "flos": 573034590720.0, + "grad_norm": 0.04575078918426429, + "language_loss": 0.86981148, + "learning_rate": 0.0003772731517165527, + "loss": 0.88060653, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.35449219, + "step": 3075, + "time_per_iteration": 2.7613656520843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075792, + "balance_loss_mlp": 1.04060149, + "epoch": 0.5917660638707195, + "flos": 789183959040.0, + "grad_norm": 0.06797753963070947, + "language_loss": 0.84194851, + "learning_rate": 0.0003769711638534784, + "loss": 0.85270643, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.35205078, + "step": 3076, + "time_per_iteration": 2.991854190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076527, + "balance_loss_mlp": 1.04181361, + "epoch": 0.5919584455559831, + "flos": 528487428096.0, + "grad_norm": 0.06227325112589354, + "language_loss": 0.78677326, + "learning_rate": 0.00037666922375443446, + "loss": 0.79753852, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.34765625, + "step": 3077, + "time_per_iteration": 2.591597557067871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072268, + "balance_loss_mlp": 1.03757811, + "epoch": 0.5921508272412467, + "flos": 560320962048.0, + "grad_norm": 0.056716138151229355, + "language_loss": 0.81505013, + "learning_rate": 0.00037636733153664396, + "loss": 0.82577276, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.34716797, + "step": 3078, + "time_per_iteration": 2.854278802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075513, + "balance_loss_mlp": 1.04144311, + "epoch": 0.5923432089265102, + "flos": 563008347648.0, + "grad_norm": 0.061835614307010005, + "language_loss": 0.79824865, + "learning_rate": 0.0003760654873173124, + "loss": 0.80900383, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.34082031, + "step": 3079, + "time_per_iteration": 2.66091251373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_mlp": 1.04387426, + "epoch": 0.5925355906117737, + "flos": 495488406528.0, + "grad_norm": 0.052514491856325576, + "language_loss": 0.81763887, + "learning_rate": 0.00037576369121362566, + "loss": 0.8284322, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.35498047, + "step": 3080, + "time_per_iteration": 2.5847787857055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079309, + "balance_loss_mlp": 1.04473865, + "epoch": 0.5927279722970373, + "flos": 565940072448.0, + "grad_norm": 0.05276703199883553, + "language_loss": 0.81885982, + "learning_rate": 0.0003754619433427516, + "loss": 0.82965291, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.34570312, + "step": 3081, + "time_per_iteration": 2.898594856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108187, + "balance_loss_mlp": 1.04682267, + "epoch": 0.5929203539823009, + "flos": 666674413056.0, + "grad_norm": 0.06717854488830324, + "language_loss": 0.77682364, + "learning_rate": 0.0003751602438218392, + "loss": 0.78764236, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.35083008, + "step": 3082, + "time_per_iteration": 2.7553367614746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083555, + "balance_loss_mlp": 1.0486505, + "epoch": 0.5931127356675644, + "flos": 555484635648.0, + "grad_norm": 0.05625551140275949, + "language_loss": 0.83254004, + "learning_rate": 0.0003748585927680186, + "loss": 0.84337556, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.34912109, + "step": 3083, + "time_per_iteration": 2.6493966579437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089532, + "balance_loss_mlp": 1.0530777, + "epoch": 0.593305117352828, + "flos": 534932992512.0, + "grad_norm": 0.07512877248395429, + "language_loss": 0.82828176, + "learning_rate": 0.00037455699029840086, + "loss": 0.83917707, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.36450195, + "step": 3084, + "time_per_iteration": 2.674532890319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079674, + "balance_loss_mlp": 1.04488921, + "epoch": 0.5934974990380916, + "flos": 593683748352.0, + "grad_norm": 0.05984158390569505, + "language_loss": 0.84177965, + "learning_rate": 0.0003742554365300787, + "loss": 0.85257638, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.34838867, + "step": 3085, + "time_per_iteration": 2.712371587753296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085917, + "balance_loss_mlp": 1.05044067, + "epoch": 0.5936898807233552, + "flos": 712339011072.0, + "grad_norm": 0.05068184961629974, + "language_loss": 0.78978491, + "learning_rate": 0.0003739539315801255, + "loss": 0.80064404, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.35473633, + "step": 3086, + "time_per_iteration": 2.916006565093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088952, + "balance_loss_mlp": 1.05345142, + "epoch": 0.5938822624086187, + "flos": 391684128768.0, + "grad_norm": 0.05263578767135529, + "language_loss": 0.9165324, + "learning_rate": 0.000373652475565596, + "loss": 0.92742193, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.35522461, + "step": 3087, + "time_per_iteration": 2.470960855484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094024, + "balance_loss_mlp": 1.05900025, + "epoch": 0.5940746440938822, + "flos": 480023373312.0, + "grad_norm": 0.060850763929597464, + "language_loss": 0.81550741, + "learning_rate": 0.00037335106860352587, + "loss": 0.82644761, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.35083008, + "step": 3088, + "time_per_iteration": 2.666472911834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097243, + "balance_loss_mlp": 1.06100357, + "epoch": 0.5942670257791458, + "flos": 483094872576.0, + "grad_norm": 0.049324641114684424, + "language_loss": 0.83196813, + "learning_rate": 0.00037304971081093146, + "loss": 0.84294057, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.36230469, + "step": 3089, + "time_per_iteration": 2.521000862121582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093812, + "balance_loss_mlp": 1.05967069, + "epoch": 0.5944594074644094, + "flos": 547656795648.0, + "grad_norm": 0.0533670066305608, + "language_loss": 0.81061506, + "learning_rate": 0.00037274840230481024, + "loss": 0.82155317, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.34179688, + "step": 3090, + "time_per_iteration": 2.7134556770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092625, + "balance_loss_mlp": 1.05700517, + "epoch": 0.594651789149673, + "flos": 448943510016.0, + "grad_norm": 0.055393993008082114, + "language_loss": 0.78753984, + "learning_rate": 0.00037244714320214077, + "loss": 0.79846609, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.35620117, + "step": 3091, + "time_per_iteration": 2.5576789379119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092048, + "balance_loss_mlp": 1.05640459, + "epoch": 0.5948441708349365, + "flos": 595969491456.0, + "grad_norm": 0.050698130573270175, + "language_loss": 0.83444929, + "learning_rate": 0.000372145933619882, + "loss": 0.84536982, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.35668945, + "step": 3092, + "time_per_iteration": 2.8742141723632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091606, + "balance_loss_mlp": 1.05636811, + "epoch": 0.5950365525202, + "flos": 548251905024.0, + "grad_norm": 0.05419961551348069, + "language_loss": 0.82168603, + "learning_rate": 0.000371844773674974, + "loss": 0.83260214, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.3527832, + "step": 3093, + "time_per_iteration": 2.6228530406951904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094358, + "balance_loss_mlp": 1.05890489, + "epoch": 0.5952289342054636, + "flos": 654385595904.0, + "grad_norm": 0.05844341434318606, + "language_loss": 0.81673229, + "learning_rate": 0.0003715436634843375, + "loss": 0.82767594, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.35498047, + "step": 3094, + "time_per_iteration": 2.8496577739715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084873, + "balance_loss_mlp": 1.04951525, + "epoch": 0.5954213158907272, + "flos": 603055245312.0, + "grad_norm": 0.0455107572696148, + "language_loss": 0.80728281, + "learning_rate": 0.00037124260316487355, + "loss": 0.81813157, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.35375977, + "step": 3095, + "time_per_iteration": 2.83181095123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084995, + "balance_loss_mlp": 1.05044806, + "epoch": 0.5956136975759908, + "flos": 486097970688.0, + "grad_norm": 0.0493360128544523, + "language_loss": 0.89028478, + "learning_rate": 0.0003709415928334643, + "loss": 0.90113473, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.34570312, + "step": 3096, + "time_per_iteration": 2.5334527492523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081969, + "balance_loss_mlp": 1.0465641, + "epoch": 0.5958060792612543, + "flos": 658462459392.0, + "grad_norm": 0.05334894182240255, + "language_loss": 0.80644953, + "learning_rate": 0.00037064063260697233, + "loss": 0.81726921, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.35424805, + "step": 3097, + "time_per_iteration": 2.868948221206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085643, + "balance_loss_mlp": 1.05004668, + "epoch": 0.5959984609465179, + "flos": 723201882624.0, + "grad_norm": 0.05441892470065276, + "language_loss": 0.78413296, + "learning_rate": 0.0003703397226022407, + "loss": 0.79498935, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.35595703, + "step": 3098, + "time_per_iteration": 3.0486435890197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054277, + "balance_loss_mlp": 1.04254675, + "epoch": 0.5961908426317815, + "flos": 1519010164224.0, + "grad_norm": 0.031936086773479797, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76554149, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.1171875, + "step": 3099, + "time_per_iteration": 4.9141762256622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082532, + "balance_loss_mlp": 1.04822397, + "epoch": 0.596383224317045, + "flos": 532357678080.0, + "grad_norm": 0.04537931846822051, + "language_loss": 0.83096731, + "learning_rate": 0.0003697380537253339, + "loss": 0.84179258, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.34350586, + "step": 3100, + "time_per_iteration": 2.6156232357025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082884, + "balance_loss_mlp": 1.04766929, + "epoch": 0.5965756060023086, + "flos": 590922169344.0, + "grad_norm": 0.060003355935897486, + "language_loss": 0.81679451, + "learning_rate": 0.0003694372950867471, + "loss": 0.82762337, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.3527832, + "step": 3101, + "time_per_iteration": 2.746100902557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084672, + "balance_loss_mlp": 1.04967189, + "epoch": 0.5967679876875721, + "flos": 861701760000.0, + "grad_norm": 0.05796500812003716, + "language_loss": 0.77373374, + "learning_rate": 0.0003691365871370976, + "loss": 0.78458047, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.3503418, + "step": 3102, + "time_per_iteration": 3.0448250770568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082291, + "balance_loss_mlp": 1.04710054, + "epoch": 0.5969603693728357, + "flos": 553574241792.0, + "grad_norm": 0.05791620467430745, + "language_loss": 0.854276, + "learning_rate": 0.00036883592999313093, + "loss": 0.86509889, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.35229492, + "step": 3103, + "time_per_iteration": 2.650810718536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082187, + "balance_loss_mlp": 1.04666269, + "epoch": 0.5971527510580993, + "flos": 718345207296.0, + "grad_norm": 0.05277795957282848, + "language_loss": 0.79037023, + "learning_rate": 0.0003685353237715722, + "loss": 0.80119205, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.35546875, + "step": 3104, + "time_per_iteration": 2.87162184715271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082645, + "balance_loss_mlp": 1.04812241, + "epoch": 0.5973451327433629, + "flos": 647324573184.0, + "grad_norm": 0.05039525348103138, + "language_loss": 0.81437027, + "learning_rate": 0.0003682347685891274, + "loss": 0.82519674, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.34570312, + "step": 3105, + "time_per_iteration": 2.844632863998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078322, + "balance_loss_mlp": 1.04284513, + "epoch": 0.5975375144286263, + "flos": 721374446592.0, + "grad_norm": 0.053848168408106474, + "language_loss": 0.80436707, + "learning_rate": 0.0003679342645624822, + "loss": 0.81515038, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.35498047, + "step": 3106, + "time_per_iteration": 2.961121082305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079962, + "balance_loss_mlp": 1.04374671, + "epoch": 0.5977298961138899, + "flos": 750616699392.0, + "grad_norm": 0.04889819009677852, + "language_loss": 0.8164891, + "learning_rate": 0.0003676338118083025, + "loss": 0.82728875, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.36230469, + "step": 3107, + "time_per_iteration": 2.997671127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076541, + "balance_loss_mlp": 1.04161251, + "epoch": 0.5979222777991535, + "flos": 530703360000.0, + "grad_norm": 0.05034919609110883, + "language_loss": 0.79592144, + "learning_rate": 0.0003673334104432347, + "loss": 0.80668688, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.34960938, + "step": 3108, + "time_per_iteration": 2.5946898460388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079709, + "balance_loss_mlp": 1.04461432, + "epoch": 0.5981146594844171, + "flos": 621459357696.0, + "grad_norm": 0.04952863942356172, + "language_loss": 0.83337331, + "learning_rate": 0.0003670330605839048, + "loss": 0.84417045, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.35131836, + "step": 3109, + "time_per_iteration": 2.7955031394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080275, + "balance_loss_mlp": 1.04470301, + "epoch": 0.5983070411696807, + "flos": 603309911040.0, + "grad_norm": 0.05233505638894281, + "language_loss": 0.76384044, + "learning_rate": 0.0003667327623469191, + "loss": 0.77464318, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.35571289, + "step": 3110, + "time_per_iteration": 2.7939095497131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080046, + "balance_loss_mlp": 1.04516506, + "epoch": 0.5984994228549442, + "flos": 633187971072.0, + "grad_norm": 0.05191698416970628, + "language_loss": 0.7765972, + "learning_rate": 0.00036643251584886333, + "loss": 0.78739762, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.34912109, + "step": 3111, + "time_per_iteration": 2.821956157684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076682, + "balance_loss_mlp": 1.0426122, + "epoch": 0.5986918045402078, + "flos": 525026022912.0, + "grad_norm": 0.05255438672232182, + "language_loss": 0.81679058, + "learning_rate": 0.00036613232120630393, + "loss": 0.82755744, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.34106445, + "step": 3112, + "time_per_iteration": 2.61639142036438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072562, + "balance_loss_mlp": 1.03751469, + "epoch": 0.5988841862254713, + "flos": 482942103552.0, + "grad_norm": 0.06309856820969045, + "language_loss": 0.8010537, + "learning_rate": 0.00036583217853578643, + "loss": 0.81177926, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.35083008, + "step": 3113, + "time_per_iteration": 2.544152021408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076935, + "balance_loss_mlp": 1.04241252, + "epoch": 0.5990765679107349, + "flos": 1139658674688.0, + "grad_norm": 0.05746596179478014, + "language_loss": 0.7739538, + "learning_rate": 0.000365532087953837, + "loss": 0.78472316, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.34545898, + "step": 3114, + "time_per_iteration": 3.6210074424743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074738, + "balance_loss_mlp": 1.04104948, + "epoch": 0.5992689495959984, + "flos": 516729701376.0, + "grad_norm": 0.0590793434639382, + "language_loss": 0.89283043, + "learning_rate": 0.00036523204957696065, + "loss": 0.9035778, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.3371582, + "step": 3115, + "time_per_iteration": 2.5835559368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079472, + "balance_loss_mlp": 1.0447346, + "epoch": 0.599461331281262, + "flos": 744288998400.0, + "grad_norm": 0.05148674480480004, + "language_loss": 0.80590332, + "learning_rate": 0.00036493206352164324, + "loss": 0.81669807, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.34790039, + "step": 3116, + "time_per_iteration": 2.9135849475860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073646, + "balance_loss_mlp": 1.03960013, + "epoch": 0.5996537129665256, + "flos": 592078892544.0, + "grad_norm": 0.05828379622393402, + "language_loss": 0.85252976, + "learning_rate": 0.000364632129904349, + "loss": 0.86326623, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.34082031, + "step": 3117, + "time_per_iteration": 2.7019104957580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107531, + "balance_loss_mlp": 1.03997648, + "epoch": 0.5998460946517892, + "flos": 558735045120.0, + "grad_norm": 0.05080253376139345, + "language_loss": 0.77507442, + "learning_rate": 0.00036433224884152283, + "loss": 0.78582752, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.35375977, + "step": 3118, + "time_per_iteration": 2.698032855987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082073, + "balance_loss_mlp": 1.04814649, + "epoch": 0.6000384763370528, + "flos": 484325789184.0, + "grad_norm": 0.058104830427354655, + "language_loss": 0.77595496, + "learning_rate": 0.00036403242044958875, + "loss": 0.78677565, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.33959961, + "step": 3119, + "time_per_iteration": 2.5694661140441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082708, + "balance_loss_mlp": 1.04763699, + "epoch": 0.6002308580223162, + "flos": 596490407424.0, + "grad_norm": 0.05350136271967441, + "language_loss": 0.91317761, + "learning_rate": 0.0003637326448449507, + "loss": 0.92400473, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.35083008, + "step": 3120, + "time_per_iteration": 2.7095799446105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083063, + "balance_loss_mlp": 1.04808724, + "epoch": 0.6004232397075798, + "flos": 544879249920.0, + "grad_norm": 0.044412764387293725, + "language_loss": 0.86037177, + "learning_rate": 0.00036343292214399177, + "loss": 0.87120235, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.34985352, + "step": 3121, + "time_per_iteration": 2.760568618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074765, + "balance_loss_mlp": 1.04112399, + "epoch": 0.6006156213928434, + "flos": 629647990272.0, + "grad_norm": 0.05788035172914192, + "language_loss": 0.770136, + "learning_rate": 0.00036313325246307456, + "loss": 0.78088361, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.33666992, + "step": 3122, + "time_per_iteration": 2.7645843029022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081915, + "balance_loss_mlp": 1.0479641, + "epoch": 0.600808003078107, + "flos": 582043885056.0, + "grad_norm": 0.05339440368403648, + "language_loss": 0.8713336, + "learning_rate": 0.0003628336359185411, + "loss": 0.8821528, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.33984375, + "step": 3123, + "time_per_iteration": 2.704559803009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084761, + "balance_loss_mlp": 1.04961848, + "epoch": 0.6010003847633705, + "flos": 634984883712.0, + "grad_norm": 0.051464767664237604, + "language_loss": 0.7543686, + "learning_rate": 0.000362534072626713, + "loss": 0.76521623, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.35180664, + "step": 3124, + "time_per_iteration": 2.767263174057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082659, + "balance_loss_mlp": 1.04837453, + "epoch": 0.6011927664486341, + "flos": 718448514048.0, + "grad_norm": 0.05118450522862765, + "language_loss": 0.80810112, + "learning_rate": 0.00036223456270389093, + "loss": 0.81892776, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.34326172, + "step": 3125, + "time_per_iteration": 2.972226858139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077124, + "balance_loss_mlp": 1.04272032, + "epoch": 0.6013851481338977, + "flos": 498782486016.0, + "grad_norm": 0.0486392008074567, + "language_loss": 0.81048089, + "learning_rate": 0.00036193510626635517, + "loss": 0.82125211, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.34423828, + "step": 3126, + "time_per_iteration": 2.6381988525390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080205, + "balance_loss_mlp": 1.04620612, + "epoch": 0.6015775298191612, + "flos": 749266509312.0, + "grad_norm": 0.057928922724073975, + "language_loss": 0.81419915, + "learning_rate": 0.0003616357034303649, + "loss": 0.82500118, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.34033203, + "step": 3127, + "time_per_iteration": 2.910590887069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077974, + "balance_loss_mlp": 1.04380846, + "epoch": 0.6017699115044248, + "flos": 592764162048.0, + "grad_norm": 0.06444067726606947, + "language_loss": 0.7886622, + "learning_rate": 0.0003613363543121584, + "loss": 0.79944193, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.34204102, + "step": 3128, + "time_per_iteration": 2.8243367671966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081251, + "balance_loss_mlp": 1.04627466, + "epoch": 0.6019622931896883, + "flos": 514839656448.0, + "grad_norm": 0.05655060163799935, + "language_loss": 0.8488009, + "learning_rate": 0.00036103705902795357, + "loss": 0.85961336, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.35009766, + "step": 3129, + "time_per_iteration": 2.691652297973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078555, + "balance_loss_mlp": 1.0440799, + "epoch": 0.6021546748749519, + "flos": 490219914240.0, + "grad_norm": 0.11187816626328603, + "language_loss": 0.79397345, + "learning_rate": 0.0003607378176939471, + "loss": 0.80475903, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.3449707, + "step": 3130, + "time_per_iteration": 2.59126353263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080272, + "balance_loss_mlp": 1.0459156, + "epoch": 0.6023470565602155, + "flos": 540763098624.0, + "grad_norm": 0.584663234761047, + "language_loss": 0.81865788, + "learning_rate": 0.00036043863042631465, + "loss": 0.82946062, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.34399414, + "step": 3131, + "time_per_iteration": 2.631758689880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082979, + "balance_loss_mlp": 1.04716837, + "epoch": 0.6025394382454791, + "flos": 844660984320.0, + "grad_norm": 0.054894708667503185, + "language_loss": 0.76558393, + "learning_rate": 0.00036013949734121133, + "loss": 0.77641368, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.3581543, + "step": 3132, + "time_per_iteration": 3.091432809829712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077376, + "balance_loss_mlp": 1.04249549, + "epoch": 0.6027318199307425, + "flos": 576903430656.0, + "grad_norm": 0.05648602970445555, + "language_loss": 0.82430494, + "learning_rate": 0.00035984041855477043, + "loss": 0.83507866, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.34912109, + "step": 3133, + "time_per_iteration": 2.707841396331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045948, + "balance_loss_mlp": 1.03345478, + "epoch": 0.6029242016160061, + "flos": 1470160585728.0, + "grad_norm": 0.017118275971869903, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79755843, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.125, + "step": 3134, + "time_per_iteration": 4.929067373275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077811, + "balance_loss_mlp": 1.0416429, + "epoch": 0.6031165833012697, + "flos": 480486062592.0, + "grad_norm": 0.057341971523643794, + "language_loss": 0.79656577, + "learning_rate": 0.00035924242434230637, + "loss": 0.80734384, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.36181641, + "step": 3135, + "time_per_iteration": 2.6362884044647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078775, + "balance_loss_mlp": 1.04294014, + "epoch": 0.6033089649865333, + "flos": 499220444160.0, + "grad_norm": 0.48805573037273664, + "language_loss": 0.78477532, + "learning_rate": 0.00035894350914844516, + "loss": 0.79556304, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.35864258, + "step": 3136, + "time_per_iteration": 2.5889768600463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095737, + "balance_loss_mlp": 1.05961668, + "epoch": 0.6035013466717969, + "flos": 556337230848.0, + "grad_norm": 0.06198645185938339, + "language_loss": 0.828888, + "learning_rate": 0.0003586446487175703, + "loss": 0.83984536, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.36132812, + "step": 3137, + "time_per_iteration": 2.6805853843688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105357, + "balance_loss_mlp": 1.06690025, + "epoch": 0.6036937283570604, + "flos": 594536343552.0, + "grad_norm": 0.04857529981882101, + "language_loss": 0.85242814, + "learning_rate": 0.0003583458431657099, + "loss": 0.86348164, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.3840332, + "step": 3138, + "time_per_iteration": 2.8694372177124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107329, + "balance_loss_mlp": 1.0691824, + "epoch": 0.603886110042324, + "flos": 540684523008.0, + "grad_norm": 0.0686265379907432, + "language_loss": 0.82493383, + "learning_rate": 0.00035804709260887056, + "loss": 0.83600712, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.38110352, + "step": 3139, + "time_per_iteration": 2.6613197326660156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111658, + "balance_loss_mlp": 1.07664514, + "epoch": 0.6040784917275875, + "flos": 518315618304.0, + "grad_norm": 0.04727969625034485, + "language_loss": 0.89413351, + "learning_rate": 0.0003577483971630373, + "loss": 0.90529931, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.39916992, + "step": 3140, + "time_per_iteration": 2.6468544006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112858, + "balance_loss_mlp": 1.08752418, + "epoch": 0.6042708734128511, + "flos": 660436872192.0, + "grad_norm": 0.0491739702694389, + "language_loss": 0.84699506, + "learning_rate": 0.00035744975694417414, + "loss": 0.8582809, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.41064453, + "step": 3141, + "time_per_iteration": 2.8567256927490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128353, + "balance_loss_mlp": 1.0867728, + "epoch": 0.6044632550981146, + "flos": 572035018752.0, + "grad_norm": 0.05704066286420323, + "language_loss": 0.82333231, + "learning_rate": 0.00035715117206822344, + "loss": 0.83461583, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.41577148, + "step": 3142, + "time_per_iteration": 2.7504515647888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141414, + "balance_loss_mlp": 1.09892821, + "epoch": 0.6046556367833782, + "flos": 546420086784.0, + "grad_norm": 0.06612582666460322, + "language_loss": 0.80943495, + "learning_rate": 0.0003568526426511065, + "loss": 0.82084912, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.42456055, + "step": 3143, + "time_per_iteration": 2.6085774898529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140491, + "balance_loss_mlp": 1.09817219, + "epoch": 0.6048480184686418, + "flos": 776505235968.0, + "grad_norm": 0.064383973380027, + "language_loss": 0.82750165, + "learning_rate": 0.000356554168808722, + "loss": 0.83890665, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.42358398, + "step": 3144, + "time_per_iteration": 2.9655168056488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140449, + "balance_loss_mlp": 1.09834385, + "epoch": 0.6050404001539054, + "flos": 656837254656.0, + "grad_norm": 0.05900200764303625, + "language_loss": 0.85025299, + "learning_rate": 0.00035625575065694837, + "loss": 0.8616575, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.42114258, + "step": 3145, + "time_per_iteration": 2.826193332672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134203, + "balance_loss_mlp": 1.09159803, + "epoch": 0.605232781839169, + "flos": 548710212096.0, + "grad_norm": 0.05530707742448767, + "language_loss": 0.77449524, + "learning_rate": 0.0003559573883116415, + "loss": 0.78583729, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.42626953, + "step": 3146, + "time_per_iteration": 2.6936702728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114118, + "balance_loss_mlp": 1.0976212, + "epoch": 0.6054251635244324, + "flos": 605093677056.0, + "grad_norm": 0.08058095897808437, + "language_loss": 0.85587645, + "learning_rate": 0.00035565908188863604, + "loss": 0.86728823, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.43579102, + "step": 3147, + "time_per_iteration": 2.8229072093963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113968, + "balance_loss_mlp": 1.09488153, + "epoch": 0.605617545209696, + "flos": 613398763008.0, + "grad_norm": 0.05127524075744011, + "language_loss": 0.79730809, + "learning_rate": 0.00035536083150374464, + "loss": 0.80870491, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.44799805, + "step": 3148, + "time_per_iteration": 2.782287836074829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139029, + "balance_loss_mlp": 1.12310266, + "epoch": 0.6058099268949596, + "flos": 1497477888000.0, + "grad_norm": 0.03498965475006418, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75886977, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.15917969, + "step": 3149, + "time_per_iteration": 4.813022613525391 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128616, + "balance_loss_mlp": 1.08696485, + "epoch": 0.6060023085802232, + "flos": 670170723840.0, + "grad_norm": 0.053702261720826414, + "language_loss": 0.85731369, + "learning_rate": 0.0003547644993114475, + "loss": 0.86859989, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.41650391, + "step": 3150, + "time_per_iteration": 2.7940874099731445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118224, + "balance_loss_mlp": 1.07688236, + "epoch": 0.6061946902654868, + "flos": 605885225472.0, + "grad_norm": 0.05286284770127293, + "language_loss": 0.79495448, + "learning_rate": 0.00035446641773555806, + "loss": 0.80613673, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.41357422, + "step": 3151, + "time_per_iteration": 2.7147328853607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116917, + "balance_loss_mlp": 1.07567072, + "epoch": 0.6063870719507503, + "flos": 557568147456.0, + "grad_norm": 0.052762165498596546, + "language_loss": 0.86798322, + "learning_rate": 0.000354168392660816, + "loss": 0.87915242, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.41235352, + "step": 3152, + "time_per_iteration": 2.7346954345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115838, + "balance_loss_mlp": 1.07583165, + "epoch": 0.6065794536360138, + "flos": 556874113536.0, + "grad_norm": 0.05405599690586098, + "language_loss": 0.82799989, + "learning_rate": 0.0003538704242029252, + "loss": 0.8391583, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.39990234, + "step": 3153, + "time_per_iteration": 2.705004930496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112343, + "balance_loss_mlp": 1.07169282, + "epoch": 0.6067718353212774, + "flos": 689836276224.0, + "grad_norm": 0.05919499383434511, + "language_loss": 0.77963281, + "learning_rate": 0.0003535725124775672, + "loss": 0.79075623, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.40649414, + "step": 3154, + "time_per_iteration": 2.8201727867126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110719, + "balance_loss_mlp": 1.07147574, + "epoch": 0.606964217006541, + "flos": 521531122176.0, + "grad_norm": 0.06643297661580516, + "language_loss": 0.86598241, + "learning_rate": 0.00035327465760040126, + "loss": 0.87708956, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.39233398, + "step": 3155, + "time_per_iteration": 2.6584889888763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100039, + "balance_loss_mlp": 1.06201148, + "epoch": 0.6071565986918045, + "flos": 641267504640.0, + "grad_norm": 0.0597836437175205, + "language_loss": 0.84776556, + "learning_rate": 0.00035297685968706526, + "loss": 0.85876596, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.37988281, + "step": 3156, + "time_per_iteration": 2.752196788787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109676, + "balance_loss_mlp": 1.05708754, + "epoch": 0.6073489803770681, + "flos": 560315169792.0, + "grad_norm": 0.05609890059594196, + "language_loss": 0.8300876, + "learning_rate": 0.00035267911885317454, + "loss": 0.84105527, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.39672852, + "step": 3157, + "time_per_iteration": 2.629136562347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109981, + "balance_loss_mlp": 1.06121039, + "epoch": 0.6075413620623317, + "flos": 585810828288.0, + "grad_norm": 0.05476186910904592, + "language_loss": 0.81797791, + "learning_rate": 0.0003523814352143222, + "loss": 0.82897604, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.38598633, + "step": 3158, + "time_per_iteration": 2.8239855766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087443, + "balance_loss_mlp": 1.04953456, + "epoch": 0.6077337437475953, + "flos": 630523906560.0, + "grad_norm": 0.060962114442721135, + "language_loss": 0.90981984, + "learning_rate": 0.00035208380888607937, + "loss": 0.92069423, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.37866211, + "step": 3159, + "time_per_iteration": 2.754648208618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068868, + "balance_loss_mlp": 1.05542111, + "epoch": 0.6079261254328588, + "flos": 1467726455808.0, + "grad_norm": 0.024644792756990472, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80530852, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.13476562, + "step": 3160, + "time_per_iteration": 4.849771022796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066394, + "balance_loss_mlp": 1.05323327, + "epoch": 0.6081185071181223, + "flos": 1522233022464.0, + "grad_norm": 0.022600356712689354, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76758623, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.13183594, + "step": 3161, + "time_per_iteration": 5.017123699188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081969, + "balance_loss_mlp": 1.04530025, + "epoch": 0.6083108888033859, + "flos": 556041867264.0, + "grad_norm": 0.07058889288065262, + "language_loss": 0.81635529, + "learning_rate": 0.00035119127492038446, + "loss": 0.82717502, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.3671875, + "step": 3162, + "time_per_iteration": 2.7839951515197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083576, + "balance_loss_mlp": 1.0463115, + "epoch": 0.6085032704886495, + "flos": 840819847680.0, + "grad_norm": 0.052086088834636966, + "language_loss": 0.82480276, + "learning_rate": 0.00035089387898984436, + "loss": 0.83563852, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.37207031, + "step": 3163, + "time_per_iteration": 3.0475828647613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079298, + "balance_loss_mlp": 1.04267716, + "epoch": 0.6086956521739131, + "flos": 684493590528.0, + "grad_norm": 0.05636679470966986, + "language_loss": 0.81840444, + "learning_rate": 0.0003505965409474343, + "loss": 0.82919747, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.36621094, + "step": 3164, + "time_per_iteration": 2.8719167709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081512, + "balance_loss_mlp": 1.04453373, + "epoch": 0.6088880338591766, + "flos": 535533894144.0, + "grad_norm": 0.05767475367988954, + "language_loss": 0.86591709, + "learning_rate": 0.0003502992609085913, + "loss": 0.87673223, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.36962891, + "step": 3165, + "time_per_iteration": 2.6596477031707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076933, + "balance_loss_mlp": 1.04007339, + "epoch": 0.6090804155444401, + "flos": 731197048320.0, + "grad_norm": 0.05479022562545965, + "language_loss": 0.82799208, + "learning_rate": 0.00035000203898872954, + "loss": 0.83876145, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.3684082, + "step": 3166, + "time_per_iteration": 2.985320568084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076201, + "balance_loss_mlp": 1.03845954, + "epoch": 0.6092727972297037, + "flos": 698708768256.0, + "grad_norm": 0.05187712745412687, + "language_loss": 0.84401566, + "learning_rate": 0.0003497048753032406, + "loss": 0.85477769, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.37695312, + "step": 3167, + "time_per_iteration": 2.876997470855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079403, + "balance_loss_mlp": 1.04213786, + "epoch": 0.6094651789149673, + "flos": 1051515869184.0, + "grad_norm": 0.16368682108793797, + "language_loss": 0.81000876, + "learning_rate": 0.000349407769967494, + "loss": 0.82080269, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.37255859, + "step": 3168, + "time_per_iteration": 3.376215696334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074675, + "balance_loss_mlp": 1.03721976, + "epoch": 0.6096575606002309, + "flos": 502834618368.0, + "grad_norm": 0.047663268241493265, + "language_loss": 0.84680313, + "learning_rate": 0.0003491107230968361, + "loss": 0.85754991, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.37475586, + "step": 3169, + "time_per_iteration": 2.660513401031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076232, + "balance_loss_mlp": 1.03872895, + "epoch": 0.6098499422854944, + "flos": 585339374592.0, + "grad_norm": 0.13699074886281146, + "language_loss": 0.81564283, + "learning_rate": 0.00034881373480659085, + "loss": 0.82640517, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.37475586, + "step": 3170, + "time_per_iteration": 2.831681728363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081961, + "balance_loss_mlp": 1.04364741, + "epoch": 0.610042323970758, + "flos": 468968444928.0, + "grad_norm": 0.06190459758057804, + "language_loss": 0.77871358, + "learning_rate": 0.0003485168052120594, + "loss": 0.78953326, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.3828125, + "step": 3171, + "time_per_iteration": 2.5600767135620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081834, + "balance_loss_mlp": 1.04387796, + "epoch": 0.6102347056560216, + "flos": 513923042304.0, + "grad_norm": 0.0838552496522472, + "language_loss": 0.80047345, + "learning_rate": 0.00034821993442851973, + "loss": 0.81129181, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.37890625, + "step": 3172, + "time_per_iteration": 2.564009666442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082985, + "balance_loss_mlp": 1.0452435, + "epoch": 0.6104270873412851, + "flos": 468776388096.0, + "grad_norm": 0.05938555160639068, + "language_loss": 0.82216555, + "learning_rate": 0.00034792312257122735, + "loss": 0.83299541, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.37719727, + "step": 3173, + "time_per_iteration": 2.6151862144470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078889, + "balance_loss_mlp": 1.04012203, + "epoch": 0.6106194690265486, + "flos": 549610859520.0, + "grad_norm": 0.05423157525738513, + "language_loss": 0.80451965, + "learning_rate": 0.00034762636975541506, + "loss": 0.81530857, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.38720703, + "step": 3174, + "time_per_iteration": 2.627699375152588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107833, + "balance_loss_mlp": 1.03965902, + "epoch": 0.6108118507118122, + "flos": 472602968064.0, + "grad_norm": 0.06986619017952604, + "language_loss": 0.80950004, + "learning_rate": 0.0003473296760962923, + "loss": 0.82028335, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.38647461, + "step": 3175, + "time_per_iteration": 2.6790359020233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073231, + "balance_loss_mlp": 1.06111896, + "epoch": 0.6110042323970758, + "flos": 1444416205824.0, + "grad_norm": 0.03162499472670903, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79606968, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.12109375, + "step": 3176, + "time_per_iteration": 4.660337924957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078339, + "balance_loss_mlp": 1.03966713, + "epoch": 0.6111966140823394, + "flos": 793807879680.0, + "grad_norm": 0.05300706067189078, + "language_loss": 0.8120122, + "learning_rate": 0.00034673646670883976, + "loss": 0.82279563, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.38623047, + "step": 3177, + "time_per_iteration": 2.9990971088409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046334, + "balance_loss_mlp": 1.03431749, + "epoch": 0.611388995767603, + "flos": 1556800432128.0, + "grad_norm": 0.020411675518342276, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76761359, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.12011719, + "step": 3178, + "time_per_iteration": 5.060986280441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078249, + "balance_loss_mlp": 1.03948236, + "epoch": 0.6115813774528664, + "flos": 711841416192.0, + "grad_norm": 0.052313854365800355, + "language_loss": 0.81582487, + "learning_rate": 0.0003461434953300865, + "loss": 0.82660735, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.38745117, + "step": 3179, + "time_per_iteration": 2.8902480602264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073999, + "balance_loss_mlp": 1.03535175, + "epoch": 0.61177375913813, + "flos": 683963910144.0, + "grad_norm": 0.0432149263415984, + "language_loss": 0.81232655, + "learning_rate": 0.0003458470991817515, + "loss": 0.82306653, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.38598633, + "step": 3180, + "time_per_iteration": 2.9921305179595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078708, + "balance_loss_mlp": 1.04068065, + "epoch": 0.6119661408233936, + "flos": 511411746816.0, + "grad_norm": 0.056171077714967085, + "language_loss": 0.84767073, + "learning_rate": 0.0003455507628808802, + "loss": 0.8584578, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.38012695, + "step": 3181, + "time_per_iteration": 2.5818896293640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073399, + "balance_loss_mlp": 1.03527629, + "epoch": 0.6121585225086572, + "flos": 556548226560.0, + "grad_norm": 0.057403680596608046, + "language_loss": 0.8451159, + "learning_rate": 0.00034525448654252076, + "loss": 0.85584986, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.38085938, + "step": 3182, + "time_per_iteration": 2.6865382194519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072137, + "balance_loss_mlp": 1.03384721, + "epoch": 0.6123509041939207, + "flos": 561585374208.0, + "grad_norm": 0.07466059986871497, + "language_loss": 0.82914555, + "learning_rate": 0.0003449582702816976, + "loss": 0.83986694, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.3828125, + "step": 3183, + "time_per_iteration": 2.6590259075164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079784, + "balance_loss_mlp": 1.0416131, + "epoch": 0.6125432858791843, + "flos": 557789317632.0, + "grad_norm": 0.05504997733679025, + "language_loss": 0.82930607, + "learning_rate": 0.0003446621142134122, + "loss": 0.84010386, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.3815918, + "step": 3184, + "time_per_iteration": 2.7104709148406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075053, + "balance_loss_mlp": 1.03776431, + "epoch": 0.6127356675644479, + "flos": 414796529664.0, + "grad_norm": 0.05785245107541848, + "language_loss": 0.84189403, + "learning_rate": 0.0003443660184526424, + "loss": 0.85264462, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.37255859, + "step": 3185, + "time_per_iteration": 2.441305160522461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078818, + "balance_loss_mlp": 1.04048026, + "epoch": 0.6129280492497114, + "flos": 603547047936.0, + "grad_norm": 0.04628969176382701, + "language_loss": 0.86441582, + "learning_rate": 0.0003440699831143429, + "loss": 0.87520397, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.38305664, + "step": 3186, + "time_per_iteration": 2.81016206741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081465, + "balance_loss_mlp": 1.04474831, + "epoch": 0.613120430934975, + "flos": 519492690432.0, + "grad_norm": 0.05115957600907009, + "language_loss": 0.82288289, + "learning_rate": 0.0003437740083134449, + "loss": 0.83369744, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.3671875, + "step": 3187, + "time_per_iteration": 2.695181369781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075008, + "balance_loss_mlp": 1.03798163, + "epoch": 0.6133128126202385, + "flos": 510835576320.0, + "grad_norm": 0.06733229983475184, + "language_loss": 0.83452654, + "learning_rate": 0.00034347809416485574, + "loss": 0.84527659, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.37011719, + "step": 3188, + "time_per_iteration": 2.5900075435638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081504, + "balance_loss_mlp": 1.04402518, + "epoch": 0.6135051943055021, + "flos": 607264528896.0, + "grad_norm": 0.053668382142468496, + "language_loss": 0.81688702, + "learning_rate": 0.0003431822407834597, + "loss": 0.82770205, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.37475586, + "step": 3189, + "time_per_iteration": 2.8129723072052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107806, + "balance_loss_mlp": 1.04062855, + "epoch": 0.6136975759907657, + "flos": 1159750600704.0, + "grad_norm": 0.0555928311696248, + "language_loss": 0.84534049, + "learning_rate": 0.00034288644828411706, + "loss": 0.85612106, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.37426758, + "step": 3190, + "time_per_iteration": 3.4628307819366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076796, + "balance_loss_mlp": 1.03931642, + "epoch": 0.6138899576760293, + "flos": 706631150592.0, + "grad_norm": 0.05334960591036923, + "language_loss": 0.75148171, + "learning_rate": 0.0003425907167816649, + "loss": 0.76224971, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.37475586, + "step": 3191, + "time_per_iteration": 2.867506265640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072461, + "balance_loss_mlp": 1.03510118, + "epoch": 0.6140823393612928, + "flos": 586151271936.0, + "grad_norm": 0.05066562210294406, + "language_loss": 0.84692401, + "learning_rate": 0.00034229504639091623, + "loss": 0.85764861, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.37329102, + "step": 3192, + "time_per_iteration": 2.757969617843628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075038, + "balance_loss_mlp": 1.03722489, + "epoch": 0.6142747210465563, + "flos": 803759929344.0, + "grad_norm": 0.052233657686543596, + "language_loss": 0.79899156, + "learning_rate": 0.0003419994372266606, + "loss": 0.80974191, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.37792969, + "step": 3193, + "time_per_iteration": 3.113477945327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074279, + "balance_loss_mlp": 1.03651392, + "epoch": 0.6144671027318199, + "flos": 529158140928.0, + "grad_norm": 0.04106506245407052, + "language_loss": 0.81734288, + "learning_rate": 0.00034170388940366335, + "loss": 0.82808566, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.37744141, + "step": 3194, + "time_per_iteration": 2.6896331310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078888, + "balance_loss_mlp": 1.04190898, + "epoch": 0.6146594844170835, + "flos": 805054864896.0, + "grad_norm": 0.05108636633203802, + "language_loss": 0.80077958, + "learning_rate": 0.0003414084030366667, + "loss": 0.8115685, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.36987305, + "step": 3195, + "time_per_iteration": 3.083922863006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078724, + "balance_loss_mlp": 1.04134059, + "epoch": 0.6148518661023471, + "flos": 501431993856.0, + "grad_norm": 0.05057450968707768, + "language_loss": 0.82827139, + "learning_rate": 0.0003411129782403883, + "loss": 0.83905864, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.3737793, + "step": 3196, + "time_per_iteration": 2.641129970550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107445, + "balance_loss_mlp": 1.03720951, + "epoch": 0.6150442477876106, + "flos": 510436905984.0, + "grad_norm": 0.062166834979967195, + "language_loss": 0.84822834, + "learning_rate": 0.0003408176151295225, + "loss": 0.85897291, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.37207031, + "step": 3197, + "time_per_iteration": 2.5532026290893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071235, + "balance_loss_mlp": 1.03425658, + "epoch": 0.6152366294728742, + "flos": 526758916608.0, + "grad_norm": 0.06002763695561428, + "language_loss": 0.770096, + "learning_rate": 0.00034052231381873944, + "loss": 0.78080833, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.36962891, + "step": 3198, + "time_per_iteration": 2.6175601482391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107449, + "balance_loss_mlp": 1.03746367, + "epoch": 0.6154290111581378, + "flos": 473055482880.0, + "grad_norm": 0.053906213257321506, + "language_loss": 0.85027397, + "learning_rate": 0.00034022707442268494, + "loss": 0.86101884, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.37060547, + "step": 3199, + "time_per_iteration": 2.5418269634246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075667, + "balance_loss_mlp": 1.03985643, + "epoch": 0.6156213928434013, + "flos": 550542030336.0, + "grad_norm": 0.04138117039405258, + "language_loss": 0.81766355, + "learning_rate": 0.0003399318970559813, + "loss": 0.82842016, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.35864258, + "step": 3200, + "time_per_iteration": 2.8180348873138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074649, + "balance_loss_mlp": 1.03795648, + "epoch": 0.6158137745286649, + "flos": 750587586048.0, + "grad_norm": 0.04925803113162635, + "language_loss": 0.84793299, + "learning_rate": 0.00033963678183322656, + "loss": 0.85867941, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.36694336, + "step": 3201, + "time_per_iteration": 3.032935857772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107717, + "balance_loss_mlp": 1.04035842, + "epoch": 0.6160061562139284, + "flos": 555544272384.0, + "grad_norm": 0.0447157472200271, + "language_loss": 0.82589877, + "learning_rate": 0.0003393417288689945, + "loss": 0.8366704, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.36816406, + "step": 3202, + "time_per_iteration": 2.675895929336548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076788, + "balance_loss_mlp": 1.03976154, + "epoch": 0.616198537899192, + "flos": 741856278528.0, + "grad_norm": 0.0597641092397592, + "language_loss": 0.75911278, + "learning_rate": 0.00033904673827783504, + "loss": 0.76988065, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.37060547, + "step": 3203, + "time_per_iteration": 2.930006265640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078121, + "balance_loss_mlp": 1.04111826, + "epoch": 0.6163909195844556, + "flos": 478569876480.0, + "grad_norm": 0.09425885378712065, + "language_loss": 0.8152014, + "learning_rate": 0.00033875181017427357, + "loss": 0.82598263, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.36962891, + "step": 3204, + "time_per_iteration": 2.624331474304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071675, + "balance_loss_mlp": 1.03524435, + "epoch": 0.6165833012697192, + "flos": 531231478272.0, + "grad_norm": 0.05217722063945812, + "language_loss": 0.80865437, + "learning_rate": 0.00033845694467281133, + "loss": 0.8193711, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.36450195, + "step": 3205, + "time_per_iteration": 2.8210368156433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107918, + "balance_loss_mlp": 1.0422256, + "epoch": 0.6167756829549826, + "flos": 807384278016.0, + "grad_norm": 0.04964273497495854, + "language_loss": 0.83231258, + "learning_rate": 0.00033816214188792516, + "loss": 0.84310448, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.36938477, + "step": 3206, + "time_per_iteration": 3.148005485534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074637, + "balance_loss_mlp": 1.03782535, + "epoch": 0.6169680646402462, + "flos": 488683459584.0, + "grad_norm": 0.053298610353503126, + "language_loss": 0.85231054, + "learning_rate": 0.00033786740193406784, + "loss": 0.8630569, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.36791992, + "step": 3207, + "time_per_iteration": 2.576956272125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083512, + "balance_loss_mlp": 1.04693818, + "epoch": 0.6171604463255098, + "flos": 618643934208.0, + "grad_norm": 0.05970709396928862, + "language_loss": 0.81620336, + "learning_rate": 0.00033757272492566736, + "loss": 0.82703847, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.3659668, + "step": 3208, + "time_per_iteration": 2.8902554512023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077083, + "balance_loss_mlp": 1.04070079, + "epoch": 0.6173528280107734, + "flos": 528600909312.0, + "grad_norm": 0.043205070358092235, + "language_loss": 0.87206829, + "learning_rate": 0.0003372781109771278, + "loss": 0.88283914, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.36401367, + "step": 3209, + "time_per_iteration": 2.688534736633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077515, + "balance_loss_mlp": 1.04036927, + "epoch": 0.617545209696037, + "flos": 596293968384.0, + "grad_norm": 0.05036658648833462, + "language_loss": 0.76489538, + "learning_rate": 0.0003369835602028281, + "loss": 0.77567053, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.37158203, + "step": 3210, + "time_per_iteration": 2.7890372276306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073752, + "balance_loss_mlp": 1.03763127, + "epoch": 0.6177375913813005, + "flos": 474848013312.0, + "grad_norm": 0.06457582449248328, + "language_loss": 0.7954967, + "learning_rate": 0.0003366890727171232, + "loss": 0.80623418, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.36132812, + "step": 3211, + "time_per_iteration": 2.6358649730682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076437, + "balance_loss_mlp": 1.03983986, + "epoch": 0.617929973066564, + "flos": 529546636800.0, + "grad_norm": 0.051638543668130914, + "language_loss": 0.78236932, + "learning_rate": 0.00033639464863434313, + "loss": 0.79313374, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.36621094, + "step": 3212, + "time_per_iteration": 2.6086573600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104581, + "balance_loss_mlp": 1.03403246, + "epoch": 0.6181223547518276, + "flos": 1419361477632.0, + "grad_norm": 0.031029800070293646, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79488277, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.11767578, + "step": 3213, + "time_per_iteration": 4.67001748085022 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082687, + "balance_loss_mlp": 1.04608989, + "epoch": 0.6183147364370912, + "flos": 739976408064.0, + "grad_norm": 0.057199257803381136, + "language_loss": 0.79338527, + "learning_rate": 0.00033580599113475543, + "loss": 0.80421209, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.36572266, + "step": 3214, + "time_per_iteration": 2.9583098888397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084038, + "balance_loss_mlp": 1.04791784, + "epoch": 0.6185071181223547, + "flos": 381442507776.0, + "grad_norm": 0.04917291397631135, + "language_loss": 0.85787857, + "learning_rate": 0.00033551175794648507, + "loss": 0.86871898, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.36108398, + "step": 3215, + "time_per_iteration": 2.450173854827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079107, + "balance_loss_mlp": 1.04191399, + "epoch": 0.6186994998076183, + "flos": 463109225472.0, + "grad_norm": 0.05232146419695497, + "language_loss": 0.8178426, + "learning_rate": 0.00033521758861821365, + "loss": 0.82863367, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.37158203, + "step": 3216, + "time_per_iteration": 2.566434144973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107802, + "balance_loss_mlp": 1.04132736, + "epoch": 0.6188918814928819, + "flos": 485029997568.0, + "grad_norm": 0.044556879100730015, + "language_loss": 0.88947988, + "learning_rate": 0.0003349234832641479, + "loss": 0.90026009, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.36669922, + "step": 3217, + "time_per_iteration": 2.5626957416534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087401, + "balance_loss_mlp": 1.05027926, + "epoch": 0.6190842631781455, + "flos": 656985641472.0, + "grad_norm": 0.056610001609600974, + "language_loss": 0.81178546, + "learning_rate": 0.00033462944199846975, + "loss": 0.82265949, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.37109375, + "step": 3218, + "time_per_iteration": 3.038856267929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091964, + "balance_loss_mlp": 1.054842, + "epoch": 0.619276644863409, + "flos": 403388011008.0, + "grad_norm": 0.051099399179052, + "language_loss": 0.86047733, + "learning_rate": 0.00033433546493533606, + "loss": 0.87139696, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.37109375, + "step": 3219, + "time_per_iteration": 2.4660589694976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086562, + "balance_loss_mlp": 1.04913092, + "epoch": 0.6194690265486725, + "flos": 582807730176.0, + "grad_norm": 0.07929462737326079, + "language_loss": 0.84635407, + "learning_rate": 0.00033404155218887897, + "loss": 0.8572197, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.37402344, + "step": 3220, + "time_per_iteration": 2.7270491123199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087776, + "balance_loss_mlp": 1.05127466, + "epoch": 0.6196614082339361, + "flos": 503963638272.0, + "grad_norm": 0.04746710197063832, + "language_loss": 0.87405616, + "learning_rate": 0.00033374770387320534, + "loss": 0.88493389, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.36499023, + "step": 3221, + "time_per_iteration": 2.7464041709899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086025, + "balance_loss_mlp": 1.04957032, + "epoch": 0.6198537899191997, + "flos": 575131249152.0, + "grad_norm": 0.04828799044899351, + "language_loss": 0.84905434, + "learning_rate": 0.00033345392010239737, + "loss": 0.85991454, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.36425781, + "step": 3222, + "time_per_iteration": 2.7124643325805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090712, + "balance_loss_mlp": 1.05432916, + "epoch": 0.6200461716044633, + "flos": 592871851008.0, + "grad_norm": 0.05455186914626242, + "language_loss": 0.8191222, + "learning_rate": 0.0003331602009905118, + "loss": 0.83002931, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.36376953, + "step": 3223, + "time_per_iteration": 2.7330005168914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084321, + "balance_loss_mlp": 1.04696107, + "epoch": 0.6202385532897268, + "flos": 665765001216.0, + "grad_norm": 0.046947333266423794, + "language_loss": 0.83694303, + "learning_rate": 0.00033286654665158085, + "loss": 0.84778625, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.37329102, + "step": 3224, + "time_per_iteration": 2.937727689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087568, + "balance_loss_mlp": 1.0515908, + "epoch": 0.6204309349749904, + "flos": 484709902848.0, + "grad_norm": 0.0575064293586871, + "language_loss": 0.87672997, + "learning_rate": 0.0003325729571996109, + "loss": 0.88760567, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.36010742, + "step": 3225, + "time_per_iteration": 2.6319355964660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085695, + "balance_loss_mlp": 1.04919314, + "epoch": 0.6206233166602539, + "flos": 583768014336.0, + "grad_norm": 0.048737024704114895, + "language_loss": 0.83402115, + "learning_rate": 0.000332279432748584, + "loss": 0.84487808, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.36523438, + "step": 3226, + "time_per_iteration": 2.733870029449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010819, + "balance_loss_mlp": 1.04656696, + "epoch": 0.6208156983455175, + "flos": 476669657088.0, + "grad_norm": 0.0460557240454385, + "language_loss": 0.87514353, + "learning_rate": 0.00033198597341245576, + "loss": 0.88596255, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.35375977, + "step": 3227, + "time_per_iteration": 2.567084789276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081012, + "balance_loss_mlp": 1.04420066, + "epoch": 0.6210080800307811, + "flos": 788716887552.0, + "grad_norm": 0.07539791999679457, + "language_loss": 0.81657004, + "learning_rate": 0.00033169257930515763, + "loss": 0.82738018, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.36816406, + "step": 3228, + "time_per_iteration": 3.074739694595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084245, + "balance_loss_mlp": 1.04655147, + "epoch": 0.6212004617160446, + "flos": 607514812416.0, + "grad_norm": 0.05269169473042375, + "language_loss": 0.82430172, + "learning_rate": 0.0003313992505405951, + "loss": 0.83514416, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.37695312, + "step": 3229, + "time_per_iteration": 2.711282730102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079533, + "balance_loss_mlp": 1.04305458, + "epoch": 0.6213928434013082, + "flos": 586248786432.0, + "grad_norm": 0.05753494770574613, + "language_loss": 0.8075214, + "learning_rate": 0.0003311059872326487, + "loss": 0.81831676, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.36474609, + "step": 3230, + "time_per_iteration": 2.6755940914154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082159, + "balance_loss_mlp": 1.04467952, + "epoch": 0.6215852250865718, + "flos": 535819083264.0, + "grad_norm": 0.04907016045640681, + "language_loss": 0.79111725, + "learning_rate": 0.0003308127894951734, + "loss": 0.80193883, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.37426758, + "step": 3231, + "time_per_iteration": 2.612122058868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086128, + "balance_loss_mlp": 1.04893494, + "epoch": 0.6217776067718354, + "flos": 617884471296.0, + "grad_norm": 0.0640423801123885, + "language_loss": 0.86435384, + "learning_rate": 0.00033051965744199834, + "loss": 0.87521511, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.37133789, + "step": 3232, + "time_per_iteration": 2.734384059906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107468, + "balance_loss_mlp": 1.03913224, + "epoch": 0.6219699884570988, + "flos": 545570311680.0, + "grad_norm": 0.045255868700115984, + "language_loss": 0.90312266, + "learning_rate": 0.0003302265911869276, + "loss": 0.91386944, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.35571289, + "step": 3233, + "time_per_iteration": 2.9088501930236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079236, + "balance_loss_mlp": 1.04216146, + "epoch": 0.6221623701423624, + "flos": 480899289600.0, + "grad_norm": 0.054924545254622516, + "language_loss": 0.83717418, + "learning_rate": 0.0003299335908437397, + "loss": 0.84796649, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.37060547, + "step": 3234, + "time_per_iteration": 2.5804450511932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077942, + "balance_loss_mlp": 1.04062915, + "epoch": 0.622354751827626, + "flos": 379812920832.0, + "grad_norm": 0.0810547632839198, + "language_loss": 0.80174738, + "learning_rate": 0.0003296406565261873, + "loss": 0.81252682, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.37304688, + "step": 3235, + "time_per_iteration": 2.480074405670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072254, + "balance_loss_mlp": 1.03610981, + "epoch": 0.6225471335128896, + "flos": 667570678272.0, + "grad_norm": 0.04590561718028109, + "language_loss": 0.84757555, + "learning_rate": 0.0003293477883479978, + "loss": 0.85829806, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.36181641, + "step": 3236, + "time_per_iteration": 2.8077552318573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107691, + "balance_loss_mlp": 1.03909636, + "epoch": 0.6227395151981532, + "flos": 770995224576.0, + "grad_norm": 0.06134325459280444, + "language_loss": 0.79419619, + "learning_rate": 0.0003290549864228727, + "loss": 0.80496532, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.37768555, + "step": 3237, + "time_per_iteration": 2.9485511779785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078391, + "balance_loss_mlp": 1.04084027, + "epoch": 0.6229318968834167, + "flos": 484104619008.0, + "grad_norm": 0.04787340801425507, + "language_loss": 0.86647016, + "learning_rate": 0.0003287622508644875, + "loss": 0.87725413, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.37548828, + "step": 3238, + "time_per_iteration": 2.723003387451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072928, + "balance_loss_mlp": 1.0360688, + "epoch": 0.6231242785686802, + "flos": 462700380672.0, + "grad_norm": 0.08533340323107003, + "language_loss": 0.86471462, + "learning_rate": 0.0003284695817864923, + "loss": 0.87544394, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.36865234, + "step": 3239, + "time_per_iteration": 2.4788854122161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079076, + "balance_loss_mlp": 1.04231155, + "epoch": 0.6233166602539438, + "flos": 608809747968.0, + "grad_norm": 0.06356990340371446, + "language_loss": 0.83732104, + "learning_rate": 0.0003281769793025116, + "loss": 0.84811181, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.36791992, + "step": 3240, + "time_per_iteration": 2.68833065032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071809, + "balance_loss_mlp": 1.03542674, + "epoch": 0.6235090419392074, + "flos": 438972521472.0, + "grad_norm": 0.05773237210342904, + "language_loss": 0.89384484, + "learning_rate": 0.00032788444352614346, + "loss": 0.90456295, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.36425781, + "step": 3241, + "time_per_iteration": 2.485630512237549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073948, + "balance_loss_mlp": 1.03706515, + "epoch": 0.6237014236244709, + "flos": 504656262144.0, + "grad_norm": 0.05916154923857777, + "language_loss": 0.80431205, + "learning_rate": 0.0003275919745709606, + "loss": 0.81505156, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.36889648, + "step": 3242, + "time_per_iteration": 2.557732582092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073489, + "balance_loss_mlp": 1.03710628, + "epoch": 0.6238938053097345, + "flos": 512648455680.0, + "grad_norm": 0.047494752086082274, + "language_loss": 0.82139623, + "learning_rate": 0.00032729957255050936, + "loss": 0.83213103, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.36376953, + "step": 3243, + "time_per_iteration": 2.653381586074829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075548, + "balance_loss_mlp": 1.03799748, + "epoch": 0.6240861869949981, + "flos": 736435017216.0, + "grad_norm": 0.07878714918390893, + "language_loss": 0.81488502, + "learning_rate": 0.0003270072375783102, + "loss": 0.8256405, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.37524414, + "step": 3244, + "time_per_iteration": 2.893857717514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068875, + "balance_loss_mlp": 1.03244424, + "epoch": 0.6242785686802617, + "flos": 494464103424.0, + "grad_norm": 0.05659954005953207, + "language_loss": 0.79646188, + "learning_rate": 0.00032671496976785774, + "loss": 0.8071506, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.36425781, + "step": 3245, + "time_per_iteration": 2.5836338996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072633, + "balance_loss_mlp": 1.03536868, + "epoch": 0.6244709503655252, + "flos": 745500976128.0, + "grad_norm": 0.04683044918703509, + "language_loss": 0.75894988, + "learning_rate": 0.0003264227692326205, + "loss": 0.76967621, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.37231445, + "step": 3246, + "time_per_iteration": 3.0129404067993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071862, + "balance_loss_mlp": 1.03524101, + "epoch": 0.6246633320507887, + "flos": 492366034944.0, + "grad_norm": 0.053825278075075034, + "language_loss": 0.85644072, + "learning_rate": 0.00032613063608604055, + "loss": 0.86715937, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.36645508, + "step": 3247, + "time_per_iteration": 2.5503756999969482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078462, + "balance_loss_mlp": 1.0416261, + "epoch": 0.6248557137360523, + "flos": 517142928384.0, + "grad_norm": 0.04781520773103446, + "language_loss": 0.8331461, + "learning_rate": 0.0003258385704415343, + "loss": 0.84393072, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.36816406, + "step": 3248, + "time_per_iteration": 2.560483455657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079309, + "balance_loss_mlp": 1.04161501, + "epoch": 0.6250480954213159, + "flos": 519098402304.0, + "grad_norm": 0.04627181605828338, + "language_loss": 0.83052945, + "learning_rate": 0.0003255465724124915, + "loss": 0.84132254, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.37670898, + "step": 3249, + "time_per_iteration": 2.7024102210998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075461, + "balance_loss_mlp": 1.03776741, + "epoch": 0.6252404771065795, + "flos": 515808705024.0, + "grad_norm": 0.04699281003283387, + "language_loss": 0.82845968, + "learning_rate": 0.00032525464211227587, + "loss": 0.83921427, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.37646484, + "step": 3250, + "time_per_iteration": 2.5934925079345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073962, + "balance_loss_mlp": 1.03712666, + "epoch": 0.6254328587918431, + "flos": 576647354880.0, + "grad_norm": 0.05335085924079445, + "language_loss": 0.85498369, + "learning_rate": 0.0003249627796542249, + "loss": 0.86572331, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.36816406, + "step": 3251, + "time_per_iteration": 2.6473746299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107209, + "balance_loss_mlp": 1.03472972, + "epoch": 0.6256252404771065, + "flos": 597638366208.0, + "grad_norm": 0.05949551618026705, + "language_loss": 0.83974731, + "learning_rate": 0.00032467098515164943, + "loss": 0.85046822, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.37353516, + "step": 3252, + "time_per_iteration": 2.8618545532226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074504, + "balance_loss_mlp": 1.03776419, + "epoch": 0.6258176221623701, + "flos": 508034709504.0, + "grad_norm": 0.05339688957223288, + "language_loss": 0.83978283, + "learning_rate": 0.00032437925871783456, + "loss": 0.85052788, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.36767578, + "step": 3253, + "time_per_iteration": 2.6301941871643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074357, + "balance_loss_mlp": 1.03680658, + "epoch": 0.6260100038476337, + "flos": 639357110784.0, + "grad_norm": 0.06013661875979651, + "language_loss": 0.84100354, + "learning_rate": 0.00032408760046603803, + "loss": 0.85174716, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.37548828, + "step": 3254, + "time_per_iteration": 2.798520565032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076909, + "balance_loss_mlp": 1.03923869, + "epoch": 0.6262023855328973, + "flos": 840648139776.0, + "grad_norm": 0.05406777705406554, + "language_loss": 0.77436024, + "learning_rate": 0.00032379601050949193, + "loss": 0.78512931, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.3762207, + "step": 3255, + "time_per_iteration": 3.0876083374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075377, + "balance_loss_mlp": 1.03746879, + "epoch": 0.6263947672181608, + "flos": 521884712448.0, + "grad_norm": 0.05001529336146337, + "language_loss": 0.8825866, + "learning_rate": 0.0003235044889614013, + "loss": 0.89334035, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.37866211, + "step": 3256, + "time_per_iteration": 2.616588592529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079768, + "balance_loss_mlp": 1.04221702, + "epoch": 0.6265871489034244, + "flos": 606747995136.0, + "grad_norm": 0.049239400336598835, + "language_loss": 0.83356363, + "learning_rate": 0.0003232130359349451, + "loss": 0.84436131, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.37524414, + "step": 3257, + "time_per_iteration": 2.8224074840545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083632, + "balance_loss_mlp": 1.04474616, + "epoch": 0.626779530588688, + "flos": 588208642560.0, + "grad_norm": 0.04846319258982293, + "language_loss": 0.81674659, + "learning_rate": 0.0003229216515432751, + "loss": 0.8275829, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.38842773, + "step": 3258, + "time_per_iteration": 2.78884220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081861, + "balance_loss_mlp": 1.0438329, + "epoch": 0.6269719122739515, + "flos": 438381794304.0, + "grad_norm": 0.061777321686694836, + "language_loss": 0.79815853, + "learning_rate": 0.0003226303358995174, + "loss": 0.80897713, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.37988281, + "step": 3259, + "time_per_iteration": 2.625014305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108494, + "balance_loss_mlp": 1.0462687, + "epoch": 0.6271642939592151, + "flos": 562590738432.0, + "grad_norm": 0.04793696937542698, + "language_loss": 0.8911407, + "learning_rate": 0.00032233908911677, + "loss": 0.90199006, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.38623047, + "step": 3260, + "time_per_iteration": 2.8619987964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081144, + "balance_loss_mlp": 1.04194832, + "epoch": 0.6273566756444786, + "flos": 514288217088.0, + "grad_norm": 0.06578723917558563, + "language_loss": 0.80680311, + "learning_rate": 0.0003220479113081053, + "loss": 0.81761456, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.3918457, + "step": 3261, + "time_per_iteration": 2.7102510929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080352, + "balance_loss_mlp": 1.04270554, + "epoch": 0.6275490573297422, + "flos": 585195369984.0, + "grad_norm": 0.0548628003226281, + "language_loss": 0.78727174, + "learning_rate": 0.00032175680258656836, + "loss": 0.79807532, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.3762207, + "step": 3262, + "time_per_iteration": 2.696701765060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083558, + "balance_loss_mlp": 1.04600739, + "epoch": 0.6277414390150058, + "flos": 559143889920.0, + "grad_norm": 0.044574681461427054, + "language_loss": 0.80117631, + "learning_rate": 0.00032146576306517794, + "loss": 0.81201196, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.37524414, + "step": 3263, + "time_per_iteration": 2.764273166656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077314, + "balance_loss_mlp": 1.03873789, + "epoch": 0.6279338207002694, + "flos": 612423922176.0, + "grad_norm": 0.04659103791946159, + "language_loss": 0.80601645, + "learning_rate": 0.0003211747928569255, + "loss": 0.81678957, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.38525391, + "step": 3264, + "time_per_iteration": 2.741144895553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077333, + "balance_loss_mlp": 1.03906703, + "epoch": 0.6281262023855329, + "flos": 625374687744.0, + "grad_norm": 0.044995138284684974, + "language_loss": 0.81407869, + "learning_rate": 0.0003208838920747754, + "loss": 0.82485199, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.38208008, + "step": 3265, + "time_per_iteration": 2.8458306789398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075806, + "balance_loss_mlp": 1.03753948, + "epoch": 0.6283185840707964, + "flos": 1123147579392.0, + "grad_norm": 0.051347706918532285, + "language_loss": 0.76555598, + "learning_rate": 0.0003205930608316656, + "loss": 0.77631402, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.38232422, + "step": 3266, + "time_per_iteration": 3.5019400119781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074561, + "balance_loss_mlp": 1.03631854, + "epoch": 0.62851096575606, + "flos": 514967694336.0, + "grad_norm": 0.055036634994397565, + "language_loss": 0.84812629, + "learning_rate": 0.00032030229924050673, + "loss": 0.85887194, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.38183594, + "step": 3267, + "time_per_iteration": 2.6514573097229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107346, + "balance_loss_mlp": 1.03495502, + "epoch": 0.6287033474413236, + "flos": 403949624832.0, + "grad_norm": 0.06092252438961513, + "language_loss": 0.79938138, + "learning_rate": 0.00032001160741418247, + "loss": 0.81011593, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.38452148, + "step": 3268, + "time_per_iteration": 2.6364564895629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076302, + "balance_loss_mlp": 1.03765488, + "epoch": 0.6288957291265872, + "flos": 525459598848.0, + "grad_norm": 0.06432688235517753, + "language_loss": 0.81921297, + "learning_rate": 0.0003197209854655494, + "loss": 0.82997596, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.38623047, + "step": 3269, + "time_per_iteration": 2.6190736293792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072941, + "balance_loss_mlp": 1.03531849, + "epoch": 0.6290881108118507, + "flos": 603414627840.0, + "grad_norm": 0.059396512475175293, + "language_loss": 0.74762654, + "learning_rate": 0.0003194304335074371, + "loss": 0.75835598, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.3762207, + "step": 3270, + "time_per_iteration": 2.829658031463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069412, + "balance_loss_mlp": 1.03190899, + "epoch": 0.6292804924971143, + "flos": 437446241280.0, + "grad_norm": 0.057734053913976915, + "language_loss": 0.8848114, + "learning_rate": 0.0003191399516526475, + "loss": 0.89550555, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.37451172, + "step": 3271, + "time_per_iteration": 2.520371675491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107074, + "balance_loss_mlp": 1.03369021, + "epoch": 0.6294728741823779, + "flos": 606368263680.0, + "grad_norm": 0.05065852355738081, + "language_loss": 0.79438859, + "learning_rate": 0.0003188495400139559, + "loss": 0.80509603, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.37060547, + "step": 3272, + "time_per_iteration": 2.771045207977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070525, + "balance_loss_mlp": 1.03354681, + "epoch": 0.6296652558676414, + "flos": 701220063744.0, + "grad_norm": 0.05978567707870047, + "language_loss": 0.84609801, + "learning_rate": 0.00031855919870411013, + "loss": 0.8568033, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.36987305, + "step": 3273, + "time_per_iteration": 2.8209264278411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072859, + "balance_loss_mlp": 1.03516483, + "epoch": 0.6298576375529049, + "flos": 523652511744.0, + "grad_norm": 0.05543489609660157, + "language_loss": 0.85005689, + "learning_rate": 0.0003182689278358305, + "loss": 0.86078548, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.37646484, + "step": 3274, + "time_per_iteration": 2.6735117435455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069939, + "balance_loss_mlp": 1.03360391, + "epoch": 0.6300500192381685, + "flos": 475723929600.0, + "grad_norm": 0.06241690898076668, + "language_loss": 0.79779917, + "learning_rate": 0.0003179787275218105, + "loss": 0.80849856, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.36352539, + "step": 3275, + "time_per_iteration": 2.5281076431274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071447, + "balance_loss_mlp": 1.03394365, + "epoch": 0.6302424009234321, + "flos": 520629064704.0, + "grad_norm": 0.04860664523501564, + "language_loss": 0.83985364, + "learning_rate": 0.0003176885978747155, + "loss": 0.85056806, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.375, + "step": 3276, + "time_per_iteration": 2.590137243270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073206, + "balance_loss_mlp": 1.03594065, + "epoch": 0.6304347826086957, + "flos": 694282696704.0, + "grad_norm": 0.06745994429641342, + "language_loss": 0.82557893, + "learning_rate": 0.0003173985390071839, + "loss": 0.83631098, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.37207031, + "step": 3277, + "time_per_iteration": 2.835454225540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026014, + "balance_loss_mlp": 1.01476038, + "epoch": 0.6306271642939593, + "flos": 1466067755520.0, + "grad_norm": 0.018393176098041853, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78926468, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.11230469, + "step": 3278, + "time_per_iteration": 4.83237099647522 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071213, + "balance_loss_mlp": 1.03440166, + "epoch": 0.6308195459792227, + "flos": 601444597248.0, + "grad_norm": 0.05391474190589518, + "language_loss": 0.8122592, + "learning_rate": 0.00031681863406122704, + "loss": 0.82297128, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.36816406, + "step": 3279, + "time_per_iteration": 2.7689826488494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071032, + "balance_loss_mlp": 1.03381503, + "epoch": 0.6310119276644863, + "flos": 726514900992.0, + "grad_norm": 0.04523972239140451, + "language_loss": 0.85147464, + "learning_rate": 0.00031652878820794087, + "loss": 0.86218488, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.37207031, + "step": 3280, + "time_per_iteration": 2.973525047302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074703, + "balance_loss_mlp": 1.03762913, + "epoch": 0.6312043093497499, + "flos": 519482515968.0, + "grad_norm": 0.0661931076661352, + "language_loss": 0.85199058, + "learning_rate": 0.00031623901358449627, + "loss": 0.86273754, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.37060547, + "step": 3281, + "time_per_iteration": 2.6226651668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074897, + "balance_loss_mlp": 1.03860974, + "epoch": 0.6313966910350135, + "flos": 530934704640.0, + "grad_norm": 0.050825479700673346, + "language_loss": 0.88810539, + "learning_rate": 0.0003159493103033936, + "loss": 0.89885437, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.36303711, + "step": 3282, + "time_per_iteration": 2.601001262664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022599, + "balance_loss_mlp": 1.01163197, + "epoch": 0.631589072720277, + "flos": 1379113606656.0, + "grad_norm": 0.015722809882928884, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80941653, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.10986328, + "step": 3283, + "time_per_iteration": 4.848982334136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075752, + "balance_loss_mlp": 1.03774858, + "epoch": 0.6317814544055406, + "flos": 624379497984.0, + "grad_norm": 0.05495466978446473, + "language_loss": 0.82262814, + "learning_rate": 0.0003153701182180776, + "loss": 0.83338571, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.37939453, + "step": 3284, + "time_per_iteration": 2.767197608947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074338, + "balance_loss_mlp": 1.03759754, + "epoch": 0.6319738360908042, + "flos": 497876046336.0, + "grad_norm": 0.052075562898617506, + "language_loss": 0.81654066, + "learning_rate": 0.00031508062963872655, + "loss": 0.82728398, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.36743164, + "step": 3285, + "time_per_iteration": 2.6035704612731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076725, + "balance_loss_mlp": 1.03836393, + "epoch": 0.6321662177760677, + "flos": 579474362880.0, + "grad_norm": 0.07288308638623867, + "language_loss": 0.79200375, + "learning_rate": 0.0003147912128514423, + "loss": 0.80277097, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.38330078, + "step": 3286, + "time_per_iteration": 2.716641426086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076397, + "balance_loss_mlp": 1.04046774, + "epoch": 0.6323585994613313, + "flos": 601207460352.0, + "grad_norm": 0.06971940923573844, + "language_loss": 0.8695125, + "learning_rate": 0.0003145018679685859, + "loss": 0.88027644, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.35913086, + "step": 3287, + "time_per_iteration": 2.7455978393554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107146, + "balance_loss_mlp": 1.03579235, + "epoch": 0.6325509811465948, + "flos": 528261875712.0, + "grad_norm": 0.04384193895060619, + "language_loss": 0.8763777, + "learning_rate": 0.00031421259510249134, + "loss": 0.88709229, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.35717773, + "step": 3288, + "time_per_iteration": 2.760524034500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070765, + "balance_loss_mlp": 1.03397667, + "epoch": 0.6327433628318584, + "flos": 573993464832.0, + "grad_norm": 0.05235334627417233, + "language_loss": 0.81302404, + "learning_rate": 0.00031392339436546414, + "loss": 0.82373166, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.36791992, + "step": 3289, + "time_per_iteration": 2.8397610187530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075193, + "balance_loss_mlp": 1.03876281, + "epoch": 0.632935744517122, + "flos": 516833008128.0, + "grad_norm": 0.06389388194591325, + "language_loss": 0.83106172, + "learning_rate": 0.00031363426586978205, + "loss": 0.84181368, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.36450195, + "step": 3290, + "time_per_iteration": 2.7519772052764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071007, + "balance_loss_mlp": 1.03438592, + "epoch": 0.6331281262023856, + "flos": 617180262912.0, + "grad_norm": 0.051172787966305235, + "language_loss": 0.84358442, + "learning_rate": 0.0003133452097276947, + "loss": 0.85429454, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.3659668, + "step": 3291, + "time_per_iteration": 2.7666964530944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066726, + "balance_loss_mlp": 1.03060579, + "epoch": 0.633320507887649, + "flos": 592665237504.0, + "grad_norm": 0.04649406007551123, + "language_loss": 0.84316128, + "learning_rate": 0.0003130562260514238, + "loss": 0.85382849, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.36132812, + "step": 3292, + "time_per_iteration": 2.7349252700805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107055, + "balance_loss_mlp": 1.03373802, + "epoch": 0.6335128895729126, + "flos": 582064233984.0, + "grad_norm": 0.04554083300278307, + "language_loss": 0.81461787, + "learning_rate": 0.0003127673149531626, + "loss": 0.82532346, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.36791992, + "step": 3293, + "time_per_iteration": 2.777203321456909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068218, + "balance_loss_mlp": 1.03150177, + "epoch": 0.6337052712581762, + "flos": 452803585536.0, + "grad_norm": 0.06587876286418191, + "language_loss": 0.83099329, + "learning_rate": 0.0003124784765450762, + "loss": 0.84167558, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.3671875, + "step": 3294, + "time_per_iteration": 2.5272936820983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076823, + "balance_loss_mlp": 1.0392009, + "epoch": 0.6338976529434398, + "flos": 573132105216.0, + "grad_norm": 0.07565645338931325, + "language_loss": 0.80265319, + "learning_rate": 0.0003121897109393017, + "loss": 0.81342143, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.37597656, + "step": 3295, + "time_per_iteration": 2.7182729244232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069809, + "balance_loss_mlp": 1.03318739, + "epoch": 0.6340900346287034, + "flos": 508497398784.0, + "grad_norm": 0.45372890194936744, + "language_loss": 0.89147079, + "learning_rate": 0.0003119010182479481, + "loss": 0.90216893, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.36621094, + "step": 3296, + "time_per_iteration": 2.613863706588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076319, + "balance_loss_mlp": 1.0396266, + "epoch": 0.6342824163139669, + "flos": 479505429504.0, + "grad_norm": 0.05534198375005729, + "language_loss": 0.82468164, + "learning_rate": 0.00031161239858309563, + "loss": 0.83544481, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.36669922, + "step": 3297, + "time_per_iteration": 2.581540822982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107642, + "balance_loss_mlp": 1.03917897, + "epoch": 0.6344747979992305, + "flos": 571762976256.0, + "grad_norm": 0.05796983524113203, + "language_loss": 0.8309406, + "learning_rate": 0.0003113238520567964, + "loss": 0.84170485, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.37182617, + "step": 3298, + "time_per_iteration": 2.666191816329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077442, + "balance_loss_mlp": 1.04082084, + "epoch": 0.634667179684494, + "flos": 605629149696.0, + "grad_norm": 0.056114886928888365, + "language_loss": 0.81702375, + "learning_rate": 0.00031103537878107403, + "loss": 0.82779819, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.36621094, + "step": 3299, + "time_per_iteration": 2.7362561225891113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080646, + "balance_loss_mlp": 1.04311848, + "epoch": 0.6348595613697576, + "flos": 646649478144.0, + "grad_norm": 0.06007496440704036, + "language_loss": 0.80261421, + "learning_rate": 0.0003107469788679238, + "loss": 0.81342065, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.37475586, + "step": 3300, + "time_per_iteration": 2.756533622741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073194, + "balance_loss_mlp": 1.03597736, + "epoch": 0.6350519430550212, + "flos": 638776558080.0, + "grad_norm": 0.05358633946635087, + "language_loss": 0.86808562, + "learning_rate": 0.00031045865242931267, + "loss": 0.87881756, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.37207031, + "step": 3301, + "time_per_iteration": 2.829094171524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080134, + "balance_loss_mlp": 1.043203, + "epoch": 0.6352443247402847, + "flos": 686091091968.0, + "grad_norm": 0.0476034793432377, + "language_loss": 0.83036846, + "learning_rate": 0.00031017039957717877, + "loss": 0.84116983, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.36938477, + "step": 3302, + "time_per_iteration": 2.9974441528320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073106, + "balance_loss_mlp": 1.03588891, + "epoch": 0.6354367064255483, + "flos": 559173003264.0, + "grad_norm": 0.056110934582374906, + "language_loss": 0.88712031, + "learning_rate": 0.0003098822204234318, + "loss": 0.89785135, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.37207031, + "step": 3303, + "time_per_iteration": 2.6585702896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076324, + "balance_loss_mlp": 1.03984571, + "epoch": 0.6356290881108119, + "flos": 979095582720.0, + "grad_norm": 0.062320507603927815, + "language_loss": 0.8736068, + "learning_rate": 0.00030959411507995273, + "loss": 0.88437009, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.36499023, + "step": 3304, + "time_per_iteration": 3.2019383907318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079953, + "balance_loss_mlp": 1.04299855, + "epoch": 0.6358214697960755, + "flos": 528005799936.0, + "grad_norm": 0.05770730921560322, + "language_loss": 0.80951726, + "learning_rate": 0.00030930608365859407, + "loss": 0.82031679, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.36938477, + "step": 3305, + "time_per_iteration": 2.6649279594421387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073678, + "balance_loss_mlp": 1.03793883, + "epoch": 0.6360138514813389, + "flos": 516547819008.0, + "grad_norm": 0.050398763649548706, + "language_loss": 0.87612951, + "learning_rate": 0.00030901812627117943, + "loss": 0.88686621, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.35791016, + "step": 3306, + "time_per_iteration": 2.6524715423583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072331, + "balance_loss_mlp": 1.0352571, + "epoch": 0.6362062331666025, + "flos": 466289823744.0, + "grad_norm": 0.06392175432949986, + "language_loss": 0.84607399, + "learning_rate": 0.000308730243029504, + "loss": 0.85679734, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.37084961, + "step": 3307, + "time_per_iteration": 2.619936943054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080745, + "balance_loss_mlp": 1.04407644, + "epoch": 0.6363986148518661, + "flos": 549458090496.0, + "grad_norm": 0.0791847929194259, + "language_loss": 0.79674953, + "learning_rate": 0.0003084424340453339, + "loss": 0.80755699, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.36669922, + "step": 3308, + "time_per_iteration": 2.847384214401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074674, + "balance_loss_mlp": 1.03688467, + "epoch": 0.6365909965371297, + "flos": 582772824576.0, + "grad_norm": 0.10517797210671455, + "language_loss": 0.82179588, + "learning_rate": 0.0003081546994304064, + "loss": 0.8325426, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.37744141, + "step": 3309, + "time_per_iteration": 2.745880365371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073786, + "balance_loss_mlp": 1.03644967, + "epoch": 0.6367833782223933, + "flos": 530998723584.0, + "grad_norm": 0.05446787183102227, + "language_loss": 0.8192482, + "learning_rate": 0.0003078670392964298, + "loss": 0.8299861, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.37304688, + "step": 3310, + "time_per_iteration": 2.6298861503601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075946, + "balance_loss_mlp": 1.03896689, + "epoch": 0.6369757599076568, + "flos": 569237124096.0, + "grad_norm": 0.05047878610686386, + "language_loss": 0.82755494, + "learning_rate": 0.00030757945375508406, + "loss": 0.83831441, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.36938477, + "step": 3311, + "time_per_iteration": 2.6519951820373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074481, + "balance_loss_mlp": 1.03652477, + "epoch": 0.6371681415929203, + "flos": 539684951040.0, + "grad_norm": 0.05551115328113397, + "language_loss": 0.81331229, + "learning_rate": 0.00030729194291801944, + "loss": 0.8240571, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.37915039, + "step": 3312, + "time_per_iteration": 2.6647114753723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078542, + "balance_loss_mlp": 1.04089594, + "epoch": 0.6373605232781839, + "flos": 483326217216.0, + "grad_norm": 0.05317823086949404, + "language_loss": 0.76999873, + "learning_rate": 0.00030700450689685787, + "loss": 0.78078413, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.37646484, + "step": 3313, + "time_per_iteration": 2.517679452896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_mlp": 1.03700447, + "epoch": 0.6375529049634475, + "flos": 578273969664.0, + "grad_norm": 0.05477509929208262, + "language_loss": 0.85436654, + "learning_rate": 0.00030671714580319186, + "loss": 0.86509454, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.35839844, + "step": 3314, + "time_per_iteration": 2.83425235748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078643, + "balance_loss_mlp": 1.04118717, + "epoch": 0.637745286648711, + "flos": 681953181696.0, + "grad_norm": 0.05493703572973109, + "language_loss": 0.83096623, + "learning_rate": 0.0003064298597485846, + "loss": 0.84175265, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.37426758, + "step": 3315, + "time_per_iteration": 2.8374462127685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107089, + "balance_loss_mlp": 1.03472173, + "epoch": 0.6379376683339746, + "flos": 504385629696.0, + "grad_norm": 0.05328451247600945, + "language_loss": 0.83983094, + "learning_rate": 0.00030614264884457054, + "loss": 0.8505398, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.36181641, + "step": 3316, + "time_per_iteration": 2.6181318759918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076564, + "balance_loss_mlp": 1.03896546, + "epoch": 0.6381300500192382, + "flos": 501771027456.0, + "grad_norm": 0.05692902887495298, + "language_loss": 0.77128184, + "learning_rate": 0.000305855513202655, + "loss": 0.78204751, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.37573242, + "step": 3317, + "time_per_iteration": 2.570751190185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072223, + "balance_loss_mlp": 1.03574491, + "epoch": 0.6383224317045018, + "flos": 400271431680.0, + "grad_norm": 0.0603897585499684, + "language_loss": 0.77435303, + "learning_rate": 0.0003055684529343138, + "loss": 0.78507531, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.36474609, + "step": 3318, + "time_per_iteration": 2.4171056747436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068524, + "balance_loss_mlp": 1.03249943, + "epoch": 0.6385148133897653, + "flos": 499131694080.0, + "grad_norm": 0.06663651312006989, + "language_loss": 0.78354919, + "learning_rate": 0.00030528146815099374, + "loss": 0.79423445, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.36010742, + "step": 3319, + "time_per_iteration": 2.5991523265838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072343, + "balance_loss_mlp": 1.03603208, + "epoch": 0.6387071950750288, + "flos": 527409280512.0, + "grad_norm": 0.04641062645518834, + "language_loss": 0.71934807, + "learning_rate": 0.00030499455896411203, + "loss": 0.73007143, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.36376953, + "step": 3320, + "time_per_iteration": 2.601541519165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047736, + "balance_loss_mlp": 1.03734136, + "epoch": 0.6388995767602924, + "flos": 1455200501760.0, + "grad_norm": 0.026504664974818824, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77348548, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.10400391, + "step": 3321, + "time_per_iteration": 4.919625997543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070963, + "balance_loss_mlp": 1.03417492, + "epoch": 0.639091958445556, + "flos": 603577571328.0, + "grad_norm": 0.051172481389266875, + "language_loss": 0.76476693, + "learning_rate": 0.0003044209678251865, + "loss": 0.77547657, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.36791992, + "step": 3322, + "time_per_iteration": 2.9173965454101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070731, + "balance_loss_mlp": 1.03406262, + "epoch": 0.6392843401308196, + "flos": 584230703616.0, + "grad_norm": 0.062017563043543965, + "language_loss": 0.84732592, + "learning_rate": 0.0003041342860958306, + "loss": 0.85803324, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.36694336, + "step": 3323, + "time_per_iteration": 2.751882791519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069123, + "balance_loss_mlp": 1.03269315, + "epoch": 0.6394767218160831, + "flos": 514420637184.0, + "grad_norm": 0.054747759386293726, + "language_loss": 0.91800594, + "learning_rate": 0.00030384768040828857, + "loss": 0.92869711, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.36425781, + "step": 3324, + "time_per_iteration": 2.6570470333099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070533, + "balance_loss_mlp": 1.03314865, + "epoch": 0.6396691035013466, + "flos": 541471689216.0, + "grad_norm": 0.049915114464213116, + "language_loss": 0.85503262, + "learning_rate": 0.00030356115087383094, + "loss": 0.86573792, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.3737793, + "step": 3325, + "time_per_iteration": 2.620593309402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068087, + "balance_loss_mlp": 1.03115582, + "epoch": 0.6398614851866102, + "flos": 525282098688.0, + "grad_norm": 0.05721597206599544, + "language_loss": 0.84746885, + "learning_rate": 0.00030327469760369803, + "loss": 0.85814971, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.36938477, + "step": 3326, + "time_per_iteration": 2.600210428237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070931, + "balance_loss_mlp": 1.03342783, + "epoch": 0.6400538668718738, + "flos": 622704830976.0, + "grad_norm": 0.3477735947082266, + "language_loss": 0.85250199, + "learning_rate": 0.0003029883207091009, + "loss": 0.86321133, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.375, + "step": 3327, + "time_per_iteration": 2.7323827743530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065435, + "balance_loss_mlp": 1.02910042, + "epoch": 0.6402462485571374, + "flos": 503096486400.0, + "grad_norm": 0.053886182744941745, + "language_loss": 0.78170431, + "learning_rate": 0.00030270202030122095, + "loss": 0.79235864, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.36328125, + "step": 3328, + "time_per_iteration": 2.6563096046447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107159, + "balance_loss_mlp": 1.03310895, + "epoch": 0.6404386302424009, + "flos": 818894693376.0, + "grad_norm": 0.06347117361698136, + "language_loss": 0.85806334, + "learning_rate": 0.00030241579649121, + "loss": 0.86877924, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.38476562, + "step": 3329, + "time_per_iteration": 2.9936435222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066015, + "balance_loss_mlp": 1.02901256, + "epoch": 0.6406310119276645, + "flos": 471568490496.0, + "grad_norm": 0.05226197441387588, + "language_loss": 0.79091239, + "learning_rate": 0.00030212964939018994, + "loss": 0.8015725, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.37011719, + "step": 3330, + "time_per_iteration": 2.5639078617095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107031, + "balance_loss_mlp": 1.0323776, + "epoch": 0.6408233936129281, + "flos": 425358245376.0, + "grad_norm": 0.06341229452326952, + "language_loss": 0.85196972, + "learning_rate": 0.0003018435791092527, + "loss": 0.86267287, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.37890625, + "step": 3331, + "time_per_iteration": 2.4909286499023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067837, + "balance_loss_mlp": 1.0288794, + "epoch": 0.6410157752981916, + "flos": 549522109440.0, + "grad_norm": 0.052178008313766185, + "language_loss": 0.81084096, + "learning_rate": 0.00030155758575946083, + "loss": 0.82151937, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.3894043, + "step": 3332, + "time_per_iteration": 2.64400315284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069681, + "balance_loss_mlp": 1.03246343, + "epoch": 0.6412081569834551, + "flos": 475659910656.0, + "grad_norm": 0.056966090936169146, + "language_loss": 0.83717507, + "learning_rate": 0.0003012716694518467, + "loss": 0.84787184, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.37231445, + "step": 3333, + "time_per_iteration": 2.5760622024536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068182, + "balance_loss_mlp": 1.02998757, + "epoch": 0.6414005386687187, + "flos": 540645235200.0, + "grad_norm": 0.0733128954911655, + "language_loss": 0.85120058, + "learning_rate": 0.000300985830297413, + "loss": 0.86188245, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.3815918, + "step": 3334, + "time_per_iteration": 2.6769511699676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068689, + "balance_loss_mlp": 1.03187692, + "epoch": 0.6415929203539823, + "flos": 1040909073408.0, + "grad_norm": 0.0544756341035146, + "language_loss": 0.87377876, + "learning_rate": 0.00030070006840713205, + "loss": 0.88446569, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.36865234, + "step": 3335, + "time_per_iteration": 3.3541831970214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070768, + "balance_loss_mlp": 1.03398037, + "epoch": 0.6417853020392459, + "flos": 648028781568.0, + "grad_norm": 0.051565037343947635, + "language_loss": 0.73971063, + "learning_rate": 0.000300414383891947, + "loss": 0.75041831, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.36791992, + "step": 3336, + "time_per_iteration": 2.802199602127075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072657, + "balance_loss_mlp": 1.03536844, + "epoch": 0.6419776837245095, + "flos": 500639035392.0, + "grad_norm": 0.04995187191956455, + "language_loss": 0.88918942, + "learning_rate": 0.00030012877686276973, + "loss": 0.89991605, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.37280273, + "step": 3337, + "time_per_iteration": 2.69291090965271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107373, + "balance_loss_mlp": 1.03677511, + "epoch": 0.642170065409773, + "flos": 620331747840.0, + "grad_norm": 0.054761035667788324, + "language_loss": 0.86300218, + "learning_rate": 0.0002998432474304832, + "loss": 0.87373948, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.36914062, + "step": 3338, + "time_per_iteration": 2.773374319076538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015993, + "balance_loss_mlp": 1.00283277, + "epoch": 0.6423624470950365, + "flos": 1422767476224.0, + "grad_norm": 0.016749722719595034, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80253339, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.13183594, + "step": 3339, + "time_per_iteration": 4.874187231063843 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073859, + "balance_loss_mlp": 1.03788161, + "epoch": 0.6425548287803001, + "flos": 562082969088.0, + "grad_norm": 0.04482420298263986, + "language_loss": 0.88213849, + "learning_rate": 0.00029927242179996107, + "loss": 0.8928771, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.36010742, + "step": 3340, + "time_per_iteration": 2.665893077850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068858, + "balance_loss_mlp": 1.03240371, + "epoch": 0.6427472104655637, + "flos": 585151699968.0, + "grad_norm": 0.04629279799595454, + "language_loss": 0.83241612, + "learning_rate": 0.0002989871258233398, + "loss": 0.84310472, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.36474609, + "step": 3341, + "time_per_iteration": 2.7554104328155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076157, + "balance_loss_mlp": 1.03927386, + "epoch": 0.6429395921508272, + "flos": 404067488256.0, + "grad_norm": 0.0587599408215441, + "language_loss": 0.82722974, + "learning_rate": 0.0002987019078868373, + "loss": 0.8379913, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.36865234, + "step": 3342, + "time_per_iteration": 2.4214284420013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074694, + "balance_loss_mlp": 1.03742945, + "epoch": 0.6431319738360908, + "flos": 548522537472.0, + "grad_norm": 0.05743775119998274, + "language_loss": 0.8159622, + "learning_rate": 0.00029841676810118484, + "loss": 0.82670915, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.37231445, + "step": 3343, + "time_per_iteration": 2.6899335384368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070744, + "balance_loss_mlp": 1.03390789, + "epoch": 0.6433243555213544, + "flos": 793044034560.0, + "grad_norm": 0.05135608784833761, + "language_loss": 0.87229836, + "learning_rate": 0.0002981317065770839, + "loss": 0.8830058, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.36816406, + "step": 3344, + "time_per_iteration": 3.038647413253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075006, + "balance_loss_mlp": 1.03771782, + "epoch": 0.643516737206618, + "flos": 582762650112.0, + "grad_norm": 0.05966061417455641, + "language_loss": 0.80907631, + "learning_rate": 0.00029784672342520493, + "loss": 0.81982636, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.37231445, + "step": 3345, + "time_per_iteration": 2.6487960815429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106957, + "balance_loss_mlp": 1.03244793, + "epoch": 0.6437091188918815, + "flos": 518501882880.0, + "grad_norm": 0.05291983306106443, + "language_loss": 0.83733785, + "learning_rate": 0.00029756181875618834, + "loss": 0.84803355, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.37133789, + "step": 3346, + "time_per_iteration": 2.5655863285064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073171, + "balance_loss_mlp": 1.03671718, + "epoch": 0.643901500577145, + "flos": 384736587264.0, + "grad_norm": 0.05666313029634666, + "language_loss": 0.83381206, + "learning_rate": 0.0002972769926806439, + "loss": 0.84454376, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.36474609, + "step": 3347, + "time_per_iteration": 2.456300735473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078252, + "balance_loss_mlp": 1.04122531, + "epoch": 0.6440938822624086, + "flos": 483478986240.0, + "grad_norm": 0.05181671155605703, + "language_loss": 0.88556045, + "learning_rate": 0.0002969922453091508, + "loss": 0.89634299, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.37036133, + "step": 3348, + "time_per_iteration": 2.5434532165527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078332, + "balance_loss_mlp": 1.04104328, + "epoch": 0.6442862639476722, + "flos": 540178163712.0, + "grad_norm": 0.04671333484936929, + "language_loss": 0.85028982, + "learning_rate": 0.00029670757675225777, + "loss": 0.86107314, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.37255859, + "step": 3349, + "time_per_iteration": 2.7254116535186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073632, + "balance_loss_mlp": 1.03715396, + "epoch": 0.6444786456329358, + "flos": 526651227648.0, + "grad_norm": 0.06388805390045102, + "language_loss": 0.7939328, + "learning_rate": 0.0002964229871204831, + "loss": 0.80466914, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.36474609, + "step": 3350, + "time_per_iteration": 2.623533248901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107868, + "balance_loss_mlp": 1.04274988, + "epoch": 0.6446710273181993, + "flos": 697576776192.0, + "grad_norm": 0.05363118847235426, + "language_loss": 0.83167213, + "learning_rate": 0.00029613847652431403, + "loss": 0.84245896, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.35961914, + "step": 3351, + "time_per_iteration": 2.835373640060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081252, + "balance_loss_mlp": 1.04536986, + "epoch": 0.6448634090034628, + "flos": 624705384960.0, + "grad_norm": 0.04827389624860956, + "language_loss": 0.79376614, + "learning_rate": 0.0002958540450742078, + "loss": 0.8045786, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.35864258, + "step": 3352, + "time_per_iteration": 2.905045986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078994, + "balance_loss_mlp": 1.04175305, + "epoch": 0.6450557906887264, + "flos": 600647256576.0, + "grad_norm": 0.04612026708575604, + "language_loss": 0.77379197, + "learning_rate": 0.0002955696928805901, + "loss": 0.7845819, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.37231445, + "step": 3353, + "time_per_iteration": 2.899186372756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081177, + "balance_loss_mlp": 1.04536617, + "epoch": 0.64524817237399, + "flos": 645905981952.0, + "grad_norm": 0.050963182313219675, + "language_loss": 0.86320436, + "learning_rate": 0.0002952854200538563, + "loss": 0.87401617, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.35839844, + "step": 3354, + "time_per_iteration": 2.7646782398223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107986, + "balance_loss_mlp": 1.04366803, + "epoch": 0.6454405540592536, + "flos": 473173346304.0, + "grad_norm": 0.05160537421710046, + "language_loss": 0.82000065, + "learning_rate": 0.000295001226704371, + "loss": 0.83079934, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.36206055, + "step": 3355, + "time_per_iteration": 2.5571465492248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080502, + "balance_loss_mlp": 1.04357088, + "epoch": 0.6456329357445171, + "flos": 611548005888.0, + "grad_norm": 0.052373080936441004, + "language_loss": 0.8272965, + "learning_rate": 0.00029471711294246783, + "loss": 0.83810151, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.36914062, + "step": 3356, + "time_per_iteration": 2.829554796218872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075402, + "balance_loss_mlp": 1.03890061, + "epoch": 0.6458253174297807, + "flos": 731373138432.0, + "grad_norm": 0.05569683801855411, + "language_loss": 0.82248133, + "learning_rate": 0.0002944330788784494, + "loss": 0.83323538, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.36499023, + "step": 3357, + "time_per_iteration": 2.93203067779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079005, + "balance_loss_mlp": 1.04276562, + "epoch": 0.6460176991150443, + "flos": 570129007104.0, + "grad_norm": 0.050517424210504216, + "language_loss": 0.84506869, + "learning_rate": 0.00029414912462258786, + "loss": 0.8558588, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.36254883, + "step": 3358, + "time_per_iteration": 2.819854259490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077222, + "balance_loss_mlp": 1.0391469, + "epoch": 0.6462100808003078, + "flos": 582890688000.0, + "grad_norm": 0.05841825537720819, + "language_loss": 0.81327105, + "learning_rate": 0.00029386525028512366, + "loss": 0.82404327, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.38037109, + "step": 3359, + "time_per_iteration": 2.698640823364258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081177, + "balance_loss_mlp": 1.04388809, + "epoch": 0.6464024624855714, + "flos": 483647721984.0, + "grad_norm": 0.05190666328104424, + "language_loss": 0.87126404, + "learning_rate": 0.0002935814559762666, + "loss": 0.88207585, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.37329102, + "step": 3360, + "time_per_iteration": 2.768366575241089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081829, + "balance_loss_mlp": 1.0439682, + "epoch": 0.6465948441708349, + "flos": 527508205056.0, + "grad_norm": 0.050745239197886684, + "language_loss": 0.79334629, + "learning_rate": 0.0002932977418061957, + "loss": 0.80416453, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.37841797, + "step": 3361, + "time_per_iteration": 2.632948637008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082687, + "balance_loss_mlp": 1.04582703, + "epoch": 0.6467872258560985, + "flos": 669121689600.0, + "grad_norm": 0.06228301103005666, + "language_loss": 0.80853021, + "learning_rate": 0.00029301410788505833, + "loss": 0.81935704, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.3684082, + "step": 3362, + "time_per_iteration": 2.7769224643707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084618, + "balance_loss_mlp": 1.04833102, + "epoch": 0.6469796075413621, + "flos": 431867828736.0, + "grad_norm": 0.06087250960931665, + "language_loss": 0.8065362, + "learning_rate": 0.00029273055432297126, + "loss": 0.81738234, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.36328125, + "step": 3363, + "time_per_iteration": 2.484450101852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084285, + "balance_loss_mlp": 1.04611397, + "epoch": 0.6471719892266257, + "flos": 803413693440.0, + "grad_norm": 0.05541447784561029, + "language_loss": 0.80514741, + "learning_rate": 0.00029244708123001917, + "loss": 0.81599021, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.3815918, + "step": 3364, + "time_per_iteration": 2.9762370586395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.04387355, + "epoch": 0.6473643709118891, + "flos": 576923779584.0, + "grad_norm": 0.051117290397423236, + "language_loss": 0.84345543, + "learning_rate": 0.0002921636887162565, + "loss": 0.85426897, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.37451172, + "step": 3365, + "time_per_iteration": 2.72733736038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085471, + "balance_loss_mlp": 1.04930282, + "epoch": 0.6475567525971527, + "flos": 761079490560.0, + "grad_norm": 0.06137767127044858, + "language_loss": 0.83554536, + "learning_rate": 0.00029188037689170595, + "loss": 0.84640002, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.36181641, + "step": 3366, + "time_per_iteration": 2.962611675262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081175, + "balance_loss_mlp": 1.04474497, + "epoch": 0.6477491342824163, + "flos": 842754972672.0, + "grad_norm": 0.05371519731752011, + "language_loss": 0.83465898, + "learning_rate": 0.0002915971458663586, + "loss": 0.84547073, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.36450195, + "step": 3367, + "time_per_iteration": 3.043851137161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082146, + "balance_loss_mlp": 1.04545331, + "epoch": 0.6479415159676799, + "flos": 884431457280.0, + "grad_norm": 0.05567471086198027, + "language_loss": 0.81976676, + "learning_rate": 0.00029131399575017494, + "loss": 0.83058822, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.36669922, + "step": 3368, + "time_per_iteration": 3.16506290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072939, + "balance_loss_mlp": 1.0362463, + "epoch": 0.6481338976529435, + "flos": 615211642368.0, + "grad_norm": 0.04146272732833695, + "language_loss": 0.85776877, + "learning_rate": 0.0002910309266530836, + "loss": 0.86849815, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.36694336, + "step": 3369, + "time_per_iteration": 2.810415267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082428, + "balance_loss_mlp": 1.04485345, + "epoch": 0.648326279338207, + "flos": 509757428736.0, + "grad_norm": 0.047563398394336556, + "language_loss": 0.85364866, + "learning_rate": 0.0002907479386849814, + "loss": 0.86447287, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.37573242, + "step": 3370, + "time_per_iteration": 2.6234049797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079087, + "balance_loss_mlp": 1.04258549, + "epoch": 0.6485186610234706, + "flos": 702157026816.0, + "grad_norm": 0.05547979254265798, + "language_loss": 0.79903388, + "learning_rate": 0.0002904650319557339, + "loss": 0.80982471, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.36523438, + "step": 3371, + "time_per_iteration": 3.052445411682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077959, + "balance_loss_mlp": 1.04148114, + "epoch": 0.6487110427087341, + "flos": 560418476544.0, + "grad_norm": 0.10081589784895977, + "language_loss": 0.80853498, + "learning_rate": 0.0002901822065751758, + "loss": 0.81931454, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.36499023, + "step": 3372, + "time_per_iteration": 2.679738759994507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072935, + "balance_loss_mlp": 1.03614688, + "epoch": 0.6489034243939977, + "flos": 679801268736.0, + "grad_norm": 0.07558571237012199, + "language_loss": 0.85327506, + "learning_rate": 0.0002898994626531093, + "loss": 0.86400437, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.36767578, + "step": 3373, + "time_per_iteration": 2.8318021297454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078194, + "balance_loss_mlp": 1.04131091, + "epoch": 0.6490958060792612, + "flos": 474172918272.0, + "grad_norm": 0.04995126846613369, + "language_loss": 0.87709844, + "learning_rate": 0.00028961680029930526, + "loss": 0.88788044, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.36865234, + "step": 3374, + "time_per_iteration": 2.550858736038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107378, + "balance_loss_mlp": 1.03751612, + "epoch": 0.6492881877645248, + "flos": 588563642880.0, + "grad_norm": 0.053073331698041674, + "language_loss": 0.76720631, + "learning_rate": 0.00028933421962350317, + "loss": 0.77794409, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.36279297, + "step": 3375, + "time_per_iteration": 2.7313249111175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073863, + "balance_loss_mlp": 1.0367415, + "epoch": 0.6494805694497884, + "flos": 642139038720.0, + "grad_norm": 0.0646432947435949, + "language_loss": 0.84017503, + "learning_rate": 0.0002890517207354104, + "loss": 0.8509137, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.37109375, + "step": 3376, + "time_per_iteration": 2.8168907165527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071643, + "balance_loss_mlp": 1.0345453, + "epoch": 0.649672951135052, + "flos": 531550162944.0, + "grad_norm": 0.054117289013755926, + "language_loss": 0.81647491, + "learning_rate": 0.0002887693037447029, + "loss": 0.82719135, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.37084961, + "step": 3377, + "time_per_iteration": 2.59980845451355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068068, + "balance_loss_mlp": 1.03170967, + "epoch": 0.6498653328203156, + "flos": 547124295168.0, + "grad_norm": 0.05861346811628937, + "language_loss": 0.82201707, + "learning_rate": 0.00028848696876102443, + "loss": 0.83269775, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.36352539, + "step": 3378, + "time_per_iteration": 2.6153130531311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071004, + "balance_loss_mlp": 1.03333366, + "epoch": 0.650057714505579, + "flos": 461996172288.0, + "grad_norm": 0.0689678336471058, + "language_loss": 0.83211708, + "learning_rate": 0.00028820471589398723, + "loss": 0.84282708, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.37646484, + "step": 3379, + "time_per_iteration": 2.553159236907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068457, + "balance_loss_mlp": 1.03100109, + "epoch": 0.6502500961908426, + "flos": 509905815552.0, + "grad_norm": 0.06047763604232794, + "language_loss": 0.77722514, + "learning_rate": 0.00028792254525317196, + "loss": 0.78790975, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.37451172, + "step": 3380, + "time_per_iteration": 2.680063009262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072989, + "balance_loss_mlp": 1.03519976, + "epoch": 0.6504424778761062, + "flos": 579557320704.0, + "grad_norm": 0.05541331386031739, + "language_loss": 0.81432557, + "learning_rate": 0.00028764045694812645, + "loss": 0.82505548, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.37768555, + "step": 3381, + "time_per_iteration": 2.7398667335510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069777, + "balance_loss_mlp": 1.03186822, + "epoch": 0.6506348595613698, + "flos": 519206091264.0, + "grad_norm": 0.0812129580253802, + "language_loss": 0.76837122, + "learning_rate": 0.0002873584510883671, + "loss": 0.77906895, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.37915039, + "step": 3382, + "time_per_iteration": 2.565248727798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070048, + "balance_loss_mlp": 1.03302145, + "epoch": 0.6508272412466333, + "flos": 510048410112.0, + "grad_norm": 0.048965932841550305, + "language_loss": 0.85894716, + "learning_rate": 0.0002870765277833788, + "loss": 0.86964768, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.37011719, + "step": 3383, + "time_per_iteration": 2.7287330627441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070639, + "balance_loss_mlp": 1.03366053, + "epoch": 0.6510196229318969, + "flos": 625329607680.0, + "grad_norm": 0.07719936634316926, + "language_loss": 0.80431008, + "learning_rate": 0.00028679468714261347, + "loss": 0.81501651, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.36938477, + "step": 3384, + "time_per_iteration": 2.73777437210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068083, + "balance_loss_mlp": 1.03141391, + "epoch": 0.6512120046171604, + "flos": 474453725184.0, + "grad_norm": 0.05390133741953619, + "language_loss": 0.77104408, + "learning_rate": 0.0002865129292754918, + "loss": 0.78172493, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.36645508, + "step": 3385, + "time_per_iteration": 2.570709228515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107061, + "balance_loss_mlp": 1.03396475, + "epoch": 0.651404386302424, + "flos": 551561951232.0, + "grad_norm": 0.04665998226112413, + "language_loss": 0.81778049, + "learning_rate": 0.00028623125429140105, + "loss": 0.82848656, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.36621094, + "step": 3386, + "time_per_iteration": 2.8083431720733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067177, + "balance_loss_mlp": 1.02964997, + "epoch": 0.6515967679876876, + "flos": 523047227904.0, + "grad_norm": 0.06778513311562764, + "language_loss": 0.86781728, + "learning_rate": 0.00028594966229969785, + "loss": 0.87848902, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.37524414, + "step": 3387, + "time_per_iteration": 2.652562379837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068807, + "balance_loss_mlp": 1.03237641, + "epoch": 0.6517891496729511, + "flos": 573590412288.0, + "grad_norm": 0.04915205130547935, + "language_loss": 0.81361043, + "learning_rate": 0.00028566815340970577, + "loss": 0.82429844, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.36450195, + "step": 3388, + "time_per_iteration": 2.7212326526641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069055, + "balance_loss_mlp": 1.0323149, + "epoch": 0.6519815313582147, + "flos": 555662135808.0, + "grad_norm": 0.05372700409854334, + "language_loss": 0.80874032, + "learning_rate": 0.0002853867277307162, + "loss": 0.81943083, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.36743164, + "step": 3389, + "time_per_iteration": 2.645580291748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072292, + "balance_loss_mlp": 1.03564715, + "epoch": 0.6521739130434783, + "flos": 480229986816.0, + "grad_norm": 0.04994212123605962, + "language_loss": 0.82347226, + "learning_rate": 0.00028510538537198824, + "loss": 0.8341952, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.36669922, + "step": 3390, + "time_per_iteration": 2.6053972244262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071186, + "balance_loss_mlp": 1.03456497, + "epoch": 0.6523662947287419, + "flos": 665380887552.0, + "grad_norm": 0.052060213121620263, + "language_loss": 0.86389101, + "learning_rate": 0.00028482412644274867, + "loss": 0.87460279, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.36621094, + "step": 3391, + "time_per_iteration": 2.9146382808685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071108, + "balance_loss_mlp": 1.03408146, + "epoch": 0.6525586764140053, + "flos": 548394499584.0, + "grad_norm": 0.05233101091155523, + "language_loss": 0.74427474, + "learning_rate": 0.00028454295105219207, + "loss": 0.75498581, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.37011719, + "step": 3392, + "time_per_iteration": 2.653144598007202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072074, + "balance_loss_mlp": 1.03457081, + "epoch": 0.6527510580992689, + "flos": 802529012736.0, + "grad_norm": 0.044337250552145664, + "language_loss": 0.7951991, + "learning_rate": 0.0002842618593094802, + "loss": 0.80591983, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.37475586, + "step": 3393, + "time_per_iteration": 3.1016182899475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075529, + "balance_loss_mlp": 1.0390985, + "epoch": 0.6529434397845325, + "flos": 670864757760.0, + "grad_norm": 0.06313497545988733, + "language_loss": 0.80366606, + "learning_rate": 0.00028398085132374243, + "loss": 0.81442136, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.36425781, + "step": 3394, + "time_per_iteration": 2.81162691116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070268, + "balance_loss_mlp": 1.03338432, + "epoch": 0.6531358214697961, + "flos": 828043610112.0, + "grad_norm": 0.05205360505405607, + "language_loss": 0.84108675, + "learning_rate": 0.0002836999272040761, + "loss": 0.85178936, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.36865234, + "step": 3395, + "time_per_iteration": 3.086585283279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073433, + "balance_loss_mlp": 1.03607285, + "epoch": 0.6533282031550597, + "flos": 487157179392.0, + "grad_norm": 0.06347573427267852, + "language_loss": 0.8364076, + "learning_rate": 0.00028341908705954575, + "loss": 0.84714192, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.37353516, + "step": 3396, + "time_per_iteration": 2.63339900970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101777, + "balance_loss_mlp": 1.00317848, + "epoch": 0.6535205848403232, + "flos": 1556908121088.0, + "grad_norm": 0.01725431962534194, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82779574, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.14550781, + "step": 3397, + "time_per_iteration": 4.886535167694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107024, + "balance_loss_mlp": 1.03342795, + "epoch": 0.6537129665255867, + "flos": 493464531456.0, + "grad_norm": 0.0583640657945681, + "language_loss": 0.78047717, + "learning_rate": 0.00028285765913198604, + "loss": 0.79117954, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.36816406, + "step": 3398, + "time_per_iteration": 2.5336763858795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075265, + "balance_loss_mlp": 1.03771448, + "epoch": 0.6539053482108503, + "flos": 604718327808.0, + "grad_norm": 0.10018787672366053, + "language_loss": 0.81953001, + "learning_rate": 0.0002825770715669227, + "loss": 0.83028269, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.37548828, + "step": 3399, + "time_per_iteration": 2.7225871086120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073476, + "balance_loss_mlp": 1.03656852, + "epoch": 0.6540977298961139, + "flos": 577504332288.0, + "grad_norm": 0.054796705255158284, + "language_loss": 0.81529284, + "learning_rate": 0.00028229656841292634, + "loss": 0.82602763, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.36938477, + "step": 3400, + "time_per_iteration": 2.7136409282684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.03675604, + "epoch": 0.6542901115813774, + "flos": 511500496896.0, + "grad_norm": 0.09810959054820141, + "language_loss": 0.76415372, + "learning_rate": 0.0002820161497788979, + "loss": 0.77489489, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.37304688, + "step": 3401, + "time_per_iteration": 2.561142921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107247, + "balance_loss_mlp": 1.03656387, + "epoch": 0.654482493266641, + "flos": 625201569792.0, + "grad_norm": 0.05065630966567836, + "language_loss": 0.86865586, + "learning_rate": 0.00028173581577370545, + "loss": 0.87938058, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.35913086, + "step": 3402, + "time_per_iteration": 2.771660327911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074844, + "balance_loss_mlp": 1.0377934, + "epoch": 0.6546748749519046, + "flos": 523712148480.0, + "grad_norm": 0.04769798618105731, + "language_loss": 0.78826487, + "learning_rate": 0.0002814555665061844, + "loss": 0.79901326, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.37011719, + "step": 3403, + "time_per_iteration": 2.6541905403137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070804, + "balance_loss_mlp": 1.03351498, + "epoch": 0.6548672566371682, + "flos": 478945225728.0, + "grad_norm": 0.05625408135925951, + "language_loss": 0.77440852, + "learning_rate": 0.00028117540208513715, + "loss": 0.78511655, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.37280273, + "step": 3404, + "time_per_iteration": 2.7175214290618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070835, + "balance_loss_mlp": 1.03428507, + "epoch": 0.6550596383224317, + "flos": 615732558336.0, + "grad_norm": 0.05404961750978507, + "language_loss": 0.84969914, + "learning_rate": 0.00028089532261933313, + "loss": 0.86040747, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.36523438, + "step": 3405, + "time_per_iteration": 2.6872446537017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079163, + "balance_loss_mlp": 1.04197001, + "epoch": 0.6552520200076952, + "flos": 488594709504.0, + "grad_norm": 0.0680253030817501, + "language_loss": 0.85329425, + "learning_rate": 0.0002806153282175087, + "loss": 0.86408579, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.37182617, + "step": 3406, + "time_per_iteration": 2.573843479156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069625, + "balance_loss_mlp": 1.0329802, + "epoch": 0.6554444016929588, + "flos": 687310424064.0, + "grad_norm": 0.0894093410202252, + "language_loss": 0.82802272, + "learning_rate": 0.0002803354189883679, + "loss": 0.83871901, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.36669922, + "step": 3407, + "time_per_iteration": 2.824995279312134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076204, + "balance_loss_mlp": 1.04017901, + "epoch": 0.6556367833782224, + "flos": 542772417024.0, + "grad_norm": 0.05173629873734528, + "language_loss": 0.85629022, + "learning_rate": 0.00028005559504058053, + "loss": 0.86705232, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.3605957, + "step": 3408, + "time_per_iteration": 2.709195852279663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074603, + "balance_loss_mlp": 1.03860188, + "epoch": 0.655829165063486, + "flos": 673237840896.0, + "grad_norm": 0.05391320536337509, + "language_loss": 0.76764786, + "learning_rate": 0.0002797758564827838, + "loss": 0.77839386, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.36010742, + "step": 3409, + "time_per_iteration": 2.7769269943237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073177, + "balance_loss_mlp": 1.03624606, + "epoch": 0.6560215467487496, + "flos": 531550162944.0, + "grad_norm": 0.059937965776424594, + "language_loss": 0.8368215, + "learning_rate": 0.0002794962034235824, + "loss": 0.84755325, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.36889648, + "step": 3410, + "time_per_iteration": 2.599886417388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072982, + "balance_loss_mlp": 1.03588414, + "epoch": 0.656213928434013, + "flos": 591025476096.0, + "grad_norm": 0.13531884717327836, + "language_loss": 0.74423587, + "learning_rate": 0.00027921663597154695, + "loss": 0.75496566, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.37084961, + "step": 3411, + "time_per_iteration": 2.7206108570098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107222, + "balance_loss_mlp": 1.03686285, + "epoch": 0.6564063101192766, + "flos": 415564756992.0, + "grad_norm": 0.08609193384147822, + "language_loss": 0.80696797, + "learning_rate": 0.00027893715423521525, + "loss": 0.81769013, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.35375977, + "step": 3412, + "time_per_iteration": 2.4493868350982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067552, + "balance_loss_mlp": 1.03183699, + "epoch": 0.6565986918045402, + "flos": 453084392448.0, + "grad_norm": 0.05044036578156056, + "language_loss": 0.8354848, + "learning_rate": 0.00027865775832309163, + "loss": 0.84616029, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.35742188, + "step": 3413, + "time_per_iteration": 2.665999174118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074779, + "balance_loss_mlp": 1.03899264, + "epoch": 0.6567910734898038, + "flos": 547483677696.0, + "grad_norm": 0.060493690389786, + "language_loss": 0.85984117, + "learning_rate": 0.00027837844834364733, + "loss": 0.87058896, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.35839844, + "step": 3414, + "time_per_iteration": 2.6195499897003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072987, + "balance_loss_mlp": 1.03677094, + "epoch": 0.6569834551750673, + "flos": 655207667712.0, + "grad_norm": 0.11318049634335087, + "language_loss": 0.86511016, + "learning_rate": 0.00027809922440532, + "loss": 0.87583995, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.36254883, + "step": 3415, + "time_per_iteration": 2.823486566543579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072977, + "balance_loss_mlp": 1.03664172, + "epoch": 0.6571758368603309, + "flos": 539399761920.0, + "grad_norm": 0.08390902906870049, + "language_loss": 0.80793774, + "learning_rate": 0.00027782008661651406, + "loss": 0.81866741, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.36352539, + "step": 3416, + "time_per_iteration": 2.762639045715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071461, + "balance_loss_mlp": 1.03588891, + "epoch": 0.6573682185455945, + "flos": 497088880128.0, + "grad_norm": 0.049698407396127284, + "language_loss": 0.87283665, + "learning_rate": 0.00027754103508560013, + "loss": 0.8835513, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.35620117, + "step": 3417, + "time_per_iteration": 2.5768332481384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070469, + "balance_loss_mlp": 1.03389549, + "epoch": 0.657560600230858, + "flos": 447244111872.0, + "grad_norm": 0.06621650904732551, + "language_loss": 0.8256399, + "learning_rate": 0.0002772620699209163, + "loss": 0.83634454, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.36572266, + "step": 3418, + "time_per_iteration": 2.5885636806488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072181, + "balance_loss_mlp": 1.03606033, + "epoch": 0.6577529819161216, + "flos": 481696630272.0, + "grad_norm": 0.053979947748841836, + "language_loss": 0.80128914, + "learning_rate": 0.0002769831912307658, + "loss": 0.81201094, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.36157227, + "step": 3419, + "time_per_iteration": 2.51863169670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010709, + "balance_loss_mlp": 1.0346607, + "epoch": 0.6579453636013851, + "flos": 530589878784.0, + "grad_norm": 0.061422994023147534, + "language_loss": 0.80013275, + "learning_rate": 0.00027670439912341917, + "loss": 0.81084168, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.36254883, + "step": 3420, + "time_per_iteration": 2.595789670944214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067388, + "balance_loss_mlp": 1.03117275, + "epoch": 0.6581377452866487, + "flos": 627737596416.0, + "grad_norm": 0.0471415503067176, + "language_loss": 0.8344667, + "learning_rate": 0.0002764256937071129, + "loss": 0.84514058, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.36230469, + "step": 3421, + "time_per_iteration": 2.7812321186065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075792, + "balance_loss_mlp": 1.03886116, + "epoch": 0.6583301269719123, + "flos": 548355211776.0, + "grad_norm": 0.05116368726028845, + "language_loss": 0.86894339, + "learning_rate": 0.00027614707509005036, + "loss": 0.87970132, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.36889648, + "step": 3422, + "time_per_iteration": 2.6573753356933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069799, + "balance_loss_mlp": 1.03401232, + "epoch": 0.6585225086571759, + "flos": 427268639232.0, + "grad_norm": 0.053946906539649876, + "language_loss": 0.7900126, + "learning_rate": 0.0002758685433804008, + "loss": 0.80071056, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.35839844, + "step": 3423, + "time_per_iteration": 2.4556972980499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075371, + "balance_loss_mlp": 1.03758192, + "epoch": 0.6587148903424394, + "flos": 859264657920.0, + "grad_norm": 0.05746906751203771, + "language_loss": 0.79022425, + "learning_rate": 0.00027559009868630005, + "loss": 0.80097795, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.37768555, + "step": 3424, + "time_per_iteration": 3.0918102264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068275, + "balance_loss_mlp": 1.03067625, + "epoch": 0.6589072720277029, + "flos": 805280417280.0, + "grad_norm": 0.05909134726698472, + "language_loss": 0.7990104, + "learning_rate": 0.0002753117411158491, + "loss": 0.8096931, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.37573242, + "step": 3425, + "time_per_iteration": 3.0557546615600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074885, + "balance_loss_mlp": 1.03769183, + "epoch": 0.6590996537129665, + "flos": 548355211776.0, + "grad_norm": 0.0487398796366246, + "language_loss": 0.89624393, + "learning_rate": 0.0002750334707771168, + "loss": 0.90699285, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.37158203, + "step": 3426, + "time_per_iteration": 2.6186933517456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107782, + "balance_loss_mlp": 1.03991175, + "epoch": 0.6592920353982301, + "flos": 453931195392.0, + "grad_norm": 0.09520851451243123, + "language_loss": 0.81130987, + "learning_rate": 0.0002747552877781369, + "loss": 0.82208812, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.37866211, + "step": 3427, + "time_per_iteration": 2.4979238510131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068038, + "balance_loss_mlp": 1.03086865, + "epoch": 0.6594844170834937, + "flos": 566903328768.0, + "grad_norm": 0.04689884727267459, + "language_loss": 0.81804323, + "learning_rate": 0.0002744771922269097, + "loss": 0.82872361, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.37158203, + "step": 3428, + "time_per_iteration": 2.740729808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075113, + "balance_loss_mlp": 1.03768158, + "epoch": 0.6596767987687572, + "flos": 1187452016640.0, + "grad_norm": 0.05881296297664234, + "language_loss": 0.81886125, + "learning_rate": 0.0002741991842314015, + "loss": 0.82961237, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.37426758, + "step": 3429, + "time_per_iteration": 3.4745006561279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071953, + "balance_loss_mlp": 1.03506947, + "epoch": 0.6598691804540208, + "flos": 503247845376.0, + "grad_norm": 0.05507751278667406, + "language_loss": 0.85868287, + "learning_rate": 0.0002739212638995445, + "loss": 0.86940235, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.3684082, + "step": 3430, + "time_per_iteration": 2.532402515411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070704, + "balance_loss_mlp": 1.033463, + "epoch": 0.6600615621392844, + "flos": 531072916992.0, + "grad_norm": 0.05565442756862113, + "language_loss": 0.83027416, + "learning_rate": 0.00027364343133923696, + "loss": 0.84098119, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.37231445, + "step": 3431, + "time_per_iteration": 2.630985736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077517, + "balance_loss_mlp": 1.0396086, + "epoch": 0.6602539438245479, + "flos": 565170435072.0, + "grad_norm": 0.06720345334853779, + "language_loss": 0.82615936, + "learning_rate": 0.0002733656866583431, + "loss": 0.83693457, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.37890625, + "step": 3432, + "time_per_iteration": 2.6693778038024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072775, + "balance_loss_mlp": 1.0354147, + "epoch": 0.6604463255098114, + "flos": 856802824704.0, + "grad_norm": 0.05437523875977016, + "language_loss": 0.82810867, + "learning_rate": 0.0002730880299646927, + "loss": 0.83883643, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.37329102, + "step": 3433, + "time_per_iteration": 3.047272205352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072216, + "balance_loss_mlp": 1.03540444, + "epoch": 0.660638707195075, + "flos": 674158837248.0, + "grad_norm": 0.05169361023924996, + "language_loss": 0.85458863, + "learning_rate": 0.0002728104613660821, + "loss": 0.86531085, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.36791992, + "step": 3434, + "time_per_iteration": 2.8202831745147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010658, + "balance_loss_mlp": 1.02879786, + "epoch": 0.6608310888803386, + "flos": 888572339712.0, + "grad_norm": 0.05115304739976813, + "language_loss": 0.83194226, + "learning_rate": 0.0002725329809702729, + "loss": 0.84260029, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.36962891, + "step": 3435, + "time_per_iteration": 3.228891134262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071505, + "balance_loss_mlp": 1.03376281, + "epoch": 0.6610234705656022, + "flos": 1135909260288.0, + "grad_norm": 0.06628416389045559, + "language_loss": 0.75631964, + "learning_rate": 0.0002722555888849921, + "loss": 0.76703465, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.37695312, + "step": 3436, + "time_per_iteration": 3.422288179397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068961, + "balance_loss_mlp": 1.03212583, + "epoch": 0.6612158522508658, + "flos": 467776816128.0, + "grad_norm": 0.05048111401896507, + "language_loss": 0.80400562, + "learning_rate": 0.00027197828521793334, + "loss": 0.81469518, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.36816406, + "step": 3437, + "time_per_iteration": 2.4787607192993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073991, + "balance_loss_mlp": 1.03686941, + "epoch": 0.6614082339361292, + "flos": 571374480384.0, + "grad_norm": 0.05876416837727376, + "language_loss": 0.84865153, + "learning_rate": 0.0002717010700767552, + "loss": 0.85939145, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.37109375, + "step": 3438, + "time_per_iteration": 2.740835189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071354, + "balance_loss_mlp": 1.03444707, + "epoch": 0.6616006156213928, + "flos": 498220872192.0, + "grad_norm": 0.06865546708014894, + "language_loss": 0.75838953, + "learning_rate": 0.00027142394356908226, + "loss": 0.76910305, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.36889648, + "step": 3439, + "time_per_iteration": 2.5476725101470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067365, + "balance_loss_mlp": 1.03021967, + "epoch": 0.6617929973066564, + "flos": 602124074496.0, + "grad_norm": 0.05819778232686783, + "language_loss": 0.85115051, + "learning_rate": 0.00027114690580250456, + "loss": 0.86182415, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.37133789, + "step": 3440, + "time_per_iteration": 2.746610403060913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072245, + "balance_loss_mlp": 1.03562403, + "epoch": 0.66198537899192, + "flos": 522731515392.0, + "grad_norm": 0.053821887104205664, + "language_loss": 0.86748421, + "learning_rate": 0.0002708699568845776, + "loss": 0.87820661, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.36621094, + "step": 3441, + "time_per_iteration": 2.6001980304718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046259, + "balance_loss_mlp": 1.0328126, + "epoch": 0.6621777606771835, + "flos": 1565421230592.0, + "grad_norm": 0.030021604030083596, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80334044, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.13476562, + "step": 3442, + "time_per_iteration": 4.909358263015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075884, + "balance_loss_mlp": 1.03933442, + "epoch": 0.6623701423624471, + "flos": 526409708544.0, + "grad_norm": 0.050122845180299073, + "language_loss": 0.83157456, + "learning_rate": 0.0002703163260247261, + "loss": 0.84233344, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.36547852, + "step": 3443, + "time_per_iteration": 2.600733757019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074894, + "balance_loss_mlp": 1.03853548, + "epoch": 0.6625625240477107, + "flos": 527921432064.0, + "grad_norm": 0.07644437952185021, + "language_loss": 0.81613672, + "learning_rate": 0.0002700396442977399, + "loss": 0.8268857, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.36376953, + "step": 3444, + "time_per_iteration": 2.598722457885742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080077, + "balance_loss_mlp": 1.04312193, + "epoch": 0.6627549057329742, + "flos": 472854661632.0, + "grad_norm": 0.05132438186678615, + "language_loss": 0.84284377, + "learning_rate": 0.0002697630518492817, + "loss": 0.85364461, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.36938477, + "step": 3445, + "time_per_iteration": 2.6794075965881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078368, + "balance_loss_mlp": 1.04253387, + "epoch": 0.6629472874182378, + "flos": 527743931904.0, + "grad_norm": 0.05491144350541831, + "language_loss": 0.8564226, + "learning_rate": 0.0002694865487867343, + "loss": 0.86720634, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.35888672, + "step": 3446, + "time_per_iteration": 2.643427848815918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081911, + "balance_loss_mlp": 1.04540932, + "epoch": 0.6631396691035013, + "flos": 612906960384.0, + "grad_norm": 0.04980385474467639, + "language_loss": 0.84496373, + "learning_rate": 0.0002692101352174453, + "loss": 0.85578281, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.36499023, + "step": 3447, + "time_per_iteration": 2.750990629196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077753, + "balance_loss_mlp": 1.04106009, + "epoch": 0.6633320507887649, + "flos": 609041092608.0, + "grad_norm": 0.05216047224803115, + "language_loss": 0.8459692, + "learning_rate": 0.00026893381124872787, + "loss": 0.85674667, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.3671875, + "step": 3448, + "time_per_iteration": 2.7701821327209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074873, + "balance_loss_mlp": 1.03839493, + "epoch": 0.6635244324740285, + "flos": 749342112768.0, + "grad_norm": 0.05521376247242365, + "language_loss": 0.80839992, + "learning_rate": 0.00026865757698786097, + "loss": 0.81914866, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.36499023, + "step": 3449, + "time_per_iteration": 3.046751022338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079305, + "balance_loss_mlp": 1.04382825, + "epoch": 0.6637168141592921, + "flos": 664222754304.0, + "grad_norm": 0.05057031991468663, + "language_loss": 0.8206256, + "learning_rate": 0.000268381432542088, + "loss": 0.83141863, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.35546875, + "step": 3450, + "time_per_iteration": 2.7903122901916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078156, + "balance_loss_mlp": 1.04117751, + "epoch": 0.6639091958445555, + "flos": 606500683776.0, + "grad_norm": 0.05221239612866202, + "language_loss": 0.7978282, + "learning_rate": 0.00026810537801861807, + "loss": 0.80860978, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.36938477, + "step": 3451, + "time_per_iteration": 2.7744555473327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078626, + "balance_loss_mlp": 1.04200482, + "epoch": 0.6641015775298191, + "flos": 476452869120.0, + "grad_norm": 0.04982593193554921, + "language_loss": 0.81320304, + "learning_rate": 0.0002678294135246243, + "loss": 0.82398927, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.36621094, + "step": 3452, + "time_per_iteration": 2.748623847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107526, + "balance_loss_mlp": 1.03902042, + "epoch": 0.6642939592150827, + "flos": 903746391552.0, + "grad_norm": 0.05075048748752087, + "language_loss": 0.86122698, + "learning_rate": 0.0002675535391672463, + "loss": 0.87197959, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.36230469, + "step": 3453, + "time_per_iteration": 3.0941269397735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075595, + "balance_loss_mlp": 1.03995168, + "epoch": 0.6644863409003463, + "flos": 581527351296.0, + "grad_norm": 0.04705931875685086, + "language_loss": 0.85942483, + "learning_rate": 0.0002672777550535877, + "loss": 0.87018085, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.35668945, + "step": 3454, + "time_per_iteration": 2.782492160797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077961, + "balance_loss_mlp": 1.04222202, + "epoch": 0.6646787225856099, + "flos": 478761933312.0, + "grad_norm": 0.05883776733050642, + "language_loss": 0.84943002, + "learning_rate": 0.00026700206129071747, + "loss": 0.86020958, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.35791016, + "step": 3455, + "time_per_iteration": 2.524601697921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074329, + "balance_loss_mlp": 1.0389235, + "epoch": 0.6648711042708734, + "flos": 449676831744.0, + "grad_norm": 0.058012568255648024, + "language_loss": 0.88879943, + "learning_rate": 0.00026672645798566925, + "loss": 0.89954275, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.35449219, + "step": 3456, + "time_per_iteration": 2.532412528991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072913, + "balance_loss_mlp": 1.03745985, + "epoch": 0.665063485956137, + "flos": 858553095168.0, + "grad_norm": 0.053261627047558845, + "language_loss": 0.79371452, + "learning_rate": 0.00026645094524544225, + "loss": 0.8044436, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.35473633, + "step": 3457, + "time_per_iteration": 3.2936151027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068323, + "balance_loss_mlp": 1.03229845, + "epoch": 0.6652558676414005, + "flos": 604024293888.0, + "grad_norm": 0.04836928796010222, + "language_loss": 0.75254017, + "learning_rate": 0.00026617552317699945, + "loss": 0.76322341, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.36035156, + "step": 3458, + "time_per_iteration": 2.781972646713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072016, + "balance_loss_mlp": 1.03651559, + "epoch": 0.6654482493266641, + "flos": 510141542400.0, + "grad_norm": 0.05402195072483101, + "language_loss": 0.87006921, + "learning_rate": 0.0002659001918872693, + "loss": 0.88078934, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.35546875, + "step": 3459, + "time_per_iteration": 2.586364507675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073402, + "balance_loss_mlp": 1.03790104, + "epoch": 0.6656406310119277, + "flos": 565342142976.0, + "grad_norm": 0.06009221273725258, + "language_loss": 0.80872095, + "learning_rate": 0.0002656249514831449, + "loss": 0.81945497, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.35522461, + "step": 3460, + "time_per_iteration": 2.6385302543640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072951, + "balance_loss_mlp": 1.03652048, + "epoch": 0.6658330126971912, + "flos": 1023859533312.0, + "grad_norm": 0.05794846268474579, + "language_loss": 0.86832029, + "learning_rate": 0.00026534980207148416, + "loss": 0.87904978, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.36425781, + "step": 3461, + "time_per_iteration": 3.388073205947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074265, + "balance_loss_mlp": 1.03869295, + "epoch": 0.6660253943824548, + "flos": 816472147968.0, + "grad_norm": 0.06339025189442228, + "language_loss": 0.7302506, + "learning_rate": 0.0002650747437591097, + "loss": 0.74099326, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.35595703, + "step": 3462, + "time_per_iteration": 2.980158567428589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021261, + "balance_loss_mlp": 1.00810075, + "epoch": 0.6662177760677184, + "flos": 1495331767296.0, + "grad_norm": 0.02097535909927297, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82900834, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.13183594, + "step": 3463, + "time_per_iteration": 5.0071799755096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070948, + "balance_loss_mlp": 1.0354948, + "epoch": 0.666410157752982, + "flos": 499875190272.0, + "grad_norm": 0.04521050671951116, + "language_loss": 0.86503369, + "learning_rate": 0.00026452490085933155, + "loss": 0.87574315, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.35473633, + "step": 3464, + "time_per_iteration": 2.5450592041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067553, + "balance_loss_mlp": 1.03212357, + "epoch": 0.6666025394382454, + "flos": 480928402944.0, + "grad_norm": 0.05339724932754041, + "language_loss": 0.89435887, + "learning_rate": 0.00026425011648539614, + "loss": 0.90503436, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.35424805, + "step": 3465, + "time_per_iteration": 2.5414719581604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070322, + "balance_loss_mlp": 1.03377271, + "epoch": 0.666794921123509, + "flos": 546395355648.0, + "grad_norm": 0.05247467659401075, + "language_loss": 0.82117605, + "learning_rate": 0.00026397542363768267, + "loss": 0.83187926, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.36547852, + "step": 3466, + "time_per_iteration": 2.659952402114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071457, + "balance_loss_mlp": 1.03533673, + "epoch": 0.6669873028087726, + "flos": 471750372864.0, + "grad_norm": 0.052441453711620734, + "language_loss": 0.81731021, + "learning_rate": 0.0002637008224228362, + "loss": 0.82802474, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.36132812, + "step": 3467, + "time_per_iteration": 2.5569608211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073874, + "balance_loss_mlp": 1.03875458, + "epoch": 0.6671796844940362, + "flos": 547119912960.0, + "grad_norm": 0.04638174393206939, + "language_loss": 0.84333348, + "learning_rate": 0.00026342631294746653, + "loss": 0.85407221, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.3515625, + "step": 3468, + "time_per_iteration": 2.7492995262145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068836, + "balance_loss_mlp": 1.03300142, + "epoch": 0.6673720661792998, + "flos": 1069867547136.0, + "grad_norm": 0.06886465160601114, + "language_loss": 0.80601752, + "learning_rate": 0.0002631518953181476, + "loss": 0.81670582, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.35839844, + "step": 3469, + "time_per_iteration": 3.4849367141723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017385, + "balance_loss_mlp": 1.0047015, + "epoch": 0.6675644478645633, + "flos": 1522963372032.0, + "grad_norm": 0.011284556376000376, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.77342671, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.12695312, + "step": 3470, + "time_per_iteration": 4.8896119594573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073777, + "balance_loss_mlp": 1.03775215, + "epoch": 0.6677568295498268, + "flos": 579410343936.0, + "grad_norm": 0.05100561036949307, + "language_loss": 0.8019954, + "learning_rate": 0.00026260333602377985, + "loss": 0.81273311, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.36035156, + "step": 3471, + "time_per_iteration": 2.7527613639831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069612, + "balance_loss_mlp": 1.03370583, + "epoch": 0.6679492112350904, + "flos": 383722458624.0, + "grad_norm": 0.06457573009444674, + "language_loss": 0.86992371, + "learning_rate": 0.0002623291945717007, + "loss": 0.88061988, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.35913086, + "step": 3472, + "time_per_iteration": 2.4496309757232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067849, + "balance_loss_mlp": 1.03158569, + "epoch": 0.668141592920354, + "flos": 1150297555968.0, + "grad_norm": 0.0483341926082761, + "language_loss": 0.83728033, + "learning_rate": 0.00026205514539161175, + "loss": 0.84795886, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.36254883, + "step": 3473, + "time_per_iteration": 3.518329620361328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072278, + "balance_loss_mlp": 1.03682494, + "epoch": 0.6683339746056175, + "flos": 560804000256.0, + "grad_norm": 0.054398972389199884, + "language_loss": 0.84145987, + "learning_rate": 0.00026178118858990773, + "loss": 0.85218263, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.35449219, + "step": 3474, + "time_per_iteration": 2.848719596862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068236, + "balance_loss_mlp": 1.0318768, + "epoch": 0.6685263562908811, + "flos": 514051080192.0, + "grad_norm": 0.060039795644517814, + "language_loss": 0.84093618, + "learning_rate": 0.0002615073242729483, + "loss": 0.85161853, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.36352539, + "step": 3475, + "time_per_iteration": 2.648353099822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070134, + "balance_loss_mlp": 1.03382277, + "epoch": 0.6687187379761447, + "flos": 629466107904.0, + "grad_norm": 0.05046564119076302, + "language_loss": 0.84281248, + "learning_rate": 0.0002612335525470573, + "loss": 0.85351384, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.36352539, + "step": 3476, + "time_per_iteration": 2.792809247970581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_mlp": 1.03096104, + "epoch": 0.6689111196614083, + "flos": 535312723968.0, + "grad_norm": 0.05473638804270082, + "language_loss": 0.78341687, + "learning_rate": 0.0002609598735185221, + "loss": 0.79407597, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.35009766, + "step": 3477, + "time_per_iteration": 2.64404559135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070177, + "balance_loss_mlp": 1.03489089, + "epoch": 0.6691035013466718, + "flos": 602758471680.0, + "grad_norm": 0.0937067542198485, + "language_loss": 0.82979453, + "learning_rate": 0.00026068628729359445, + "loss": 0.84049624, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.35327148, + "step": 3478, + "time_per_iteration": 2.749631404876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071211, + "balance_loss_mlp": 1.03640211, + "epoch": 0.6692958830319353, + "flos": 632539017216.0, + "grad_norm": 0.04937335272714273, + "language_loss": 0.7616291, + "learning_rate": 0.00026041279397848996, + "loss": 0.77234125, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.34838867, + "step": 3479, + "time_per_iteration": 2.839651584625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072082, + "balance_loss_mlp": 1.03693914, + "epoch": 0.6694882647171989, + "flos": 645153721344.0, + "grad_norm": 0.04802288968176994, + "language_loss": 0.8253727, + "learning_rate": 0.00026013939367938797, + "loss": 0.83609354, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.35180664, + "step": 3480, + "time_per_iteration": 2.8756163120269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072592, + "balance_loss_mlp": 1.03861761, + "epoch": 0.6696806464024625, + "flos": 569292378624.0, + "grad_norm": 0.05111387659739007, + "language_loss": 0.81035048, + "learning_rate": 0.00025986608650243204, + "loss": 0.82107639, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.34008789, + "step": 3481, + "time_per_iteration": 2.780930757522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107265, + "balance_loss_mlp": 1.03762627, + "epoch": 0.6698730280877261, + "flos": 622386146304.0, + "grad_norm": 0.11620710974574953, + "language_loss": 0.79299992, + "learning_rate": 0.0002595928725537293, + "loss": 0.80372643, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.35058594, + "step": 3482, + "time_per_iteration": 2.8551175594329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071879, + "balance_loss_mlp": 1.03642654, + "epoch": 0.6700654097729896, + "flos": 502258447872.0, + "grad_norm": 0.05059450730585095, + "language_loss": 0.88189447, + "learning_rate": 0.0002593197519393509, + "loss": 0.89261329, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.35449219, + "step": 3483, + "time_per_iteration": 2.556617021560669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070709, + "balance_loss_mlp": 1.03637671, + "epoch": 0.6702577914582531, + "flos": 623567600640.0, + "grad_norm": 0.05152577773762556, + "language_loss": 0.79466176, + "learning_rate": 0.00025904672476533165, + "loss": 0.8053689, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.34375, + "step": 3484, + "time_per_iteration": 2.8806934356689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072584, + "balance_loss_mlp": 1.03794122, + "epoch": 0.6704501731435167, + "flos": 456033646080.0, + "grad_norm": 0.06330154522458538, + "language_loss": 0.82820839, + "learning_rate": 0.0002587737911376704, + "loss": 0.83893424, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.34643555, + "step": 3485, + "time_per_iteration": 2.6385717391967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073373, + "balance_loss_mlp": 1.03789639, + "epoch": 0.6706425548287803, + "flos": 542973238272.0, + "grad_norm": 0.04882372942075566, + "language_loss": 0.83671743, + "learning_rate": 0.00025850095116232885, + "loss": 0.84745121, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.35498047, + "step": 3486, + "time_per_iteration": 2.6404170989990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073473, + "balance_loss_mlp": 1.03873491, + "epoch": 0.6708349365140439, + "flos": 633631721472.0, + "grad_norm": 0.0500263981223685, + "language_loss": 0.77869016, + "learning_rate": 0.000258228204945233, + "loss": 0.7894249, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.34765625, + "step": 3487, + "time_per_iteration": 2.934980630874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107374, + "balance_loss_mlp": 1.03964591, + "epoch": 0.6710273181993074, + "flos": 640459989504.0, + "grad_norm": 0.05519065712818486, + "language_loss": 0.84700072, + "learning_rate": 0.00025795555259227254, + "loss": 0.85773814, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.34130859, + "step": 3488, + "time_per_iteration": 2.7644948959350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071185, + "balance_loss_mlp": 1.03720999, + "epoch": 0.671219699884571, + "flos": 553673166336.0, + "grad_norm": 0.13608492094864486, + "language_loss": 0.8373906, + "learning_rate": 0.00025768299420930046, + "loss": 0.84810245, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.33984375, + "step": 3489, + "time_per_iteration": 2.718442916870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072555, + "balance_loss_mlp": 1.03700686, + "epoch": 0.6714120815698346, + "flos": 731191256064.0, + "grad_norm": 0.05259417787616518, + "language_loss": 0.83743513, + "learning_rate": 0.0002574105299021332, + "loss": 0.84816062, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.35571289, + "step": 3490, + "time_per_iteration": 2.8551361560821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069241, + "balance_loss_mlp": 1.03440833, + "epoch": 0.6716044632550981, + "flos": 688344901632.0, + "grad_norm": 0.0512424310438266, + "language_loss": 0.84138238, + "learning_rate": 0.00025713815977655084, + "loss": 0.85207486, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.34863281, + "step": 3491, + "time_per_iteration": 2.8758041858673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107046, + "balance_loss_mlp": 1.03700948, + "epoch": 0.6717968449403616, + "flos": 460391316480.0, + "grad_norm": 0.05311776823475344, + "language_loss": 0.84021199, + "learning_rate": 0.0002568658839382969, + "loss": 0.85091662, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.3347168, + "step": 3492, + "time_per_iteration": 2.5461535453796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066732, + "balance_loss_mlp": 1.03259087, + "epoch": 0.6719892266256252, + "flos": 501362182656.0, + "grad_norm": 0.0636144820373753, + "language_loss": 0.84432656, + "learning_rate": 0.00025659370249307814, + "loss": 0.85499388, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.34179688, + "step": 3493, + "time_per_iteration": 2.5833051204681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066615, + "balance_loss_mlp": 1.03094745, + "epoch": 0.6721816083108888, + "flos": 683223386112.0, + "grad_norm": 0.056507935755291845, + "language_loss": 0.84795702, + "learning_rate": 0.00025632161554656473, + "loss": 0.85862321, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.35717773, + "step": 3494, + "time_per_iteration": 2.852865219116211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065933, + "balance_loss_mlp": 1.03067088, + "epoch": 0.6723739899961524, + "flos": 585544578048.0, + "grad_norm": 0.05119219920681276, + "language_loss": 0.82001173, + "learning_rate": 0.00025604962320439017, + "loss": 0.83067107, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.35327148, + "step": 3495, + "time_per_iteration": 2.6681125164031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068608, + "balance_loss_mlp": 1.03334618, + "epoch": 0.672566371681416, + "flos": 506336721408.0, + "grad_norm": 0.06376768707456672, + "language_loss": 0.82132721, + "learning_rate": 0.0002557777255721516, + "loss": 0.83201331, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.35302734, + "step": 3496, + "time_per_iteration": 2.688211441040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066305, + "balance_loss_mlp": 1.03142464, + "epoch": 0.6727587533666795, + "flos": 535405856256.0, + "grad_norm": 0.061511790914054676, + "language_loss": 0.80550486, + "learning_rate": 0.0002555059227554087, + "loss": 0.81616795, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.34912109, + "step": 3497, + "time_per_iteration": 2.6755480766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107069, + "balance_loss_mlp": 1.03588057, + "epoch": 0.672951135051943, + "flos": 602532919296.0, + "grad_norm": 0.08077616236025223, + "language_loss": 0.77663779, + "learning_rate": 0.00025523421485968453, + "loss": 0.78734469, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.34838867, + "step": 3498, + "time_per_iteration": 2.782900333404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067117, + "balance_loss_mlp": 1.0330708, + "epoch": 0.6731435167372066, + "flos": 810976693248.0, + "grad_norm": 0.05548957560218429, + "language_loss": 0.85524929, + "learning_rate": 0.00025496260199046585, + "loss": 0.86592042, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.34082031, + "step": 3499, + "time_per_iteration": 2.9468865394592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070869, + "balance_loss_mlp": 1.0354166, + "epoch": 0.6733358984224702, + "flos": 611306486784.0, + "grad_norm": 0.05533117407316435, + "language_loss": 0.84011221, + "learning_rate": 0.000254691084253202, + "loss": 0.8508209, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.35473633, + "step": 3500, + "time_per_iteration": 2.7936129570007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107048, + "balance_loss_mlp": 1.03607607, + "epoch": 0.6735282801077337, + "flos": 558636120576.0, + "grad_norm": 0.06619060652022955, + "language_loss": 0.77001846, + "learning_rate": 0.00025441966175330567, + "loss": 0.78072333, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.34423828, + "step": 3501, + "time_per_iteration": 2.7096900939941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073276, + "balance_loss_mlp": 1.03737032, + "epoch": 0.6737206617929973, + "flos": 672134962176.0, + "grad_norm": 0.04835122337119983, + "language_loss": 0.79766667, + "learning_rate": 0.00025414833459615183, + "loss": 0.80839938, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.35913086, + "step": 3502, + "time_per_iteration": 2.787539482116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075889, + "balance_loss_mlp": 1.03933966, + "epoch": 0.6739130434782609, + "flos": 633148683264.0, + "grad_norm": 0.05358836017753152, + "language_loss": 0.80260807, + "learning_rate": 0.0002538771028870796, + "loss": 0.81336701, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.36547852, + "step": 3503, + "time_per_iteration": 2.7826414108276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077652, + "balance_loss_mlp": 1.04224694, + "epoch": 0.6741054251635245, + "flos": 531171841536.0, + "grad_norm": 0.07580622934543835, + "language_loss": 0.81591624, + "learning_rate": 0.0002536059667313903, + "loss": 0.82669276, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.35424805, + "step": 3504, + "time_per_iteration": 2.7296247482299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107121, + "balance_loss_mlp": 1.03551888, + "epoch": 0.674297806848788, + "flos": 542343223296.0, + "grad_norm": 0.056073772887399426, + "language_loss": 0.8900978, + "learning_rate": 0.0002533349262343483, + "loss": 0.90080988, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.35742188, + "step": 3505, + "time_per_iteration": 2.674409866333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107348, + "balance_loss_mlp": 1.03828955, + "epoch": 0.6744901885340515, + "flos": 463291107840.0, + "grad_norm": 0.05947075073095298, + "language_loss": 0.81730378, + "learning_rate": 0.0002530639815011807, + "loss": 0.82803857, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.35229492, + "step": 3506, + "time_per_iteration": 2.497544765472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068375, + "balance_loss_mlp": 1.0326128, + "epoch": 0.6746825702193151, + "flos": 631533652992.0, + "grad_norm": 0.07086052765097473, + "language_loss": 0.84639049, + "learning_rate": 0.0002527931326370781, + "loss": 0.85707426, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.35791016, + "step": 3507, + "time_per_iteration": 2.7526142597198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069527, + "balance_loss_mlp": 1.03395462, + "epoch": 0.6748749519045787, + "flos": 670835644416.0, + "grad_norm": 0.05093445347334381, + "language_loss": 0.82660782, + "learning_rate": 0.00025252237974719276, + "loss": 0.83730316, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.35595703, + "step": 3508, + "time_per_iteration": 2.8549742698669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107359, + "balance_loss_mlp": 1.03782725, + "epoch": 0.6750673335898423, + "flos": 766756827648.0, + "grad_norm": 0.05329285448866526, + "language_loss": 0.80265921, + "learning_rate": 0.00025225172293664056, + "loss": 0.81339508, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.3581543, + "step": 3509, + "time_per_iteration": 2.974613904953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026572, + "balance_loss_mlp": 1.01465082, + "epoch": 0.6752597152751059, + "flos": 1511786198016.0, + "grad_norm": 0.015514835233315651, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77959704, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.11914062, + "step": 3510, + "time_per_iteration": 4.91582179069519 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072206, + "balance_loss_mlp": 1.03637218, + "epoch": 0.6754520969603693, + "flos": 686990329344.0, + "grad_norm": 0.06350153745428545, + "language_loss": 0.84804261, + "learning_rate": 0.00025171069797381106, + "loss": 0.85876471, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.35864258, + "step": 3511, + "time_per_iteration": 2.7993617057800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066581, + "balance_loss_mlp": 1.0310328, + "epoch": 0.6756444786456329, + "flos": 500318940672.0, + "grad_norm": 0.06118900000736982, + "language_loss": 0.81987178, + "learning_rate": 0.00025144033003157864, + "loss": 0.83053756, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.35620117, + "step": 3512, + "time_per_iteration": 2.5873219966888428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069288, + "balance_loss_mlp": 1.03450298, + "epoch": 0.6758368603308965, + "flos": 492357270528.0, + "grad_norm": 0.060009957038895716, + "language_loss": 0.78680366, + "learning_rate": 0.00025117005858876806, + "loss": 0.7974965, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.34838867, + "step": 3513, + "time_per_iteration": 2.6835427284240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069658, + "balance_loss_mlp": 1.03427649, + "epoch": 0.6760292420161601, + "flos": 555657753600.0, + "grad_norm": 0.15540830916665044, + "language_loss": 0.8478874, + "learning_rate": 0.000250899883750308, + "loss": 0.85858399, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.35400391, + "step": 3514, + "time_per_iteration": 2.650256395339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070046, + "balance_loss_mlp": 1.03478396, + "epoch": 0.6762216237014236, + "flos": 607322755584.0, + "grad_norm": 0.06069446103583955, + "language_loss": 0.8186444, + "learning_rate": 0.00025062980562109006, + "loss": 0.82934481, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.35302734, + "step": 3515, + "time_per_iteration": 2.7015137672424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066431, + "balance_loss_mlp": 1.0309782, + "epoch": 0.6764140053866872, + "flos": 533501254656.0, + "grad_norm": 0.06011919218972519, + "language_loss": 0.82936066, + "learning_rate": 0.0002503598243059677, + "loss": 0.84002495, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.35473633, + "step": 3516, + "time_per_iteration": 2.7936599254608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066759, + "balance_loss_mlp": 1.03221166, + "epoch": 0.6766063870719508, + "flos": 504548573184.0, + "grad_norm": 0.0538086785967606, + "language_loss": 0.79831243, + "learning_rate": 0.0002500899399097568, + "loss": 0.80897999, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.34594727, + "step": 3517, + "time_per_iteration": 2.647766351699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068577, + "balance_loss_mlp": 1.03340983, + "epoch": 0.6767987687572143, + "flos": 512923470336.0, + "grad_norm": 0.05682834446853688, + "language_loss": 0.85193241, + "learning_rate": 0.0002498201525372359, + "loss": 0.86261815, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.35205078, + "step": 3518, + "time_per_iteration": 2.5557949542999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064417, + "balance_loss_mlp": 1.03029943, + "epoch": 0.6769911504424779, + "flos": 524780121600.0, + "grad_norm": 0.05092560749530118, + "language_loss": 0.83158201, + "learning_rate": 0.00024955046229314584, + "loss": 0.84222615, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.34130859, + "step": 3519, + "time_per_iteration": 2.578089475631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069846, + "balance_loss_mlp": 1.03422618, + "epoch": 0.6771835321277414, + "flos": 449662275072.0, + "grad_norm": 0.05617502004048809, + "language_loss": 0.87603748, + "learning_rate": 0.00024928086928218947, + "loss": 0.88673592, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.35644531, + "step": 3520, + "time_per_iteration": 2.490943193435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068484, + "balance_loss_mlp": 1.03322208, + "epoch": 0.677375913813005, + "flos": 709020200448.0, + "grad_norm": 0.051602142671676454, + "language_loss": 0.75993657, + "learning_rate": 0.00024901137360903216, + "loss": 0.77062142, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.35302734, + "step": 3521, + "time_per_iteration": 2.9075634479522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073222, + "balance_loss_mlp": 1.03817451, + "epoch": 0.6775682954982686, + "flos": 428189635584.0, + "grad_norm": 0.10231641973637204, + "language_loss": 0.81175685, + "learning_rate": 0.00024874197537830115, + "loss": 0.82248902, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.35083008, + "step": 3522, + "time_per_iteration": 2.5057058334350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069237, + "balance_loss_mlp": 1.03478503, + "epoch": 0.6777606771835322, + "flos": 437677585920.0, + "grad_norm": 0.060253133761597404, + "language_loss": 0.83087361, + "learning_rate": 0.00024847267469458684, + "loss": 0.84156603, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.3449707, + "step": 3523, + "time_per_iteration": 2.5406739711761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067339, + "balance_loss_mlp": 1.03210068, + "epoch": 0.6779530588687956, + "flos": 775106993664.0, + "grad_norm": 0.0551254373136415, + "language_loss": 0.78231275, + "learning_rate": 0.00024820347166244034, + "loss": 0.79298615, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.35302734, + "step": 3524, + "time_per_iteration": 3.021663188934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064562, + "balance_loss_mlp": 1.03013432, + "epoch": 0.6781454405540592, + "flos": 571502518272.0, + "grad_norm": 0.04412805225967261, + "language_loss": 0.84577274, + "learning_rate": 0.0002479343663863755, + "loss": 0.85641837, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.34448242, + "step": 3525, + "time_per_iteration": 2.760934352874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070433, + "balance_loss_mlp": 1.03395486, + "epoch": 0.6783378222393228, + "flos": 484788478464.0, + "grad_norm": 0.051123449842866715, + "language_loss": 0.76749617, + "learning_rate": 0.00024766535897086876, + "loss": 0.77820051, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.36474609, + "step": 3526, + "time_per_iteration": 2.5466532707214355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071181, + "balance_loss_mlp": 1.03584695, + "epoch": 0.6785302039245864, + "flos": 482592895488.0, + "grad_norm": 0.04922293189317912, + "language_loss": 0.78913069, + "learning_rate": 0.0002473964495203578, + "loss": 0.79984254, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.35351562, + "step": 3527, + "time_per_iteration": 2.65765118598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072886, + "balance_loss_mlp": 1.03609788, + "epoch": 0.67872258560985, + "flos": 524451262464.0, + "grad_norm": 0.04942804135010068, + "language_loss": 0.85464156, + "learning_rate": 0.0002471276381392425, + "loss": 0.86537039, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.36791992, + "step": 3528, + "time_per_iteration": 2.75915265083313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038837, + "balance_loss_mlp": 1.02634406, + "epoch": 0.6789149672951135, + "flos": 1551786605568.0, + "grad_norm": 0.02259283228752806, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79227471, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.125, + "step": 3529, + "time_per_iteration": 4.964378356933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069996, + "balance_loss_mlp": 1.0344243, + "epoch": 0.6791073489803771, + "flos": 741088051200.0, + "grad_norm": 0.05189094051618866, + "language_loss": 0.84224343, + "learning_rate": 0.00024659031000260826, + "loss": 0.85294336, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.35595703, + "step": 3530, + "time_per_iteration": 2.8634302616119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072509, + "balance_loss_mlp": 1.03638899, + "epoch": 0.6792997306656406, + "flos": 576095915520.0, + "grad_norm": 0.055023533803773034, + "language_loss": 0.80543637, + "learning_rate": 0.0002463217934556985, + "loss": 0.81616145, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.36132812, + "step": 3531, + "time_per_iteration": 2.632070541381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031238, + "balance_loss_mlp": 1.01884079, + "epoch": 0.6794921123509042, + "flos": 1502538356736.0, + "grad_norm": 0.018779116568333653, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77563328, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.12402344, + "step": 3532, + "time_per_iteration": 4.7274627685546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073531, + "balance_loss_mlp": 1.03836441, + "epoch": 0.6796844940361677, + "flos": 698620018176.0, + "grad_norm": 0.05756666047667581, + "language_loss": 0.8354668, + "learning_rate": 0.0002457850559259306, + "loss": 0.84620214, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.35205078, + "step": 3533, + "time_per_iteration": 2.8860280513763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074469, + "balance_loss_mlp": 1.03901649, + "epoch": 0.6798768757214313, + "flos": 552496094208.0, + "grad_norm": 0.05133054826538493, + "language_loss": 0.81485093, + "learning_rate": 0.00024551683515145275, + "loss": 0.82559562, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.35498047, + "step": 3534, + "time_per_iteration": 2.620476722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072102, + "balance_loss_mlp": 1.03610086, + "epoch": 0.6800692574066949, + "flos": 522677670912.0, + "grad_norm": 0.04887500327812814, + "language_loss": 0.86479199, + "learning_rate": 0.0002452487131761014, + "loss": 0.87551308, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.35986328, + "step": 3535, + "time_per_iteration": 2.7402584552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069949, + "balance_loss_mlp": 1.03523564, + "epoch": 0.6802616390919585, + "flos": 573747563520.0, + "grad_norm": 0.05056319210769973, + "language_loss": 0.79672563, + "learning_rate": 0.00024498069010397093, + "loss": 0.80742508, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.34741211, + "step": 3536, + "time_per_iteration": 2.6493327617645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076551, + "balance_loss_mlp": 1.04109788, + "epoch": 0.6804540207772221, + "flos": 487915232256.0, + "grad_norm": 0.08967027587321133, + "language_loss": 0.85052317, + "learning_rate": 0.00024471276603911697, + "loss": 0.86128873, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.35449219, + "step": 3537, + "time_per_iteration": 2.5946011543273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074683, + "balance_loss_mlp": 1.03946912, + "epoch": 0.6806464024624855, + "flos": 578307465216.0, + "grad_norm": 0.050744450088680546, + "language_loss": 0.78934067, + "learning_rate": 0.0002444449410855572, + "loss": 0.80008757, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.35229492, + "step": 3538, + "time_per_iteration": 2.7160799503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073592, + "balance_loss_mlp": 1.03778172, + "epoch": 0.6808387841477491, + "flos": 553456378368.0, + "grad_norm": 0.0415443850681439, + "language_loss": 0.84257662, + "learning_rate": 0.00024417721534727033, + "loss": 0.85331261, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.35864258, + "step": 3539, + "time_per_iteration": 2.6316590309143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067702, + "balance_loss_mlp": 1.03220177, + "epoch": 0.6810311658330127, + "flos": 426613893120.0, + "grad_norm": 0.06268112342212401, + "language_loss": 0.82995272, + "learning_rate": 0.00024390958892819687, + "loss": 0.8406297, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.35546875, + "step": 3540, + "time_per_iteration": 2.4619975090026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071831, + "balance_loss_mlp": 1.03518569, + "epoch": 0.6812235475182763, + "flos": 571956443136.0, + "grad_norm": 0.047330457395290515, + "language_loss": 0.80951297, + "learning_rate": 0.0002436420619322381, + "loss": 0.82023126, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.36645508, + "step": 3541, + "time_per_iteration": 2.814427614212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070446, + "balance_loss_mlp": 1.03515983, + "epoch": 0.6814159292035398, + "flos": 501648781824.0, + "grad_norm": 0.0608425293250951, + "language_loss": 0.82551098, + "learning_rate": 0.0002433746344632577, + "loss": 0.83621544, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.35327148, + "step": 3542, + "time_per_iteration": 2.6463205814361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069714, + "balance_loss_mlp": 1.03340268, + "epoch": 0.6816083108888034, + "flos": 765176702976.0, + "grad_norm": 0.05597669105837374, + "language_loss": 0.7998035, + "learning_rate": 0.00024310730662508006, + "loss": 0.81050068, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.36303711, + "step": 3543, + "time_per_iteration": 3.0262770652770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107287, + "balance_loss_mlp": 1.03787053, + "epoch": 0.681800692574067, + "flos": 479205683712.0, + "grad_norm": 0.05246394950285061, + "language_loss": 0.87412894, + "learning_rate": 0.0002428400785214911, + "loss": 0.88485765, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.3503418, + "step": 3544, + "time_per_iteration": 2.6026573181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072118, + "balance_loss_mlp": 1.03547359, + "epoch": 0.6819930742593305, + "flos": 691298537472.0, + "grad_norm": 0.057535239065408805, + "language_loss": 0.8261283, + "learning_rate": 0.00024257295025623794, + "loss": 0.83684945, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.36645508, + "step": 3545, + "time_per_iteration": 2.813525915145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066625, + "balance_loss_mlp": 1.03059971, + "epoch": 0.6821854559445941, + "flos": 677783185920.0, + "grad_norm": 0.051890775320829655, + "language_loss": 0.80731034, + "learning_rate": 0.00024230592193302892, + "loss": 0.81797659, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.3605957, + "step": 3546, + "time_per_iteration": 2.852640151977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069168, + "balance_loss_mlp": 1.03378654, + "epoch": 0.6823778376298576, + "flos": 461956884480.0, + "grad_norm": 0.04826922291722955, + "language_loss": 0.84192979, + "learning_rate": 0.00024203899365553372, + "loss": 0.85262144, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.35424805, + "step": 3547, + "time_per_iteration": 2.51088285446167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018198, + "balance_loss_mlp": 1.00651574, + "epoch": 0.6825702193151212, + "flos": 1474582427136.0, + "grad_norm": 0.01234117563256537, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.77752554, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.11669922, + "step": 3548, + "time_per_iteration": 4.512159824371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069713, + "balance_loss_mlp": 1.03397429, + "epoch": 0.6827626010003848, + "flos": 722791627776.0, + "grad_norm": 0.05201405662428197, + "language_loss": 0.83068311, + "learning_rate": 0.00024150543765216848, + "loss": 0.84138024, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.35766602, + "step": 3549, + "time_per_iteration": 2.9022421836853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066879, + "balance_loss_mlp": 1.03066325, + "epoch": 0.6829549826856484, + "flos": 558596832768.0, + "grad_norm": 0.050492877395882395, + "language_loss": 0.83153272, + "learning_rate": 0.00024123881013344352, + "loss": 0.84220147, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.36230469, + "step": 3550, + "time_per_iteration": 2.663245677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070068, + "balance_loss_mlp": 1.03509164, + "epoch": 0.6831473643709118, + "flos": 624635573760.0, + "grad_norm": 0.06049149203697264, + "language_loss": 0.79663515, + "learning_rate": 0.00024097228307472202, + "loss": 0.80733585, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.35009766, + "step": 3551, + "time_per_iteration": 2.7762739658355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070299, + "balance_loss_mlp": 1.03458428, + "epoch": 0.6833397460561754, + "flos": 713553960960.0, + "grad_norm": 0.05841581019215986, + "language_loss": 0.81410074, + "learning_rate": 0.00024070585657947846, + "loss": 0.82480371, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.35717773, + "step": 3552, + "time_per_iteration": 2.8573665618896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070846, + "balance_loss_mlp": 1.03470206, + "epoch": 0.683532127741439, + "flos": 464449241088.0, + "grad_norm": 0.042320338748993415, + "language_loss": 0.85217428, + "learning_rate": 0.00024043953075114934, + "loss": 0.86288273, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.36157227, + "step": 3553, + "time_per_iteration": 2.6308178901672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106716, + "balance_loss_mlp": 1.03230345, + "epoch": 0.6837245094267026, + "flos": 581979866112.0, + "grad_norm": 0.06353851780596993, + "language_loss": 0.88855463, + "learning_rate": 0.00024017330569313128, + "loss": 0.89922619, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.34912109, + "step": 3554, + "time_per_iteration": 2.691176176071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070226, + "balance_loss_mlp": 1.03415298, + "epoch": 0.6839168911119662, + "flos": 793836993024.0, + "grad_norm": 0.05307417263054524, + "language_loss": 0.74880016, + "learning_rate": 0.0002399071815087821, + "loss": 0.75950241, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.36108398, + "step": 3555, + "time_per_iteration": 2.990910530090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073324, + "balance_loss_mlp": 1.03803802, + "epoch": 0.6841092727972297, + "flos": 579734820864.0, + "grad_norm": 0.05505515245095852, + "language_loss": 0.83355868, + "learning_rate": 0.00023964115830142025, + "loss": 0.84429193, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.35327148, + "step": 3556, + "time_per_iteration": 2.6737208366394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070632, + "balance_loss_mlp": 1.03522646, + "epoch": 0.6843016544824932, + "flos": 383530401792.0, + "grad_norm": 0.06254442302238046, + "language_loss": 0.8747263, + "learning_rate": 0.00023937523617432522, + "loss": 0.8854326, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.35449219, + "step": 3557, + "time_per_iteration": 2.4377589225769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066745, + "balance_loss_mlp": 1.03176904, + "epoch": 0.6844940361677568, + "flos": 1438474332672.0, + "grad_norm": 0.05391810386575329, + "language_loss": 0.86953497, + "learning_rate": 0.00023910941523073705, + "loss": 0.88020241, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.34985352, + "step": 3558, + "time_per_iteration": 3.854933738708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072327, + "balance_loss_mlp": 1.03739882, + "epoch": 0.6846864178530204, + "flos": 520614508032.0, + "grad_norm": 0.05572945475530707, + "language_loss": 0.86660743, + "learning_rate": 0.0002388436955738566, + "loss": 0.87733072, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.34960938, + "step": 3559, + "time_per_iteration": 2.6673743724823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072956, + "balance_loss_mlp": 1.03874326, + "epoch": 0.6848787995382839, + "flos": 717626442240.0, + "grad_norm": 0.051092768918582485, + "language_loss": 0.81714153, + "learning_rate": 0.00023857807730684523, + "loss": 0.82787108, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.3425293, + "step": 3560, + "time_per_iteration": 2.8930888175964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074328, + "balance_loss_mlp": 1.03956604, + "epoch": 0.6850711812235475, + "flos": 510787524096.0, + "grad_norm": 0.06174671890156068, + "language_loss": 0.82387376, + "learning_rate": 0.00023831256053282547, + "loss": 0.83461708, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.34790039, + "step": 3561, + "time_per_iteration": 2.6872005462646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073119, + "balance_loss_mlp": 1.03923941, + "epoch": 0.6852635629088111, + "flos": 667832546304.0, + "grad_norm": 0.051363024529254335, + "language_loss": 0.78085375, + "learning_rate": 0.00023804714535488003, + "loss": 0.79158491, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.33911133, + "step": 3562, + "time_per_iteration": 4.3489556312561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008548, + "balance_loss_mlp": 0.9979142, + "epoch": 0.6854559445940747, + "flos": 1522136918016.0, + "grad_norm": 0.005165223405227486, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80818176, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.10644531, + "step": 3563, + "time_per_iteration": 4.906137704849243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072342, + "balance_loss_mlp": 1.03812885, + "epoch": 0.6856483262793382, + "flos": 453970483200.0, + "grad_norm": 0.05119141259642537, + "language_loss": 0.80591673, + "learning_rate": 0.00023751662019934488, + "loss": 0.81664014, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.3425293, + "step": 3564, + "time_per_iteration": 2.4906551837921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071978, + "balance_loss_mlp": 1.03745532, + "epoch": 0.6858407079646017, + "flos": 615269869056.0, + "grad_norm": 0.08282945217506828, + "language_loss": 0.79188418, + "learning_rate": 0.00023725151042772364, + "loss": 0.80260396, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.34545898, + "step": 3565, + "time_per_iteration": 2.7048499584198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075078, + "balance_loss_mlp": 1.04065084, + "epoch": 0.6860330896498653, + "flos": 465793638912.0, + "grad_norm": 0.05470196692680893, + "language_loss": 0.82981157, + "learning_rate": 0.00023698650266411276, + "loss": 0.8405624, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.34472656, + "step": 3566, + "time_per_iteration": 2.6011905670166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072281, + "balance_loss_mlp": 1.03909349, + "epoch": 0.6862254713351289, + "flos": 863879814144.0, + "grad_norm": 0.05579586531854514, + "language_loss": 0.82876581, + "learning_rate": 0.00023672159701139755, + "loss": 0.83948863, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.33203125, + "step": 3567, + "time_per_iteration": 3.1918952465057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078201, + "balance_loss_mlp": 1.0438447, + "epoch": 0.6864178530203925, + "flos": 446905078272.0, + "grad_norm": 0.06805670760386738, + "language_loss": 0.85873824, + "learning_rate": 0.00023645679357242296, + "loss": 0.86952031, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.34399414, + "step": 3568, + "time_per_iteration": 2.4888172149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074879, + "balance_loss_mlp": 1.04128623, + "epoch": 0.6866102347056561, + "flos": 424034196480.0, + "grad_norm": 0.05006770232648597, + "language_loss": 0.83895862, + "learning_rate": 0.00023619209244999534, + "loss": 0.84970748, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.33618164, + "step": 3569, + "time_per_iteration": 2.502540111541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107562, + "balance_loss_mlp": 1.04150224, + "epoch": 0.6868026163909196, + "flos": 472134486528.0, + "grad_norm": 0.060913037985659245, + "language_loss": 0.85054779, + "learning_rate": 0.0002359274937468806, + "loss": 0.86130404, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.34155273, + "step": 3570, + "time_per_iteration": 2.5016539096832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076781, + "balance_loss_mlp": 1.04263973, + "epoch": 0.6869949980761831, + "flos": 463937089536.0, + "grad_norm": 0.04774464497453654, + "language_loss": 0.778054, + "learning_rate": 0.00023566299756580512, + "loss": 0.78882182, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.34179688, + "step": 3571, + "time_per_iteration": 2.6037425994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076706, + "balance_loss_mlp": 1.04194498, + "epoch": 0.6871873797614467, + "flos": 426012991488.0, + "grad_norm": 0.056784915958369084, + "language_loss": 0.7818104, + "learning_rate": 0.0002353986040094551, + "loss": 0.79257739, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.34765625, + "step": 3572, + "time_per_iteration": 2.4650750160217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077273, + "balance_loss_mlp": 1.04286885, + "epoch": 0.6873797614467103, + "flos": 443394210816.0, + "grad_norm": 0.05696789275443238, + "language_loss": 0.7911824, + "learning_rate": 0.00023513431318047796, + "loss": 0.8019551, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.34448242, + "step": 3573, + "time_per_iteration": 2.5429108142852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072767, + "balance_loss_mlp": 1.03912568, + "epoch": 0.6875721431319738, + "flos": 991927074816.0, + "grad_norm": 0.06588497554546605, + "language_loss": 0.76656246, + "learning_rate": 0.00023487012518147977, + "loss": 0.77729011, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.33666992, + "step": 3574, + "time_per_iteration": 3.2478342056274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074542, + "balance_loss_mlp": 1.03985214, + "epoch": 0.6877645248172374, + "flos": 1285031900160.0, + "grad_norm": 0.05648016172081939, + "language_loss": 0.84123796, + "learning_rate": 0.00023460604011502772, + "loss": 0.85198337, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.34692383, + "step": 3575, + "time_per_iteration": 3.6104493141174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073345, + "balance_loss_mlp": 1.03946543, + "epoch": 0.687956906502501, + "flos": 876360688128.0, + "grad_norm": 0.05234067730424214, + "language_loss": 0.8542276, + "learning_rate": 0.00023434205808364845, + "loss": 0.86496103, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.33911133, + "step": 3576, + "time_per_iteration": 3.1311981678009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107559, + "balance_loss_mlp": 1.04142499, + "epoch": 0.6881492881877646, + "flos": 563038871040.0, + "grad_norm": 0.05805523475293479, + "language_loss": 0.8543247, + "learning_rate": 0.00023407817918982932, + "loss": 0.86508065, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.34204102, + "step": 3577, + "time_per_iteration": 2.76940655708313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075416, + "balance_loss_mlp": 1.04101276, + "epoch": 0.6883416698730281, + "flos": 794782720512.0, + "grad_norm": 0.05454368675547281, + "language_loss": 0.7852968, + "learning_rate": 0.00023381440353601718, + "loss": 0.79605091, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.34448242, + "step": 3578, + "time_per_iteration": 2.987713098526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078404, + "balance_loss_mlp": 1.04295087, + "epoch": 0.6885340515582916, + "flos": 723308161536.0, + "grad_norm": 0.1550034716178633, + "language_loss": 0.8585633, + "learning_rate": 0.00023355073122461822, + "loss": 0.86934739, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.35449219, + "step": 3579, + "time_per_iteration": 2.8689723014831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073162, + "balance_loss_mlp": 1.03866315, + "epoch": 0.6887264332435552, + "flos": 1010529036288.0, + "grad_norm": 0.05073405937769219, + "language_loss": 0.82913256, + "learning_rate": 0.00023328716235799973, + "loss": 0.83986419, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.34545898, + "step": 3580, + "time_per_iteration": 3.2760398387908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077211, + "balance_loss_mlp": 1.04292655, + "epoch": 0.6889188149288188, + "flos": 584993138688.0, + "grad_norm": 0.0642868391556551, + "language_loss": 0.83958888, + "learning_rate": 0.00023302369703848803, + "loss": 0.85036099, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.34326172, + "step": 3581, + "time_per_iteration": 2.6795780658721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075736, + "balance_loss_mlp": 1.04121315, + "epoch": 0.6891111966140824, + "flos": 635831686656.0, + "grad_norm": 0.05830003162798764, + "language_loss": 0.79951459, + "learning_rate": 0.00023276033536836937, + "loss": 0.81027198, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.34570312, + "step": 3582, + "time_per_iteration": 2.7684953212738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074456, + "balance_loss_mlp": 1.03964663, + "epoch": 0.6893035782993459, + "flos": 495011160576.0, + "grad_norm": 0.04509310145442872, + "language_loss": 0.84428883, + "learning_rate": 0.00023249707744988984, + "loss": 0.8550334, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.34838867, + "step": 3583, + "time_per_iteration": 2.6324620246887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074677, + "balance_loss_mlp": 1.04041624, + "epoch": 0.6894959599846094, + "flos": 457983327744.0, + "grad_norm": 0.06541043788965, + "language_loss": 0.81646812, + "learning_rate": 0.00023223392338525529, + "loss": 0.8272149, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.34301758, + "step": 3584, + "time_per_iteration": 2.496835231781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070744, + "balance_loss_mlp": 1.03614986, + "epoch": 0.689688341669873, + "flos": 504740630016.0, + "grad_norm": 0.0500959825049001, + "language_loss": 0.78515136, + "learning_rate": 0.00023197087327663107, + "loss": 0.7958588, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.34643555, + "step": 3585, + "time_per_iteration": 2.6497855186462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107737, + "balance_loss_mlp": 1.04349089, + "epoch": 0.6898807233551366, + "flos": 763584993792.0, + "grad_norm": 0.05545986059450925, + "language_loss": 0.81721687, + "learning_rate": 0.00023170792722614243, + "loss": 0.82799053, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.33911133, + "step": 3586, + "time_per_iteration": 2.8789288997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071576, + "balance_loss_mlp": 1.0367434, + "epoch": 0.6900731050404002, + "flos": 583030310400.0, + "grad_norm": 0.05029766249236532, + "language_loss": 0.83530807, + "learning_rate": 0.00023144508533587377, + "loss": 0.84602392, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.34863281, + "step": 3587, + "time_per_iteration": 2.8913052082061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074935, + "balance_loss_mlp": 1.03998244, + "epoch": 0.6902654867256637, + "flos": 711531495936.0, + "grad_norm": 0.0709422421698616, + "language_loss": 0.7865144, + "learning_rate": 0.0002311823477078698, + "loss": 0.79726374, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.35009766, + "step": 3588, + "time_per_iteration": 2.923501491546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068421, + "balance_loss_mlp": 1.03446984, + "epoch": 0.6904578684109273, + "flos": 596816294400.0, + "grad_norm": 0.26453664714217867, + "language_loss": 0.8501482, + "learning_rate": 0.00023091971444413428, + "loss": 0.86083239, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.33984375, + "step": 3589, + "time_per_iteration": 2.779235363006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076229, + "balance_loss_mlp": 1.04056144, + "epoch": 0.6906502500961909, + "flos": 584757411840.0, + "grad_norm": 0.051361873763105706, + "language_loss": 0.82785845, + "learning_rate": 0.00023065718564663012, + "loss": 0.83862066, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.35668945, + "step": 3590, + "time_per_iteration": 2.7035253047943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020688, + "balance_loss_mlp": 1.00957787, + "epoch": 0.6908426317814544, + "flos": 1587001559040.0, + "grad_norm": 0.009423557970014077, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74932277, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.11132812, + "step": 3591, + "time_per_iteration": 4.9744603633880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073352, + "balance_loss_mlp": 1.03901935, + "epoch": 0.6910350134667179, + "flos": 500525554176.0, + "grad_norm": 0.048031169148873155, + "language_loss": 0.80940306, + "learning_rate": 0.0002301324418579666, + "loss": 0.82013655, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.34350586, + "step": 3592, + "time_per_iteration": 2.673436403274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016267, + "balance_loss_mlp": 1.00534713, + "epoch": 0.6912273951519815, + "flos": 1408462138368.0, + "grad_norm": 0.006132313228220279, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79704738, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.109375, + "step": 3593, + "time_per_iteration": 4.7109363079071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074657, + "balance_loss_mlp": 1.04053962, + "epoch": 0.6914197768372451, + "flos": 634961562624.0, + "grad_norm": 0.056049498625347735, + "language_loss": 0.80705756, + "learning_rate": 0.00022960811715677415, + "loss": 0.8178041, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.34155273, + "step": 3594, + "time_per_iteration": 2.830838918685913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107036, + "balance_loss_mlp": 1.03686213, + "epoch": 0.6916121585225087, + "flos": 557755822080.0, + "grad_norm": 0.05478776736586074, + "language_loss": 0.81540507, + "learning_rate": 0.00022934611221845608, + "loss": 0.82610869, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.33520508, + "step": 3595, + "time_per_iteration": 2.800851583480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074607, + "balance_loss_mlp": 1.04127622, + "epoch": 0.6918045402077723, + "flos": 528887508480.0, + "grad_norm": 0.051880347807473304, + "language_loss": 0.77869982, + "learning_rate": 0.00022908421235729609, + "loss": 0.78944588, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.33349609, + "step": 3596, + "time_per_iteration": 2.7151432037353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071648, + "balance_loss_mlp": 1.03645778, + "epoch": 0.6919969218930357, + "flos": 570083927040.0, + "grad_norm": 0.044849912113491465, + "language_loss": 0.85305548, + "learning_rate": 0.0002288224176749728, + "loss": 0.86377192, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.35205078, + "step": 3597, + "time_per_iteration": 2.634561061859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075011, + "balance_loss_mlp": 1.04005897, + "epoch": 0.6921893035782993, + "flos": 683006598144.0, + "grad_norm": 0.0536844380747242, + "language_loss": 0.78127837, + "learning_rate": 0.00022856072827312385, + "loss": 0.79202843, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.34936523, + "step": 3598, + "time_per_iteration": 2.8242592811584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072467, + "balance_loss_mlp": 1.03830183, + "epoch": 0.6923816852635629, + "flos": 546484105728.0, + "grad_norm": 0.13391006913463419, + "language_loss": 0.76835263, + "learning_rate": 0.00022829914425334598, + "loss": 0.77907735, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.34204102, + "step": 3599, + "time_per_iteration": 2.634923219680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074851, + "balance_loss_mlp": 1.04051888, + "epoch": 0.6925740669488265, + "flos": 509782159872.0, + "grad_norm": 0.0539133277986469, + "language_loss": 0.80556238, + "learning_rate": 0.0002280376657171956, + "loss": 0.81631094, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.34350586, + "step": 3600, + "time_per_iteration": 2.6054348945617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071386, + "balance_loss_mlp": 1.03662419, + "epoch": 0.69276644863409, + "flos": 869053764096.0, + "grad_norm": 0.05194865310511828, + "language_loss": 0.76575196, + "learning_rate": 0.00022777629276618706, + "loss": 0.77646577, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.34765625, + "step": 3601, + "time_per_iteration": 3.1115190982818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077383, + "balance_loss_mlp": 1.04219222, + "epoch": 0.6929588303193536, + "flos": 625486758912.0, + "grad_norm": 0.05453934109077095, + "language_loss": 0.77726191, + "learning_rate": 0.0002275150255017947, + "loss": 0.78803569, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.35205078, + "step": 3602, + "time_per_iteration": 2.7954330444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013352, + "balance_loss_mlp": 1.00333869, + "epoch": 0.6931512120046172, + "flos": 1544530553856.0, + "grad_norm": 0.00865021754788789, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76746023, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.10009766, + "step": 3603, + "time_per_iteration": 4.98169469833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016608, + "balance_loss_mlp": 1.00664246, + "epoch": 0.6933435936898807, + "flos": 1447460001792.0, + "grad_norm": 0.007581021196043067, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76143718, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.09960938, + "step": 3604, + "time_per_iteration": 4.666281223297119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071493, + "balance_loss_mlp": 1.03739882, + "epoch": 0.6935359753751443, + "flos": 540639442944.0, + "grad_norm": 0.05365572329513203, + "language_loss": 0.84348619, + "learning_rate": 0.0002267318588424379, + "loss": 0.85420108, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.34130859, + "step": 3605, + "time_per_iteration": 2.5876171588897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071367, + "balance_loss_mlp": 1.03755951, + "epoch": 0.6937283570604078, + "flos": 719074146816.0, + "grad_norm": 0.0635324341399035, + "language_loss": 0.87573755, + "learning_rate": 0.00022647101533842845, + "loss": 0.8864513, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.33837891, + "step": 3606, + "time_per_iteration": 2.873445510864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072563, + "balance_loss_mlp": 1.03825426, + "epoch": 0.6939207387456714, + "flos": 521909443584.0, + "grad_norm": 0.05554055490203988, + "language_loss": 0.76844239, + "learning_rate": 0.00022621027802778872, + "loss": 0.77916795, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.34350586, + "step": 3607, + "time_per_iteration": 2.607332706451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074645, + "balance_loss_mlp": 1.04086149, + "epoch": 0.694113120430935, + "flos": 535100318208.0, + "grad_norm": 0.058788257779223134, + "language_loss": 0.78766942, + "learning_rate": 0.00022594964701174586, + "loss": 0.79841584, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.33813477, + "step": 3608, + "time_per_iteration": 2.6019680500030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074213, + "balance_loss_mlp": 1.03985715, + "epoch": 0.6943055021161986, + "flos": 523101072384.0, + "grad_norm": 0.052336959457674984, + "language_loss": 0.84605336, + "learning_rate": 0.00022568912239148586, + "loss": 0.85679555, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.34399414, + "step": 3609, + "time_per_iteration": 2.6037116050720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073862, + "balance_loss_mlp": 1.03943467, + "epoch": 0.694497883801462, + "flos": 484637119488.0, + "grad_norm": 0.05428318108102923, + "language_loss": 0.81688815, + "learning_rate": 0.00022542870426815344, + "loss": 0.82762676, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.34472656, + "step": 3610, + "time_per_iteration": 2.723229169845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080613, + "balance_loss_mlp": 1.04518366, + "epoch": 0.6946902654867256, + "flos": 461238119424.0, + "grad_norm": 0.06119674491487997, + "language_loss": 0.86244833, + "learning_rate": 0.00022516839274285173, + "loss": 0.87325442, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.35449219, + "step": 3611, + "time_per_iteration": 2.540647268295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073799, + "balance_loss_mlp": 1.03832269, + "epoch": 0.6948826471719892, + "flos": 512603375616.0, + "grad_norm": 0.054515273937313154, + "language_loss": 0.74971861, + "learning_rate": 0.00022490818791664265, + "loss": 0.76045656, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.35522461, + "step": 3612, + "time_per_iteration": 2.577448844909668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074613, + "balance_loss_mlp": 1.03989887, + "epoch": 0.6950750288572528, + "flos": 556917783552.0, + "grad_norm": 0.04771365069249161, + "language_loss": 0.85378981, + "learning_rate": 0.00022464808989054676, + "loss": 0.86453593, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.34741211, + "step": 3613, + "time_per_iteration": 2.6405351161956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071033, + "balance_loss_mlp": 1.03646183, + "epoch": 0.6952674105425164, + "flos": 542215185408.0, + "grad_norm": 0.06079183455352582, + "language_loss": 0.75739813, + "learning_rate": 0.00022438809876554284, + "loss": 0.76810849, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.34594727, + "step": 3614, + "time_per_iteration": 2.613945484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075685, + "balance_loss_mlp": 1.04128122, + "epoch": 0.6954597922277799, + "flos": 546465166848.0, + "grad_norm": 0.05561683748761922, + "language_loss": 0.80328143, + "learning_rate": 0.00022412821464256873, + "loss": 0.81403828, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.34448242, + "step": 3615, + "time_per_iteration": 2.7260682582855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073152, + "balance_loss_mlp": 1.03922486, + "epoch": 0.6956521739130435, + "flos": 519255553536.0, + "grad_norm": 0.0593468724066596, + "language_loss": 0.82113886, + "learning_rate": 0.00022386843762252023, + "loss": 0.83187044, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.33959961, + "step": 3616, + "time_per_iteration": 2.6294190883636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070918, + "balance_loss_mlp": 1.03622794, + "epoch": 0.695844555598307, + "flos": 466029365760.0, + "grad_norm": 0.055313153128714786, + "language_loss": 0.79384601, + "learning_rate": 0.00022360876780625193, + "loss": 0.80455518, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.34741211, + "step": 3617, + "time_per_iteration": 2.590061664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071648, + "balance_loss_mlp": 1.03741097, + "epoch": 0.6960369372835706, + "flos": 600347510784.0, + "grad_norm": 0.044171001480645455, + "language_loss": 0.79755616, + "learning_rate": 0.00022334920529457604, + "loss": 0.8082726, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.34277344, + "step": 3618, + "time_per_iteration": 2.9306209087371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071307, + "balance_loss_mlp": 1.0369513, + "epoch": 0.6962293189688342, + "flos": 643927186944.0, + "grad_norm": 0.0535379410757751, + "language_loss": 0.87326622, + "learning_rate": 0.00022308975018826423, + "loss": 0.88397926, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.34399414, + "step": 3619, + "time_per_iteration": 2.888936758041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074623, + "balance_loss_mlp": 1.03967083, + "epoch": 0.6964217006540977, + "flos": 638524864512.0, + "grad_norm": 0.061080983554533244, + "language_loss": 0.84665489, + "learning_rate": 0.00022283040258804564, + "loss": 0.85740113, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.34985352, + "step": 3620, + "time_per_iteration": 2.777407169342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073748, + "balance_loss_mlp": 1.04005957, + "epoch": 0.6966140823393613, + "flos": 651864125952.0, + "grad_norm": 0.05227227103704651, + "language_loss": 0.83467555, + "learning_rate": 0.00022257116259460802, + "loss": 0.84541297, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.3371582, + "step": 3621, + "time_per_iteration": 2.8371803760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107145, + "balance_loss_mlp": 1.03802419, + "epoch": 0.6968064640246249, + "flos": 704160552960.0, + "grad_norm": 0.054247578312955166, + "language_loss": 0.8137657, + "learning_rate": 0.00022231203030859725, + "loss": 0.82448018, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.33447266, + "step": 3622, + "time_per_iteration": 2.9509494304656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077752, + "balance_loss_mlp": 1.04361081, + "epoch": 0.6969988457098885, + "flos": 492312190464.0, + "grad_norm": 0.06806535076017864, + "language_loss": 0.83473521, + "learning_rate": 0.00022205300583061737, + "loss": 0.84551275, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.34179688, + "step": 3623, + "time_per_iteration": 2.564910888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006317, + "balance_loss_mlp": 0.99630374, + "epoch": 0.6971912273951519, + "flos": 1351839974400.0, + "grad_norm": 0.005946878920226346, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83844519, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.10009766, + "step": 3624, + "time_per_iteration": 4.894897937774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_mlp": 1.04030991, + "epoch": 0.6973836090804155, + "flos": 602182301184.0, + "grad_norm": 0.052322011442081255, + "language_loss": 0.77296048, + "learning_rate": 0.00022153528070095735, + "loss": 0.78370118, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.33789062, + "step": 3625, + "time_per_iteration": 2.6873764991760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074827, + "balance_loss_mlp": 1.04056633, + "epoch": 0.6975759907656791, + "flos": 523805280768.0, + "grad_norm": 0.05344661809943597, + "language_loss": 0.88087487, + "learning_rate": 0.00022127658025027568, + "loss": 0.89162308, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.34301758, + "step": 3626, + "time_per_iteration": 2.6872076988220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077609, + "balance_loss_mlp": 1.04291928, + "epoch": 0.6977683724509427, + "flos": 480672327168.0, + "grad_norm": 0.05134929974551719, + "language_loss": 0.84773469, + "learning_rate": 0.00022101798800962258, + "loss": 0.85851079, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.34741211, + "step": 3627, + "time_per_iteration": 2.592256546020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074479, + "balance_loss_mlp": 1.03933573, + "epoch": 0.6979607541362063, + "flos": 522372132864.0, + "grad_norm": 0.06417164030840651, + "language_loss": 0.78953862, + "learning_rate": 0.00022075950407939227, + "loss": 0.80028337, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.35180664, + "step": 3628, + "time_per_iteration": 2.616570234298706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_mlp": 1.04023814, + "epoch": 0.6981531358214698, + "flos": 547818329088.0, + "grad_norm": 0.05532420233787888, + "language_loss": 0.82282603, + "learning_rate": 0.0002205011285599367, + "loss": 0.83356667, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.33862305, + "step": 3629, + "time_per_iteration": 2.612488269805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073299, + "balance_loss_mlp": 1.03925288, + "epoch": 0.6983455175067333, + "flos": 699747628032.0, + "grad_norm": 0.05532386422624981, + "language_loss": 0.80727249, + "learning_rate": 0.00022024286155156658, + "loss": 0.8180055, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.34082031, + "step": 3630, + "time_per_iteration": 2.8387677669525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070371, + "balance_loss_mlp": 1.03632545, + "epoch": 0.6985378991919969, + "flos": 484819001856.0, + "grad_norm": 0.047952910030837306, + "language_loss": 0.85720146, + "learning_rate": 0.00021998470315454994, + "loss": 0.8679052, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.34057617, + "step": 3631, + "time_per_iteration": 2.635730743408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071863, + "balance_loss_mlp": 1.03843713, + "epoch": 0.6987302808772605, + "flos": 558503700480.0, + "grad_norm": 0.05280665579931524, + "language_loss": 0.86521721, + "learning_rate": 0.00021972665346911275, + "loss": 0.87593591, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.33447266, + "step": 3632, + "time_per_iteration": 2.668616771697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071923, + "balance_loss_mlp": 1.03763855, + "epoch": 0.698922662562524, + "flos": 483350948352.0, + "grad_norm": 0.05402222352143004, + "language_loss": 0.79431093, + "learning_rate": 0.00021946871259543877, + "loss": 0.80503017, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.34326172, + "step": 3633, + "time_per_iteration": 2.580191135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068198, + "balance_loss_mlp": 1.03486705, + "epoch": 0.6991150442477876, + "flos": 718586726400.0, + "grad_norm": 0.05023014316790998, + "language_loss": 0.8304534, + "learning_rate": 0.00021921088063366957, + "loss": 0.84113538, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.33349609, + "step": 3634, + "time_per_iteration": 2.9607045650482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067291, + "balance_loss_mlp": 1.03384113, + "epoch": 0.6993074259330512, + "flos": 488871134208.0, + "grad_norm": 0.05127346508888132, + "language_loss": 0.8176077, + "learning_rate": 0.00021895315768390435, + "loss": 0.82828063, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.3347168, + "step": 3635, + "time_per_iteration": 2.585498332977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107047, + "balance_loss_mlp": 1.03651941, + "epoch": 0.6994998076183148, + "flos": 717745715712.0, + "grad_norm": 0.04635500593717234, + "language_loss": 0.87909687, + "learning_rate": 0.00021869554384619999, + "loss": 0.88980162, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.33959961, + "step": 3636, + "time_per_iteration": 2.968268394470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074967, + "balance_loss_mlp": 1.0413022, + "epoch": 0.6996921893035783, + "flos": 578730866688.0, + "grad_norm": 0.05835542586274351, + "language_loss": 0.80754793, + "learning_rate": 0.00021843803922057115, + "loss": 0.81829762, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.33691406, + "step": 3637, + "time_per_iteration": 2.7109100818634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068155, + "balance_loss_mlp": 1.0351578, + "epoch": 0.6998845709888418, + "flos": 518369462784.0, + "grad_norm": 0.06833550802909422, + "language_loss": 0.81533343, + "learning_rate": 0.00021818064390698977, + "loss": 0.826015, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.33007812, + "step": 3638, + "time_per_iteration": 2.5944924354553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071173, + "balance_loss_mlp": 1.03726995, + "epoch": 0.7000769526741054, + "flos": 620666399232.0, + "grad_norm": 0.05517026065702434, + "language_loss": 0.86890268, + "learning_rate": 0.0002179233580053861, + "loss": 0.87961447, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.33935547, + "step": 3639, + "time_per_iteration": 2.7613229751586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070142, + "balance_loss_mlp": 1.03652453, + "epoch": 0.700269334359369, + "flos": 559670598144.0, + "grad_norm": 0.13465593059658462, + "language_loss": 0.85617924, + "learning_rate": 0.00021766618161564688, + "loss": 0.86688066, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.33642578, + "step": 3640, + "time_per_iteration": 2.7400569915771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071036, + "balance_loss_mlp": 1.0372045, + "epoch": 0.7004617160446326, + "flos": 483090490368.0, + "grad_norm": 0.051527698047250534, + "language_loss": 0.87097609, + "learning_rate": 0.00021740911483761677, + "loss": 0.88168645, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.33862305, + "step": 3641, + "time_per_iteration": 2.5464553833007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107241, + "balance_loss_mlp": 1.0389359, + "epoch": 0.7006540977298961, + "flos": 696647015424.0, + "grad_norm": 0.04496743490694548, + "language_loss": 0.91822404, + "learning_rate": 0.00021715215777109837, + "loss": 0.92894816, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.33496094, + "step": 3642, + "time_per_iteration": 2.9422945976257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068413, + "balance_loss_mlp": 1.03477192, + "epoch": 0.7008464794151597, + "flos": 504528224256.0, + "grad_norm": 0.053490842325032185, + "language_loss": 0.84272158, + "learning_rate": 0.00021689531051585103, + "loss": 0.85340571, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.33642578, + "step": 3643, + "time_per_iteration": 2.609464406967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069383, + "balance_loss_mlp": 1.03421593, + "epoch": 0.7010388611004232, + "flos": 536985980928.0, + "grad_norm": 0.06575198455651811, + "language_loss": 0.79940069, + "learning_rate": 0.00021663857317159196, + "loss": 0.81009454, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.35229492, + "step": 3644, + "time_per_iteration": 2.652776002883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074301, + "balance_loss_mlp": 1.04006386, + "epoch": 0.7012312427856868, + "flos": 546996257280.0, + "grad_norm": 0.05180675245879084, + "language_loss": 0.8175106, + "learning_rate": 0.00021638194583799487, + "loss": 0.82825363, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.3425293, + "step": 3645, + "time_per_iteration": 2.647700071334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072403, + "balance_loss_mlp": 1.03785658, + "epoch": 0.7014236244709504, + "flos": 941020125696.0, + "grad_norm": 0.0581240827613666, + "language_loss": 0.82057631, + "learning_rate": 0.00021612542861469176, + "loss": 0.83130032, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.34594727, + "step": 3646, + "time_per_iteration": 3.1926403045654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070686, + "balance_loss_mlp": 1.03644955, + "epoch": 0.7016160061562139, + "flos": 524908159488.0, + "grad_norm": 0.05426451368259885, + "language_loss": 0.82171357, + "learning_rate": 0.00021586902160127135, + "loss": 0.83242047, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.34277344, + "step": 3647, + "time_per_iteration": 2.5836267471313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074201, + "balance_loss_mlp": 1.03967857, + "epoch": 0.7018083878414775, + "flos": 373170917376.0, + "grad_norm": 0.07691887625237197, + "language_loss": 0.73860252, + "learning_rate": 0.00021561272489727974, + "loss": 0.74934447, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.34570312, + "step": 3648, + "time_per_iteration": 2.426370143890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068543, + "balance_loss_mlp": 1.03518772, + "epoch": 0.7020007695267411, + "flos": 527522761728.0, + "grad_norm": 0.07653563490177187, + "language_loss": 0.80320156, + "learning_rate": 0.0002153565386022199, + "loss": 0.813887, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.33374023, + "step": 3649, + "time_per_iteration": 2.6524124145507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073058, + "balance_loss_mlp": 1.03822541, + "epoch": 0.7021931512120047, + "flos": 689850832896.0, + "grad_norm": 0.0770521311839047, + "language_loss": 0.82439005, + "learning_rate": 0.00021510046281555262, + "loss": 0.83512068, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.34887695, + "step": 3650, + "time_per_iteration": 2.796095609664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069064, + "balance_loss_mlp": 1.03497052, + "epoch": 0.7023855328972681, + "flos": 639499705344.0, + "grad_norm": 0.07628366219259466, + "language_loss": 0.81408215, + "learning_rate": 0.0002148444976366949, + "loss": 0.82477278, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.34130859, + "step": 3651, + "time_per_iteration": 2.7908504009246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071482, + "balance_loss_mlp": 1.03760242, + "epoch": 0.7025779145825317, + "flos": 560674552320.0, + "grad_norm": 0.06297036166850548, + "language_loss": 0.82553816, + "learning_rate": 0.00021458864316502136, + "loss": 0.83625293, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.33911133, + "step": 3652, + "time_per_iteration": 2.7136270999908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073226, + "balance_loss_mlp": 1.03927469, + "epoch": 0.7027702962677953, + "flos": 447214998528.0, + "grad_norm": 0.0549303916698645, + "language_loss": 0.87089896, + "learning_rate": 0.0002143328994998634, + "loss": 0.88163126, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.33959961, + "step": 3653, + "time_per_iteration": 2.4819934368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071057, + "balance_loss_mlp": 1.03603339, + "epoch": 0.7029626779530589, + "flos": 622198471680.0, + "grad_norm": 0.05753095633291236, + "language_loss": 0.78409469, + "learning_rate": 0.00021407726674050982, + "loss": 0.79480523, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.35058594, + "step": 3654, + "time_per_iteration": 2.839901924133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077723, + "balance_loss_mlp": 1.04312825, + "epoch": 0.7031550596383225, + "flos": 629307546624.0, + "grad_norm": 0.04660069709874721, + "language_loss": 0.87104034, + "learning_rate": 0.0002138217449862061, + "loss": 0.88181752, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.34619141, + "step": 3655, + "time_per_iteration": 2.729714870452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074113, + "balance_loss_mlp": 1.04001868, + "epoch": 0.703347441323586, + "flos": 530589878784.0, + "grad_norm": 0.04994580933868796, + "language_loss": 0.78216398, + "learning_rate": 0.00021356633433615403, + "loss": 0.79290509, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.34130859, + "step": 3656, + "time_per_iteration": 2.578078031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074299, + "balance_loss_mlp": 1.04044342, + "epoch": 0.7035398230088495, + "flos": 693264185856.0, + "grad_norm": 0.0479106829759696, + "language_loss": 0.83245599, + "learning_rate": 0.0002133110348895133, + "loss": 0.84319901, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.33862305, + "step": 3657, + "time_per_iteration": 2.9648847579956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068496, + "balance_loss_mlp": 1.03537953, + "epoch": 0.7037322046941131, + "flos": 967628837376.0, + "grad_norm": 0.048159657931533775, + "language_loss": 0.84623647, + "learning_rate": 0.0002130558467453999, + "loss": 0.85692137, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.33129883, + "step": 3658, + "time_per_iteration": 3.3155901432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069074, + "balance_loss_mlp": 1.03514767, + "epoch": 0.7039245863793767, + "flos": 502598891520.0, + "grad_norm": 0.045313539316245835, + "language_loss": 0.84409332, + "learning_rate": 0.0002128007700028865, + "loss": 0.85478401, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.33959961, + "step": 3659, + "time_per_iteration": 2.7024378776550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072456, + "balance_loss_mlp": 1.03926849, + "epoch": 0.7041169680646402, + "flos": 465709271040.0, + "grad_norm": 0.056824645226565565, + "language_loss": 0.84162152, + "learning_rate": 0.00021254580476100276, + "loss": 0.85234612, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.33203125, + "step": 3660, + "time_per_iteration": 2.5560450553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074809, + "balance_loss_mlp": 1.04097748, + "epoch": 0.7043093497499038, + "flos": 631897417728.0, + "grad_norm": 0.07471330414673147, + "language_loss": 0.78714609, + "learning_rate": 0.00021229095111873497, + "loss": 0.79789412, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.33862305, + "step": 3661, + "time_per_iteration": 2.7691423892974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070997, + "balance_loss_mlp": 1.03704596, + "epoch": 0.7045017314351674, + "flos": 542639996928.0, + "grad_norm": 0.04471074658603975, + "language_loss": 0.86054224, + "learning_rate": 0.0002120362091750261, + "loss": 0.87125218, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.33984375, + "step": 3662, + "time_per_iteration": 2.7782440185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073091, + "balance_loss_mlp": 1.03883076, + "epoch": 0.704694113120431, + "flos": 428012135424.0, + "grad_norm": 0.05523093470828303, + "language_loss": 0.86868262, + "learning_rate": 0.00021178157902877566, + "loss": 0.8794136, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.34301758, + "step": 3663, + "time_per_iteration": 2.440488815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070446, + "balance_loss_mlp": 1.03682911, + "epoch": 0.7048864948056945, + "flos": 650253477888.0, + "grad_norm": 0.07482453920379879, + "language_loss": 0.87160063, + "learning_rate": 0.0002115270607788397, + "loss": 0.88230515, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.33642578, + "step": 3664, + "time_per_iteration": 2.760225772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.04015791, + "epoch": 0.705078876490958, + "flos": 412330314240.0, + "grad_norm": 0.05762286530441703, + "language_loss": 0.85702121, + "learning_rate": 0.00021127265452403133, + "loss": 0.86775321, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.33032227, + "step": 3665, + "time_per_iteration": 2.561060905456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007528, + "balance_loss_mlp": 0.99813432, + "epoch": 0.7052712581762216, + "flos": 1419266783232.0, + "grad_norm": 0.0045947469063837235, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85099161, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.09375, + "step": 3666, + "time_per_iteration": 4.89429235458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_mlp": 1.03714871, + "epoch": 0.7054636398614852, + "flos": 492795228672.0, + "grad_norm": 0.08921720435757349, + "language_loss": 0.82764697, + "learning_rate": 0.00021076417839483065, + "loss": 0.83834386, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.32543945, + "step": 3667, + "time_per_iteration": 2.768646240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073838, + "balance_loss_mlp": 1.04010153, + "epoch": 0.7056560215467488, + "flos": 450228271104.0, + "grad_norm": 0.04427607909576538, + "language_loss": 0.85058916, + "learning_rate": 0.00021051010871784589, + "loss": 0.86132753, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.33764648, + "step": 3668, + "time_per_iteration": 2.567970037460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068482, + "balance_loss_mlp": 1.03462708, + "epoch": 0.7058484032320124, + "flos": 565426510848.0, + "grad_norm": 0.048767729933519285, + "language_loss": 0.78747618, + "learning_rate": 0.0002102561514308045, + "loss": 0.79816097, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.33886719, + "step": 3669, + "time_per_iteration": 2.7534899711608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069827, + "balance_loss_mlp": 1.03635263, + "epoch": 0.7060407849172758, + "flos": 566736003072.0, + "grad_norm": 0.04982032344187492, + "language_loss": 0.82456899, + "learning_rate": 0.00021000230663230135, + "loss": 0.83526719, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.33496094, + "step": 3670, + "time_per_iteration": 2.6715986728668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070311, + "balance_loss_mlp": 1.03655052, + "epoch": 0.7062331666025394, + "flos": 468505755648.0, + "grad_norm": 0.07243344373146629, + "language_loss": 0.82818425, + "learning_rate": 0.00020974857442088762, + "loss": 0.83888733, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.33789062, + "step": 3671, + "time_per_iteration": 2.5750696659088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072721, + "balance_loss_mlp": 1.03896141, + "epoch": 0.706425548287803, + "flos": 595042702848.0, + "grad_norm": 0.061680604914147966, + "language_loss": 0.88855779, + "learning_rate": 0.00020949495489507104, + "loss": 0.89928508, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.33789062, + "step": 3672, + "time_per_iteration": 2.6669857501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070135, + "balance_loss_mlp": 1.03680396, + "epoch": 0.7066179299730666, + "flos": 475566778368.0, + "grad_norm": 0.055232709126585705, + "language_loss": 0.8461234, + "learning_rate": 0.00020924144815331525, + "loss": 0.85682476, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.33349609, + "step": 3673, + "time_per_iteration": 2.5462799072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067992, + "balance_loss_mlp": 1.03451765, + "epoch": 0.7068103116583301, + "flos": 506153428992.0, + "grad_norm": 0.061788729653189316, + "language_loss": 0.82846355, + "learning_rate": 0.00020898805429404044, + "loss": 0.83914346, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.33496094, + "step": 3674, + "time_per_iteration": 2.5948987007141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073379, + "balance_loss_mlp": 1.03880787, + "epoch": 0.7070026933435937, + "flos": 679028659200.0, + "grad_norm": 0.053331350399745237, + "language_loss": 0.78217506, + "learning_rate": 0.0002087347734156228, + "loss": 0.79290879, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.34619141, + "step": 3675, + "time_per_iteration": 2.8384974002838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069694, + "balance_loss_mlp": 1.0364821, + "epoch": 0.7071950750288573, + "flos": 471981717504.0, + "grad_norm": 0.04797263488188438, + "language_loss": 0.79430759, + "learning_rate": 0.00020848160561639452, + "loss": 0.8050046, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.33227539, + "step": 3676, + "time_per_iteration": 2.6169028282165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067947, + "balance_loss_mlp": 1.03406775, + "epoch": 0.7073874567141208, + "flos": 473507997696.0, + "grad_norm": 0.04772517856798178, + "language_loss": 0.85496527, + "learning_rate": 0.0002082285509946445, + "loss": 0.86564475, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.33911133, + "step": 3677, + "time_per_iteration": 2.536482334136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070461, + "balance_loss_mlp": 1.03562784, + "epoch": 0.7075798383993844, + "flos": 545589250560.0, + "grad_norm": 0.05597865502328579, + "language_loss": 0.83377022, + "learning_rate": 0.00020797560964861683, + "loss": 0.84447479, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.34887695, + "step": 3678, + "time_per_iteration": 2.7888569831848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070957, + "balance_loss_mlp": 1.03765035, + "epoch": 0.7077722200846479, + "flos": 661766713344.0, + "grad_norm": 0.05495651688887883, + "language_loss": 0.80313671, + "learning_rate": 0.0002077227816765122, + "loss": 0.81384623, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.33325195, + "step": 3679, + "time_per_iteration": 3.0229249000549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009856, + "balance_loss_mlp": 1.00065279, + "epoch": 0.7079646017699115, + "flos": 1529128129536.0, + "grad_norm": 0.00795907908422284, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77457583, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.09179688, + "step": 3680, + "time_per_iteration": 4.766546249389648 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066391, + "balance_loss_mlp": 1.03317952, + "epoch": 0.7081569834551751, + "flos": 621217838592.0, + "grad_norm": 0.05324470770264926, + "language_loss": 0.78516078, + "learning_rate": 0.00020721746624665383, + "loss": 0.79582465, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.33203125, + "step": 3681, + "time_per_iteration": 2.7075722217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065664, + "balance_loss_mlp": 1.03199935, + "epoch": 0.7083493651404387, + "flos": 794280743424.0, + "grad_norm": 0.05089131854365718, + "language_loss": 0.79764175, + "learning_rate": 0.00020696497898508114, + "loss": 0.80829841, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.33691406, + "step": 3682, + "time_per_iteration": 2.9950366020202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066323, + "balance_loss_mlp": 1.03165746, + "epoch": 0.7085417468257021, + "flos": 813394856448.0, + "grad_norm": 0.05983793282747749, + "language_loss": 0.7766552, + "learning_rate": 0.00020671260548979316, + "loss": 0.78731841, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.34716797, + "step": 3683, + "time_per_iteration": 2.986528158187866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069107, + "balance_loss_mlp": 1.03503704, + "epoch": 0.7087341285109657, + "flos": 700259779584.0, + "grad_norm": 0.07395200120023371, + "language_loss": 0.84964406, + "learning_rate": 0.00020646034585876982, + "loss": 0.86033517, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.34106445, + "step": 3684, + "time_per_iteration": 2.801340341567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068964, + "balance_loss_mlp": 1.03467929, + "epoch": 0.7089265101962293, + "flos": 596211010560.0, + "grad_norm": 0.047359686788279315, + "language_loss": 0.84225708, + "learning_rate": 0.00020620820018994718, + "loss": 0.85294676, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.34301758, + "step": 3685, + "time_per_iteration": 2.8521230220794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069096, + "balance_loss_mlp": 1.03438258, + "epoch": 0.7091188918814929, + "flos": 486842876928.0, + "grad_norm": 0.05746562851929707, + "language_loss": 0.82886755, + "learning_rate": 0.00020595616858121675, + "loss": 0.8395586, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.34765625, + "step": 3686, + "time_per_iteration": 2.7113983631134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064604, + "balance_loss_mlp": 1.03034306, + "epoch": 0.7093112735667565, + "flos": 599833949184.0, + "grad_norm": 0.05104944796705689, + "language_loss": 0.80622023, + "learning_rate": 0.00020570425113042586, + "loss": 0.81686622, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.34277344, + "step": 3687, + "time_per_iteration": 2.712451457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066788, + "balance_loss_mlp": 1.03293276, + "epoch": 0.70950365525202, + "flos": 505577258496.0, + "grad_norm": 0.05729403369858188, + "language_loss": 0.85692352, + "learning_rate": 0.0002054524479353776, + "loss": 0.86759138, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.33886719, + "step": 3688, + "time_per_iteration": 2.6377811431884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068463, + "balance_loss_mlp": 1.03446496, + "epoch": 0.7096960369372836, + "flos": 731846002176.0, + "grad_norm": 0.05774020478713443, + "language_loss": 0.81201112, + "learning_rate": 0.00020520075909383063, + "loss": 0.82269579, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.34033203, + "step": 3689, + "time_per_iteration": 2.8854405879974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068474, + "balance_loss_mlp": 1.03409433, + "epoch": 0.7098884186225471, + "flos": 971685351936.0, + "grad_norm": 0.048806563033970844, + "language_loss": 0.8087877, + "learning_rate": 0.00020494918470349916, + "loss": 0.81947243, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.34399414, + "step": 3690, + "time_per_iteration": 3.2719247341156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069117, + "balance_loss_mlp": 1.03518987, + "epoch": 0.7100808003078107, + "flos": 504001516032.0, + "grad_norm": 0.0562848132432342, + "language_loss": 0.85595727, + "learning_rate": 0.00020469772486205297, + "loss": 0.86664844, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.33959961, + "step": 3691, + "time_per_iteration": 2.599254608154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064349, + "balance_loss_mlp": 1.03018332, + "epoch": 0.7102731819930742, + "flos": 540073446912.0, + "grad_norm": 0.052398389551748005, + "language_loss": 0.81299037, + "learning_rate": 0.0002044463796671177, + "loss": 0.82363379, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.34204102, + "step": 3692, + "time_per_iteration": 2.6676712036132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068474, + "balance_loss_mlp": 1.03502345, + "epoch": 0.7104655636783378, + "flos": 620066907648.0, + "grad_norm": 0.05724464606399067, + "language_loss": 0.80306011, + "learning_rate": 0.00020419514921627408, + "loss": 0.8137449, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.3347168, + "step": 3693, + "time_per_iteration": 2.906092643737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071382, + "balance_loss_mlp": 1.03707361, + "epoch": 0.7106579453636014, + "flos": 557060378112.0, + "grad_norm": 0.04981428600794461, + "language_loss": 0.77017659, + "learning_rate": 0.00020394403360705855, + "loss": 0.78089035, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.34350586, + "step": 3694, + "time_per_iteration": 2.69543719291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107094, + "balance_loss_mlp": 1.03634608, + "epoch": 0.710850327048865, + "flos": 512795432448.0, + "grad_norm": 0.05615701524037797, + "language_loss": 0.8807683, + "learning_rate": 0.00020369303293696228, + "loss": 0.8914777, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.34619141, + "step": 3695, + "time_per_iteration": 2.613211154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072906, + "balance_loss_mlp": 1.03850234, + "epoch": 0.7110427087341286, + "flos": 423398389248.0, + "grad_norm": 0.05344233224786611, + "language_loss": 0.78265321, + "learning_rate": 0.00020344214730343304, + "loss": 0.79338229, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.34448242, + "step": 3696, + "time_per_iteration": 2.60355544090271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070854, + "balance_loss_mlp": 1.03687966, + "epoch": 0.711235090419392, + "flos": 577107072000.0, + "grad_norm": 0.05731164613368461, + "language_loss": 0.79340208, + "learning_rate": 0.00020319137680387296, + "loss": 0.80411065, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.34008789, + "step": 3697, + "time_per_iteration": 2.9248886108398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071143, + "balance_loss_mlp": 1.03712082, + "epoch": 0.7114274721046556, + "flos": 447830456832.0, + "grad_norm": 0.06826664171711681, + "language_loss": 0.80587053, + "learning_rate": 0.0002029407215356398, + "loss": 0.81658196, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.34057617, + "step": 3698, + "time_per_iteration": 2.5251829624176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066516, + "balance_loss_mlp": 1.03304207, + "epoch": 0.7116198537899192, + "flos": 621680527872.0, + "grad_norm": 0.05434937939483776, + "language_loss": 0.83318967, + "learning_rate": 0.00020269018159604663, + "loss": 0.84385484, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.33496094, + "step": 3699, + "time_per_iteration": 2.6997692584991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062784, + "balance_loss_mlp": 1.02921486, + "epoch": 0.7118122354751828, + "flos": 498476947968.0, + "grad_norm": 0.04823068648652618, + "language_loss": 0.81931448, + "learning_rate": 0.00020243975708236162, + "loss": 0.82994235, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.3359375, + "step": 3700, + "time_per_iteration": 2.5654993057250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071384, + "balance_loss_mlp": 1.03717113, + "epoch": 0.7120046171604463, + "flos": 572438071296.0, + "grad_norm": 0.09878181502627377, + "language_loss": 0.85897946, + "learning_rate": 0.00020218944809180818, + "loss": 0.86969334, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.3425293, + "step": 3701, + "time_per_iteration": 2.7016773223876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070567, + "balance_loss_mlp": 1.03661633, + "epoch": 0.7121969988457099, + "flos": 572388609024.0, + "grad_norm": 0.07221648962243508, + "language_loss": 0.8452931, + "learning_rate": 0.00020193925472156493, + "loss": 0.85599875, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.33984375, + "step": 3702, + "time_per_iteration": 2.6914734840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034328, + "balance_loss_mlp": 1.02545857, + "epoch": 0.7123893805309734, + "flos": 1522585050624.0, + "grad_norm": 0.022091327023181177, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75323498, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.08886719, + "step": 3703, + "time_per_iteration": 4.884379148483276 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066468, + "balance_loss_mlp": 1.03304124, + "epoch": 0.712581762216237, + "flos": 614779476480.0, + "grad_norm": 0.06545400953207585, + "language_loss": 0.83676839, + "learning_rate": 0.00020143921523049863, + "loss": 0.84743309, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.33447266, + "step": 3704, + "time_per_iteration": 2.9219436645507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106542, + "balance_loss_mlp": 1.03185105, + "epoch": 0.7127741439015006, + "flos": 597504536064.0, + "grad_norm": 0.06577771502635076, + "language_loss": 0.835908, + "learning_rate": 0.00020118936930380837, + "loss": 0.84656215, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.3359375, + "step": 3705, + "time_per_iteration": 2.6833901405334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070499, + "balance_loss_mlp": 1.03635776, + "epoch": 0.7129665255867641, + "flos": 537138749952.0, + "grad_norm": 0.05242920734791126, + "language_loss": 0.80929446, + "learning_rate": 0.0002009396393856932, + "loss": 0.81999946, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.34179688, + "step": 3706, + "time_per_iteration": 2.6226556301116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107105, + "balance_loss_mlp": 1.03707516, + "epoch": 0.7131589072720277, + "flos": 526173981696.0, + "grad_norm": 0.05578991259827158, + "language_loss": 0.82312477, + "learning_rate": 0.00020069002557310673, + "loss": 0.8338353, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.34008789, + "step": 3707, + "time_per_iteration": 2.6535470485687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064608, + "balance_loss_mlp": 1.0319922, + "epoch": 0.7133512889572913, + "flos": 530626194432.0, + "grad_norm": 0.0741438657284304, + "language_loss": 0.77105689, + "learning_rate": 0.00020044052796295807, + "loss": 0.78170288, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.32617188, + "step": 3708, + "time_per_iteration": 2.787355899810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066524, + "balance_loss_mlp": 1.03226364, + "epoch": 0.7135436706425549, + "flos": 503282750976.0, + "grad_norm": 0.05095203093874289, + "language_loss": 0.82020175, + "learning_rate": 0.00020019114665211063, + "loss": 0.83086699, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.34301758, + "step": 3709, + "time_per_iteration": 2.5732407569885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070401, + "balance_loss_mlp": 1.03645074, + "epoch": 0.7137360523278183, + "flos": 515719954944.0, + "grad_norm": 0.04941715658479687, + "language_loss": 0.81220102, + "learning_rate": 0.00019994188173738276, + "loss": 0.82290506, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.33984375, + "step": 3710, + "time_per_iteration": 2.5564064979553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068107, + "balance_loss_mlp": 1.03398967, + "epoch": 0.7139284340130819, + "flos": 510103664640.0, + "grad_norm": 0.05502854520341245, + "language_loss": 0.80873179, + "learning_rate": 0.0001996927333155477, + "loss": 0.81941289, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.34155273, + "step": 3711, + "time_per_iteration": 2.732224225997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071876, + "balance_loss_mlp": 1.03825879, + "epoch": 0.7141208156983455, + "flos": 889896388608.0, + "grad_norm": 0.05033741502761429, + "language_loss": 0.85233271, + "learning_rate": 0.00019944370148333346, + "loss": 0.86305141, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.33642578, + "step": 3712, + "time_per_iteration": 3.213644504547119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107149, + "balance_loss_mlp": 1.03827798, + "epoch": 0.7143131973836091, + "flos": 535504780800.0, + "grad_norm": 0.05173411094558013, + "language_loss": 0.79739279, + "learning_rate": 0.00019919478633742278, + "loss": 0.80810767, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.33227539, + "step": 3713, + "time_per_iteration": 2.7310914993286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072393, + "balance_loss_mlp": 1.03884721, + "epoch": 0.7145055790688727, + "flos": 473429422080.0, + "grad_norm": 0.04797356179200618, + "language_loss": 0.85098791, + "learning_rate": 0.00019894598797445302, + "loss": 0.86171186, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.33569336, + "step": 3714, + "time_per_iteration": 2.5128626823425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071822, + "balance_loss_mlp": 1.03796673, + "epoch": 0.7146979607541362, + "flos": 570227931648.0, + "grad_norm": 0.05105012604374378, + "language_loss": 0.81882799, + "learning_rate": 0.00019869730649101615, + "loss": 0.82954621, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.33886719, + "step": 3715, + "time_per_iteration": 2.7468035221099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074372, + "balance_loss_mlp": 1.03965807, + "epoch": 0.7148903424393998, + "flos": 839299359744.0, + "grad_norm": 0.0561955521045174, + "language_loss": 0.72303152, + "learning_rate": 0.00019844874198365943, + "loss": 0.73377526, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.34765625, + "step": 3716, + "time_per_iteration": 3.0928800106048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072381, + "balance_loss_mlp": 1.03807223, + "epoch": 0.7150827241246633, + "flos": 541560439296.0, + "grad_norm": 0.05538627322116671, + "language_loss": 0.83775991, + "learning_rate": 0.00019820029454888362, + "loss": 0.84848368, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.34326172, + "step": 3717, + "time_per_iteration": 2.6984283924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101976, + "balance_loss_mlp": 1.00993717, + "epoch": 0.7152751058099269, + "flos": 1582803859968.0, + "grad_norm": 0.008798476496045995, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.75541025, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.09814453, + "step": 3718, + "time_per_iteration": 5.056431531906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072324, + "balance_loss_mlp": 1.03775322, + "epoch": 0.7154674874951905, + "flos": 517167659520.0, + "grad_norm": 0.0523553620911167, + "language_loss": 0.80075788, + "learning_rate": 0.0001977037512828529, + "loss": 0.81148112, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.34594727, + "step": 3719, + "time_per_iteration": 2.57888126373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068149, + "balance_loss_mlp": 1.03391242, + "epoch": 0.715659869180454, + "flos": 602246320128.0, + "grad_norm": 0.048902324655222526, + "language_loss": 0.86289543, + "learning_rate": 0.0001974556556443734, + "loss": 0.873577, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.3425293, + "step": 3720, + "time_per_iteration": 2.6931040287017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065993, + "balance_loss_mlp": 1.03206623, + "epoch": 0.7158522508657176, + "flos": 531403186176.0, + "grad_norm": 0.0436888691485468, + "language_loss": 0.88365716, + "learning_rate": 0.00019720767746402547, + "loss": 0.89431709, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.33959961, + "step": 3721, + "time_per_iteration": 2.7067127227783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072793, + "balance_loss_mlp": 1.03867531, + "epoch": 0.7160446325509812, + "flos": 557301897216.0, + "grad_norm": 0.0582274730279212, + "language_loss": 0.80045772, + "learning_rate": 0.00019695981683808222, + "loss": 0.8111856, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.34155273, + "step": 3722, + "time_per_iteration": 2.708950996398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067549, + "balance_loss_mlp": 1.03405118, + "epoch": 0.7162370142362448, + "flos": 690664140288.0, + "grad_norm": 0.04509643904161843, + "language_loss": 0.84632957, + "learning_rate": 0.00019671207386277225, + "loss": 0.85700506, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.33520508, + "step": 3723, + "time_per_iteration": 2.9580013751983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068909, + "balance_loss_mlp": 1.03462386, + "epoch": 0.7164293959215082, + "flos": 793772974080.0, + "grad_norm": 0.06707821988874196, + "language_loss": 0.77988201, + "learning_rate": 0.0001964644486342777, + "loss": 0.79057109, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.34326172, + "step": 3724, + "time_per_iteration": 2.937603712081909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067817, + "balance_loss_mlp": 1.03403354, + "epoch": 0.7166217776067718, + "flos": 493922838528.0, + "grad_norm": 0.05338190287132838, + "language_loss": 0.86470282, + "learning_rate": 0.00019621694124873524, + "loss": 0.87538099, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.33813477, + "step": 3725, + "time_per_iteration": 2.708923816680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012685, + "balance_loss_mlp": 1.00305271, + "epoch": 0.7168141592920354, + "flos": 1400337524736.0, + "grad_norm": 0.004329548481597118, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.7755276, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.09619141, + "step": 3726, + "time_per_iteration": 4.868973970413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067858, + "balance_loss_mlp": 1.03323972, + "epoch": 0.717006540977299, + "flos": 792789368832.0, + "grad_norm": 0.04993242383663973, + "language_loss": 0.77399421, + "learning_rate": 0.00019572228039082428, + "loss": 0.78467286, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.34643555, + "step": 3727, + "time_per_iteration": 3.0444281101226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063691, + "balance_loss_mlp": 1.02971661, + "epoch": 0.7171989226625626, + "flos": 554525761536.0, + "grad_norm": 0.045554501799563094, + "language_loss": 0.83411372, + "learning_rate": 0.0001954751271105002, + "loss": 0.84475064, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.34008789, + "step": 3728, + "time_per_iteration": 2.809967041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065462, + "balance_loss_mlp": 1.03186858, + "epoch": 0.717391304347826, + "flos": 555628640256.0, + "grad_norm": 0.05755567657425633, + "language_loss": 0.80672932, + "learning_rate": 0.00019522809205721687, + "loss": 0.81738389, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.33618164, + "step": 3729, + "time_per_iteration": 2.7862703800201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067947, + "balance_loss_mlp": 1.03459263, + "epoch": 0.7175836860330896, + "flos": 538582072320.0, + "grad_norm": 0.05354925450450462, + "language_loss": 0.82769603, + "learning_rate": 0.0001949811753268816, + "loss": 0.83837551, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.33374023, + "step": 3730, + "time_per_iteration": 2.6676440238952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106439, + "balance_loss_mlp": 1.03046322, + "epoch": 0.7177760677183532, + "flos": 515385303552.0, + "grad_norm": 0.057530592847955, + "language_loss": 0.82664466, + "learning_rate": 0.00019473437701535634, + "loss": 0.8372885, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.33959961, + "step": 3731, + "time_per_iteration": 2.5901401042938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061132, + "balance_loss_mlp": 1.02772939, + "epoch": 0.7179684494036168, + "flos": 674414913024.0, + "grad_norm": 0.05555536497682914, + "language_loss": 0.89367867, + "learning_rate": 0.00019448769721845677, + "loss": 0.90428996, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.33422852, + "step": 3732, + "time_per_iteration": 2.784381628036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106645, + "balance_loss_mlp": 1.03192735, + "epoch": 0.7181608310888803, + "flos": 469672653312.0, + "grad_norm": 0.05444278495505657, + "language_loss": 0.85605729, + "learning_rate": 0.00019424113603195203, + "loss": 0.86672175, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.34570312, + "step": 3733, + "time_per_iteration": 2.5088841915130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106796, + "balance_loss_mlp": 1.03343654, + "epoch": 0.7183532127741439, + "flos": 593645870592.0, + "grad_norm": 0.06008894294367452, + "language_loss": 0.79899514, + "learning_rate": 0.0001939946935515657, + "loss": 0.80967468, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.34570312, + "step": 3734, + "time_per_iteration": 2.8258321285247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064654, + "balance_loss_mlp": 1.03065538, + "epoch": 0.7185455944594075, + "flos": 498669004800.0, + "grad_norm": 0.05732279387699742, + "language_loss": 0.80418706, + "learning_rate": 0.0001937483698729755, + "loss": 0.81483358, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.34008789, + "step": 3735, + "time_per_iteration": 2.5968332290649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065941, + "balance_loss_mlp": 1.03182328, + "epoch": 0.718737976144671, + "flos": 814590867456.0, + "grad_norm": 0.053801017075388924, + "language_loss": 0.82329178, + "learning_rate": 0.0001935021650918128, + "loss": 0.83395112, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.34155273, + "step": 3736, + "time_per_iteration": 2.982541084289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063158, + "balance_loss_mlp": 1.02894521, + "epoch": 0.7189303578299346, + "flos": 438100987392.0, + "grad_norm": 0.06976823938990344, + "language_loss": 0.86880851, + "learning_rate": 0.0001932560793036625, + "loss": 0.87944007, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.3425293, + "step": 3737, + "time_per_iteration": 2.5138731002807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064404, + "balance_loss_mlp": 1.0309298, + "epoch": 0.7191227395151981, + "flos": 549137995776.0, + "grad_norm": 0.0607946285508029, + "language_loss": 0.8638792, + "learning_rate": 0.00019301011260406382, + "loss": 0.87452322, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.33496094, + "step": 3738, + "time_per_iteration": 2.628265619277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065412, + "balance_loss_mlp": 1.03224778, + "epoch": 0.7193151212004617, + "flos": 626653656576.0, + "grad_norm": 0.05146382358147088, + "language_loss": 0.79296547, + "learning_rate": 0.00019276426508850936, + "loss": 0.80361962, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.33178711, + "step": 3739, + "time_per_iteration": 2.7006874084472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065434, + "balance_loss_mlp": 1.03179288, + "epoch": 0.7195075028857253, + "flos": 740719904256.0, + "grad_norm": 0.046550971907091544, + "language_loss": 0.80166346, + "learning_rate": 0.00019251853685244564, + "loss": 0.81231779, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.33666992, + "step": 3740, + "time_per_iteration": 3.0175721645355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066355, + "balance_loss_mlp": 1.0327853, + "epoch": 0.7196998845709889, + "flos": 802523220480.0, + "grad_norm": 0.05930173376482813, + "language_loss": 0.80639338, + "learning_rate": 0.00019227292799127283, + "loss": 0.81705689, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.3359375, + "step": 3741, + "time_per_iteration": 3.074167251586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069786, + "balance_loss_mlp": 1.03640747, + "epoch": 0.7198922662562524, + "flos": 924786865152.0, + "grad_norm": 0.05002690922956246, + "language_loss": 0.79003727, + "learning_rate": 0.00019202743860034454, + "loss": 0.80073518, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.33398438, + "step": 3742, + "time_per_iteration": 3.205714702606201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067652, + "balance_loss_mlp": 1.03448844, + "epoch": 0.7200846479415159, + "flos": 579838127616.0, + "grad_norm": 0.05345251644076864, + "language_loss": 0.83706784, + "learning_rate": 0.00019178206877496873, + "loss": 0.84774435, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.33178711, + "step": 3743, + "time_per_iteration": 2.6547601222991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106189, + "balance_loss_mlp": 1.02834439, + "epoch": 0.7202770296267795, + "flos": 557410996224.0, + "grad_norm": 0.043135096200134324, + "language_loss": 0.85002279, + "learning_rate": 0.0001915368186104059, + "loss": 0.86064172, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.33569336, + "step": 3744, + "time_per_iteration": 2.740265130996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066689, + "balance_loss_mlp": 1.03385842, + "epoch": 0.7204694113120431, + "flos": 672248443392.0, + "grad_norm": 0.0510098873102972, + "language_loss": 0.81037152, + "learning_rate": 0.0001912916882018706, + "loss": 0.82103842, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.32836914, + "step": 3745, + "time_per_iteration": 2.8475067615509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068353, + "balance_loss_mlp": 1.03511715, + "epoch": 0.7206617929973067, + "flos": 798845027328.0, + "grad_norm": 0.058473767349389985, + "language_loss": 0.78699112, + "learning_rate": 0.00019104667764453125, + "loss": 0.79767466, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.33251953, + "step": 3746, + "time_per_iteration": 3.016134738922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064862, + "balance_loss_mlp": 1.031793, + "epoch": 0.7208541746825702, + "flos": 531638913024.0, + "grad_norm": 0.04570203365425481, + "language_loss": 0.80496103, + "learning_rate": 0.00019080178703350926, + "loss": 0.81560969, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.33081055, + "step": 3747, + "time_per_iteration": 2.6047801971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060751, + "balance_loss_mlp": 1.02682364, + "epoch": 0.7210465563678338, + "flos": 534883530240.0, + "grad_norm": 0.04791251301755464, + "language_loss": 0.82855403, + "learning_rate": 0.00019055701646387952, + "loss": 0.83916157, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.33959961, + "step": 3748, + "time_per_iteration": 2.6366617679595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015113, + "balance_loss_mlp": 1.00548053, + "epoch": 0.7212389380530974, + "flos": 1533076955136.0, + "grad_norm": 0.0050303066243172915, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81487799, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.09619141, + "step": 3749, + "time_per_iteration": 4.800697326660156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067212, + "balance_loss_mlp": 1.03361845, + "epoch": 0.7214313197383609, + "flos": 461277407232.0, + "grad_norm": 0.05889548383130951, + "language_loss": 0.86542219, + "learning_rate": 0.00019006783582886368, + "loss": 0.87609434, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.33618164, + "step": 3750, + "time_per_iteration": 2.52746844291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066015, + "balance_loss_mlp": 1.0318023, + "epoch": 0.7216237014236244, + "flos": 1036691025408.0, + "grad_norm": 0.046476584677382714, + "language_loss": 0.82800925, + "learning_rate": 0.00018982342595339437, + "loss": 0.83866942, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.3425293, + "step": 3751, + "time_per_iteration": 3.5170929431915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067071, + "balance_loss_mlp": 1.03416932, + "epoch": 0.721816083108888, + "flos": 895578107904.0, + "grad_norm": 0.05167132755024372, + "language_loss": 0.81707644, + "learning_rate": 0.00018957913649915076, + "loss": 0.82774711, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.32910156, + "step": 3752, + "time_per_iteration": 3.1112849712371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010644, + "balance_loss_mlp": 1.03178465, + "epoch": 0.7220084647941516, + "flos": 523066166784.0, + "grad_norm": 0.05533376577602326, + "language_loss": 0.79672492, + "learning_rate": 0.00018933496756097428, + "loss": 0.80736887, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.32617188, + "step": 3753, + "time_per_iteration": 2.5987796783447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064595, + "balance_loss_mlp": 1.03102577, + "epoch": 0.7222008464794152, + "flos": 815757765120.0, + "grad_norm": 0.05288107423325553, + "language_loss": 0.81242466, + "learning_rate": 0.0001890909192336603, + "loss": 0.82307053, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.3359375, + "step": 3754, + "time_per_iteration": 3.0019736289978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065173, + "balance_loss_mlp": 1.03172278, + "epoch": 0.7223932281646788, + "flos": 748725244416.0, + "grad_norm": 0.049565047551570436, + "language_loss": 0.70085669, + "learning_rate": 0.00018884699161195623, + "loss": 0.71150839, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.3347168, + "step": 3755, + "time_per_iteration": 2.921433448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106462, + "balance_loss_mlp": 1.03047848, + "epoch": 0.7225856098499422, + "flos": 745132829184.0, + "grad_norm": 0.05110029255023059, + "language_loss": 0.77537811, + "learning_rate": 0.00018860318479056327, + "loss": 0.78602433, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.34179688, + "step": 3756, + "time_per_iteration": 4.5331456661224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064579, + "balance_loss_mlp": 1.03155816, + "epoch": 0.7227779915352058, + "flos": 547055894016.0, + "grad_norm": 0.047457603213344, + "language_loss": 0.835307, + "learning_rate": 0.00018835949886413555, + "loss": 0.84595281, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.33032227, + "step": 3757, + "time_per_iteration": 2.721592903137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106988, + "balance_loss_mlp": 1.0362395, + "epoch": 0.7229703732204694, + "flos": 530230496256.0, + "grad_norm": 0.05570980366468543, + "language_loss": 0.78520513, + "learning_rate": 0.0001881159339272806, + "loss": 0.79590392, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.33666992, + "step": 3758, + "time_per_iteration": 2.6724090576171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066008, + "balance_loss_mlp": 1.03289187, + "epoch": 0.723162754905733, + "flos": 528103314432.0, + "grad_norm": 0.05510744793319723, + "language_loss": 0.7836262, + "learning_rate": 0.00018787249007455858, + "loss": 0.79428625, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.33129883, + "step": 3759, + "time_per_iteration": 2.608786106109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065133, + "balance_loss_mlp": 1.03292298, + "epoch": 0.7233551365909965, + "flos": 654571860480.0, + "grad_norm": 0.051481631649939415, + "language_loss": 0.71461964, + "learning_rate": 0.00018762916740048302, + "loss": 0.72527099, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.32202148, + "step": 3760, + "time_per_iteration": 2.768165111541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064991, + "balance_loss_mlp": 1.03201807, + "epoch": 0.7235475182762601, + "flos": 522097118208.0, + "grad_norm": 0.045655130957968595, + "language_loss": 0.85612011, + "learning_rate": 0.0001873859659995195, + "loss": 0.86677003, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.32983398, + "step": 3761, + "time_per_iteration": 2.749396800994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106691, + "balance_loss_mlp": 1.03357887, + "epoch": 0.7237398999615237, + "flos": 608883941376.0, + "grad_norm": 0.05437044634391734, + "language_loss": 0.83492088, + "learning_rate": 0.0001871428859660878, + "loss": 0.84559, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.33349609, + "step": 3762, + "time_per_iteration": 2.767180919647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107084, + "balance_loss_mlp": 1.03820074, + "epoch": 0.7239322816467872, + "flos": 658664690688.0, + "grad_norm": 0.04804139363705488, + "language_loss": 0.82056308, + "learning_rate": 0.00018689992739455975, + "loss": 0.83127153, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.32641602, + "step": 3763, + "time_per_iteration": 2.8873496055603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071938, + "balance_loss_mlp": 1.03803444, + "epoch": 0.7241246633320508, + "flos": 968869928448.0, + "grad_norm": 0.04487268066979416, + "language_loss": 0.85964411, + "learning_rate": 0.00018665709037926027, + "loss": 0.87036347, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.33935547, + "step": 3764, + "time_per_iteration": 3.2812607288360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067601, + "balance_loss_mlp": 1.03429401, + "epoch": 0.7243170450173143, + "flos": 514745114112.0, + "grad_norm": 0.06636395802329886, + "language_loss": 0.84182644, + "learning_rate": 0.00018641437501446694, + "loss": 0.85250252, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.33325195, + "step": 3765, + "time_per_iteration": 2.573697328567505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069298, + "balance_loss_mlp": 1.03668237, + "epoch": 0.7245094267025779, + "flos": 559482923520.0, + "grad_norm": 0.05849002982454381, + "language_loss": 0.82240844, + "learning_rate": 0.0001861717813944104, + "loss": 0.83310151, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.32617188, + "step": 3766, + "time_per_iteration": 2.630692481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070145, + "balance_loss_mlp": 1.03674293, + "epoch": 0.7247018083878415, + "flos": 612359903232.0, + "grad_norm": 0.059142078563837144, + "language_loss": 0.7934258, + "learning_rate": 0.00018592930961327365, + "loss": 0.80412722, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.33422852, + "step": 3767, + "time_per_iteration": 2.714850902557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069775, + "balance_loss_mlp": 1.03694439, + "epoch": 0.7248941900731051, + "flos": 634379599872.0, + "grad_norm": 0.04667094016302488, + "language_loss": 0.8795737, + "learning_rate": 0.00018568695976519273, + "loss": 0.89027148, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.32836914, + "step": 3768, + "time_per_iteration": 2.78951358795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067478, + "balance_loss_mlp": 1.03433776, + "epoch": 0.7250865717583687, + "flos": 424718055936.0, + "grad_norm": 0.05715863238838566, + "language_loss": 0.80076563, + "learning_rate": 0.00018544473194425593, + "loss": 0.81144047, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.33154297, + "step": 3769, + "time_per_iteration": 2.5101308822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068926, + "balance_loss_mlp": 1.03542805, + "epoch": 0.7252789534436321, + "flos": 634794236928.0, + "grad_norm": 0.05221621035796038, + "language_loss": 0.78552115, + "learning_rate": 0.00018520262624450485, + "loss": 0.79621041, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.33520508, + "step": 3770, + "time_per_iteration": 2.851344347000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065232, + "balance_loss_mlp": 1.03247309, + "epoch": 0.7254713351288957, + "flos": 616895073792.0, + "grad_norm": 0.05281322327607285, + "language_loss": 0.86844021, + "learning_rate": 0.00018496064275993324, + "loss": 0.87909257, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.32763672, + "step": 3771, + "time_per_iteration": 2.740528106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065588, + "balance_loss_mlp": 1.03132713, + "epoch": 0.7256637168141593, + "flos": 766662285312.0, + "grad_norm": 0.053619752531576234, + "language_loss": 0.81698912, + "learning_rate": 0.00018471878158448686, + "loss": 0.82764494, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.34301758, + "step": 3772, + "time_per_iteration": 2.940927028656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068836, + "balance_loss_mlp": 1.03538561, + "epoch": 0.7258560984994229, + "flos": 495268646400.0, + "grad_norm": 0.044202669157845896, + "language_loss": 0.8410005, + "learning_rate": 0.00018447704281206512, + "loss": 0.85168886, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.3347168, + "step": 3773, + "time_per_iteration": 2.9211905002593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010641, + "balance_loss_mlp": 1.03050709, + "epoch": 0.7260484801846864, + "flos": 529802712576.0, + "grad_norm": 0.0599389288946333, + "language_loss": 0.82910264, + "learning_rate": 0.0001842354265365191, + "loss": 0.83974361, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.33618164, + "step": 3774, + "time_per_iteration": 2.672297477722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067714, + "balance_loss_mlp": 1.03478813, + "epoch": 0.72624086186995, + "flos": 624679243776.0, + "grad_norm": 0.055766679807351886, + "language_loss": 0.80738944, + "learning_rate": 0.0001839939328516526, + "loss": 0.81806654, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.32910156, + "step": 3775, + "time_per_iteration": 2.715765953063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067661, + "balance_loss_mlp": 1.03475976, + "epoch": 0.7264332435552135, + "flos": 716203468800.0, + "grad_norm": 0.054689232806286694, + "language_loss": 0.80927253, + "learning_rate": 0.0001837525618512218, + "loss": 0.81994909, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.32910156, + "step": 3776, + "time_per_iteration": 2.9182283878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067478, + "balance_loss_mlp": 1.03467178, + "epoch": 0.7266256252404771, + "flos": 680736821760.0, + "grad_norm": 0.056616455322331526, + "language_loss": 0.83123744, + "learning_rate": 0.00018351131362893519, + "loss": 0.84191227, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.328125, + "step": 3777, + "time_per_iteration": 2.8280246257781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065965, + "balance_loss_mlp": 1.03227687, + "epoch": 0.7268180069257407, + "flos": 518654651904.0, + "grad_norm": 0.0757528299469481, + "language_loss": 0.80649394, + "learning_rate": 0.00018327018827845364, + "loss": 0.81715357, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.3371582, + "step": 3778, + "time_per_iteration": 2.6342718601226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065833, + "balance_loss_mlp": 1.03221643, + "epoch": 0.7270103886110042, + "flos": 512411318784.0, + "grad_norm": 0.05462394949163198, + "language_loss": 0.87201697, + "learning_rate": 0.00018302918589339036, + "loss": 0.88267529, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.33642578, + "step": 3779, + "time_per_iteration": 2.6401546001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065353, + "balance_loss_mlp": 1.03166389, + "epoch": 0.7272027702962678, + "flos": 546395355648.0, + "grad_norm": 0.050485328839168696, + "language_loss": 0.90140432, + "learning_rate": 0.00018278830656731054, + "loss": 0.91205782, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.3371582, + "step": 3780, + "time_per_iteration": 2.6837782859802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060977, + "balance_loss_mlp": 1.02883863, + "epoch": 0.7273951519815314, + "flos": 592772926464.0, + "grad_norm": 0.04496338740790305, + "language_loss": 0.86495197, + "learning_rate": 0.00018254755039373222, + "loss": 0.87556171, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.32128906, + "step": 3781, + "time_per_iteration": 2.7322683334350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063791, + "balance_loss_mlp": 1.03084135, + "epoch": 0.727587533666795, + "flos": 605732456448.0, + "grad_norm": 0.056903164121683655, + "language_loss": 0.83278424, + "learning_rate": 0.0001823069174661252, + "loss": 0.84342206, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.32958984, + "step": 3782, + "time_per_iteration": 2.75710129737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067494, + "balance_loss_mlp": 1.03380585, + "epoch": 0.7277799153520584, + "flos": 512770701312.0, + "grad_norm": 0.05370507093110541, + "language_loss": 0.78568602, + "learning_rate": 0.00018206640787791112, + "loss": 0.79636097, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.3371582, + "step": 3783, + "time_per_iteration": 2.61852765083313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062618, + "balance_loss_mlp": 1.02923894, + "epoch": 0.727972297037322, + "flos": 537498132480.0, + "grad_norm": 0.05379721469366117, + "language_loss": 0.85843956, + "learning_rate": 0.00018182602172246416, + "loss": 0.8690657, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.33398438, + "step": 3784, + "time_per_iteration": 2.593327522277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061524, + "balance_loss_mlp": 1.02819335, + "epoch": 0.7281646787225856, + "flos": 534780223488.0, + "grad_norm": 0.06658957148496236, + "language_loss": 0.76393896, + "learning_rate": 0.00018158575909311075, + "loss": 0.77455419, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.33349609, + "step": 3785, + "time_per_iteration": 2.600620985031128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106257, + "balance_loss_mlp": 1.02935863, + "epoch": 0.7283570604078492, + "flos": 624767993856.0, + "grad_norm": 0.053054924881327924, + "language_loss": 0.79626518, + "learning_rate": 0.000181345620083129, + "loss": 0.80689085, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.33227539, + "step": 3786, + "time_per_iteration": 2.746778726577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065243, + "balance_loss_mlp": 1.03255534, + "epoch": 0.7285494420931128, + "flos": 533904307200.0, + "grad_norm": 0.097300641099862, + "language_loss": 0.86717927, + "learning_rate": 0.00018110560478574927, + "loss": 0.8778317, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.3269043, + "step": 3787, + "time_per_iteration": 2.6793131828308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065255, + "balance_loss_mlp": 1.03147149, + "epoch": 0.7287418237783763, + "flos": 666251011584.0, + "grad_norm": 0.05707772132850956, + "language_loss": 0.80307966, + "learning_rate": 0.0001808657132941533, + "loss": 0.81373221, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.33813477, + "step": 3788, + "time_per_iteration": 2.7490005493164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065453, + "balance_loss_mlp": 1.03147793, + "epoch": 0.7289342054636399, + "flos": 550344181248.0, + "grad_norm": 0.05691575768916977, + "language_loss": 0.82927215, + "learning_rate": 0.00018062594570147572, + "loss": 0.83992666, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.33984375, + "step": 3789, + "time_per_iteration": 2.584277391433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063831, + "balance_loss_mlp": 1.03109622, + "epoch": 0.7291265871489034, + "flos": 687620344320.0, + "grad_norm": 0.05865206546440876, + "language_loss": 0.85141826, + "learning_rate": 0.00018038630210080243, + "loss": 0.86205661, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.32739258, + "step": 3790, + "time_per_iteration": 2.7913711071014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010669, + "balance_loss_mlp": 1.03421283, + "epoch": 0.729318968834167, + "flos": 572388609024.0, + "grad_norm": 0.08871994753922169, + "language_loss": 0.8494693, + "learning_rate": 0.0001801467825851712, + "loss": 0.8601383, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.3269043, + "step": 3791, + "time_per_iteration": 2.7232275009155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064513, + "balance_loss_mlp": 1.03013325, + "epoch": 0.7295113505194305, + "flos": 585786097152.0, + "grad_norm": 0.05597763782774928, + "language_loss": 0.78437781, + "learning_rate": 0.00017990738724757172, + "loss": 0.79502296, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.34423828, + "step": 3792, + "time_per_iteration": 2.8646349906921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070211, + "balance_loss_mlp": 1.03664136, + "epoch": 0.7297037322046941, + "flos": 706872669696.0, + "grad_norm": 0.0454122102846594, + "language_loss": 0.82281637, + "learning_rate": 0.00017966811618094598, + "loss": 0.83351851, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.3359375, + "step": 3793, + "time_per_iteration": 2.9363014698028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065082, + "balance_loss_mlp": 1.03148866, + "epoch": 0.7298961138899577, + "flos": 487039315968.0, + "grad_norm": 0.060918230322826325, + "language_loss": 0.84644252, + "learning_rate": 0.00017942896947818664, + "loss": 0.85709333, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.33618164, + "step": 3794, + "time_per_iteration": 2.634622097015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014366, + "balance_loss_mlp": 1.00473428, + "epoch": 0.7300884955752213, + "flos": 1365102222336.0, + "grad_norm": 0.006306847562880891, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.75839418, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.09619141, + "step": 3795, + "time_per_iteration": 4.8498523235321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067916, + "balance_loss_mlp": 1.03434658, + "epoch": 0.7302808772604849, + "flos": 531550162944.0, + "grad_norm": 0.07784703337464734, + "language_loss": 0.85064995, + "learning_rate": 0.00017895104953559947, + "loss": 0.86132914, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.33569336, + "step": 3796, + "time_per_iteration": 2.578749418258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069311, + "balance_loss_mlp": 1.03533602, + "epoch": 0.7304732589457483, + "flos": 435949074432.0, + "grad_norm": 0.06903187092561903, + "language_loss": 0.8945868, + "learning_rate": 0.00017871227648131672, + "loss": 0.90527987, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.34008789, + "step": 3797, + "time_per_iteration": 2.498368740081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064089, + "balance_loss_mlp": 1.03071082, + "epoch": 0.7306656406310119, + "flos": 451376229888.0, + "grad_norm": 0.049186518116542115, + "language_loss": 0.82359099, + "learning_rate": 0.0001784736281619907, + "loss": 0.83423185, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.33398438, + "step": 3798, + "time_per_iteration": 2.5968668460845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063054, + "balance_loss_mlp": 1.02898395, + "epoch": 0.7308580223162755, + "flos": 511756572672.0, + "grad_norm": 0.049616480799322744, + "language_loss": 0.74341989, + "learning_rate": 0.00017823510467027232, + "loss": 0.75405043, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.34106445, + "step": 3799, + "time_per_iteration": 2.733454465866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063516, + "balance_loss_mlp": 1.02930331, + "epoch": 0.7310504040015391, + "flos": 375209349120.0, + "grad_norm": 0.0582146456406939, + "language_loss": 0.78020084, + "learning_rate": 0.00017799670609876516, + "loss": 0.79083604, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.3425293, + "step": 3800, + "time_per_iteration": 4.01823616027832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065254, + "balance_loss_mlp": 1.03135109, + "epoch": 0.7312427856868026, + "flos": 549073976832.0, + "grad_norm": 0.04960878758692363, + "language_loss": 0.8857708, + "learning_rate": 0.00017775843254002366, + "loss": 0.89642334, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.33935547, + "step": 3801, + "time_per_iteration": 2.6998913288116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063333, + "balance_loss_mlp": 1.03014541, + "epoch": 0.7314351673720662, + "flos": 766880483328.0, + "grad_norm": 0.0540974976561695, + "language_loss": 0.84199798, + "learning_rate": 0.00017752028408655367, + "loss": 0.85263133, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.33203125, + "step": 3802, + "time_per_iteration": 3.058145523071289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064093, + "balance_loss_mlp": 1.03102422, + "epoch": 0.7316275490573297, + "flos": 486492258816.0, + "grad_norm": 0.051110561372661595, + "language_loss": 0.85141397, + "learning_rate": 0.00017728226083081272, + "loss": 0.86205482, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.33081055, + "step": 3803, + "time_per_iteration": 2.5310099124908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065165, + "balance_loss_mlp": 1.03166723, + "epoch": 0.7318199307425933, + "flos": 473183520768.0, + "grad_norm": 0.05616081836254539, + "language_loss": 0.81485891, + "learning_rate": 0.00017704436286520965, + "loss": 0.8255105, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.33520508, + "step": 3804, + "time_per_iteration": 2.568283796310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063426, + "balance_loss_mlp": 1.02952337, + "epoch": 0.7320123124278569, + "flos": 549202014720.0, + "grad_norm": 0.05320670127317765, + "language_loss": 0.84491169, + "learning_rate": 0.0001768065902821046, + "loss": 0.855546, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.33935547, + "step": 3805, + "time_per_iteration": 2.605682134628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061947, + "balance_loss_mlp": 1.02751899, + "epoch": 0.7322046941131204, + "flos": 570502946304.0, + "grad_norm": 0.06611321477092025, + "language_loss": 0.8209759, + "learning_rate": 0.00017656894317380907, + "loss": 0.83159536, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.34472656, + "step": 3806, + "time_per_iteration": 2.7116403579711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010208, + "balance_loss_mlp": 1.00062358, + "epoch": 0.732397075798384, + "flos": 1468334559744.0, + "grad_norm": 0.00621008772312024, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77041477, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.09570312, + "step": 3807, + "time_per_iteration": 4.968751668930054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061209, + "balance_loss_mlp": 1.0275209, + "epoch": 0.7325894574836476, + "flos": 464620948992.0, + "grad_norm": 0.05827651043720701, + "language_loss": 0.83991838, + "learning_rate": 0.00017609402575064875, + "loss": 0.85053051, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.3371582, + "step": 3808, + "time_per_iteration": 2.5385282039642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063699, + "balance_loss_mlp": 1.03003407, + "epoch": 0.7327818391689112, + "flos": 495246887424.0, + "grad_norm": 0.05735407240941104, + "language_loss": 0.80858552, + "learning_rate": 0.00017585675562016367, + "loss": 0.81922251, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.33691406, + "step": 3809, + "time_per_iteration": 2.555299997329712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007774, + "balance_loss_mlp": 0.99823719, + "epoch": 0.7329742208541746, + "flos": 1432694794752.0, + "grad_norm": 0.0030976704675862504, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78220618, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.09521484, + "step": 3810, + "time_per_iteration": 4.790294647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062704, + "balance_loss_mlp": 1.02894437, + "epoch": 0.7331666025394382, + "flos": 496645129728.0, + "grad_norm": 0.057652785058487796, + "language_loss": 0.84699941, + "learning_rate": 0.00017538259298196474, + "loss": 0.85762644, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.33789062, + "step": 3811, + "time_per_iteration": 2.5608150959014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066548, + "balance_loss_mlp": 1.03271604, + "epoch": 0.7333589842247018, + "flos": 538247420928.0, + "grad_norm": 0.07102765773461414, + "language_loss": 0.81726062, + "learning_rate": 0.00017514570065833745, + "loss": 0.82792604, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.33862305, + "step": 3812, + "time_per_iteration": 2.733987808227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063175, + "balance_loss_mlp": 1.03024936, + "epoch": 0.7335513659099654, + "flos": 490825198080.0, + "grad_norm": 0.0783727795203613, + "language_loss": 0.80580723, + "learning_rate": 0.00017490893445433426, + "loss": 0.81643891, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.3293457, + "step": 3813, + "time_per_iteration": 2.5801103115081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062414, + "balance_loss_mlp": 1.02953637, + "epoch": 0.733743747595229, + "flos": 561876355584.0, + "grad_norm": 0.048847975772381425, + "language_loss": 0.81069362, + "learning_rate": 0.00017467229446187587, + "loss": 0.82131779, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.32885742, + "step": 3814, + "time_per_iteration": 2.683293104171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060801, + "balance_loss_mlp": 1.02684999, + "epoch": 0.7339361292804925, + "flos": 538315822080.0, + "grad_norm": 0.047730041635456175, + "language_loss": 0.81664294, + "learning_rate": 0.00017443578077283424, + "loss": 0.82725096, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.33984375, + "step": 3815, + "time_per_iteration": 2.641364574432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064457, + "balance_loss_mlp": 1.03043437, + "epoch": 0.734128510965756, + "flos": 548198060544.0, + "grad_norm": 0.05243488536705766, + "language_loss": 0.85093778, + "learning_rate": 0.0001741993934790319, + "loss": 0.86158234, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.34057617, + "step": 3816, + "time_per_iteration": 2.7296290397644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060693, + "balance_loss_mlp": 1.027004, + "epoch": 0.7343208926510196, + "flos": 539783875584.0, + "grad_norm": 0.059294435662015, + "language_loss": 0.84253871, + "learning_rate": 0.00017396313267224273, + "loss": 0.85314572, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.3371582, + "step": 3817, + "time_per_iteration": 2.702885866165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064529, + "balance_loss_mlp": 1.03141296, + "epoch": 0.7345132743362832, + "flos": 570827423232.0, + "grad_norm": 0.058276166249488254, + "language_loss": 0.88087535, + "learning_rate": 0.0001737269984441912, + "loss": 0.89152062, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.33129883, + "step": 3818, + "time_per_iteration": 2.6317105293273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064679, + "balance_loss_mlp": 1.03089499, + "epoch": 0.7347056560215467, + "flos": 545135325696.0, + "grad_norm": 0.04588849649553848, + "language_loss": 0.84933245, + "learning_rate": 0.00017349099088655263, + "loss": 0.85997921, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.33813477, + "step": 3819, + "time_per_iteration": 2.6894302368164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063138, + "balance_loss_mlp": 1.03023624, + "epoch": 0.7348980377068103, + "flos": 595668335616.0, + "grad_norm": 0.04507487661925427, + "language_loss": 0.80804777, + "learning_rate": 0.00017325511009095375, + "loss": 0.81867915, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.32910156, + "step": 3820, + "time_per_iteration": 2.7293684482574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106202, + "balance_loss_mlp": 1.02833104, + "epoch": 0.7350904193920739, + "flos": 538291090944.0, + "grad_norm": 0.05281271554601035, + "language_loss": 0.83436865, + "learning_rate": 0.00017301935614897113, + "loss": 0.84498882, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.3371582, + "step": 3821, + "time_per_iteration": 2.727043390274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065798, + "balance_loss_mlp": 1.03199053, + "epoch": 0.7352828010773375, + "flos": 512712474624.0, + "grad_norm": 0.049847760142976955, + "language_loss": 0.81776285, + "learning_rate": 0.00017278372915213274, + "loss": 0.82842088, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.33837891, + "step": 3822, + "time_per_iteration": 2.650468587875366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016098, + "balance_loss_mlp": 1.00732386, + "epoch": 0.735475182762601, + "flos": 1552965087744.0, + "grad_norm": 0.006919711828678118, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80909944, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.08789062, + "step": 3823, + "time_per_iteration": 4.953552007675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064811, + "balance_loss_mlp": 1.03152812, + "epoch": 0.7356675644478645, + "flos": 680984133120.0, + "grad_norm": 0.05477130008948058, + "language_loss": 0.80415845, + "learning_rate": 0.00017231285635975314, + "loss": 0.81480658, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.33300781, + "step": 3824, + "time_per_iteration": 2.889289140701294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067221, + "balance_loss_mlp": 1.03334153, + "epoch": 0.7358599461331281, + "flos": 514961902080.0, + "grad_norm": 0.05024116025531215, + "language_loss": 0.83180618, + "learning_rate": 0.00017207761074702115, + "loss": 0.84247839, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.33911133, + "step": 3825, + "time_per_iteration": 2.5944931507110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068957, + "balance_loss_mlp": 1.03491116, + "epoch": 0.7360523278183917, + "flos": 443739036672.0, + "grad_norm": 0.05416022756752086, + "language_loss": 0.83636504, + "learning_rate": 0.0001718424924450514, + "loss": 0.8470546, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.34082031, + "step": 3826, + "time_per_iteration": 2.6031198501586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067291, + "balance_loss_mlp": 1.03441358, + "epoch": 0.7362447095036553, + "flos": 603142585344.0, + "grad_norm": 0.04455430936789472, + "language_loss": 0.85882723, + "learning_rate": 0.00017160750154512482, + "loss": 0.86950016, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.32885742, + "step": 3827, + "time_per_iteration": 2.702148914337158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067373, + "balance_loss_mlp": 1.03470922, + "epoch": 0.7364370911889189, + "flos": 552807424512.0, + "grad_norm": 0.06654318382518472, + "language_loss": 0.83394545, + "learning_rate": 0.0001713726381384731, + "loss": 0.84461915, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.32666016, + "step": 3828, + "time_per_iteration": 2.7451815605163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069365, + "balance_loss_mlp": 1.03622484, + "epoch": 0.7366294728741823, + "flos": 448830028800.0, + "grad_norm": 0.05260282371151395, + "language_loss": 0.81186259, + "learning_rate": 0.00017113790231627812, + "loss": 0.82255614, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.33154297, + "step": 3829, + "time_per_iteration": 2.537193775177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017376, + "balance_loss_mlp": 1.00879276, + "epoch": 0.7368218545594459, + "flos": 1534705132032.0, + "grad_norm": 0.0074062815552694275, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80275595, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.0859375, + "step": 3830, + "time_per_iteration": 4.833421945571899 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069316, + "balance_loss_mlp": 1.03584218, + "epoch": 0.7370142362447095, + "flos": 515164133376.0, + "grad_norm": 0.05241835365741791, + "language_loss": 0.81748456, + "learning_rate": 0.00017066881378973936, + "loss": 0.82817769, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.33496094, + "step": 3831, + "time_per_iteration": 2.619849443435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071106, + "balance_loss_mlp": 1.03808546, + "epoch": 0.7372066179299731, + "flos": 500531346432.0, + "grad_norm": 0.056102661804596575, + "language_loss": 0.82564443, + "learning_rate": 0.00017043446126751189, + "loss": 0.83635545, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.33032227, + "step": 3832, + "time_per_iteration": 2.689955711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069455, + "balance_loss_mlp": 1.03605282, + "epoch": 0.7373989996152366, + "flos": 557814048768.0, + "grad_norm": 0.062254186962725604, + "language_loss": 0.76771331, + "learning_rate": 0.00017020023669397376, + "loss": 0.77840781, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.33422852, + "step": 3833, + "time_per_iteration": 2.7102112770080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071175, + "balance_loss_mlp": 1.03722405, + "epoch": 0.7375913813005002, + "flos": 506527368192.0, + "grad_norm": 0.05138473189519923, + "language_loss": 0.81401753, + "learning_rate": 0.0001699661401600589, + "loss": 0.82472932, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.33984375, + "step": 3834, + "time_per_iteration": 2.5580482482910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066317, + "balance_loss_mlp": 1.03386855, + "epoch": 0.7377837629857638, + "flos": 485940819456.0, + "grad_norm": 0.04817361691999996, + "language_loss": 0.78101605, + "learning_rate": 0.00016973217175665205, + "loss": 0.7916792, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.32446289, + "step": 3835, + "time_per_iteration": 2.5466511249542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014272, + "balance_loss_mlp": 1.00540292, + "epoch": 0.7379761446710273, + "flos": 1413900776448.0, + "grad_norm": 0.004962525889406641, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.8218044, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.08886719, + "step": 3836, + "time_per_iteration": 4.947209358215332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065038, + "balance_loss_mlp": 1.03173065, + "epoch": 0.7381685263562909, + "flos": 629445758976.0, + "grad_norm": 0.04309096718082386, + "language_loss": 0.83880627, + "learning_rate": 0.00016926461970465047, + "loss": 0.84945667, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.33325195, + "step": 3837, + "time_per_iteration": 2.7604105472564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064561, + "balance_loss_mlp": 1.03175426, + "epoch": 0.7383609080415544, + "flos": 738869147136.0, + "grad_norm": 0.046495404641084814, + "language_loss": 0.84092653, + "learning_rate": 0.00016903103623757516, + "loss": 0.8515721, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.328125, + "step": 3838, + "time_per_iteration": 3.0393178462982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064697, + "balance_loss_mlp": 1.03165209, + "epoch": 0.738553289726818, + "flos": 549945510912.0, + "grad_norm": 0.05807903751309768, + "language_loss": 0.80044198, + "learning_rate": 0.00016879758126404738, + "loss": 0.81108892, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.33056641, + "step": 3839, + "time_per_iteration": 2.7287819385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066157, + "balance_loss_mlp": 1.03296924, + "epoch": 0.7387456714120816, + "flos": 909925705728.0, + "grad_norm": 0.06505190297085839, + "language_loss": 0.7982837, + "learning_rate": 0.00016856425487470216, + "loss": 0.8089453, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.33203125, + "step": 3840, + "time_per_iteration": 3.088334321975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070195, + "balance_loss_mlp": 1.03724539, + "epoch": 0.7389380530973452, + "flos": 852308352000.0, + "grad_norm": 0.054902923406453155, + "language_loss": 0.78921622, + "learning_rate": 0.00016833105716012486, + "loss": 0.79991817, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.32958984, + "step": 3841, + "time_per_iteration": 3.1420795917510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067762, + "balance_loss_mlp": 1.03433585, + "epoch": 0.7391304347826086, + "flos": 816678761472.0, + "grad_norm": 0.0538484990097731, + "language_loss": 0.85046756, + "learning_rate": 0.00016809798821085088, + "loss": 0.86114514, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.33447266, + "step": 3842, + "time_per_iteration": 2.9748454093933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067117, + "balance_loss_mlp": 1.03321409, + "epoch": 0.7393228164678722, + "flos": 572541378048.0, + "grad_norm": 0.07853013477986996, + "language_loss": 0.88786352, + "learning_rate": 0.00016786504811736565, + "loss": 0.89853466, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.33935547, + "step": 3843, + "time_per_iteration": 2.697993516921997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107057, + "balance_loss_mlp": 1.0370723, + "epoch": 0.7395151981531358, + "flos": 684903845376.0, + "grad_norm": 0.054879027639850184, + "language_loss": 0.82676303, + "learning_rate": 0.00016763223697010442, + "loss": 0.83746874, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.33520508, + "step": 3844, + "time_per_iteration": 2.941396951675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069204, + "balance_loss_mlp": 1.03680301, + "epoch": 0.7397075798383994, + "flos": 556095711744.0, + "grad_norm": 0.044630458439445526, + "language_loss": 0.84558266, + "learning_rate": 0.00016739955485945256, + "loss": 0.85627472, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.32397461, + "step": 3845, + "time_per_iteration": 2.6704368591308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070852, + "balance_loss_mlp": 1.03692532, + "epoch": 0.739899961523663, + "flos": 546523393536.0, + "grad_norm": 0.16146348926095225, + "language_loss": 0.8579582, + "learning_rate": 0.00016716700187574513, + "loss": 0.86866671, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.33959961, + "step": 3846, + "time_per_iteration": 2.689548969268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066553, + "balance_loss_mlp": 1.03400922, + "epoch": 0.7400923432089265, + "flos": 608913054720.0, + "grad_norm": 0.062089054691193496, + "language_loss": 0.83502501, + "learning_rate": 0.0001669345781092675, + "loss": 0.84569055, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.32543945, + "step": 3847, + "time_per_iteration": 2.7922914028167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106553, + "balance_loss_mlp": 1.03286684, + "epoch": 0.7402847248941901, + "flos": 590715555840.0, + "grad_norm": 0.053588507044290926, + "language_loss": 0.86693704, + "learning_rate": 0.0001667022836502546, + "loss": 0.87759233, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.32666016, + "step": 3848, + "time_per_iteration": 2.7810423374176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106826, + "balance_loss_mlp": 1.0351913, + "epoch": 0.7404771065794536, + "flos": 477136728576.0, + "grad_norm": 0.05607520940274661, + "language_loss": 0.82591665, + "learning_rate": 0.00016647011858889077, + "loss": 0.83659923, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.33081055, + "step": 3849, + "time_per_iteration": 2.5447256565093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068875, + "balance_loss_mlp": 1.03552043, + "epoch": 0.7406694882647172, + "flos": 496192614912.0, + "grad_norm": 0.05524374859668954, + "language_loss": 0.85861689, + "learning_rate": 0.00016623808301531056, + "loss": 0.86930567, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.33374023, + "step": 3850, + "time_per_iteration": 2.647326707839966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067961, + "balance_loss_mlp": 1.03455853, + "epoch": 0.7408618699499807, + "flos": 561925817856.0, + "grad_norm": 0.0770294501397313, + "language_loss": 0.79239726, + "learning_rate": 0.00016600617701959842, + "loss": 0.80307692, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.33422852, + "step": 3851, + "time_per_iteration": 2.724172830581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011471, + "balance_loss_mlp": 1.00212514, + "epoch": 0.7410542516352443, + "flos": 1387421512704.0, + "grad_norm": 0.004619624955394922, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79855287, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.09326172, + "step": 3852, + "time_per_iteration": 4.94897198677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069894, + "balance_loss_mlp": 1.03620529, + "epoch": 0.7412466333205079, + "flos": 669697860096.0, + "grad_norm": 0.05139846534823347, + "language_loss": 0.80732995, + "learning_rate": 0.00016554275412186315, + "loss": 0.81802887, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.3371582, + "step": 3853, + "time_per_iteration": 2.798964262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069502, + "balance_loss_mlp": 1.0356704, + "epoch": 0.7414390150057715, + "flos": 489038459904.0, + "grad_norm": 0.059331107298497686, + "language_loss": 0.80721259, + "learning_rate": 0.0001653112373997568, + "loss": 0.81790757, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.33862305, + "step": 3854, + "time_per_iteration": 2.6622824668884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071808, + "balance_loss_mlp": 1.03878713, + "epoch": 0.7416313966910351, + "flos": 599119566336.0, + "grad_norm": 0.060794627478568314, + "language_loss": 0.74696434, + "learning_rate": 0.0001650798506153517, + "loss": 0.7576825, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.33032227, + "step": 3855, + "time_per_iteration": 2.6897103786468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068866, + "balance_loss_mlp": 1.03558254, + "epoch": 0.7418237783762985, + "flos": 542279204352.0, + "grad_norm": 0.06401290816121721, + "language_loss": 0.83928871, + "learning_rate": 0.00016484859385848023, + "loss": 0.84997737, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.33276367, + "step": 3856, + "time_per_iteration": 2.6182141304016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065034, + "balance_loss_mlp": 1.0325613, + "epoch": 0.7420161600615621, + "flos": 543865121280.0, + "grad_norm": 0.060824827203723085, + "language_loss": 0.77091217, + "learning_rate": 0.0001646174672189243, + "loss": 0.78156251, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.32470703, + "step": 3857, + "time_per_iteration": 2.639897584915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072771, + "balance_loss_mlp": 1.039464, + "epoch": 0.7422085417468257, + "flos": 526921860096.0, + "grad_norm": 0.05508256135397888, + "language_loss": 0.80038357, + "learning_rate": 0.00016438647078641488, + "loss": 0.81111133, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.33325195, + "step": 3858, + "time_per_iteration": 2.583303213119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071831, + "balance_loss_mlp": 1.0385952, + "epoch": 0.7424009234320893, + "flos": 508404266496.0, + "grad_norm": 0.05219884306446566, + "language_loss": 0.83017123, + "learning_rate": 0.00016415560465063344, + "loss": 0.84088957, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.33251953, + "step": 3859, + "time_per_iteration": 2.7442150115966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069864, + "balance_loss_mlp": 1.03670025, + "epoch": 0.7425933051173528, + "flos": 512347299840.0, + "grad_norm": 0.07638052287216905, + "language_loss": 0.78861916, + "learning_rate": 0.0001639248689012095, + "loss": 0.79931784, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.33154297, + "step": 3860, + "time_per_iteration": 2.5846545696258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067753, + "balance_loss_mlp": 1.03487468, + "epoch": 0.7427856868026164, + "flos": 458034200064.0, + "grad_norm": 0.05020095318806213, + "language_loss": 0.87714618, + "learning_rate": 0.00016369426362772271, + "loss": 0.8878237, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.32885742, + "step": 3861, + "time_per_iteration": 2.7977116107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106857, + "balance_loss_mlp": 1.03576398, + "epoch": 0.74297806848788, + "flos": 604728502272.0, + "grad_norm": 0.04367608298357755, + "language_loss": 0.80370325, + "learning_rate": 0.00016346378891970233, + "loss": 0.81438893, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.328125, + "step": 3862, + "time_per_iteration": 2.8144397735595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067373, + "balance_loss_mlp": 1.03416157, + "epoch": 0.7431704501731435, + "flos": 890971564032.0, + "grad_norm": 0.052584770309724485, + "language_loss": 0.81109643, + "learning_rate": 0.00016323344486662633, + "loss": 0.82177019, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.33227539, + "step": 3863, + "time_per_iteration": 3.306062936782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069805, + "balance_loss_mlp": 1.03566337, + "epoch": 0.7433628318584071, + "flos": 591867896832.0, + "grad_norm": 0.05409036708303953, + "language_loss": 0.78479373, + "learning_rate": 0.00016300323155792247, + "loss": 0.79549176, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.34179688, + "step": 3864, + "time_per_iteration": 2.881361961364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070231, + "balance_loss_mlp": 1.03756773, + "epoch": 0.7435552135436706, + "flos": 476896619520.0, + "grad_norm": 0.06261465074360906, + "language_loss": 0.88414448, + "learning_rate": 0.00016277314908296687, + "loss": 0.8948468, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.32666016, + "step": 3865, + "time_per_iteration": 2.6607327461242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068826, + "balance_loss_mlp": 1.03497088, + "epoch": 0.7437475952289342, + "flos": 672874076160.0, + "grad_norm": 0.05871216754162407, + "language_loss": 0.75963724, + "learning_rate": 0.00016254319753108604, + "loss": 0.77032548, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.33862305, + "step": 3866, + "time_per_iteration": 2.8663392066955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072261, + "balance_loss_mlp": 1.03881145, + "epoch": 0.7439399769141978, + "flos": 770094577152.0, + "grad_norm": 0.0657107928380086, + "language_loss": 0.76937765, + "learning_rate": 0.00016231337699155492, + "loss": 0.78010023, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.3347168, + "step": 3867, + "time_per_iteration": 3.0015652179718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069051, + "balance_loss_mlp": 1.03579164, + "epoch": 0.7441323585994614, + "flos": 647462785536.0, + "grad_norm": 0.05480167763007067, + "language_loss": 0.781057, + "learning_rate": 0.0001620836875535977, + "loss": 0.79174751, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.33276367, + "step": 3868, + "time_per_iteration": 2.842230796813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067908, + "balance_loss_mlp": 1.03438592, + "epoch": 0.7443247402847248, + "flos": 565091859456.0, + "grad_norm": 0.08182292750671373, + "language_loss": 0.80810648, + "learning_rate": 0.00016185412930638766, + "loss": 0.81878555, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.33544922, + "step": 3869, + "time_per_iteration": 2.7977213859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071043, + "balance_loss_mlp": 1.03761721, + "epoch": 0.7445171219699884, + "flos": 578243446272.0, + "grad_norm": 0.07110615471626963, + "language_loss": 0.82752168, + "learning_rate": 0.00016162470233904765, + "loss": 0.8382321, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.33447266, + "step": 3870, + "time_per_iteration": 2.707329273223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106823, + "balance_loss_mlp": 1.03456485, + "epoch": 0.744709503655252, + "flos": 618588679680.0, + "grad_norm": 0.08201563915437336, + "language_loss": 0.81978703, + "learning_rate": 0.00016139540674064856, + "loss": 0.83046937, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.33666992, + "step": 3871, + "time_per_iteration": 2.779015302658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.03349781, + "epoch": 0.7449018853405156, + "flos": 528355008000.0, + "grad_norm": 0.053737872907142804, + "language_loss": 0.77632427, + "learning_rate": 0.00016116624260021113, + "loss": 0.78698754, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.32836914, + "step": 3872, + "time_per_iteration": 2.748868942260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068405, + "balance_loss_mlp": 1.03509796, + "epoch": 0.7450942670257792, + "flos": 433088570880.0, + "grad_norm": 0.050066249617561176, + "language_loss": 0.83786619, + "learning_rate": 0.0001609372100067046, + "loss": 0.84855032, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.33325195, + "step": 3873, + "time_per_iteration": 2.5261478424072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068874, + "balance_loss_mlp": 1.03504205, + "epoch": 0.7452866487110427, + "flos": 696562647552.0, + "grad_norm": 0.062485843646331765, + "language_loss": 0.84858561, + "learning_rate": 0.0001607083090490475, + "loss": 0.85927439, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.33862305, + "step": 3874, + "time_per_iteration": 2.912550210952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070703, + "balance_loss_mlp": 1.03718174, + "epoch": 0.7454790303963063, + "flos": 511944247296.0, + "grad_norm": 0.05620990191133866, + "language_loss": 0.80024898, + "learning_rate": 0.00016047953981610714, + "loss": 0.810956, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.33544922, + "step": 3875, + "time_per_iteration": 2.7009074687957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024051, + "balance_loss_mlp": 1.01460981, + "epoch": 0.7456714120815698, + "flos": 1325221088256.0, + "grad_norm": 0.008467942690165917, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.8075369, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.09423828, + "step": 3876, + "time_per_iteration": 4.952231168746948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065537, + "balance_loss_mlp": 1.0320152, + "epoch": 0.7458637937668334, + "flos": 721397767680.0, + "grad_norm": 0.05688245720911951, + "language_loss": 0.8058607, + "learning_rate": 0.0001600223968795889, + "loss": 0.8165161, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.33544922, + "step": 3877, + "time_per_iteration": 2.87972092628479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024325, + "balance_loss_mlp": 1.014979, + "epoch": 0.746056175452097, + "flos": 1500761793024.0, + "grad_norm": 0.00806071633609759, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76720393, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.09326172, + "step": 3878, + "time_per_iteration": 4.914839029312134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065825, + "balance_loss_mlp": 1.03335285, + "epoch": 0.7462485571373605, + "flos": 519984493056.0, + "grad_norm": 0.05864389965433195, + "language_loss": 0.81840986, + "learning_rate": 0.00015956578190706483, + "loss": 0.82906812, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.32470703, + "step": 3879, + "time_per_iteration": 2.690336227416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067732, + "balance_loss_mlp": 1.03492546, + "epoch": 0.7464409388226241, + "flos": 480967690752.0, + "grad_norm": 0.05296793730256709, + "language_loss": 0.75717044, + "learning_rate": 0.00015933767262892468, + "loss": 0.76784778, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.328125, + "step": 3880, + "time_per_iteration": 2.702094078063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106723, + "balance_loss_mlp": 1.03418517, + "epoch": 0.7466333205078877, + "flos": 486516989952.0, + "grad_norm": 0.06088844142287201, + "language_loss": 0.81730115, + "learning_rate": 0.00015910969560762927, + "loss": 0.82797348, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.33056641, + "step": 3881, + "time_per_iteration": 2.5547542572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066924, + "balance_loss_mlp": 1.03464174, + "epoch": 0.7468257021931513, + "flos": 611015505408.0, + "grad_norm": 0.05773306272323557, + "language_loss": 0.83265662, + "learning_rate": 0.00015888185093168727, + "loss": 0.84332585, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.32275391, + "step": 3882, + "time_per_iteration": 2.7600655555725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069044, + "balance_loss_mlp": 1.03502131, + "epoch": 0.7470180838784147, + "flos": 533204481024.0, + "grad_norm": 0.06850625099692723, + "language_loss": 0.8104043, + "learning_rate": 0.00015865413868955581, + "loss": 0.82109475, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.34057617, + "step": 3883, + "time_per_iteration": 2.6018030643463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066378, + "balance_loss_mlp": 1.03378606, + "epoch": 0.7472104655636783, + "flos": 739005949440.0, + "grad_norm": 0.05384081039558067, + "language_loss": 0.82672417, + "learning_rate": 0.00015842655896964054, + "loss": 0.83738798, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.32592773, + "step": 3884, + "time_per_iteration": 3.021933078765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065038, + "balance_loss_mlp": 1.03223145, + "epoch": 0.7474028472489419, + "flos": 640007474688.0, + "grad_norm": 0.052763664912519236, + "language_loss": 0.73725951, + "learning_rate": 0.00015819911186029567, + "loss": 0.7479099, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.328125, + "step": 3885, + "time_per_iteration": 2.8068392276763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068676, + "balance_loss_mlp": 1.03577399, + "epoch": 0.7475952289342055, + "flos": 589980824064.0, + "grad_norm": 0.05740266756526494, + "language_loss": 0.8658216, + "learning_rate": 0.00015797179744982443, + "loss": 0.87650836, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.32910156, + "step": 3886, + "time_per_iteration": 2.7342216968536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067874, + "balance_loss_mlp": 1.03492451, + "epoch": 0.7477876106194691, + "flos": 487935581184.0, + "grad_norm": 0.05063564499597122, + "language_loss": 0.79109228, + "learning_rate": 0.00015774461582647765, + "loss": 0.80177104, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.32958984, + "step": 3887, + "time_per_iteration": 2.617705821990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066709, + "balance_loss_mlp": 1.03390241, + "epoch": 0.7479799923047326, + "flos": 554470507008.0, + "grad_norm": 0.04778068214414316, + "language_loss": 0.81002998, + "learning_rate": 0.00015751756707845505, + "loss": 0.82069701, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.328125, + "step": 3888, + "time_per_iteration": 2.611276626586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067147, + "balance_loss_mlp": 1.03505635, + "epoch": 0.7481723739899961, + "flos": 767037634560.0, + "grad_norm": 0.054687563688018546, + "language_loss": 0.88108873, + "learning_rate": 0.00015729065129390502, + "loss": 0.89176023, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.32080078, + "step": 3889, + "time_per_iteration": 3.022294759750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069902, + "balance_loss_mlp": 1.03614235, + "epoch": 0.7483647556752597, + "flos": 495926364672.0, + "grad_norm": 0.07150557993865005, + "language_loss": 0.81957299, + "learning_rate": 0.0001570638685609241, + "loss": 0.83027202, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.33789062, + "step": 3890, + "time_per_iteration": 2.540038585662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068666, + "balance_loss_mlp": 1.03588343, + "epoch": 0.7485571373605233, + "flos": 472607350272.0, + "grad_norm": 0.055161335390356114, + "language_loss": 0.8031671, + "learning_rate": 0.00015683721896755693, + "loss": 0.81385386, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.32788086, + "step": 3891, + "time_per_iteration": 2.5199973583221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015851, + "balance_loss_mlp": 1.00683892, + "epoch": 0.7487495190457868, + "flos": 1553619833856.0, + "grad_norm": 0.004937901566549453, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83226347, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.09033203, + "step": 3892, + "time_per_iteration": 4.912605047225952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068919, + "balance_loss_mlp": 1.03632677, + "epoch": 0.7489419007310504, + "flos": 581566639104.0, + "grad_norm": 0.04880798479848443, + "language_loss": 0.84992248, + "learning_rate": 0.00015638431955158528, + "loss": 0.86061168, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.32592773, + "step": 3893, + "time_per_iteration": 2.6795592308044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066956, + "balance_loss_mlp": 1.03398299, + "epoch": 0.749134282416314, + "flos": 567297616896.0, + "grad_norm": 0.04606226658973748, + "language_loss": 0.80857748, + "learning_rate": 0.00015615806990481186, + "loss": 0.81924701, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.32983398, + "step": 3894, + "time_per_iteration": 2.7299861907958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066927, + "balance_loss_mlp": 1.03433573, + "epoch": 0.7493266641015776, + "flos": 532786871808.0, + "grad_norm": 0.044395679249862555, + "language_loss": 0.8442167, + "learning_rate": 0.00015593195374931452, + "loss": 0.854886, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.32592773, + "step": 3895, + "time_per_iteration": 2.725260019302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066101, + "balance_loss_mlp": 1.03346133, + "epoch": 0.7495190457868411, + "flos": 523338209280.0, + "grad_norm": 0.05913067332521358, + "language_loss": 0.79859447, + "learning_rate": 0.00015570597117287922, + "loss": 0.80925548, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.32641602, + "step": 3896, + "time_per_iteration": 2.6577799320220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070214, + "balance_loss_mlp": 1.03762269, + "epoch": 0.7497114274721046, + "flos": 513937598976.0, + "grad_norm": 0.0999283842203671, + "language_loss": 0.77427346, + "learning_rate": 0.0001554801222632406, + "loss": 0.78497565, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.32592773, + "step": 3897, + "time_per_iteration": 2.6006200313568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065155, + "balance_loss_mlp": 1.03239596, + "epoch": 0.7499038091573682, + "flos": 494759467008.0, + "grad_norm": 0.050843654610054065, + "language_loss": 0.85019195, + "learning_rate": 0.00015525440710808052, + "loss": 0.86084348, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.32763672, + "step": 3898, + "time_per_iteration": 2.661421775817871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068075, + "balance_loss_mlp": 1.03586483, + "epoch": 0.7500961908426318, + "flos": 737326900224.0, + "grad_norm": 0.05107930467548482, + "language_loss": 0.77678949, + "learning_rate": 0.00015502882579502953, + "loss": 0.78747022, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.32202148, + "step": 3899, + "time_per_iteration": 2.9202702045440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062076, + "balance_loss_mlp": 1.02931714, + "epoch": 0.7502885725278954, + "flos": 533117140992.0, + "grad_norm": 0.046214312949338116, + "language_loss": 0.84483492, + "learning_rate": 0.00015480337841166592, + "loss": 0.85545564, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.32763672, + "step": 3900, + "time_per_iteration": 2.704392194747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070609, + "balance_loss_mlp": 1.03761196, + "epoch": 0.7504809542131589, + "flos": 589017567744.0, + "grad_norm": 0.05276594694020605, + "language_loss": 0.82456982, + "learning_rate": 0.00015457806504551647, + "loss": 0.83527595, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.33007812, + "step": 3901, + "time_per_iteration": 2.8369719982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066454, + "balance_loss_mlp": 1.03376722, + "epoch": 0.7506733358984224, + "flos": 511293883392.0, + "grad_norm": 0.05412460278066938, + "language_loss": 0.78305542, + "learning_rate": 0.0001543528857840554, + "loss": 0.79372001, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.3269043, + "step": 3902, + "time_per_iteration": 2.679732084274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064427, + "balance_loss_mlp": 1.03204942, + "epoch": 0.750865717583686, + "flos": 538990917120.0, + "grad_norm": 0.0614099012114921, + "language_loss": 0.80124992, + "learning_rate": 0.000154127840714705, + "loss": 0.81189418, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.32373047, + "step": 3903, + "time_per_iteration": 2.7384650707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065338, + "balance_loss_mlp": 1.03265119, + "epoch": 0.7510580992689496, + "flos": 476339387904.0, + "grad_norm": 0.0665672194872541, + "language_loss": 0.81678092, + "learning_rate": 0.00015390292992483557, + "loss": 0.82743436, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.3269043, + "step": 3904, + "time_per_iteration": 2.489619731903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106175, + "balance_loss_mlp": 1.02894402, + "epoch": 0.7512504809542132, + "flos": 578755597824.0, + "grad_norm": 0.06071277834491827, + "language_loss": 0.83697867, + "learning_rate": 0.00015367815350176523, + "loss": 0.84759617, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.328125, + "step": 3905, + "time_per_iteration": 2.716557025909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062505, + "balance_loss_mlp": 1.02943611, + "epoch": 0.7514428626394767, + "flos": 418435435008.0, + "grad_norm": 0.05426428820628694, + "language_loss": 0.82564658, + "learning_rate": 0.00015345351153275987, + "loss": 0.83627158, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.33081055, + "step": 3906, + "time_per_iteration": 2.522923707962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065336, + "balance_loss_mlp": 1.03262544, + "epoch": 0.7516352443247403, + "flos": 640736414208.0, + "grad_norm": 0.05433907321222643, + "language_loss": 0.80729043, + "learning_rate": 0.00015322900410503332, + "loss": 0.81794381, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.32714844, + "step": 3907, + "time_per_iteration": 2.793536424636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064989, + "balance_loss_mlp": 1.03189635, + "epoch": 0.7518276260100039, + "flos": 580700897280.0, + "grad_norm": 0.05951130469098692, + "language_loss": 0.76875365, + "learning_rate": 0.00015300463130574703, + "loss": 0.77940357, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.33105469, + "step": 3908, + "time_per_iteration": 2.8399226665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063261, + "balance_loss_mlp": 1.03045464, + "epoch": 0.7520200076952674, + "flos": 687025234944.0, + "grad_norm": 0.0651669879699934, + "language_loss": 0.81970477, + "learning_rate": 0.00015278039322201033, + "loss": 0.83033741, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.328125, + "step": 3909, + "time_per_iteration": 2.9373419284820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063542, + "balance_loss_mlp": 1.02985382, + "epoch": 0.7522123893805309, + "flos": 486196895232.0, + "grad_norm": 0.06049213601321292, + "language_loss": 0.79440963, + "learning_rate": 0.00015255628994088004, + "loss": 0.80504501, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.3371582, + "step": 3910, + "time_per_iteration": 2.528364419937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065253, + "balance_loss_mlp": 1.03175521, + "epoch": 0.7524047710657945, + "flos": 818581800960.0, + "grad_norm": 0.05892068173673864, + "language_loss": 0.75070155, + "learning_rate": 0.00015233232154936082, + "loss": 0.76135409, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.33520508, + "step": 3911, + "time_per_iteration": 3.230201244354248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062613, + "balance_loss_mlp": 1.02916312, + "epoch": 0.7525971527510581, + "flos": 699191806464.0, + "grad_norm": 0.055756434069827554, + "language_loss": 0.76463896, + "learning_rate": 0.0001521084881344048, + "loss": 0.7752651, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.3347168, + "step": 3912, + "time_per_iteration": 2.8348512649536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065733, + "balance_loss_mlp": 1.03216362, + "epoch": 0.7527895344363217, + "flos": 633497891328.0, + "grad_norm": 0.050850444756768094, + "language_loss": 0.86350536, + "learning_rate": 0.00015188478978291208, + "loss": 0.87416273, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.3359375, + "step": 3913, + "time_per_iteration": 2.744290828704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.03404021, + "epoch": 0.7529819161215853, + "flos": 562555832832.0, + "grad_norm": 0.05433821617011464, + "language_loss": 0.8621949, + "learning_rate": 0.00015166122658173014, + "loss": 0.8728655, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.33032227, + "step": 3914, + "time_per_iteration": 2.8117570877075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066949, + "balance_loss_mlp": 1.03368926, + "epoch": 0.7531742978068487, + "flos": 690344045568.0, + "grad_norm": 0.048975254587736855, + "language_loss": 0.88076222, + "learning_rate": 0.00015143779861765332, + "loss": 0.89143169, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.33251953, + "step": 3915, + "time_per_iteration": 2.8815720081329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063708, + "balance_loss_mlp": 1.03140223, + "epoch": 0.7533666794921123, + "flos": 680800840704.0, + "grad_norm": 0.04986662461838111, + "language_loss": 0.81009239, + "learning_rate": 0.00015121450597742458, + "loss": 0.82072949, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.32299805, + "step": 3916, + "time_per_iteration": 2.80761456489563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010608, + "balance_loss_mlp": 1.02830386, + "epoch": 0.7535590611773759, + "flos": 623384308224.0, + "grad_norm": 0.05782496092002166, + "language_loss": 0.78096646, + "learning_rate": 0.00015099134874773369, + "loss": 0.79157448, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.32495117, + "step": 3917, + "time_per_iteration": 2.7233426570892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065187, + "balance_loss_mlp": 1.03149819, + "epoch": 0.7537514428626395, + "flos": 519162421248.0, + "grad_norm": 0.0518571632225719, + "language_loss": 0.80421233, + "learning_rate": 0.00015076832701521793, + "loss": 0.81486416, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.3371582, + "step": 3918, + "time_per_iteration": 2.6993284225463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062901, + "balance_loss_mlp": 1.02971327, + "epoch": 0.753943824547903, + "flos": 723309571584.0, + "grad_norm": 0.06554029395428207, + "language_loss": 0.82133907, + "learning_rate": 0.000150545440866462, + "loss": 0.83196807, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.33203125, + "step": 3919, + "time_per_iteration": 2.9902353286743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063028, + "balance_loss_mlp": 1.03050804, + "epoch": 0.7541362062331666, + "flos": 437318203392.0, + "grad_norm": 0.051833460096662155, + "language_loss": 0.78462708, + "learning_rate": 0.000150322690387998, + "loss": 0.79525733, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.32519531, + "step": 3920, + "time_per_iteration": 2.496290922164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106303, + "balance_loss_mlp": 1.02941298, + "epoch": 0.7543285879184302, + "flos": 565007491584.0, + "grad_norm": 0.05213671641073607, + "language_loss": 0.75242233, + "learning_rate": 0.00015010007566630535, + "loss": 0.76305258, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.33642578, + "step": 3921, + "time_per_iteration": 2.7238450050354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065892, + "balance_loss_mlp": 1.03210807, + "epoch": 0.7545209696036937, + "flos": 520781833728.0, + "grad_norm": 0.060725267986870404, + "language_loss": 0.8104378, + "learning_rate": 0.00014987759678781077, + "loss": 0.82109678, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.33813477, + "step": 3922, + "time_per_iteration": 2.596788167953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066538, + "balance_loss_mlp": 1.03208637, + "epoch": 0.7547133512889573, + "flos": 615782020608.0, + "grad_norm": 0.05117423221869946, + "language_loss": 0.82205606, + "learning_rate": 0.00014965525383888795, + "loss": 0.83272147, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.3449707, + "step": 3923, + "time_per_iteration": 2.7719502449035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106299, + "balance_loss_mlp": 1.0298022, + "epoch": 0.7549057329742208, + "flos": 750522157056.0, + "grad_norm": 0.05672347636966434, + "language_loss": 0.72166836, + "learning_rate": 0.00014943304690585851, + "loss": 0.73229825, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.33203125, + "step": 3924, + "time_per_iteration": 2.90588116645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063807, + "balance_loss_mlp": 1.03069079, + "epoch": 0.7550981146594844, + "flos": 514193674752.0, + "grad_norm": 0.06004038441284508, + "language_loss": 0.79123962, + "learning_rate": 0.0001492109760749908, + "loss": 0.80187768, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.33129883, + "step": 3925, + "time_per_iteration": 2.573479652404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062523, + "balance_loss_mlp": 1.02900124, + "epoch": 0.755290496344748, + "flos": 521756674560.0, + "grad_norm": 0.04754610420203459, + "language_loss": 0.79945302, + "learning_rate": 0.00014898904143250002, + "loss": 0.81007826, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.33544922, + "step": 3926, + "time_per_iteration": 2.6605517864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011981, + "balance_loss_mlp": 1.00320745, + "epoch": 0.7554828780300116, + "flos": 1413845521920.0, + "grad_norm": 0.009243318676460378, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76767182, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.08789062, + "step": 3927, + "time_per_iteration": 4.911595106124878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066882, + "balance_loss_mlp": 1.03343201, + "epoch": 0.7556752597152752, + "flos": 556676264448.0, + "grad_norm": 0.06225847362151781, + "language_loss": 0.80114925, + "learning_rate": 0.0001485455810572474, + "loss": 0.81181806, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.3347168, + "step": 3928, + "time_per_iteration": 2.6221096515655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061736, + "balance_loss_mlp": 1.02864373, + "epoch": 0.7558676414005386, + "flos": 563363347968.0, + "grad_norm": 0.0430287394272786, + "language_loss": 0.83688951, + "learning_rate": 0.00014832405549665236, + "loss": 0.84750688, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.33105469, + "step": 3929, + "time_per_iteration": 2.687077760696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062194, + "balance_loss_mlp": 1.02898264, + "epoch": 0.7560600230858022, + "flos": 561089189376.0, + "grad_norm": 0.072300166117579, + "language_loss": 0.78684491, + "learning_rate": 0.00014810266646876746, + "loss": 0.79746687, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.33227539, + "step": 3930, + "time_per_iteration": 2.784480571746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060871, + "balance_loss_mlp": 1.02703977, + "epoch": 0.7562524047710658, + "flos": 719232708096.0, + "grad_norm": 0.05835242926257929, + "language_loss": 0.7758401, + "learning_rate": 0.00014788141405954364, + "loss": 0.78644884, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.33862305, + "step": 3931, + "time_per_iteration": 2.9784233570098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066404, + "balance_loss_mlp": 1.03345442, + "epoch": 0.7564447864563294, + "flos": 543086719488.0, + "grad_norm": 0.059110171964688825, + "language_loss": 0.84827656, + "learning_rate": 0.00014766029835487865, + "loss": 0.85894054, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.32958984, + "step": 3932, + "time_per_iteration": 2.6907904148101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062333, + "balance_loss_mlp": 1.02945542, + "epoch": 0.7566371681415929, + "flos": 725484805632.0, + "grad_norm": 0.06258669653948258, + "language_loss": 0.79361248, + "learning_rate": 0.0001474393194406173, + "loss": 0.80423582, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.32885742, + "step": 3933, + "time_per_iteration": 2.8968892097473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062749, + "balance_loss_mlp": 1.02991855, + "epoch": 0.7568295498268565, + "flos": 576274825728.0, + "grad_norm": 0.05981896319872157, + "language_loss": 0.79737186, + "learning_rate": 0.00014721847740255112, + "loss": 0.80799937, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.32836914, + "step": 3934, + "time_per_iteration": 2.7890961170196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011573, + "balance_loss_mlp": 1.00279939, + "epoch": 0.75702193151212, + "flos": 1519273594368.0, + "grad_norm": 0.004234862497934677, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74923497, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.08789062, + "step": 3935, + "time_per_iteration": 4.601314544677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061211, + "balance_loss_mlp": 1.02866662, + "epoch": 0.7572143131973836, + "flos": 525218079744.0, + "grad_norm": 0.08729501831475094, + "language_loss": 0.78364342, + "learning_rate": 0.00014677720429790526, + "loss": 0.7942555, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.32543945, + "step": 3936, + "time_per_iteration": 2.5926949977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061936, + "balance_loss_mlp": 1.0290581, + "epoch": 0.7574066948826472, + "flos": 550467836928.0, + "grad_norm": 0.04449678335712254, + "language_loss": 0.84388995, + "learning_rate": 0.0001465567734026429, + "loss": 0.85450935, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.32885742, + "step": 3937, + "time_per_iteration": 2.673203706741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064975, + "balance_loss_mlp": 1.03183448, + "epoch": 0.7575990765679107, + "flos": 395682416640.0, + "grad_norm": 0.06471305080336787, + "language_loss": 0.82730478, + "learning_rate": 0.00014633647972621034, + "loss": 0.83795452, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.33154297, + "step": 3938, + "time_per_iteration": 2.4455604553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067639, + "balance_loss_mlp": 1.03399837, + "epoch": 0.7577914582531743, + "flos": 584742855168.0, + "grad_norm": 0.04609831927497642, + "language_loss": 0.86192119, + "learning_rate": 0.00014611632335413354, + "loss": 0.87259758, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.33666992, + "step": 3939, + "time_per_iteration": 2.7661402225494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068002, + "balance_loss_mlp": 1.03526759, + "epoch": 0.7579838399384379, + "flos": 820604265984.0, + "grad_norm": 0.05221570879511052, + "language_loss": 0.82420516, + "learning_rate": 0.00014589630437188456, + "loss": 0.83488512, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.32739258, + "step": 3940, + "time_per_iteration": 3.1596429347991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010684, + "balance_loss_mlp": 1.03578401, + "epoch": 0.7581762216237015, + "flos": 443664843264.0, + "grad_norm": 0.0650937472679739, + "language_loss": 0.78844047, + "learning_rate": 0.00014567642286488253, + "loss": 0.79912448, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.32617188, + "step": 3941, + "time_per_iteration": 2.515453577041626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067786, + "balance_loss_mlp": 1.03505135, + "epoch": 0.7583686033089649, + "flos": 540624886272.0, + "grad_norm": 0.060324478977950624, + "language_loss": 0.7890631, + "learning_rate": 0.00014545667891849258, + "loss": 0.79974091, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.32739258, + "step": 3942, + "time_per_iteration": 2.632852554321289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068794, + "balance_loss_mlp": 1.03648806, + "epoch": 0.7585609849942285, + "flos": 522332845056.0, + "grad_norm": 0.05155975595459647, + "language_loss": 0.8239159, + "learning_rate": 0.00014523707261802733, + "loss": 0.83460391, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.32299805, + "step": 3943, + "time_per_iteration": 2.6377763748168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074539, + "balance_loss_mlp": 1.04170835, + "epoch": 0.7587533666794921, + "flos": 541599727104.0, + "grad_norm": 0.05698795626816005, + "language_loss": 0.81395125, + "learning_rate": 0.00014501760404874527, + "loss": 0.82469666, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.32836914, + "step": 3944, + "time_per_iteration": 2.690519332885742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073349, + "balance_loss_mlp": 1.04116213, + "epoch": 0.7589457483647557, + "flos": 606131126784.0, + "grad_norm": 0.06183156174415775, + "language_loss": 0.85775477, + "learning_rate": 0.00014479827329585176, + "loss": 0.86848831, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.32177734, + "step": 3945, + "time_per_iteration": 2.7058537006378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066895, + "balance_loss_mlp": 1.03449392, + "epoch": 0.7591381300500193, + "flos": 554821125120.0, + "grad_norm": 0.04920928189565755, + "language_loss": 0.84866571, + "learning_rate": 0.00014457908044449846, + "loss": 0.85933459, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.32397461, + "step": 3946, + "time_per_iteration": 2.785212516784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071888, + "balance_loss_mlp": 1.03963017, + "epoch": 0.7593305117352828, + "flos": 529399660032.0, + "grad_norm": 0.05182175118482316, + "language_loss": 0.82816386, + "learning_rate": 0.00014436002557978371, + "loss": 0.8388828, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.32250977, + "step": 3947, + "time_per_iteration": 2.784555196762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029365, + "balance_loss_mlp": 1.02059126, + "epoch": 0.7595228934205464, + "flos": 1502020412928.0, + "grad_norm": 0.01048294354444643, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77672517, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.08789062, + "step": 3948, + "time_per_iteration": 4.8788769245147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072982, + "balance_loss_mlp": 1.0407002, + "epoch": 0.7597152751058099, + "flos": 455290149888.0, + "grad_norm": 0.0492093123378979, + "language_loss": 0.79732686, + "learning_rate": 0.0001439223301503945, + "loss": 0.80805671, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.32275391, + "step": 3949, + "time_per_iteration": 2.548963785171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071385, + "balance_loss_mlp": 1.0404619, + "epoch": 0.7599076567910735, + "flos": 685135190016.0, + "grad_norm": 0.05900471318664728, + "language_loss": 0.76152921, + "learning_rate": 0.00014370368975564834, + "loss": 0.77224308, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.30883789, + "step": 3950, + "time_per_iteration": 2.913701295852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107191, + "balance_loss_mlp": 1.03915179, + "epoch": 0.760100038476337, + "flos": 532092837888.0, + "grad_norm": 0.059009621355687734, + "language_loss": 0.83279252, + "learning_rate": 0.00014348518768739766, + "loss": 0.84351158, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.32763672, + "step": 3951, + "time_per_iteration": 2.7261831760406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022819, + "balance_loss_mlp": 1.01409268, + "epoch": 0.7602924201616006, + "flos": 1470952134144.0, + "grad_norm": 0.0078103610005334605, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77750862, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.08740234, + "step": 3952, + "time_per_iteration": 4.8437769412994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072298, + "balance_loss_mlp": 1.04094601, + "epoch": 0.7604848018468642, + "flos": 774280539648.0, + "grad_norm": 0.04997444218606865, + "language_loss": 0.86468828, + "learning_rate": 0.00014304859886964867, + "loss": 0.87541121, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.31323242, + "step": 3953, + "time_per_iteration": 3.0284688472747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074246, + "balance_loss_mlp": 1.04148698, + "epoch": 0.7606771835321278, + "flos": 557917355520.0, + "grad_norm": 0.06472890254950428, + "language_loss": 0.83519757, + "learning_rate": 0.00014283051228964878, + "loss": 0.84594011, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.32763672, + "step": 3954, + "time_per_iteration": 2.783090591430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067108, + "balance_loss_mlp": 1.03527939, + "epoch": 0.7608695652173914, + "flos": 525139504128.0, + "grad_norm": 0.05417243250507387, + "language_loss": 0.82754749, + "learning_rate": 0.00014261256437514197, + "loss": 0.83821857, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.31811523, + "step": 3955, + "time_per_iteration": 2.644597291946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067658, + "balance_loss_mlp": 1.03497052, + "epoch": 0.7610619469026548, + "flos": 614757717504.0, + "grad_norm": 0.055555468337999576, + "language_loss": 0.82313621, + "learning_rate": 0.0001423947552107428, + "loss": 0.83381271, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.3269043, + "step": 3956, + "time_per_iteration": 2.7361013889312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069431, + "balance_loss_mlp": 1.03648186, + "epoch": 0.7612543285879184, + "flos": 862992313344.0, + "grad_norm": 0.0569357592258459, + "language_loss": 0.77433807, + "learning_rate": 0.00014217708488101243, + "loss": 0.78503239, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.32958984, + "step": 3957, + "time_per_iteration": 3.050961494445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074949, + "balance_loss_mlp": 1.04271495, + "epoch": 0.761446710273182, + "flos": 553392359424.0, + "grad_norm": 0.06767693941608623, + "language_loss": 0.77007008, + "learning_rate": 0.0001419595534704579, + "loss": 0.78081954, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.32226562, + "step": 3958, + "time_per_iteration": 2.660353899002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062214, + "balance_loss_mlp": 1.03105259, + "epoch": 0.7616390919584456, + "flos": 467107513344.0, + "grad_norm": 0.049028323039667754, + "language_loss": 0.80953354, + "learning_rate": 0.00014174216106353237, + "loss": 0.82015562, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.3112793, + "step": 3959, + "time_per_iteration": 2.5838327407836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068236, + "balance_loss_mlp": 1.03542924, + "epoch": 0.7618314736437091, + "flos": 498181584384.0, + "grad_norm": 0.05923666711399137, + "language_loss": 0.75957918, + "learning_rate": 0.00014152490774463512, + "loss": 0.77026153, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.328125, + "step": 3960, + "time_per_iteration": 2.629302978515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068987, + "balance_loss_mlp": 1.03639507, + "epoch": 0.7620238553289727, + "flos": 434319487488.0, + "grad_norm": 0.07059591088547341, + "language_loss": 0.8700611, + "learning_rate": 0.00014130779359811135, + "loss": 0.88075095, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.32592773, + "step": 3961, + "time_per_iteration": 2.485924243927002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067259, + "balance_loss_mlp": 1.03528666, + "epoch": 0.7622162370142362, + "flos": 663962296320.0, + "grad_norm": 0.05047068415952909, + "language_loss": 0.85704315, + "learning_rate": 0.0001410908187082521, + "loss": 0.86771578, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.31958008, + "step": 3962, + "time_per_iteration": 2.8265414237976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065802, + "balance_loss_mlp": 1.03313875, + "epoch": 0.7624086186994998, + "flos": 557700567552.0, + "grad_norm": 0.05430861505422096, + "language_loss": 0.82810938, + "learning_rate": 0.0001408739831592949, + "loss": 0.83876741, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.32666016, + "step": 3963, + "time_per_iteration": 2.661726236343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067961, + "balance_loss_mlp": 1.03529739, + "epoch": 0.7626010003847634, + "flos": 628844857344.0, + "grad_norm": 0.06042473159086171, + "language_loss": 0.77454793, + "learning_rate": 0.0001406572870354224, + "loss": 0.78522754, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.32666016, + "step": 3964, + "time_per_iteration": 2.7862119674682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068961, + "balance_loss_mlp": 1.03706062, + "epoch": 0.7627933820700269, + "flos": 437716873728.0, + "grad_norm": 0.04673534263309711, + "language_loss": 0.86767244, + "learning_rate": 0.00014044073042076337, + "loss": 0.87836206, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.31884766, + "step": 3965, + "time_per_iteration": 2.4798128604888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069459, + "balance_loss_mlp": 1.03765345, + "epoch": 0.7629857637552905, + "flos": 532456602624.0, + "grad_norm": 0.04658863025626681, + "language_loss": 0.88987994, + "learning_rate": 0.00014022431339939302, + "loss": 0.90057456, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.31787109, + "step": 3966, + "time_per_iteration": 2.636894702911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067525, + "balance_loss_mlp": 1.03479052, + "epoch": 0.7631781454405541, + "flos": 679737249792.0, + "grad_norm": 0.08316975322842361, + "language_loss": 0.77961999, + "learning_rate": 0.00014000803605533163, + "loss": 0.79029524, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.32739258, + "step": 3967, + "time_per_iteration": 2.8040103912353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065892, + "balance_loss_mlp": 1.03344274, + "epoch": 0.7633705271258177, + "flos": 507246133248.0, + "grad_norm": 0.05895392031680787, + "language_loss": 0.83634377, + "learning_rate": 0.00013979189847254553, + "loss": 0.84700263, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.32446289, + "step": 3968, + "time_per_iteration": 2.5431933403015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067532, + "balance_loss_mlp": 1.03501129, + "epoch": 0.7635629088110811, + "flos": 618574123008.0, + "grad_norm": 0.055607531947043785, + "language_loss": 0.80514443, + "learning_rate": 0.00013957590073494674, + "loss": 0.81581974, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.32519531, + "step": 3969, + "time_per_iteration": 2.8017959594726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064277, + "balance_loss_mlp": 1.03232884, + "epoch": 0.7637552904963447, + "flos": 638140750848.0, + "grad_norm": 0.26403384502939975, + "language_loss": 0.78649521, + "learning_rate": 0.0001393600429263931, + "loss": 0.79713798, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.31933594, + "step": 3970, + "time_per_iteration": 4.2505412101745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100666, + "balance_loss_mlp": 0.99793345, + "epoch": 0.7639476721816083, + "flos": 1562359905792.0, + "grad_norm": 0.004510519200430985, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75751543, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.08740234, + "step": 3971, + "time_per_iteration": 4.917391777038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063864, + "balance_loss_mlp": 1.03112936, + "epoch": 0.7641400538668719, + "flos": 495729925632.0, + "grad_norm": 0.05348736526149064, + "language_loss": 0.81438577, + "learning_rate": 0.0001389287474315804, + "loss": 0.82502437, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.32739258, + "step": 3972, + "time_per_iteration": 2.611975908279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063623, + "balance_loss_mlp": 1.03153205, + "epoch": 0.7643324355521355, + "flos": 578173635072.0, + "grad_norm": 0.05070273758495156, + "language_loss": 0.7976076, + "learning_rate": 0.00013871330991276505, + "loss": 0.80824381, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.32080078, + "step": 3973, + "time_per_iteration": 2.6702983379364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106582, + "balance_loss_mlp": 1.03318095, + "epoch": 0.764524817237399, + "flos": 784472698368.0, + "grad_norm": 0.053475096213737486, + "language_loss": 0.80356216, + "learning_rate": 0.00013849801265788247, + "loss": 0.81422037, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.32641602, + "step": 3974, + "time_per_iteration": 3.00087571144104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066526, + "balance_loss_mlp": 1.03357661, + "epoch": 0.7647171989226625, + "flos": 526025594880.0, + "grad_norm": 0.054787050143816365, + "language_loss": 0.82488281, + "learning_rate": 0.00013828285575051818, + "loss": 0.83554804, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.32958984, + "step": 3975, + "time_per_iteration": 2.6055147647857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061125, + "balance_loss_mlp": 1.0279367, + "epoch": 0.7649095806079261, + "flos": 554589780480.0, + "grad_norm": 0.05436611510263978, + "language_loss": 0.84129888, + "learning_rate": 0.0001380678392742035, + "loss": 0.85191011, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.33203125, + "step": 3976, + "time_per_iteration": 2.6914188861846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106537, + "balance_loss_mlp": 1.03232551, + "epoch": 0.7651019622931897, + "flos": 648836296704.0, + "grad_norm": 0.051149264081770666, + "language_loss": 0.84838861, + "learning_rate": 0.00013785296331241526, + "loss": 0.85904235, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.33056641, + "step": 3977, + "time_per_iteration": 2.866154670715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064487, + "balance_loss_mlp": 1.03089428, + "epoch": 0.7652943439784533, + "flos": 1046034971136.0, + "grad_norm": 0.05614197674370758, + "language_loss": 0.87043619, + "learning_rate": 0.00013763822794857583, + "loss": 0.8810811, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.33618164, + "step": 3978, + "time_per_iteration": 3.309242010116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062044, + "balance_loss_mlp": 1.02947557, + "epoch": 0.7654867256637168, + "flos": 504085883904.0, + "grad_norm": 0.05878573704619195, + "language_loss": 0.89744586, + "learning_rate": 0.00013742363326605278, + "loss": 0.90806627, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.32568359, + "step": 3979, + "time_per_iteration": 2.687633991241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060607, + "balance_loss_mlp": 1.02789593, + "epoch": 0.7656791073489804, + "flos": 574422658560.0, + "grad_norm": 0.055229141283006315, + "language_loss": 0.78390539, + "learning_rate": 0.00013720917934815935, + "loss": 0.79451144, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.32714844, + "step": 3980, + "time_per_iteration": 2.7192299365997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106176, + "balance_loss_mlp": 1.02876329, + "epoch": 0.765871489034244, + "flos": 492568266240.0, + "grad_norm": 0.11784191582460708, + "language_loss": 0.82716662, + "learning_rate": 0.00013699486627815344, + "loss": 0.83778423, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.33007812, + "step": 3981, + "time_per_iteration": 2.5523879528045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066983, + "balance_loss_mlp": 1.03386712, + "epoch": 0.7660638707195075, + "flos": 485769111552.0, + "grad_norm": 0.048709081947545384, + "language_loss": 0.82393169, + "learning_rate": 0.00013678069413923928, + "loss": 0.83460152, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.33129883, + "step": 3982, + "time_per_iteration": 2.5948498249053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062511, + "balance_loss_mlp": 1.03034854, + "epoch": 0.766256252404771, + "flos": 444059131392.0, + "grad_norm": 0.05195057178385164, + "language_loss": 0.81826979, + "learning_rate": 0.00013656666301456555, + "loss": 0.82889485, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.3215332, + "step": 3983, + "time_per_iteration": 2.5596601963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063593, + "balance_loss_mlp": 1.02980876, + "epoch": 0.7664486340900346, + "flos": 484922308608.0, + "grad_norm": 0.08343651185872063, + "language_loss": 0.84138393, + "learning_rate": 0.0001363527729872267, + "loss": 0.85201979, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.33813477, + "step": 3984, + "time_per_iteration": 2.6182045936584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065831, + "balance_loss_mlp": 1.03354931, + "epoch": 0.7666410157752982, + "flos": 645905981952.0, + "grad_norm": 0.1262618740109736, + "language_loss": 0.76256335, + "learning_rate": 0.00013613902414026207, + "loss": 0.77322161, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.32275391, + "step": 3985, + "time_per_iteration": 2.7776031494140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106424, + "balance_loss_mlp": 1.03079021, + "epoch": 0.7668333974605618, + "flos": 773964827136.0, + "grad_norm": 0.050561982196081254, + "language_loss": 0.8239125, + "learning_rate": 0.00013592541655665642, + "loss": 0.83455491, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.3347168, + "step": 3986, + "time_per_iteration": 2.952242374420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064817, + "balance_loss_mlp": 1.03136706, + "epoch": 0.7670257791458254, + "flos": 613200913920.0, + "grad_norm": 0.052879642645961566, + "language_loss": 0.85094202, + "learning_rate": 0.00013571195031933947, + "loss": 0.86159021, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.33447266, + "step": 3987, + "time_per_iteration": 2.7266581058502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005099, + "balance_loss_mlp": 0.9958964, + "epoch": 0.7672181608310888, + "flos": 1484608670208.0, + "grad_norm": 0.011043844961489012, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.8148644, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.09179688, + "step": 3988, + "time_per_iteration": 4.669104814529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063461, + "balance_loss_mlp": 1.03079784, + "epoch": 0.7674105425163524, + "flos": 610449509376.0, + "grad_norm": 0.05355294055383006, + "language_loss": 0.85597003, + "learning_rate": 0.00013528544221501655, + "loss": 0.86660457, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.32666016, + "step": 3989, + "time_per_iteration": 2.7729666233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063434, + "balance_loss_mlp": 1.02960289, + "epoch": 0.767602924201616, + "flos": 844857423360.0, + "grad_norm": 0.05868617722535175, + "language_loss": 0.81521833, + "learning_rate": 0.00013507240051359586, + "loss": 0.82585269, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.33837891, + "step": 3990, + "time_per_iteration": 3.0997486114501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065906, + "balance_loss_mlp": 1.0340054, + "epoch": 0.7677953058868796, + "flos": 526857841152.0, + "grad_norm": 0.07003191043706981, + "language_loss": 0.8601203, + "learning_rate": 0.00013485950048963425, + "loss": 0.8707794, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.31884766, + "step": 3991, + "time_per_iteration": 2.5849506855010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063228, + "balance_loss_mlp": 1.03039789, + "epoch": 0.7679876875721431, + "flos": 923161660416.0, + "grad_norm": 0.07243254290057845, + "language_loss": 0.82772785, + "learning_rate": 0.00013464674222578643, + "loss": 0.83836013, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.32836914, + "step": 3992, + "time_per_iteration": 3.2332818508148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106772, + "balance_loss_mlp": 1.03410292, + "epoch": 0.7681800692574067, + "flos": 457855289856.0, + "grad_norm": 0.05271812769462788, + "language_loss": 0.83249938, + "learning_rate": 0.00013443412580465292, + "loss": 0.8431766, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.33642578, + "step": 3993, + "time_per_iteration": 2.5794618129730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062971, + "balance_loss_mlp": 1.03033197, + "epoch": 0.7683724509426703, + "flos": 658113251328.0, + "grad_norm": 0.050288127283744266, + "language_loss": 0.83906549, + "learning_rate": 0.00013422165130877857, + "loss": 0.84969521, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.32641602, + "step": 3994, + "time_per_iteration": 2.8854472637176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060909, + "balance_loss_mlp": 1.028723, + "epoch": 0.7685648326279338, + "flos": 555021946368.0, + "grad_norm": 0.05841740887579896, + "language_loss": 0.80092537, + "learning_rate": 0.00013400931882065327, + "loss": 0.81153446, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.32177734, + "step": 3995, + "time_per_iteration": 2.6247458457946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066631, + "balance_loss_mlp": 1.03337145, + "epoch": 0.7687572143131974, + "flos": 687070315008.0, + "grad_norm": 0.0471892049079333, + "language_loss": 0.8085227, + "learning_rate": 0.0001337971284227118, + "loss": 0.81918901, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.33276367, + "step": 3996, + "time_per_iteration": 3.0075807571411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003827, + "balance_loss_mlp": 0.99471956, + "epoch": 0.7689495959984609, + "flos": 1488653448192.0, + "grad_norm": 0.013752910811902266, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77122247, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.09130859, + "step": 3997, + "time_per_iteration": 4.915713787078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060268, + "balance_loss_mlp": 1.02698493, + "epoch": 0.7691419776837245, + "flos": 570133389312.0, + "grad_norm": 0.05931733235007729, + "language_loss": 0.79872787, + "learning_rate": 0.0001333731742268438, + "loss": 0.80933058, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.33276367, + "step": 3998, + "time_per_iteration": 2.7005136013031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063521, + "balance_loss_mlp": 1.03033328, + "epoch": 0.7693343593689881, + "flos": 519812785152.0, + "grad_norm": 0.05123464057208785, + "language_loss": 0.8547945, + "learning_rate": 0.0001331614105935109, + "loss": 0.8654297, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.33203125, + "step": 3999, + "time_per_iteration": 2.6618032455444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062851, + "balance_loss_mlp": 1.0290674, + "epoch": 0.7695267410542517, + "flos": 660086254080.0, + "grad_norm": 0.04349114240195965, + "language_loss": 0.84291816, + "learning_rate": 0.00013294978937954883, + "loss": 0.85354662, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.33813477, + "step": 4000, + "time_per_iteration": 2.787548780441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106608, + "balance_loss_mlp": 1.03336918, + "epoch": 0.7697191227395151, + "flos": 546548124672.0, + "grad_norm": 0.06371806812200402, + "language_loss": 0.85203207, + "learning_rate": 0.00013273831066711655, + "loss": 0.86269283, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.32714844, + "step": 4001, + "time_per_iteration": 2.603930950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066325, + "balance_loss_mlp": 1.03387642, + "epoch": 0.7699115044247787, + "flos": 540339697152.0, + "grad_norm": 0.04713288479352539, + "language_loss": 0.80269563, + "learning_rate": 0.00013252697453831747, + "loss": 0.8133589, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.32446289, + "step": 4002, + "time_per_iteration": 2.681474447250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065185, + "balance_loss_mlp": 1.03230727, + "epoch": 0.7701038861100423, + "flos": 562635818496.0, + "grad_norm": 0.05017266789361132, + "language_loss": 0.82595527, + "learning_rate": 0.00013231578107519916, + "loss": 0.8366071, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.32885742, + "step": 4003, + "time_per_iteration": 2.910759210586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106487, + "balance_loss_mlp": 1.03289843, + "epoch": 0.7702962677953059, + "flos": 481490016768.0, + "grad_norm": 0.05443168691462721, + "language_loss": 0.82779682, + "learning_rate": 0.00013210473035975422, + "loss": 0.83844554, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.31958008, + "step": 4004, + "time_per_iteration": 2.574204444885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106935, + "balance_loss_mlp": 1.03656733, + "epoch": 0.7704886494805695, + "flos": 770036350464.0, + "grad_norm": 0.05675172766442488, + "language_loss": 0.85354382, + "learning_rate": 0.0001318938224739201, + "loss": 0.86423731, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.32788086, + "step": 4005, + "time_per_iteration": 3.032860279083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067294, + "balance_loss_mlp": 1.03417802, + "epoch": 0.770681031165833, + "flos": 600912096768.0, + "grad_norm": 0.04532626069780256, + "language_loss": 0.83667225, + "learning_rate": 0.00013168305749957843, + "loss": 0.84734517, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.33129883, + "step": 4006, + "time_per_iteration": 2.7624073028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066913, + "balance_loss_mlp": 1.03379726, + "epoch": 0.7708734128510966, + "flos": 495862345728.0, + "grad_norm": 0.05222212765251844, + "language_loss": 0.82636768, + "learning_rate": 0.00013147243551855532, + "loss": 0.83703679, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.33129883, + "step": 4007, + "time_per_iteration": 2.5816714763641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063599, + "balance_loss_mlp": 1.03115058, + "epoch": 0.7710657945363601, + "flos": 567012427776.0, + "grad_norm": 0.057481422314481036, + "language_loss": 0.80578291, + "learning_rate": 0.00013126195661262148, + "loss": 0.81641883, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.32446289, + "step": 4008, + "time_per_iteration": 2.7452778816223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064326, + "balance_loss_mlp": 1.03190088, + "epoch": 0.7712581762216237, + "flos": 604251256320.0, + "grad_norm": 0.05872708876253251, + "language_loss": 0.86326575, + "learning_rate": 0.00013105162086349216, + "loss": 0.873909, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.32421875, + "step": 4009, + "time_per_iteration": 2.8586156368255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066916, + "balance_loss_mlp": 1.03530204, + "epoch": 0.7714505579068872, + "flos": 530620402176.0, + "grad_norm": 0.047861775046014535, + "language_loss": 0.86009622, + "learning_rate": 0.00013084142835282687, + "loss": 0.87076533, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.31591797, + "step": 4010, + "time_per_iteration": 2.704119920730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005244, + "balance_loss_mlp": 0.99647039, + "epoch": 0.7716429395921508, + "flos": 1421414313984.0, + "grad_norm": 0.012063998338178145, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80889606, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.08789062, + "step": 4011, + "time_per_iteration": 4.7817652225494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065556, + "balance_loss_mlp": 1.03301144, + "epoch": 0.7718353212774144, + "flos": 578140139520.0, + "grad_norm": 0.051053206649878655, + "language_loss": 0.89366746, + "learning_rate": 0.0001304214733732485, + "loss": 0.90432304, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.32543945, + "step": 4012, + "time_per_iteration": 2.7189698219299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067191, + "balance_loss_mlp": 1.0337882, + "epoch": 0.772027702962678, + "flos": 510486368256.0, + "grad_norm": 0.053964234671719305, + "language_loss": 0.82622194, + "learning_rate": 0.00013021171106737672, + "loss": 0.8368938, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.33422852, + "step": 4013, + "time_per_iteration": 2.695345401763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062166, + "balance_loss_mlp": 1.03031349, + "epoch": 0.7722200846479416, + "flos": 525391197696.0, + "grad_norm": 0.05051004242016687, + "language_loss": 0.79927659, + "learning_rate": 0.00013000209232605071, + "loss": 0.80989826, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.31835938, + "step": 4014, + "time_per_iteration": 2.6742262840270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062585, + "balance_loss_mlp": 1.03049421, + "epoch": 0.772412466333205, + "flos": 479348278272.0, + "grad_norm": 0.06883144067650042, + "language_loss": 0.79881573, + "learning_rate": 0.0001297926172306519, + "loss": 0.80944163, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.32080078, + "step": 4015, + "time_per_iteration": 2.5998587608337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106658, + "balance_loss_mlp": 1.03420317, + "epoch": 0.7726048480184686, + "flos": 905284256256.0, + "grad_norm": 0.049021978478966305, + "language_loss": 0.7864179, + "learning_rate": 0.0001295832858625055, + "loss": 0.79708374, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.32373047, + "step": 4016, + "time_per_iteration": 3.241476535797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064551, + "balance_loss_mlp": 1.03195906, + "epoch": 0.7727972297037322, + "flos": 631085520384.0, + "grad_norm": 0.050738578814051916, + "language_loss": 0.69703871, + "learning_rate": 0.00012937409830288154, + "loss": 0.70768428, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.32592773, + "step": 4017, + "time_per_iteration": 2.7928261756896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060438, + "balance_loss_mlp": 1.02868032, + "epoch": 0.7729896113889958, + "flos": 414565185024.0, + "grad_norm": 0.11993476807725541, + "language_loss": 0.84959614, + "learning_rate": 0.00012916505463299362, + "loss": 0.86020052, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.31738281, + "step": 4018, + "time_per_iteration": 2.4724020957946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061884, + "balance_loss_mlp": 1.03012657, + "epoch": 0.7731819930742593, + "flos": 668609538048.0, + "grad_norm": 0.07815379187745079, + "language_loss": 0.78152752, + "learning_rate": 0.00012895615493399972, + "loss": 0.79214638, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.31738281, + "step": 4019, + "time_per_iteration": 2.7819771766662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105935, + "balance_loss_mlp": 1.02704406, + "epoch": 0.7733743747595229, + "flos": 489604455936.0, + "grad_norm": 0.06361322846277707, + "language_loss": 0.82174695, + "learning_rate": 0.00012874739928700192, + "loss": 0.83234048, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.32299805, + "step": 4020, + "time_per_iteration": 2.577558755874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063745, + "balance_loss_mlp": 1.03046131, + "epoch": 0.7735667564447865, + "flos": 659294705664.0, + "grad_norm": 0.0626070053016161, + "language_loss": 0.79737717, + "learning_rate": 0.00012853878777304624, + "loss": 0.80801463, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.33300781, + "step": 4021, + "time_per_iteration": 2.868053674697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064138, + "balance_loss_mlp": 1.03140283, + "epoch": 0.77375913813005, + "flos": 533106966528.0, + "grad_norm": 0.04737550155927703, + "language_loss": 0.84463626, + "learning_rate": 0.000128330320473123, + "loss": 0.85527766, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.32739258, + "step": 4022, + "time_per_iteration": 2.668313503265381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008425, + "balance_loss_mlp": 0.99988997, + "epoch": 0.7739515198153136, + "flos": 1519260447744.0, + "grad_norm": 0.005844569838786065, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79340327, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.08544922, + "step": 4023, + "time_per_iteration": 4.965493202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063459, + "balance_loss_mlp": 1.03053296, + "epoch": 0.7741439015005771, + "flos": 639819800064.0, + "grad_norm": 0.08130494829641424, + "language_loss": 0.81473714, + "learning_rate": 0.0001279138188390543, + "loss": 0.82537174, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.3293457, + "step": 4024, + "time_per_iteration": 2.7925288677215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063718, + "balance_loss_mlp": 1.03122211, + "epoch": 0.7743362831858407, + "flos": 665546803200.0, + "grad_norm": 0.05426924538048376, + "language_loss": 0.86122662, + "learning_rate": 0.00012770578466660915, + "loss": 0.87186384, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.32495117, + "step": 4025, + "time_per_iteration": 2.8743951320648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067837, + "balance_loss_mlp": 1.0342437, + "epoch": 0.7745286648711043, + "flos": 562453936128.0, + "grad_norm": 0.050549186901469166, + "language_loss": 0.81480557, + "learning_rate": 0.0001274978950315968, + "loss": 0.82548392, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.33618164, + "step": 4026, + "time_per_iteration": 2.7961745262145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061898, + "balance_loss_mlp": 1.02923501, + "epoch": 0.7747210465563679, + "flos": 516651125760.0, + "grad_norm": 0.06240008099647138, + "language_loss": 0.82893825, + "learning_rate": 0.00012729015001472716, + "loss": 0.83955729, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.32666016, + "step": 4027, + "time_per_iteration": 2.63754940032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065965, + "balance_loss_mlp": 1.03227663, + "epoch": 0.7749134282416313, + "flos": 633921292800.0, + "grad_norm": 0.052874284120550924, + "language_loss": 0.81483364, + "learning_rate": 0.00012708254969665418, + "loss": 0.82549322, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.3371582, + "step": 4028, + "time_per_iteration": 2.7484118938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064367, + "balance_loss_mlp": 1.03070259, + "epoch": 0.7751058099268949, + "flos": 495118849536.0, + "grad_norm": 0.06123905199819526, + "language_loss": 0.83476496, + "learning_rate": 0.00012687509415797526, + "loss": 0.84540868, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.33691406, + "step": 4029, + "time_per_iteration": 2.5675880908966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063972, + "balance_loss_mlp": 1.03183281, + "epoch": 0.7752981916121585, + "flos": 510048410112.0, + "grad_norm": 0.09107931997699928, + "language_loss": 0.81183356, + "learning_rate": 0.00012666778347923208, + "loss": 0.82247323, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.32128906, + "step": 4030, + "time_per_iteration": 2.632314443588257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060699, + "balance_loss_mlp": 1.02813113, + "epoch": 0.7754905732974221, + "flos": 497295493632.0, + "grad_norm": 0.04486214088641844, + "language_loss": 0.83638769, + "learning_rate": 0.0001264606177409092, + "loss": 0.84699464, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.32568359, + "step": 4031, + "time_per_iteration": 2.6301512718200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063527, + "balance_loss_mlp": 1.03081632, + "epoch": 0.7756829549826857, + "flos": 480486062592.0, + "grad_norm": 0.0481221818679906, + "language_loss": 0.86095941, + "learning_rate": 0.00012625359702343609, + "loss": 0.87159473, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.32714844, + "step": 4032, + "time_per_iteration": 2.708512544631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063655, + "balance_loss_mlp": 1.03125429, + "epoch": 0.7758753366679492, + "flos": 552368056320.0, + "grad_norm": 0.0642979185043706, + "language_loss": 0.84532368, + "learning_rate": 0.00012604672140718504, + "loss": 0.85596019, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.32397461, + "step": 4033, + "time_per_iteration": 2.632307529449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062755, + "balance_loss_mlp": 1.03006816, + "epoch": 0.7760677183532128, + "flos": 703529127936.0, + "grad_norm": 0.05215032719242253, + "language_loss": 0.77701473, + "learning_rate": 0.00012583999097247233, + "loss": 0.78764236, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.3269043, + "step": 4034, + "time_per_iteration": 2.8174097537994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064658, + "balance_loss_mlp": 1.03266239, + "epoch": 0.7762601000384763, + "flos": 523218935808.0, + "grad_norm": 0.06260246603028506, + "language_loss": 0.79696673, + "learning_rate": 0.0001256334057995578, + "loss": 0.80761331, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.31982422, + "step": 4035, + "time_per_iteration": 2.69726300239563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063141, + "balance_loss_mlp": 1.03159809, + "epoch": 0.7764524817237399, + "flos": 557262609408.0, + "grad_norm": 0.048886632926304276, + "language_loss": 0.84979451, + "learning_rate": 0.000125426965968645, + "loss": 0.86042595, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.31518555, + "step": 4036, + "time_per_iteration": 2.72336483001709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066748, + "balance_loss_mlp": 1.03508615, + "epoch": 0.7766448634090035, + "flos": 579454013952.0, + "grad_norm": 0.07567948550064775, + "language_loss": 0.81946111, + "learning_rate": 0.00012522067155988092, + "loss": 0.83012855, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.31640625, + "step": 4037, + "time_per_iteration": 2.6716489791870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063745, + "balance_loss_mlp": 1.03153515, + "epoch": 0.776837245094267, + "flos": 635300596224.0, + "grad_norm": 0.05548749189645599, + "language_loss": 0.75042689, + "learning_rate": 0.00012501452265335617, + "loss": 0.76106441, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.32202148, + "step": 4038, + "time_per_iteration": 2.798152446746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063102, + "balance_loss_mlp": 1.03115439, + "epoch": 0.7770296267795306, + "flos": 614398334976.0, + "grad_norm": 0.04733898192839437, + "language_loss": 0.83099091, + "learning_rate": 0.0001248085193291047, + "loss": 0.84162188, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.31933594, + "step": 4039, + "time_per_iteration": 2.713104009628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064564, + "balance_loss_mlp": 1.03287828, + "epoch": 0.7772220084647942, + "flos": 878438407680.0, + "grad_norm": 0.06729067040173044, + "language_loss": 0.8247925, + "learning_rate": 0.00012460266166710443, + "loss": 0.83543813, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.31665039, + "step": 4040, + "time_per_iteration": 3.142155408859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061758, + "balance_loss_mlp": 1.02988183, + "epoch": 0.7774143901500578, + "flos": 839293567488.0, + "grad_norm": 0.08233225163586903, + "language_loss": 0.77612185, + "learning_rate": 0.00012439694974727633, + "loss": 0.78673941, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.31860352, + "step": 4041, + "time_per_iteration": 2.9853243827819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065759, + "balance_loss_mlp": 1.03338194, + "epoch": 0.7776067718353212, + "flos": 567878169600.0, + "grad_norm": 0.054149054361607385, + "language_loss": 0.79806697, + "learning_rate": 0.00012419138364948458, + "loss": 0.80872452, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.32373047, + "step": 4042, + "time_per_iteration": 2.7431745529174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064082, + "balance_loss_mlp": 1.03191924, + "epoch": 0.7777991535205848, + "flos": 745627603968.0, + "grad_norm": 0.05348286137005146, + "language_loss": 0.8234185, + "learning_rate": 0.00012398596345353702, + "loss": 0.83405924, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.3215332, + "step": 4043, + "time_per_iteration": 2.896669864654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069609, + "balance_loss_mlp": 1.03785181, + "epoch": 0.7779915352058484, + "flos": 537799288320.0, + "grad_norm": 0.048601854183842386, + "language_loss": 0.83191538, + "learning_rate": 0.0001237806892391851, + "loss": 0.84261149, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.31738281, + "step": 4044, + "time_per_iteration": 2.6875576972961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067998, + "balance_loss_mlp": 1.03523958, + "epoch": 0.778183916891112, + "flos": 634497463296.0, + "grad_norm": 0.05218142456455376, + "language_loss": 0.807693, + "learning_rate": 0.0001235755610861233, + "loss": 0.81837296, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.32763672, + "step": 4045, + "time_per_iteration": 2.7440977096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063341, + "balance_loss_mlp": 1.03125, + "epoch": 0.7783762985763756, + "flos": 588400699392.0, + "grad_norm": 0.06119934823569683, + "language_loss": 0.85257781, + "learning_rate": 0.0001233705790739893, + "loss": 0.86321127, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.32080078, + "step": 4046, + "time_per_iteration": 2.771397829055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066435, + "balance_loss_mlp": 1.03398585, + "epoch": 0.7785686802616391, + "flos": 930261970944.0, + "grad_norm": 0.05518335637199763, + "language_loss": 0.74865597, + "learning_rate": 0.0001231657432823643, + "loss": 0.75932032, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.32446289, + "step": 4047, + "time_per_iteration": 3.2299704551696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068089, + "balance_loss_mlp": 1.03618836, + "epoch": 0.7787610619469026, + "flos": 497679607296.0, + "grad_norm": 0.061331476050258626, + "language_loss": 0.78644454, + "learning_rate": 0.0001229610537907725, + "loss": 0.7971254, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.31884766, + "step": 4048, + "time_per_iteration": 2.581489324569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062949, + "balance_loss_mlp": 1.03040469, + "epoch": 0.7789534436321662, + "flos": 515385303552.0, + "grad_norm": 0.060582734060361326, + "language_loss": 0.90193808, + "learning_rate": 0.00012275651067868143, + "loss": 0.9125675, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.32543945, + "step": 4049, + "time_per_iteration": 2.5799412727355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066, + "balance_loss_mlp": 1.03350401, + "epoch": 0.7791458253174298, + "flos": 988081555968.0, + "grad_norm": 0.06086378000483131, + "language_loss": 0.80482578, + "learning_rate": 0.00012255211402550182, + "loss": 0.81548578, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.32495117, + "step": 4050, + "time_per_iteration": 3.228003740310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065165, + "balance_loss_mlp": 1.03283536, + "epoch": 0.7793382070026933, + "flos": 628756107264.0, + "grad_norm": 0.1203274251701162, + "language_loss": 0.76654673, + "learning_rate": 0.00012234786391058727, + "loss": 0.77719831, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.32324219, + "step": 4051, + "time_per_iteration": 2.7767224311828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106691, + "balance_loss_mlp": 1.03405643, + "epoch": 0.7795305886879569, + "flos": 531500700672.0, + "grad_norm": 0.06608083549317771, + "language_loss": 0.85191727, + "learning_rate": 0.0001221437604132352, + "loss": 0.86258644, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.32861328, + "step": 4052, + "time_per_iteration": 2.6072323322296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069028, + "balance_loss_mlp": 1.03703237, + "epoch": 0.7797229703732205, + "flos": 611690600448.0, + "grad_norm": 0.06701840569046753, + "language_loss": 0.80875957, + "learning_rate": 0.0001219398036126852, + "loss": 0.8194499, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.31982422, + "step": 4053, + "time_per_iteration": 2.789151668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069667, + "balance_loss_mlp": 1.03738546, + "epoch": 0.7799153520584841, + "flos": 871758526464.0, + "grad_norm": 0.05089113411890528, + "language_loss": 0.78444964, + "learning_rate": 0.00012173599358812027, + "loss": 0.79514629, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.32275391, + "step": 4054, + "time_per_iteration": 3.282203197479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065975, + "balance_loss_mlp": 1.03359818, + "epoch": 0.7801077337437476, + "flos": 583348995072.0, + "grad_norm": 0.06359619445711458, + "language_loss": 0.82295758, + "learning_rate": 0.0001215323304186668, + "loss": 0.83361733, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.32373047, + "step": 4055, + "time_per_iteration": 2.751826763153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062514, + "balance_loss_mlp": 1.03073275, + "epoch": 0.7803001154290111, + "flos": 600887365632.0, + "grad_norm": 0.04750930955711312, + "language_loss": 0.8780787, + "learning_rate": 0.00012132881418339364, + "loss": 0.88870382, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.31762695, + "step": 4056, + "time_per_iteration": 2.7023940086364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016356, + "balance_loss_mlp": 1.00820196, + "epoch": 0.7804924971142747, + "flos": 1478743506432.0, + "grad_norm": 0.010148524200822068, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.78533918, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.08154297, + "step": 4057, + "time_per_iteration": 4.826777458190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066059, + "balance_loss_mlp": 1.03430223, + "epoch": 0.7806848787995383, + "flos": 630075773952.0, + "grad_norm": 0.04851285793009641, + "language_loss": 0.76570946, + "learning_rate": 0.00012092222283137944, + "loss": 0.77637005, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.31738281, + "step": 4058, + "time_per_iteration": 2.7130894660949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014581, + "balance_loss_mlp": 1.00647449, + "epoch": 0.7808772604848019, + "flos": 1416800567808.0, + "grad_norm": 0.006919063816033351, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79920888, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.08105469, + "step": 4059, + "time_per_iteration": 4.767851114273071 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068468, + "balance_loss_mlp": 1.03575706, + "epoch": 0.7810696421700654, + "flos": 731345435136.0, + "grad_norm": 0.0468820010320679, + "language_loss": 0.83492804, + "learning_rate": 0.00012051622016348856, + "loss": 0.8456127, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.32714844, + "step": 4060, + "time_per_iteration": 3.0499465465545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065592, + "balance_loss_mlp": 1.0336442, + "epoch": 0.781262023855329, + "flos": 424718055936.0, + "grad_norm": 0.05864420891572784, + "language_loss": 0.8411994, + "learning_rate": 0.00012031343978315539, + "loss": 0.85185528, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.31933594, + "step": 4061, + "time_per_iteration": 2.448692560195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063026, + "balance_loss_mlp": 1.0311023, + "epoch": 0.7814544055405925, + "flos": 500767073280.0, + "grad_norm": 0.10364470659774863, + "language_loss": 0.82632732, + "learning_rate": 0.00012011080681021774, + "loss": 0.83695757, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.3190918, + "step": 4062, + "time_per_iteration": 2.611121892929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066061, + "balance_loss_mlp": 1.03373194, + "epoch": 0.7816467872258561, + "flos": 462212960256.0, + "grad_norm": 0.09614941126191437, + "language_loss": 0.86035311, + "learning_rate": 0.00011990832132334512, + "loss": 0.87101376, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.32324219, + "step": 4063, + "time_per_iteration": 2.5123276710510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066235, + "balance_loss_mlp": 1.03354836, + "epoch": 0.7818391689111197, + "flos": 740497324032.0, + "grad_norm": 0.05603872830064661, + "language_loss": 0.8259666, + "learning_rate": 0.00011970598340114897, + "loss": 0.83662897, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.3269043, + "step": 4064, + "time_per_iteration": 2.992100238800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062887, + "balance_loss_mlp": 1.03101015, + "epoch": 0.7820315505963832, + "flos": 547386163200.0, + "grad_norm": 0.05629095926792252, + "language_loss": 0.8402884, + "learning_rate": 0.00011950379312218396, + "loss": 0.85091722, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.31860352, + "step": 4065, + "time_per_iteration": 2.7270681858062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061717, + "balance_loss_mlp": 1.02950692, + "epoch": 0.7822239322816468, + "flos": 728665403904.0, + "grad_norm": 0.045794357656988534, + "language_loss": 0.8601073, + "learning_rate": 0.00011930175056494719, + "loss": 0.87072444, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.32202148, + "step": 4066, + "time_per_iteration": 2.8730247020721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066726, + "balance_loss_mlp": 1.03408647, + "epoch": 0.7824163139669104, + "flos": 451774900224.0, + "grad_norm": 0.04781338865883617, + "language_loss": 0.76222277, + "learning_rate": 0.00011909985580787885, + "loss": 0.77288997, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.32641602, + "step": 4067, + "time_per_iteration": 2.6421656608581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063309, + "balance_loss_mlp": 1.03138483, + "epoch": 0.782608695652174, + "flos": 540207277056.0, + "grad_norm": 0.05261646090903281, + "language_loss": 0.81026649, + "learning_rate": 0.00011889810892936137, + "loss": 0.82089961, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.3190918, + "step": 4068, + "time_per_iteration": 2.70185923576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071177, + "balance_loss_mlp": 1.03813219, + "epoch": 0.7828010773374374, + "flos": 500029369344.0, + "grad_norm": 0.05419048158551631, + "language_loss": 0.7722286, + "learning_rate": 0.00011869651000771959, + "loss": 0.78294039, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.33056641, + "step": 4069, + "time_per_iteration": 2.822190523147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060879, + "balance_loss_mlp": 1.02890754, + "epoch": 0.782993459022701, + "flos": 600542539776.0, + "grad_norm": 0.05379601018960074, + "language_loss": 0.82404703, + "learning_rate": 0.00011849505912122117, + "loss": 0.83465582, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.31958008, + "step": 4070, + "time_per_iteration": 2.7197659015655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061129, + "balance_loss_mlp": 1.02827537, + "epoch": 0.7831858407079646, + "flos": 809702106624.0, + "grad_norm": 0.06431726516643936, + "language_loss": 0.77697992, + "learning_rate": 0.00011829375634807654, + "loss": 0.78759122, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.32861328, + "step": 4071, + "time_per_iteration": 3.0201632976531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060522, + "balance_loss_mlp": 1.027596, + "epoch": 0.7833782223932282, + "flos": 806240701440.0, + "grad_norm": 0.09019117286711203, + "language_loss": 0.80854774, + "learning_rate": 0.00011809260176643821, + "loss": 0.81915295, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.3293457, + "step": 4072, + "time_per_iteration": 3.059041738510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062272, + "balance_loss_mlp": 1.0295614, + "epoch": 0.7835706040784918, + "flos": 520614508032.0, + "grad_norm": 0.05845304127163334, + "language_loss": 0.83590925, + "learning_rate": 0.00011789159545440131, + "loss": 0.84653199, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.32714844, + "step": 4073, + "time_per_iteration": 2.5912578105926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064777, + "balance_loss_mlp": 1.03199446, + "epoch": 0.7837629857637552, + "flos": 505322592768.0, + "grad_norm": 0.0488968990026523, + "language_loss": 0.82248485, + "learning_rate": 0.00011769073749000348, + "loss": 0.83313262, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.32788086, + "step": 4074, + "time_per_iteration": 2.7853548526763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067546, + "balance_loss_mlp": 1.03533578, + "epoch": 0.7839553674490188, + "flos": 515872723968.0, + "grad_norm": 0.0606411027248537, + "language_loss": 0.75941336, + "learning_rate": 0.0001174900279512246, + "loss": 0.77008879, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.32202148, + "step": 4075, + "time_per_iteration": 2.5954041481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065014, + "balance_loss_mlp": 1.03266096, + "epoch": 0.7841477491342824, + "flos": 506399330304.0, + "grad_norm": 0.05056809711727469, + "language_loss": 0.81398273, + "learning_rate": 0.00011728946691598707, + "loss": 0.82463288, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.32348633, + "step": 4076, + "time_per_iteration": 2.618093252182007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059278, + "balance_loss_mlp": 1.02680504, + "epoch": 0.784340130819546, + "flos": 719320048128.0, + "grad_norm": 0.06832591600294699, + "language_loss": 0.76352495, + "learning_rate": 0.00011708905446215561, + "loss": 0.77411771, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.32470703, + "step": 4077, + "time_per_iteration": 2.8518495559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064476, + "balance_loss_mlp": 1.03228974, + "epoch": 0.7845325125048095, + "flos": 514174735872.0, + "grad_norm": 0.05162512360480059, + "language_loss": 0.79919541, + "learning_rate": 0.00011688879066753711, + "loss": 0.8098402, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.32177734, + "step": 4078, + "time_per_iteration": 2.693814516067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107088, + "balance_loss_mlp": 1.03919387, + "epoch": 0.7847248941900731, + "flos": 465866422272.0, + "grad_norm": 0.057791720647150095, + "language_loss": 0.87164676, + "learning_rate": 0.00011668867560988122, + "loss": 0.88235557, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.31665039, + "step": 4079, + "time_per_iteration": 2.544497489929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106489, + "balance_loss_mlp": 1.03217876, + "epoch": 0.7849172758753367, + "flos": 502766217216.0, + "grad_norm": 0.06577906092431222, + "language_loss": 0.84248155, + "learning_rate": 0.00011648870936687916, + "loss": 0.85313052, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.32714844, + "step": 4080, + "time_per_iteration": 2.73219895362854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067029, + "balance_loss_mlp": 1.03465128, + "epoch": 0.7851096575606002, + "flos": 531742219776.0, + "grad_norm": 0.07071087412215145, + "language_loss": 0.77993482, + "learning_rate": 0.00011628889201616461, + "loss": 0.79060507, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.32373047, + "step": 4081, + "time_per_iteration": 2.6256251335144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064024, + "balance_loss_mlp": 1.03145564, + "epoch": 0.7853020392458638, + "flos": 569685256704.0, + "grad_norm": 0.054581090755724565, + "language_loss": 0.81991017, + "learning_rate": 0.00011608922363531393, + "loss": 0.83055043, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.32568359, + "step": 4082, + "time_per_iteration": 2.68129825592041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066291, + "balance_loss_mlp": 1.03522539, + "epoch": 0.7854944209311273, + "flos": 832228162560.0, + "grad_norm": 0.0528540930480431, + "language_loss": 0.83166963, + "learning_rate": 0.00011588970430184504, + "loss": 0.84233254, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.31030273, + "step": 4083, + "time_per_iteration": 3.01277494430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068509, + "balance_loss_mlp": 1.03608418, + "epoch": 0.7856868026163909, + "flos": 559660423680.0, + "grad_norm": 0.04365607087588255, + "language_loss": 0.81863219, + "learning_rate": 0.00011569033409321822, + "loss": 0.82931721, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.32421875, + "step": 4084, + "time_per_iteration": 2.6665027141571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106523, + "balance_loss_mlp": 1.03290033, + "epoch": 0.7858791843016545, + "flos": 544972382208.0, + "grad_norm": 0.05673133805325975, + "language_loss": 0.72893167, + "learning_rate": 0.00011549111308683591, + "loss": 0.73958397, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.32324219, + "step": 4085, + "time_per_iteration": 2.652221918106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062463, + "balance_loss_mlp": 1.03111076, + "epoch": 0.7860715659869181, + "flos": 380787761664.0, + "grad_norm": 0.058608703259898844, + "language_loss": 0.80785263, + "learning_rate": 0.00011529204136004251, + "loss": 0.81847727, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.31323242, + "step": 4086, + "time_per_iteration": 2.4127490520477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069783, + "balance_loss_mlp": 1.03762007, + "epoch": 0.7862639476721817, + "flos": 567173961216.0, + "grad_norm": 0.058008459467675216, + "language_loss": 0.84520507, + "learning_rate": 0.00011509311899012459, + "loss": 0.85590291, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.3215332, + "step": 4087, + "time_per_iteration": 2.6412453651428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067278, + "balance_loss_mlp": 1.03544927, + "epoch": 0.7864563293574451, + "flos": 544968000000.0, + "grad_norm": 0.06454830776496215, + "language_loss": 0.78072417, + "learning_rate": 0.00011489434605431053, + "loss": 0.79139692, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.31811523, + "step": 4088, + "time_per_iteration": 2.637660026550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106782, + "balance_loss_mlp": 1.03563344, + "epoch": 0.7866487110427087, + "flos": 563260041216.0, + "grad_norm": 0.058240331432363256, + "language_loss": 0.81125653, + "learning_rate": 0.0001146957226297708, + "loss": 0.82193476, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.32177734, + "step": 4089, + "time_per_iteration": 2.6684415340423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065799, + "balance_loss_mlp": 1.03323102, + "epoch": 0.7868410927279723, + "flos": 727849124352.0, + "grad_norm": 0.04414589533004489, + "language_loss": 0.76471299, + "learning_rate": 0.00011449724879361827, + "loss": 0.77537096, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.32568359, + "step": 4090, + "time_per_iteration": 2.951436758041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106424, + "balance_loss_mlp": 1.03236377, + "epoch": 0.7870334744132359, + "flos": 521082989568.0, + "grad_norm": 0.060886300721865946, + "language_loss": 0.73346722, + "learning_rate": 0.00011429892462290687, + "loss": 0.74410957, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.31860352, + "step": 4091, + "time_per_iteration": 2.681136131286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063333, + "balance_loss_mlp": 1.03143215, + "epoch": 0.7872258560984994, + "flos": 451173998592.0, + "grad_norm": 0.05425416710162835, + "language_loss": 0.83261812, + "learning_rate": 0.00011410075019463295, + "loss": 0.84325141, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.31884766, + "step": 4092, + "time_per_iteration": 2.596997022628784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_mlp": 1.03559613, + "epoch": 0.787418237783763, + "flos": 514932788736.0, + "grad_norm": 0.06041624723999286, + "language_loss": 0.80031419, + "learning_rate": 0.00011390272558573461, + "loss": 0.81098628, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.31591797, + "step": 4093, + "time_per_iteration": 2.724531412124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066529, + "balance_loss_mlp": 1.03422308, + "epoch": 0.7876106194690266, + "flos": 484837940736.0, + "grad_norm": 0.057479971789758694, + "language_loss": 0.79717124, + "learning_rate": 0.00011370485087309202, + "loss": 0.80783653, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.32299805, + "step": 4094, + "time_per_iteration": 2.6680920124053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066683, + "balance_loss_mlp": 1.03401947, + "epoch": 0.7878030011542901, + "flos": 542570185728.0, + "grad_norm": 0.07064536799183499, + "language_loss": 0.79107904, + "learning_rate": 0.00011350712613352688, + "loss": 0.80174589, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.32666016, + "step": 4095, + "time_per_iteration": 2.6333553791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066243, + "balance_loss_mlp": 1.03415227, + "epoch": 0.7879953828395537, + "flos": 516488182272.0, + "grad_norm": 0.06900072412934964, + "language_loss": 0.79095006, + "learning_rate": 0.00011330955144380283, + "loss": 0.8016125, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.32080078, + "step": 4096, + "time_per_iteration": 2.5925889015197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.03246856, + "epoch": 0.7881877645248172, + "flos": 582004597248.0, + "grad_norm": 0.054709813023541755, + "language_loss": 0.8620733, + "learning_rate": 0.00011311212688062483, + "loss": 0.87271917, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.32104492, + "step": 4097, + "time_per_iteration": 2.774585485458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065245, + "balance_loss_mlp": 1.03279638, + "epoch": 0.7883801462100808, + "flos": 588883737600.0, + "grad_norm": 0.05950523871883677, + "language_loss": 0.77641714, + "learning_rate": 0.0001129148525206402, + "loss": 0.78706962, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.32446289, + "step": 4098, + "time_per_iteration": 2.8262319564819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066849, + "balance_loss_mlp": 1.03535402, + "epoch": 0.7885725278953444, + "flos": 481475460096.0, + "grad_norm": 0.05859958093341329, + "language_loss": 0.86361545, + "learning_rate": 0.00011271772844043759, + "loss": 0.87428391, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.31469727, + "step": 4099, + "time_per_iteration": 2.6731910705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064893, + "balance_loss_mlp": 1.03306413, + "epoch": 0.788764909580608, + "flos": 756470126592.0, + "grad_norm": 0.05966502266655521, + "language_loss": 0.75518525, + "learning_rate": 0.00011252075471654727, + "loss": 0.76583415, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.31811523, + "step": 4100, + "time_per_iteration": 2.919638156890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065613, + "balance_loss_mlp": 1.03294969, + "epoch": 0.7889572912658714, + "flos": 702225427968.0, + "grad_norm": 0.050441368463949324, + "language_loss": 0.77960974, + "learning_rate": 0.00011232393142544133, + "loss": 0.79026586, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.32666016, + "step": 4101, + "time_per_iteration": 2.9371449947357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064145, + "balance_loss_mlp": 1.03188694, + "epoch": 0.789149672951135, + "flos": 736047931392.0, + "grad_norm": 0.05824722379420924, + "language_loss": 0.83012629, + "learning_rate": 0.00011212725864353323, + "loss": 0.8407678, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.32250977, + "step": 4102, + "time_per_iteration": 3.070425033569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019214, + "balance_loss_mlp": 1.01106, + "epoch": 0.7893420546363986, + "flos": 1480626349056.0, + "grad_norm": 0.00964834437524815, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.7735514, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.08154297, + "step": 4103, + "time_per_iteration": 4.87341046333313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069024, + "balance_loss_mlp": 1.03688502, + "epoch": 0.7895344363216622, + "flos": 508821875712.0, + "grad_norm": 0.06647723888078448, + "language_loss": 0.76089919, + "learning_rate": 0.00011173436491267291, + "loss": 0.77158946, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.32128906, + "step": 4104, + "time_per_iteration": 2.579040050506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069021, + "balance_loss_mlp": 1.036906, + "epoch": 0.7897268180069258, + "flos": 541727764992.0, + "grad_norm": 0.05890584899946244, + "language_loss": 0.81946945, + "learning_rate": 0.0001115381441162554, + "loss": 0.83015972, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.32104492, + "step": 4105, + "time_per_iteration": 2.6771240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019188, + "balance_loss_mlp": 1.01103461, + "epoch": 0.7899191996921893, + "flos": 1411924953600.0, + "grad_norm": 0.009593800245269755, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74602914, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.08154297, + "step": 4106, + "time_per_iteration": 4.9348978996276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067573, + "balance_loss_mlp": 1.03593516, + "epoch": 0.7901115813774529, + "flos": 622547679744.0, + "grad_norm": 0.05203428042978299, + "language_loss": 0.84845543, + "learning_rate": 0.00011114615504234465, + "loss": 0.85913116, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.31616211, + "step": 4107, + "time_per_iteration": 2.78153657913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067357, + "balance_loss_mlp": 1.03581429, + "epoch": 0.7903039630627164, + "flos": 645232296960.0, + "grad_norm": 0.05460483755610551, + "language_loss": 0.80740857, + "learning_rate": 0.00011095038691703468, + "loss": 0.81808215, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.31518555, + "step": 4108, + "time_per_iteration": 2.83954119682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069484, + "balance_loss_mlp": 1.03829885, + "epoch": 0.79049634474798, + "flos": 594054715392.0, + "grad_norm": 0.05143854855735133, + "language_loss": 0.82689941, + "learning_rate": 0.00011075476983417998, + "loss": 0.83759421, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.31152344, + "step": 4109, + "time_per_iteration": 2.8581154346466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069019, + "balance_loss_mlp": 1.03792906, + "epoch": 0.7906887264332435, + "flos": 715784449536.0, + "grad_norm": 0.056450839629860305, + "language_loss": 0.77744591, + "learning_rate": 0.00011055930386972579, + "loss": 0.78813612, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.31054688, + "step": 4110, + "time_per_iteration": 2.8273229598999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071548, + "balance_loss_mlp": 1.03855133, + "epoch": 0.7908811081185071, + "flos": 789553516032.0, + "grad_norm": 0.04891253400272343, + "language_loss": 0.78669703, + "learning_rate": 0.00011036398909955863, + "loss": 0.79741246, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.33007812, + "step": 4111, + "time_per_iteration": 2.961766004562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069451, + "balance_loss_mlp": 1.03747857, + "epoch": 0.7910734898037707, + "flos": 641612330496.0, + "grad_norm": 0.048663438809518546, + "language_loss": 0.81452119, + "learning_rate": 0.00011016882559950648, + "loss": 0.82521558, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.31958008, + "step": 4112, + "time_per_iteration": 2.8214406967163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068413, + "balance_loss_mlp": 1.03660822, + "epoch": 0.7912658714890343, + "flos": 669057670656.0, + "grad_norm": 0.05392137662685343, + "language_loss": 0.80067742, + "learning_rate": 0.00010997381344533853, + "loss": 0.81136161, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.31787109, + "step": 4113, + "time_per_iteration": 2.811772346496582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073852, + "balance_loss_mlp": 1.04152238, + "epoch": 0.7914582531742979, + "flos": 557504128512.0, + "grad_norm": 0.0581863083981893, + "language_loss": 0.80220509, + "learning_rate": 0.00010977895271276517, + "loss": 0.81294358, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.32324219, + "step": 4114, + "time_per_iteration": 2.719431161880493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107287, + "balance_loss_mlp": 1.0409224, + "epoch": 0.7916506348595613, + "flos": 569784181248.0, + "grad_norm": 0.05018332028611806, + "language_loss": 0.7987901, + "learning_rate": 0.00010958424347743807, + "loss": 0.80951875, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.31933594, + "step": 4115, + "time_per_iteration": 2.6972670555114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106862, + "balance_loss_mlp": 1.03724396, + "epoch": 0.7918430165448249, + "flos": 717966885888.0, + "grad_norm": 0.06933669285907723, + "language_loss": 0.80126512, + "learning_rate": 0.00010938968581494991, + "loss": 0.81195128, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.31347656, + "step": 4116, + "time_per_iteration": 2.9974632263183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069688, + "balance_loss_mlp": 1.03750205, + "epoch": 0.7920353982300885, + "flos": 553377802752.0, + "grad_norm": 0.05941447289744039, + "language_loss": 0.78879136, + "learning_rate": 0.000109195279800835, + "loss": 0.79948825, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.32177734, + "step": 4117, + "time_per_iteration": 2.710513114929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071029, + "balance_loss_mlp": 1.03896213, + "epoch": 0.7922277799153521, + "flos": 809766125568.0, + "grad_norm": 0.05531983375516572, + "language_loss": 0.76555854, + "learning_rate": 0.00010900102551056834, + "loss": 0.77626884, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.32055664, + "step": 4118, + "time_per_iteration": 3.0103225708007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069951, + "balance_loss_mlp": 1.03766966, + "epoch": 0.7924201616006156, + "flos": 421128612864.0, + "grad_norm": 0.05482547351078549, + "language_loss": 0.84337735, + "learning_rate": 0.00010880692301956601, + "loss": 0.85407686, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.32275391, + "step": 4119, + "time_per_iteration": 2.445122003555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069691, + "balance_loss_mlp": 1.03707528, + "epoch": 0.7926125432858792, + "flos": 617541055488.0, + "grad_norm": 0.04369868110465695, + "language_loss": 0.86072242, + "learning_rate": 0.00010861297240318518, + "loss": 0.87141925, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.32617188, + "step": 4120, + "time_per_iteration": 2.85048508644104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067992, + "balance_loss_mlp": 1.03656876, + "epoch": 0.7928049249711427, + "flos": 602207032320.0, + "grad_norm": 0.05006458241333452, + "language_loss": 0.86780667, + "learning_rate": 0.00010841917373672444, + "loss": 0.87848663, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.31396484, + "step": 4121, + "time_per_iteration": 2.704904794692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067696, + "balance_loss_mlp": 1.03570032, + "epoch": 0.7929973066564063, + "flos": 655724201472.0, + "grad_norm": 0.05319226556655214, + "language_loss": 0.78318095, + "learning_rate": 0.00010822552709542293, + "loss": 0.79385787, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.31982422, + "step": 4122, + "time_per_iteration": 2.8160955905914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069953, + "balance_loss_mlp": 1.03814769, + "epoch": 0.7931896883416699, + "flos": 536139177984.0, + "grad_norm": 0.04444307991995564, + "language_loss": 0.85812402, + "learning_rate": 0.0001080320325544612, + "loss": 0.86882365, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.31787109, + "step": 4123, + "time_per_iteration": 2.6734302043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067197, + "balance_loss_mlp": 1.03594005, + "epoch": 0.7933820700269334, + "flos": 497836758528.0, + "grad_norm": 0.04986309312867086, + "language_loss": 0.82817209, + "learning_rate": 0.00010783869018895997, + "loss": 0.838844, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.31225586, + "step": 4124, + "time_per_iteration": 2.578643321990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067226, + "balance_loss_mlp": 1.03506327, + "epoch": 0.793574451712197, + "flos": 537217325568.0, + "grad_norm": 0.05142590484857824, + "language_loss": 0.84177709, + "learning_rate": 0.00010764550007398189, + "loss": 0.8524493, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.3215332, + "step": 4125, + "time_per_iteration": 2.668468475341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065806, + "balance_loss_mlp": 1.03419125, + "epoch": 0.7937668333974606, + "flos": 488043270144.0, + "grad_norm": 0.048489850781485225, + "language_loss": 0.81036043, + "learning_rate": 0.00010745246228452982, + "loss": 0.82101846, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.31591797, + "step": 4126, + "time_per_iteration": 2.5388453006744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106876, + "balance_loss_mlp": 1.0364542, + "epoch": 0.7939592150827242, + "flos": 527163379200.0, + "grad_norm": 0.05117583653255347, + "language_loss": 0.81550407, + "learning_rate": 0.00010725957689554771, + "loss": 0.82619166, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.32299805, + "step": 4127, + "time_per_iteration": 2.7774598598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065815, + "balance_loss_mlp": 1.03353345, + "epoch": 0.7941515967679876, + "flos": 541428019200.0, + "grad_norm": 0.13198996647770603, + "language_loss": 0.84346122, + "learning_rate": 0.00010706684398192013, + "loss": 0.85411942, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.32275391, + "step": 4128, + "time_per_iteration": 2.6948909759521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068056, + "balance_loss_mlp": 1.03555918, + "epoch": 0.7943439784532512, + "flos": 518104622592.0, + "grad_norm": 0.05568877803614168, + "language_loss": 0.81997395, + "learning_rate": 0.00010687426361847313, + "loss": 0.8306545, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.32495117, + "step": 4129, + "time_per_iteration": 2.693753957748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069811, + "balance_loss_mlp": 1.0384829, + "epoch": 0.7945363601385148, + "flos": 508768031232.0, + "grad_norm": 0.052703179932938445, + "language_loss": 0.85951877, + "learning_rate": 0.00010668183587997254, + "loss": 0.87021685, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.31298828, + "step": 4130, + "time_per_iteration": 2.5763041973114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069665, + "balance_loss_mlp": 1.03731203, + "epoch": 0.7947287418237784, + "flos": 650918398464.0, + "grad_norm": 0.061493260737887565, + "language_loss": 0.77379823, + "learning_rate": 0.0001064895608411256, + "loss": 0.78449482, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.32348633, + "step": 4131, + "time_per_iteration": 2.763904333114624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068483, + "balance_loss_mlp": 1.03620124, + "epoch": 0.794921123509042, + "flos": 695726019072.0, + "grad_norm": 0.07934957130099038, + "language_loss": 0.80297732, + "learning_rate": 0.00010629743857657998, + "loss": 0.81366217, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.32275391, + "step": 4132, + "time_per_iteration": 2.933009386062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019333, + "balance_loss_mlp": 1.01117909, + "epoch": 0.7951135051943055, + "flos": 1402161988608.0, + "grad_norm": 0.006928845772435826, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71618003, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.08154297, + "step": 4133, + "time_per_iteration": 4.611080884933472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067103, + "balance_loss_mlp": 1.03560841, + "epoch": 0.795305886879569, + "flos": 809745776640.0, + "grad_norm": 0.059789926396459823, + "language_loss": 0.81835663, + "learning_rate": 0.00010591365266868802, + "loss": 0.82902765, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.31469727, + "step": 4134, + "time_per_iteration": 2.9697659015655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016843, + "balance_loss_mlp": 1.00873721, + "epoch": 0.7954982685648326, + "flos": 1425205988352.0, + "grad_norm": 0.006305006479863361, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76528627, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.08105469, + "step": 4135, + "time_per_iteration": 4.8860838413238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068161, + "balance_loss_mlp": 1.03547359, + "epoch": 0.7956906502500962, + "flos": 389670428160.0, + "grad_norm": 0.055642824897664006, + "language_loss": 0.79057562, + "learning_rate": 0.00010553047875229166, + "loss": 0.80125725, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.3269043, + "step": 4136, + "time_per_iteration": 2.5156140327453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065548, + "balance_loss_mlp": 1.03359985, + "epoch": 0.7958830319353598, + "flos": 515321284608.0, + "grad_norm": 0.05406078670363032, + "language_loss": 0.83169937, + "learning_rate": 0.00010533912147689328, + "loss": 0.84235483, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.31933594, + "step": 4137, + "time_per_iteration": 2.613961696624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064795, + "balance_loss_mlp": 1.03296661, + "epoch": 0.7960754136206233, + "flos": 493695876096.0, + "grad_norm": 0.050232390896865514, + "language_loss": 0.82344103, + "learning_rate": 0.00010514791742243656, + "loss": 0.83408904, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.31811523, + "step": 4138, + "time_per_iteration": 2.5978379249572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106016, + "balance_loss_mlp": 1.02813983, + "epoch": 0.7962677953058869, + "flos": 655409899008.0, + "grad_norm": 0.05370274741433686, + "language_loss": 0.82677209, + "learning_rate": 0.00010495686666315341, + "loss": 0.83737361, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.32006836, + "step": 4139, + "time_per_iteration": 2.872997283935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062257, + "balance_loss_mlp": 1.03088117, + "epoch": 0.7964601769911505, + "flos": 542126435328.0, + "grad_norm": 0.05348146063522791, + "language_loss": 0.77502406, + "learning_rate": 0.00010476596927321635, + "loss": 0.78564668, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.31347656, + "step": 4140, + "time_per_iteration": 2.620577812194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064605, + "balance_loss_mlp": 1.0327282, + "epoch": 0.796652558676414, + "flos": 537356947968.0, + "grad_norm": 0.042260612329337484, + "language_loss": 0.80177677, + "learning_rate": 0.00010457522532673835, + "loss": 0.81242287, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.31860352, + "step": 4141, + "time_per_iteration": 2.7778780460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066312, + "balance_loss_mlp": 1.03419721, + "epoch": 0.7968449403616775, + "flos": 474852395520.0, + "grad_norm": 0.061301631429393516, + "language_loss": 0.82973599, + "learning_rate": 0.00010438463489777272, + "loss": 0.84039915, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.32104492, + "step": 4142, + "time_per_iteration": 2.579953908920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064314, + "balance_loss_mlp": 1.03157902, + "epoch": 0.7970373220469411, + "flos": 567336904704.0, + "grad_norm": 0.06081943760353449, + "language_loss": 0.77709621, + "learning_rate": 0.00010419419806031316, + "loss": 0.7877394, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.32739258, + "step": 4143, + "time_per_iteration": 2.6624910831451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066036, + "balance_loss_mlp": 1.03477979, + "epoch": 0.7972297037322047, + "flos": 555924003840.0, + "grad_norm": 0.05994376344115418, + "language_loss": 0.83806866, + "learning_rate": 0.00010400391488829403, + "loss": 0.84872901, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.31225586, + "step": 4144, + "time_per_iteration": 2.774700880050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063404, + "balance_loss_mlp": 1.03157544, + "epoch": 0.7974220854174683, + "flos": 575899476480.0, + "grad_norm": 0.04407421907789105, + "language_loss": 0.86373734, + "learning_rate": 0.00010381378545558984, + "loss": 0.87437141, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.31811523, + "step": 4145, + "time_per_iteration": 2.686239004135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065318, + "balance_loss_mlp": 1.03301203, + "epoch": 0.7976144671027319, + "flos": 482824240128.0, + "grad_norm": 0.047216774900369206, + "language_loss": 0.8480643, + "learning_rate": 0.00010362380983601505, + "loss": 0.85871744, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.32299805, + "step": 4146, + "time_per_iteration": 2.533198833465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062521, + "balance_loss_mlp": 1.03102612, + "epoch": 0.7978068487879953, + "flos": 1077420372480.0, + "grad_norm": 0.04375196843804429, + "language_loss": 0.78552485, + "learning_rate": 0.00010343398810332477, + "loss": 0.79615009, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.31469727, + "step": 4147, + "time_per_iteration": 3.451004981994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106262, + "balance_loss_mlp": 1.03007627, + "epoch": 0.7979992304732589, + "flos": 733421744640.0, + "grad_norm": 0.06305718879587498, + "language_loss": 0.84127843, + "learning_rate": 0.00010324432033121467, + "loss": 0.85190463, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.32543945, + "step": 4148, + "time_per_iteration": 2.890085220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065874, + "balance_loss_mlp": 1.03349686, + "epoch": 0.7981916121585225, + "flos": 415531261440.0, + "grad_norm": 0.050318448147633754, + "language_loss": 0.83318138, + "learning_rate": 0.00010305480659332005, + "loss": 0.84384012, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.32373047, + "step": 4149, + "time_per_iteration": 2.588676929473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063088, + "balance_loss_mlp": 1.03133059, + "epoch": 0.7983839938437861, + "flos": 465019619328.0, + "grad_norm": 0.06596514407169883, + "language_loss": 0.83595121, + "learning_rate": 0.00010286544696321682, + "loss": 0.84658206, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.31738281, + "step": 4150, + "time_per_iteration": 2.546215772628784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064473, + "balance_loss_mlp": 1.03304911, + "epoch": 0.7985763755290496, + "flos": 510304485888.0, + "grad_norm": 0.05480976519736011, + "language_loss": 0.79303128, + "learning_rate": 0.00010267624151442073, + "loss": 0.80367601, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.31396484, + "step": 4151, + "time_per_iteration": 2.620140790939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062507, + "balance_loss_mlp": 1.03077376, + "epoch": 0.7987687572143132, + "flos": 1010243847168.0, + "grad_norm": 0.05583504275555366, + "language_loss": 0.81259573, + "learning_rate": 0.000102487190320388, + "loss": 0.82322085, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.31713867, + "step": 4152, + "time_per_iteration": 3.3063504695892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064782, + "balance_loss_mlp": 1.03247619, + "epoch": 0.7989611388995768, + "flos": 1020662968320.0, + "grad_norm": 0.05403781232268857, + "language_loss": 0.79678059, + "learning_rate": 0.00010229829345451475, + "loss": 0.80742842, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.32299805, + "step": 4153, + "time_per_iteration": 3.301403522491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064656, + "balance_loss_mlp": 1.03237379, + "epoch": 0.7991535205848403, + "flos": 1100915476992.0, + "grad_norm": 0.05303368831267737, + "language_loss": 0.79783893, + "learning_rate": 0.00010210955099013724, + "loss": 0.80848551, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.32275391, + "step": 4154, + "time_per_iteration": 3.383039712905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065536, + "balance_loss_mlp": 1.03301597, + "epoch": 0.7993459022701039, + "flos": 834454268928.0, + "grad_norm": 0.06456924160427363, + "language_loss": 0.76284033, + "learning_rate": 0.00010192096300053167, + "loss": 0.77349567, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.32519531, + "step": 4155, + "time_per_iteration": 3.0697450637817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061021, + "balance_loss_mlp": 1.02928793, + "epoch": 0.7995382839553674, + "flos": 522417212928.0, + "grad_norm": 0.04699781712080769, + "language_loss": 0.851726, + "learning_rate": 0.00010173252955891477, + "loss": 0.86233628, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.31713867, + "step": 4156, + "time_per_iteration": 2.7266414165496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106594, + "balance_loss_mlp": 1.03389633, + "epoch": 0.799730665640631, + "flos": 537562151424.0, + "grad_norm": 0.059253037565978675, + "language_loss": 0.73188376, + "learning_rate": 0.00010154425073844253, + "loss": 0.7425431, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.3203125, + "step": 4157, + "time_per_iteration": 2.6836955547332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067435, + "balance_loss_mlp": 1.0347718, + "epoch": 0.7999230473258946, + "flos": 504809031168.0, + "grad_norm": 0.050604408560098985, + "language_loss": 0.82231861, + "learning_rate": 0.00010135612661221138, + "loss": 0.83299297, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.32666016, + "step": 4158, + "time_per_iteration": 2.5858490467071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061337, + "balance_loss_mlp": 1.02903104, + "epoch": 0.8001154290111582, + "flos": 1026935414784.0, + "grad_norm": 0.07154666191877361, + "language_loss": 0.81335956, + "learning_rate": 0.00010116815725325751, + "loss": 0.82397294, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.32299805, + "step": 4159, + "time_per_iteration": 3.30757474899292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063379, + "balance_loss_mlp": 1.03073967, + "epoch": 0.8003078106964217, + "flos": 750567237120.0, + "grad_norm": 0.05734149006242142, + "language_loss": 0.80527955, + "learning_rate": 0.00010098034273455725, + "loss": 0.81591332, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.32641602, + "step": 4160, + "time_per_iteration": 2.9547767639160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065922, + "balance_loss_mlp": 1.03313947, + "epoch": 0.8005001923816852, + "flos": 488201831424.0, + "grad_norm": 0.05051565727224089, + "language_loss": 0.79769969, + "learning_rate": 0.00010079268312902662, + "loss": 0.80835891, + "num_input_tokens_seen": 345015392, + "router_z_loss_mlp": 0.32788086, + "step": 4161, + "time_per_iteration": 2.6677966117858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062976, + "balance_loss_mlp": 1.03119469, + "epoch": 0.8006925740669488, + "flos": 512983107072.0, + "grad_norm": 0.05230288400742034, + "language_loss": 0.81782764, + "learning_rate": 0.0001006051785095215, + "loss": 0.82845742, + "num_input_tokens_seen": 345086640, + "router_z_loss_mlp": 0.31762695, + "step": 4162, + "time_per_iteration": 2.642228126525879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064845, + "balance_loss_mlp": 1.03172922, + "epoch": 0.8008849557522124, + "flos": 578243446272.0, + "grad_norm": 0.05393641740779556, + "language_loss": 0.79291767, + "learning_rate": 0.0001004178289488376, + "loss": 0.8035661, + "num_input_tokens_seen": 345159616, + "router_z_loss_mlp": 0.33129883, + "step": 4163, + "time_per_iteration": 2.7046382427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065647, + "balance_loss_mlp": 1.03360367, + "epoch": 0.801077337437476, + "flos": 478466569728.0, + "grad_norm": 0.05246916136240305, + "language_loss": 0.83748746, + "learning_rate": 0.0001002306345197106, + "loss": 0.84814394, + "num_input_tokens_seen": 345225536, + "router_z_loss_mlp": 0.3203125, + "step": 4164, + "time_per_iteration": 2.5735926628112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064035, + "balance_loss_mlp": 1.03115666, + "epoch": 0.8012697191227395, + "flos": 676384943616.0, + "grad_norm": 0.06395934079571464, + "language_loss": 0.79516339, + "learning_rate": 0.00010004359529481571, + "loss": 0.80580378, + "num_input_tokens_seen": 345302960, + "router_z_loss_mlp": 0.32885742, + "step": 4165, + "time_per_iteration": 3.073218822479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070917, + "balance_loss_mlp": 1.03760982, + "epoch": 0.8014621008080031, + "flos": 1294624567296.0, + "grad_norm": 0.06013073241121916, + "language_loss": 0.81983745, + "learning_rate": 9.985671134676804e-05, + "loss": 0.83054662, + "num_input_tokens_seen": 345397792, + "router_z_loss_mlp": 0.33325195, + "step": 4166, + "time_per_iteration": 3.6898140907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064786, + "balance_loss_mlp": 1.03259957, + "epoch": 0.8016544824932667, + "flos": 511579072512.0, + "grad_norm": 0.0757010712096974, + "language_loss": 0.83196199, + "learning_rate": 9.966998274812234e-05, + "loss": 0.84260988, + "num_input_tokens_seen": 345465440, + "router_z_loss_mlp": 0.32177734, + "step": 4167, + "time_per_iteration": 2.61006498336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065564, + "balance_loss_mlp": 1.03368783, + "epoch": 0.8018468641785302, + "flos": 535434969600.0, + "grad_norm": 0.06034302862122322, + "language_loss": 0.81307828, + "learning_rate": 9.948340957137308e-05, + "loss": 0.82373393, + "num_input_tokens_seen": 345533072, + "router_z_loss_mlp": 0.31860352, + "step": 4168, + "time_per_iteration": 2.615943670272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064528, + "balance_loss_mlp": 1.03160238, + "epoch": 0.8020392458637937, + "flos": 1023025876992.0, + "grad_norm": 0.07125861886522546, + "language_loss": 0.79438949, + "learning_rate": 9.929699188895447e-05, + "loss": 0.80503476, + "num_input_tokens_seen": 345622208, + "router_z_loss_mlp": 0.3293457, + "step": 4169, + "time_per_iteration": 3.2835214138031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011972, + "balance_loss_mlp": 1.00377011, + "epoch": 0.8022316275490573, + "flos": 1560993748992.0, + "grad_norm": 0.00911725430175954, + "language_loss": 0.78054404, + "learning_rate": 9.911072977324009e-05, + "loss": 0.79066378, + "num_input_tokens_seen": 345852544, + "router_z_loss_mlp": 0.08203125, + "step": 4170, + "time_per_iteration": 4.975283861160278 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064906, + "balance_loss_mlp": 1.03236222, + "epoch": 0.8024240092343209, + "flos": 420473866752.0, + "grad_norm": 0.05669992043917717, + "language_loss": 0.82862949, + "learning_rate": 9.89246232965435e-05, + "loss": 0.83927858, + "num_input_tokens_seen": 345917328, + "router_z_loss_mlp": 0.32543945, + "step": 4171, + "time_per_iteration": 2.5107967853546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066001, + "balance_loss_mlp": 1.03307581, + "epoch": 0.8026163909195845, + "flos": 763506418176.0, + "grad_norm": 0.06023729307779171, + "language_loss": 0.78644282, + "learning_rate": 9.873867253111762e-05, + "loss": 0.79710281, + "num_input_tokens_seen": 345995936, + "router_z_loss_mlp": 0.3293457, + "step": 4172, + "time_per_iteration": 2.9537975788116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011767, + "balance_loss_mlp": 1.00356519, + "epoch": 0.8028087726048481, + "flos": 1518044087808.0, + "grad_norm": 0.009851202253072271, + "language_loss": 0.80264562, + "learning_rate": 9.855287754915503e-05, + "loss": 0.81276327, + "num_input_tokens_seen": 346232720, + "router_z_loss_mlp": 0.08203125, + "step": 4173, + "time_per_iteration": 4.9082865715026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065835, + "balance_loss_mlp": 1.03310037, + "epoch": 0.8030011542901115, + "flos": 517620174336.0, + "grad_norm": 0.06108822499581813, + "language_loss": 0.88279212, + "learning_rate": 9.836723842278733e-05, + "loss": 0.89345044, + "num_input_tokens_seen": 346298208, + "router_z_loss_mlp": 0.32739258, + "step": 4174, + "time_per_iteration": 2.58488392829895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063958, + "balance_loss_mlp": 1.03177166, + "epoch": 0.8031935359753751, + "flos": 545356495872.0, + "grad_norm": 0.05400426019462201, + "language_loss": 0.77958262, + "learning_rate": 9.818175522408646e-05, + "loss": 0.79022217, + "num_input_tokens_seen": 346370080, + "router_z_loss_mlp": 0.32177734, + "step": 4175, + "time_per_iteration": 2.6506383419036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064557, + "balance_loss_mlp": 1.03201258, + "epoch": 0.8033859176606387, + "flos": 603266241024.0, + "grad_norm": 0.12211411732469771, + "language_loss": 0.84598446, + "learning_rate": 9.79964280250632e-05, + "loss": 0.85662997, + "num_input_tokens_seen": 346442432, + "router_z_loss_mlp": 0.32543945, + "step": 4176, + "time_per_iteration": 2.7917282581329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066053, + "balance_loss_mlp": 1.03353262, + "epoch": 0.8035782993459023, + "flos": 565579279872.0, + "grad_norm": 0.06768154425814676, + "language_loss": 0.8119694, + "learning_rate": 9.781125689766795e-05, + "loss": 0.82262993, + "num_input_tokens_seen": 346513088, + "router_z_loss_mlp": 0.32519531, + "step": 4177, + "time_per_iteration": 2.776914358139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062732, + "balance_loss_mlp": 1.03080821, + "epoch": 0.8037706810311658, + "flos": 538177609728.0, + "grad_norm": 0.06144959220645004, + "language_loss": 0.84109223, + "learning_rate": 9.762624191379054e-05, + "loss": 0.8517195, + "num_input_tokens_seen": 346581376, + "router_z_loss_mlp": 0.3190918, + "step": 4178, + "time_per_iteration": 2.6495769023895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062746, + "balance_loss_mlp": 1.03122723, + "epoch": 0.8039630627164294, + "flos": 514937170944.0, + "grad_norm": 0.06604335738972231, + "language_loss": 0.79559189, + "learning_rate": 9.744138314526014e-05, + "loss": 0.80621934, + "num_input_tokens_seen": 346653328, + "router_z_loss_mlp": 0.31494141, + "step": 4179, + "time_per_iteration": 2.602325201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011191, + "balance_loss_mlp": 1.0028466, + "epoch": 0.804155444401693, + "flos": 1478061209088.0, + "grad_norm": 0.008490114201074244, + "language_loss": 0.74733561, + "learning_rate": 9.725668066384535e-05, + "loss": 0.75744754, + "num_input_tokens_seen": 346873264, + "router_z_loss_mlp": 0.08349609, + "step": 4180, + "time_per_iteration": 4.859116077423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065566, + "balance_loss_mlp": 1.03337991, + "epoch": 0.8043478260869565, + "flos": 520909871616.0, + "grad_norm": 0.05865304304999897, + "language_loss": 0.77159905, + "learning_rate": 9.707213454125396e-05, + "loss": 0.7822547, + "num_input_tokens_seen": 346946272, + "router_z_loss_mlp": 0.32177734, + "step": 4181, + "time_per_iteration": 2.635836362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064141, + "balance_loss_mlp": 1.03131068, + "epoch": 0.8045402077722201, + "flos": 545170231296.0, + "grad_norm": 0.0530956365986092, + "language_loss": 0.80491257, + "learning_rate": 9.688774484913298e-05, + "loss": 0.81555402, + "num_input_tokens_seen": 347024048, + "router_z_loss_mlp": 0.32836914, + "step": 4182, + "time_per_iteration": 2.751748561859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106557, + "balance_loss_mlp": 1.03438473, + "epoch": 0.8047325894574836, + "flos": 678059610624.0, + "grad_norm": 0.05198093608069738, + "language_loss": 0.740538, + "learning_rate": 9.670351165906921e-05, + "loss": 0.75119376, + "num_input_tokens_seen": 347108736, + "router_z_loss_mlp": 0.31152344, + "step": 4183, + "time_per_iteration": 2.9374914169311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063909, + "balance_loss_mlp": 1.03146052, + "epoch": 0.8049249711427472, + "flos": 586952994816.0, + "grad_norm": 0.051787161978078845, + "language_loss": 0.78289056, + "learning_rate": 9.65194350425882e-05, + "loss": 0.79352963, + "num_input_tokens_seen": 347184192, + "router_z_loss_mlp": 0.32446289, + "step": 4184, + "time_per_iteration": 2.7513604164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066837, + "balance_loss_mlp": 1.03467417, + "epoch": 0.8051173528280108, + "flos": 813824050176.0, + "grad_norm": 0.049086248297275724, + "language_loss": 0.77875173, + "learning_rate": 9.633551507115452e-05, + "loss": 0.78942013, + "num_input_tokens_seen": 347282336, + "router_z_loss_mlp": 0.3215332, + "step": 4185, + "time_per_iteration": 3.129385232925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064139, + "balance_loss_mlp": 1.03254879, + "epoch": 0.8053097345132744, + "flos": 725371324416.0, + "grad_norm": 0.05310426447425182, + "language_loss": 0.77401984, + "learning_rate": 9.615175181617259e-05, + "loss": 0.78466123, + "num_input_tokens_seen": 347364800, + "router_z_loss_mlp": 0.31567383, + "step": 4186, + "time_per_iteration": 2.9424233436584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062352, + "balance_loss_mlp": 1.02980769, + "epoch": 0.805502116198538, + "flos": 747706733568.0, + "grad_norm": 0.12719309664505637, + "language_loss": 0.81555432, + "learning_rate": 9.596814534898552e-05, + "loss": 0.82617784, + "num_input_tokens_seen": 347443328, + "router_z_loss_mlp": 0.32543945, + "step": 4187, + "time_per_iteration": 2.9859328269958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063734, + "balance_loss_mlp": 1.03204834, + "epoch": 0.8056944978838014, + "flos": 639953630208.0, + "grad_norm": 0.05354487168987546, + "language_loss": 0.87387693, + "learning_rate": 9.578469574087561e-05, + "loss": 0.88451427, + "num_input_tokens_seen": 347522064, + "router_z_loss_mlp": 0.31665039, + "step": 4188, + "time_per_iteration": 2.7995247840881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065054, + "balance_loss_mlp": 1.03255761, + "epoch": 0.805886879569065, + "flos": 644344796160.0, + "grad_norm": 0.056241473497873305, + "language_loss": 0.77584517, + "learning_rate": 9.560140306306436e-05, + "loss": 0.78649569, + "num_input_tokens_seen": 347597200, + "router_z_loss_mlp": 0.32495117, + "step": 4189, + "time_per_iteration": 2.7428812980651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106136, + "balance_loss_mlp": 1.02964997, + "epoch": 0.8060792612543286, + "flos": 660928674816.0, + "grad_norm": 0.0522294077132656, + "language_loss": 0.81488943, + "learning_rate": 9.541826738671233e-05, + "loss": 0.82550299, + "num_input_tokens_seen": 347676928, + "router_z_loss_mlp": 0.31689453, + "step": 4190, + "time_per_iteration": 2.8001224994659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062885, + "balance_loss_mlp": 1.03172374, + "epoch": 0.8062716429395922, + "flos": 454842017280.0, + "grad_norm": 0.057773567854599146, + "language_loss": 0.82252741, + "learning_rate": 9.523528878291904e-05, + "loss": 0.83315623, + "num_input_tokens_seen": 347741552, + "router_z_loss_mlp": 0.3112793, + "step": 4191, + "time_per_iteration": 2.521599531173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065048, + "balance_loss_mlp": 1.03267062, + "epoch": 0.8064640246248557, + "flos": 526153632768.0, + "grad_norm": 0.06481758336131359, + "language_loss": 0.85109866, + "learning_rate": 9.50524673227231e-05, + "loss": 0.86174917, + "num_input_tokens_seen": 347807008, + "router_z_loss_mlp": 0.32373047, + "step": 4192, + "time_per_iteration": 2.5771045684814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106484, + "balance_loss_mlp": 1.03239143, + "epoch": 0.8066564063101193, + "flos": 864726617088.0, + "grad_norm": 0.04614815892553539, + "language_loss": 0.82226491, + "learning_rate": 9.486980307710208e-05, + "loss": 0.83291328, + "num_input_tokens_seen": 347895728, + "router_z_loss_mlp": 0.32446289, + "step": 4193, + "time_per_iteration": 3.1550650596618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106366, + "balance_loss_mlp": 1.03142595, + "epoch": 0.8068487879953828, + "flos": 530261019648.0, + "grad_norm": 0.04778244873492842, + "language_loss": 0.81720543, + "learning_rate": 9.468729611697246e-05, + "loss": 0.82784206, + "num_input_tokens_seen": 347970368, + "router_z_loss_mlp": 0.32226562, + "step": 4194, + "time_per_iteration": 2.68117618560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063088, + "balance_loss_mlp": 1.03094923, + "epoch": 0.8070411696806464, + "flos": 565918313472.0, + "grad_norm": 0.05165640646024543, + "language_loss": 0.81600553, + "learning_rate": 9.450494651319003e-05, + "loss": 0.82663637, + "num_input_tokens_seen": 348039040, + "router_z_loss_mlp": 0.32128906, + "step": 4195, + "time_per_iteration": 2.633653402328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068382, + "balance_loss_mlp": 1.03557551, + "epoch": 0.80723355136591, + "flos": 986176954368.0, + "grad_norm": 0.07405936380462144, + "language_loss": 0.7911948, + "learning_rate": 9.432275433654885e-05, + "loss": 0.80187857, + "num_input_tokens_seen": 348126064, + "router_z_loss_mlp": 0.328125, + "step": 4196, + "time_per_iteration": 3.312209129333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066123, + "balance_loss_mlp": 1.0333643, + "epoch": 0.8074259330511735, + "flos": 566682158592.0, + "grad_norm": 0.05744842453598568, + "language_loss": 0.82666802, + "learning_rate": 9.414071965778221e-05, + "loss": 0.83732927, + "num_input_tokens_seen": 348205888, + "router_z_loss_mlp": 0.32763672, + "step": 4197, + "time_per_iteration": 2.8450043201446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062207, + "balance_loss_mlp": 1.02997255, + "epoch": 0.8076183147364371, + "flos": 494391320064.0, + "grad_norm": 0.049534177459162165, + "language_loss": 0.79876429, + "learning_rate": 9.395884254756242e-05, + "loss": 0.80938637, + "num_input_tokens_seen": 348278608, + "router_z_loss_mlp": 0.32226562, + "step": 4198, + "time_per_iteration": 2.742640733718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071014, + "balance_loss_mlp": 1.03858888, + "epoch": 0.8078106964217007, + "flos": 419798771712.0, + "grad_norm": 0.05248878052625419, + "language_loss": 0.79987884, + "learning_rate": 9.377712307650044e-05, + "loss": 0.81058896, + "num_input_tokens_seen": 348341312, + "router_z_loss_mlp": 0.32421875, + "step": 4199, + "time_per_iteration": 2.483083486557007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061181, + "balance_loss_mlp": 1.02877951, + "epoch": 0.8080030781069643, + "flos": 527281242624.0, + "grad_norm": 0.05810503485387347, + "language_loss": 0.82971925, + "learning_rate": 9.359556131514602e-05, + "loss": 0.84033108, + "num_input_tokens_seen": 348409184, + "router_z_loss_mlp": 0.32397461, + "step": 4200, + "time_per_iteration": 2.6200740337371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068361, + "balance_loss_mlp": 1.0357213, + "epoch": 0.8081954597922277, + "flos": 543898616832.0, + "grad_norm": 0.05004595325075464, + "language_loss": 0.81427366, + "learning_rate": 9.341415733398733e-05, + "loss": 0.82495731, + "num_input_tokens_seen": 348480832, + "router_z_loss_mlp": 0.32641602, + "step": 4201, + "time_per_iteration": 2.621709108352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066327, + "balance_loss_mlp": 1.03383064, + "epoch": 0.8083878414774913, + "flos": 640593819648.0, + "grad_norm": 0.050042429423884156, + "language_loss": 0.75564444, + "learning_rate": 9.323291120345207e-05, + "loss": 0.76630765, + "num_input_tokens_seen": 348559232, + "router_z_loss_mlp": 0.32495117, + "step": 4202, + "time_per_iteration": 2.843702793121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062749, + "balance_loss_mlp": 1.03072977, + "epoch": 0.8085802231627549, + "flos": 705292545024.0, + "grad_norm": 0.0932476800908623, + "language_loss": 0.72727638, + "learning_rate": 9.305182299390614e-05, + "loss": 0.73790395, + "num_input_tokens_seen": 348638960, + "router_z_loss_mlp": 0.32006836, + "step": 4203, + "time_per_iteration": 2.8963630199432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062026, + "balance_loss_mlp": 1.02960098, + "epoch": 0.8087726048480185, + "flos": 419538313728.0, + "grad_norm": 0.05589938001894908, + "language_loss": 0.88482207, + "learning_rate": 9.287089277565409e-05, + "loss": 0.89544237, + "num_input_tokens_seen": 348704816, + "router_z_loss_mlp": 0.32421875, + "step": 4204, + "time_per_iteration": 2.5090560913085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063669, + "balance_loss_mlp": 1.03205466, + "epoch": 0.8089649865332821, + "flos": 508493016576.0, + "grad_norm": 0.05129598135232067, + "language_loss": 0.87062168, + "learning_rate": 9.269012061893922e-05, + "loss": 0.88125837, + "num_input_tokens_seen": 348783504, + "router_z_loss_mlp": 0.31591797, + "step": 4205, + "time_per_iteration": 2.7403717041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064175, + "balance_loss_mlp": 1.03174961, + "epoch": 0.8091573682185456, + "flos": 456960434688.0, + "grad_norm": 0.051102923464018915, + "language_loss": 0.84725749, + "learning_rate": 9.250950659394386e-05, + "loss": 0.85789919, + "num_input_tokens_seen": 348858272, + "router_z_loss_mlp": 0.32421875, + "step": 4206, + "time_per_iteration": 2.657475709915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064522, + "balance_loss_mlp": 1.03204989, + "epoch": 0.8093497499038091, + "flos": 524977970688.0, + "grad_norm": 0.05067893035392799, + "language_loss": 0.76910889, + "learning_rate": 9.232905077078824e-05, + "loss": 0.77975416, + "num_input_tokens_seen": 348934432, + "router_z_loss_mlp": 0.32470703, + "step": 4207, + "time_per_iteration": 2.7274835109710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069405, + "balance_loss_mlp": 1.03726602, + "epoch": 0.8095421315890727, + "flos": 489377493504.0, + "grad_norm": 0.07892159060292517, + "language_loss": 0.77032375, + "learning_rate": 9.214875321953164e-05, + "loss": 0.78101778, + "num_input_tokens_seen": 349003856, + "router_z_loss_mlp": 0.32128906, + "step": 4208, + "time_per_iteration": 2.5977513790130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064316, + "balance_loss_mlp": 1.03205764, + "epoch": 0.8097345132743363, + "flos": 624817456128.0, + "grad_norm": 0.04941967434183097, + "language_loss": 0.80449629, + "learning_rate": 9.196861401017164e-05, + "loss": 0.81513947, + "num_input_tokens_seen": 349080544, + "router_z_loss_mlp": 0.32250977, + "step": 4209, + "time_per_iteration": 2.735485792160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067095, + "balance_loss_mlp": 1.03416991, + "epoch": 0.8099268949595998, + "flos": 615393524736.0, + "grad_norm": 0.0567514842087568, + "language_loss": 0.79628795, + "learning_rate": 9.178863321264475e-05, + "loss": 0.80695891, + "num_input_tokens_seen": 349159072, + "router_z_loss_mlp": 0.3293457, + "step": 4210, + "time_per_iteration": 2.764686346054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106462, + "balance_loss_mlp": 1.03245759, + "epoch": 0.8101192766448634, + "flos": 479383183872.0, + "grad_norm": 0.04955977006686596, + "language_loss": 0.80138111, + "learning_rate": 9.160881089682566e-05, + "loss": 0.81202734, + "num_input_tokens_seen": 349230176, + "router_z_loss_mlp": 0.3215332, + "step": 4211, + "time_per_iteration": 2.6480655670166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066314, + "balance_loss_mlp": 1.03434241, + "epoch": 0.810311658330127, + "flos": 517078909440.0, + "grad_norm": 0.05012077281177707, + "language_loss": 0.86578828, + "learning_rate": 9.142914713252725e-05, + "loss": 0.87645137, + "num_input_tokens_seen": 349299760, + "router_z_loss_mlp": 0.31958008, + "step": 4212, + "time_per_iteration": 2.6052680015563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106724, + "balance_loss_mlp": 1.03467226, + "epoch": 0.8105040400153906, + "flos": 575481867264.0, + "grad_norm": 0.044856961843391975, + "language_loss": 0.84047955, + "learning_rate": 9.124964198950159e-05, + "loss": 0.85115194, + "num_input_tokens_seen": 349379712, + "router_z_loss_mlp": 0.32568359, + "step": 4213, + "time_per_iteration": 2.824300765991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063471, + "balance_loss_mlp": 1.03147483, + "epoch": 0.8106964217006541, + "flos": 638658694656.0, + "grad_norm": 0.05372438091559442, + "language_loss": 0.85053265, + "learning_rate": 9.107029553743862e-05, + "loss": 0.86116743, + "num_input_tokens_seen": 349460320, + "router_z_loss_mlp": 0.31982422, + "step": 4214, + "time_per_iteration": 2.8542444705963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061569, + "balance_loss_mlp": 1.02997828, + "epoch": 0.8108888033859176, + "flos": 579237225984.0, + "grad_norm": 0.06664809164052823, + "language_loss": 0.81298697, + "learning_rate": 9.089110784596672e-05, + "loss": 0.82360268, + "num_input_tokens_seen": 349527648, + "router_z_loss_mlp": 0.31567383, + "step": 4215, + "time_per_iteration": 2.680931329727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062307, + "balance_loss_mlp": 1.03040659, + "epoch": 0.8110811850711812, + "flos": 559612371456.0, + "grad_norm": 0.04909557742541004, + "language_loss": 0.8349334, + "learning_rate": 9.071207898465284e-05, + "loss": 0.8455565, + "num_input_tokens_seen": 349606912, + "router_z_loss_mlp": 0.31884766, + "step": 4216, + "time_per_iteration": 2.774233102798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012014, + "balance_loss_mlp": 1.00366914, + "epoch": 0.8112735667564448, + "flos": 1517160969216.0, + "grad_norm": 0.0037601729160873353, + "language_loss": 0.77260417, + "learning_rate": 9.053320902300205e-05, + "loss": 0.78272432, + "num_input_tokens_seen": 349827040, + "router_z_loss_mlp": 0.08349609, + "step": 4217, + "time_per_iteration": 4.66510534286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065826, + "balance_loss_mlp": 1.03375852, + "epoch": 0.8114659484417084, + "flos": 616048270848.0, + "grad_norm": 0.06288981625786437, + "language_loss": 0.85243338, + "learning_rate": 9.035449803045792e-05, + "loss": 0.86309159, + "num_input_tokens_seen": 349900080, + "router_z_loss_mlp": 0.32055664, + "step": 4218, + "time_per_iteration": 2.800776958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059611, + "balance_loss_mlp": 1.02835429, + "epoch": 0.8116583301269719, + "flos": 649624872960.0, + "grad_norm": 0.05605772970333741, + "language_loss": 0.78812903, + "learning_rate": 9.017594607640211e-05, + "loss": 0.79872513, + "num_input_tokens_seen": 349983568, + "router_z_loss_mlp": 0.31225586, + "step": 4219, + "time_per_iteration": 2.9232895374298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067588, + "balance_loss_mlp": 1.0353061, + "epoch": 0.8118507118122354, + "flos": 552811806720.0, + "grad_norm": 0.055355121711081465, + "language_loss": 0.80514246, + "learning_rate": 8.999755323015463e-05, + "loss": 0.81581837, + "num_input_tokens_seen": 350054928, + "router_z_loss_mlp": 0.32275391, + "step": 4220, + "time_per_iteration": 2.668212413787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063714, + "balance_loss_mlp": 1.03186107, + "epoch": 0.812043093497499, + "flos": 543854946816.0, + "grad_norm": 0.048800759009726725, + "language_loss": 0.87706423, + "learning_rate": 8.981931956097384e-05, + "loss": 0.88770139, + "num_input_tokens_seen": 350127872, + "router_z_loss_mlp": 0.31835938, + "step": 4221, + "time_per_iteration": 2.6155126094818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065942, + "balance_loss_mlp": 1.03370762, + "epoch": 0.8122354751827626, + "flos": 583113268224.0, + "grad_norm": 0.04761865072569173, + "language_loss": 0.83265173, + "learning_rate": 8.964124513805628e-05, + "loss": 0.84331113, + "num_input_tokens_seen": 350206592, + "router_z_loss_mlp": 0.32226562, + "step": 4222, + "time_per_iteration": 2.774775505065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013622, + "balance_loss_mlp": 1.00532508, + "epoch": 0.8124278568680262, + "flos": 1529747970048.0, + "grad_norm": 0.0043191347413629065, + "language_loss": 0.78250074, + "learning_rate": 8.94633300305363e-05, + "loss": 0.79263699, + "num_input_tokens_seen": 350436048, + "router_z_loss_mlp": 0.08300781, + "step": 4223, + "time_per_iteration": 4.961513519287109 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063048, + "balance_loss_mlp": 1.03012204, + "epoch": 0.8126202385532897, + "flos": 432640438272.0, + "grad_norm": 0.05869447426107839, + "language_loss": 0.80021381, + "learning_rate": 8.928557430748668e-05, + "loss": 0.8108443, + "num_input_tokens_seen": 350501376, + "router_z_loss_mlp": 0.3293457, + "step": 4224, + "time_per_iteration": 2.574615240097046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013546, + "balance_loss_mlp": 1.00524938, + "epoch": 0.8128126202385533, + "flos": 1547098665984.0, + "grad_norm": 0.004316183823014201, + "language_loss": 0.76495624, + "learning_rate": 8.910797803791854e-05, + "loss": 0.77509177, + "num_input_tokens_seen": 350735232, + "router_z_loss_mlp": 0.08300781, + "step": 4225, + "time_per_iteration": 4.941166639328003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063113, + "balance_loss_mlp": 1.03166568, + "epoch": 0.8130050019238169, + "flos": 528064026624.0, + "grad_norm": 0.05209839082843741, + "language_loss": 0.89101052, + "learning_rate": 8.893054129078077e-05, + "loss": 0.90164173, + "num_input_tokens_seen": 350805088, + "router_z_loss_mlp": 0.31420898, + "step": 4226, + "time_per_iteration": 2.6181585788726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066672, + "balance_loss_mlp": 1.03441381, + "epoch": 0.8131973836090804, + "flos": 542850992640.0, + "grad_norm": 0.07190535116471519, + "language_loss": 0.80237639, + "learning_rate": 8.875326413496037e-05, + "loss": 0.81304312, + "num_input_tokens_seen": 350876896, + "router_z_loss_mlp": 0.32250977, + "step": 4227, + "time_per_iteration": 2.753739595413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064339, + "balance_loss_mlp": 1.03219986, + "epoch": 0.8133897652943439, + "flos": 576223953408.0, + "grad_norm": 0.052554063935664315, + "language_loss": 0.82211399, + "learning_rate": 8.857614663928249e-05, + "loss": 0.83275741, + "num_input_tokens_seen": 350948400, + "router_z_loss_mlp": 0.32128906, + "step": 4228, + "time_per_iteration": 2.775632858276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063809, + "balance_loss_mlp": 1.03245687, + "epoch": 0.8135821469796075, + "flos": 578937480192.0, + "grad_norm": 0.05954408838402955, + "language_loss": 0.78956014, + "learning_rate": 8.839918887251025e-05, + "loss": 0.80019832, + "num_input_tokens_seen": 351023328, + "router_z_loss_mlp": 0.31323242, + "step": 4229, + "time_per_iteration": 2.7360551357269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063217, + "balance_loss_mlp": 1.0319128, + "epoch": 0.8137745286648711, + "flos": 650023543296.0, + "grad_norm": 0.049012494433050675, + "language_loss": 0.83987892, + "learning_rate": 8.822239090334472e-05, + "loss": 0.85051107, + "num_input_tokens_seen": 351108672, + "router_z_loss_mlp": 0.31274414, + "step": 4230, + "time_per_iteration": 2.9183671474456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063188, + "balance_loss_mlp": 1.03076267, + "epoch": 0.8139669103501347, + "flos": 701579446272.0, + "grad_norm": 0.050773528250442714, + "language_loss": 0.7552613, + "learning_rate": 8.804575280042493e-05, + "loss": 0.7658931, + "num_input_tokens_seen": 351185056, + "router_z_loss_mlp": 0.32421875, + "step": 4231, + "time_per_iteration": 2.892979860305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106874, + "balance_loss_mlp": 1.0362674, + "epoch": 0.8141592920353983, + "flos": 649933383168.0, + "grad_norm": 0.0634360445287057, + "language_loss": 0.83362955, + "learning_rate": 8.786927463232774e-05, + "loss": 0.84431696, + "num_input_tokens_seen": 351255856, + "router_z_loss_mlp": 0.32470703, + "step": 4232, + "time_per_iteration": 2.747666120529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067521, + "balance_loss_mlp": 1.03521562, + "epoch": 0.8143516737206618, + "flos": 536577136128.0, + "grad_norm": 0.05551196564912533, + "language_loss": 0.81381476, + "learning_rate": 8.769295646756853e-05, + "loss": 0.82448995, + "num_input_tokens_seen": 351322336, + "router_z_loss_mlp": 0.32299805, + "step": 4233, + "time_per_iteration": 2.6436498165130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063586, + "balance_loss_mlp": 1.03082705, + "epoch": 0.8145440554059253, + "flos": 508117667328.0, + "grad_norm": 0.050777916612580305, + "language_loss": 0.82274687, + "learning_rate": 8.751679837459963e-05, + "loss": 0.83338279, + "num_input_tokens_seen": 351387440, + "router_z_loss_mlp": 0.32763672, + "step": 4234, + "time_per_iteration": 2.5614001750946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066442, + "balance_loss_mlp": 1.03454125, + "epoch": 0.8147364370911889, + "flos": 634720043520.0, + "grad_norm": 0.05344342526685846, + "language_loss": 0.86183035, + "learning_rate": 8.734080042181181e-05, + "loss": 0.87249482, + "num_input_tokens_seen": 351464192, + "router_z_loss_mlp": 0.31884766, + "step": 4235, + "time_per_iteration": 2.8306832313537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065616, + "balance_loss_mlp": 1.0336678, + "epoch": 0.8149288187764525, + "flos": 422576317440.0, + "grad_norm": 0.05510479621984829, + "language_loss": 0.7812373, + "learning_rate": 8.716496267753343e-05, + "loss": 0.79189348, + "num_input_tokens_seen": 351528016, + "router_z_loss_mlp": 0.31933594, + "step": 4236, + "time_per_iteration": 2.4926445484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066427, + "balance_loss_mlp": 1.03512251, + "epoch": 0.8151212004617161, + "flos": 597150945792.0, + "grad_norm": 0.05070421466516423, + "language_loss": 0.81572199, + "learning_rate": 8.698928521003097e-05, + "loss": 0.82638621, + "num_input_tokens_seen": 351601648, + "router_z_loss_mlp": 0.31274414, + "step": 4237, + "time_per_iteration": 2.7581071853637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015484, + "balance_loss_mlp": 1.00713933, + "epoch": 0.8153135821469796, + "flos": 1478563186176.0, + "grad_norm": 0.0058140182195042965, + "language_loss": 0.77852845, + "learning_rate": 8.681376808750835e-05, + "loss": 0.7886833, + "num_input_tokens_seen": 351826720, + "router_z_loss_mlp": 0.08349609, + "step": 4238, + "time_per_iteration": 4.994212627410889 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064499, + "balance_loss_mlp": 1.03255105, + "epoch": 0.8155059638322432, + "flos": 436870070784.0, + "grad_norm": 0.0778292766394767, + "language_loss": 0.82763624, + "learning_rate": 8.663841137810741e-05, + "loss": 0.83828127, + "num_input_tokens_seen": 351891760, + "router_z_loss_mlp": 0.31933594, + "step": 4239, + "time_per_iteration": 2.522731065750122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063565, + "balance_loss_mlp": 1.03149724, + "epoch": 0.8156983455175068, + "flos": 794034842112.0, + "grad_norm": 0.05452846456487443, + "language_loss": 0.85091472, + "learning_rate": 8.646321514990763e-05, + "loss": 0.86155033, + "num_input_tokens_seen": 351977504, + "router_z_loss_mlp": 0.32055664, + "step": 4240, + "time_per_iteration": 3.029479742050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067898, + "balance_loss_mlp": 1.0353775, + "epoch": 0.8158907272027703, + "flos": 685685219328.0, + "grad_norm": 0.05011553315331007, + "language_loss": 0.81715024, + "learning_rate": 8.628817947092616e-05, + "loss": 0.82782924, + "num_input_tokens_seen": 352050176, + "router_z_loss_mlp": 0.32519531, + "step": 4241, + "time_per_iteration": 2.8111627101898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063687, + "balance_loss_mlp": 1.03140509, + "epoch": 0.8160831088880338, + "flos": 486812353536.0, + "grad_norm": 0.0676026015098648, + "language_loss": 0.84249878, + "learning_rate": 8.611330440911797e-05, + "loss": 0.85313565, + "num_input_tokens_seen": 352116848, + "router_z_loss_mlp": 0.32275391, + "step": 4242, + "time_per_iteration": 2.6123099327087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066489, + "balance_loss_mlp": 1.03439796, + "epoch": 0.8162754905732974, + "flos": 464635505664.0, + "grad_norm": 0.05452366301324058, + "language_loss": 0.80174685, + "learning_rate": 8.593859003237558e-05, + "loss": 0.81241173, + "num_input_tokens_seen": 352185056, + "router_z_loss_mlp": 0.32080078, + "step": 4243, + "time_per_iteration": 2.628828287124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014713, + "balance_loss_mlp": 1.00617814, + "epoch": 0.816467872258561, + "flos": 1238879577600.0, + "grad_norm": 0.006298566861144672, + "language_loss": 0.75285125, + "learning_rate": 8.576403640852904e-05, + "loss": 0.76299834, + "num_input_tokens_seen": 352397648, + "router_z_loss_mlp": 0.08544922, + "step": 4244, + "time_per_iteration": 4.721359014511108 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066399, + "balance_loss_mlp": 1.03497541, + "epoch": 0.8166602539438246, + "flos": 686862291456.0, + "grad_norm": 0.04683457091377271, + "language_loss": 0.86992514, + "learning_rate": 8.558964360534615e-05, + "loss": 0.88058907, + "num_input_tokens_seen": 352478272, + "router_z_loss_mlp": 0.31396484, + "step": 4245, + "time_per_iteration": 2.9027247428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014799, + "balance_loss_mlp": 1.00626397, + "epoch": 0.8168526356290882, + "flos": 1489674779136.0, + "grad_norm": 0.006322038265945877, + "language_loss": 0.72974741, + "learning_rate": 8.541541169053219e-05, + "loss": 0.7398954, + "num_input_tokens_seen": 352707104, + "router_z_loss_mlp": 0.08544922, + "step": 4246, + "time_per_iteration": 4.944149017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063972, + "balance_loss_mlp": 1.03269172, + "epoch": 0.8170450173143516, + "flos": 577927733760.0, + "grad_norm": 0.07058072733424971, + "language_loss": 0.84509909, + "learning_rate": 8.524134073172984e-05, + "loss": 0.85573876, + "num_input_tokens_seen": 352779248, + "router_z_loss_mlp": 0.3125, + "step": 4247, + "time_per_iteration": 2.7045445442199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066255, + "balance_loss_mlp": 1.03476024, + "epoch": 0.8172373989996152, + "flos": 570985984512.0, + "grad_norm": 0.10797119683150772, + "language_loss": 0.84231156, + "learning_rate": 8.506743079651974e-05, + "loss": 0.85297412, + "num_input_tokens_seen": 352856784, + "router_z_loss_mlp": 0.31469727, + "step": 4248, + "time_per_iteration": 2.775876760482788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066682, + "balance_loss_mlp": 1.03528225, + "epoch": 0.8174297806848788, + "flos": 528576178176.0, + "grad_norm": 0.05516186993789668, + "language_loss": 0.80867159, + "learning_rate": 8.489368195241948e-05, + "loss": 0.81933844, + "num_input_tokens_seen": 352926496, + "router_z_loss_mlp": 0.3137207, + "step": 4249, + "time_per_iteration": 2.6222243309020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064974, + "balance_loss_mlp": 1.03300214, + "epoch": 0.8176221623701424, + "flos": 568819514880.0, + "grad_norm": 0.05332500176787013, + "language_loss": 0.78964639, + "learning_rate": 8.47200942668846e-05, + "loss": 0.80029613, + "num_input_tokens_seen": 353005312, + "router_z_loss_mlp": 0.31958008, + "step": 4250, + "time_per_iteration": 2.788813829421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062759, + "balance_loss_mlp": 1.0308584, + "epoch": 0.8178145440554059, + "flos": 656226178560.0, + "grad_norm": 0.0759018879957778, + "language_loss": 0.80471349, + "learning_rate": 8.454666780730735e-05, + "loss": 0.81534111, + "num_input_tokens_seen": 353085120, + "router_z_loss_mlp": 0.31884766, + "step": 4251, + "time_per_iteration": 2.870534896850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064692, + "balance_loss_mlp": 1.03293502, + "epoch": 0.8180069257406695, + "flos": 545643095040.0, + "grad_norm": 0.05126338036877886, + "language_loss": 0.87462568, + "learning_rate": 8.437340264101828e-05, + "loss": 0.88527262, + "num_input_tokens_seen": 353160992, + "router_z_loss_mlp": 0.31738281, + "step": 4252, + "time_per_iteration": 2.7038302421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067347, + "balance_loss_mlp": 1.0348742, + "epoch": 0.818199307425933, + "flos": 618987350016.0, + "grad_norm": 0.059044445551817724, + "language_loss": 0.84837854, + "learning_rate": 8.420029883528474e-05, + "loss": 0.85905206, + "num_input_tokens_seen": 353233328, + "router_z_loss_mlp": 0.32470703, + "step": 4253, + "time_per_iteration": 2.713452100753784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062773, + "balance_loss_mlp": 1.03082526, + "epoch": 0.8183916891111966, + "flos": 647291077632.0, + "grad_norm": 0.08799557625316533, + "language_loss": 0.77167904, + "learning_rate": 8.402735645731157e-05, + "loss": 0.78230679, + "num_input_tokens_seen": 353310592, + "router_z_loss_mlp": 0.31933594, + "step": 4254, + "time_per_iteration": 2.958004951477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065309, + "balance_loss_mlp": 1.03305101, + "epoch": 0.8185840707964602, + "flos": 498875618304.0, + "grad_norm": 0.05690863085587787, + "language_loss": 0.78464049, + "learning_rate": 8.385457557424098e-05, + "loss": 0.79529357, + "num_input_tokens_seen": 353376544, + "router_z_loss_mlp": 0.32250977, + "step": 4255, + "time_per_iteration": 2.570077896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061679, + "balance_loss_mlp": 1.0300889, + "epoch": 0.8187764524817237, + "flos": 785885497344.0, + "grad_norm": 0.049659571224279345, + "language_loss": 0.79487193, + "learning_rate": 8.368195625315251e-05, + "loss": 0.80548877, + "num_input_tokens_seen": 353461200, + "router_z_loss_mlp": 0.31567383, + "step": 4256, + "time_per_iteration": 3.077765464782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066604, + "balance_loss_mlp": 1.03422654, + "epoch": 0.8189688341669873, + "flos": 550443105792.0, + "grad_norm": 0.048188663930783165, + "language_loss": 0.80790627, + "learning_rate": 8.350949856106283e-05, + "loss": 0.81857234, + "num_input_tokens_seen": 353538608, + "router_z_loss_mlp": 0.32373047, + "step": 4257, + "time_per_iteration": 2.7946553230285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013006, + "balance_loss_mlp": 1.00451815, + "epoch": 0.8191612158522509, + "flos": 1351247837184.0, + "grad_norm": 0.004934638976943409, + "language_loss": 0.71149343, + "learning_rate": 8.333720256492599e-05, + "loss": 0.72162348, + "num_input_tokens_seen": 353766960, + "router_z_loss_mlp": 0.08496094, + "step": 4258, + "time_per_iteration": 4.824636697769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064916, + "balance_loss_mlp": 1.03308725, + "epoch": 0.8193535975375145, + "flos": 543997541376.0, + "grad_norm": 0.061566617876948435, + "language_loss": 0.83655453, + "learning_rate": 8.316506833163318e-05, + "loss": 0.84720367, + "num_input_tokens_seen": 353833552, + "router_z_loss_mlp": 0.31811523, + "step": 4259, + "time_per_iteration": 2.652660846710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061066, + "balance_loss_mlp": 1.02978587, + "epoch": 0.8195459792227779, + "flos": 865361014272.0, + "grad_norm": 0.04568174068597017, + "language_loss": 0.85518008, + "learning_rate": 8.299309592801297e-05, + "loss": 0.86579072, + "num_input_tokens_seen": 353915520, + "router_z_loss_mlp": 0.3125, + "step": 4260, + "time_per_iteration": 3.0853793621063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066817, + "balance_loss_mlp": 1.03479743, + "epoch": 0.8197383609080415, + "flos": 569015953920.0, + "grad_norm": 0.0626065931580156, + "language_loss": 0.81053776, + "learning_rate": 8.282128542083101e-05, + "loss": 0.82120585, + "num_input_tokens_seen": 353992048, + "router_z_loss_mlp": 0.32006836, + "step": 4261, + "time_per_iteration": 2.69217848777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064059, + "balance_loss_mlp": 1.03232574, + "epoch": 0.8199307425933051, + "flos": 530546208768.0, + "grad_norm": 0.051394925331661186, + "language_loss": 0.84971988, + "learning_rate": 8.264963687678978e-05, + "loss": 0.86036044, + "num_input_tokens_seen": 354064848, + "router_z_loss_mlp": 0.31713867, + "step": 4262, + "time_per_iteration": 2.63204288482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066158, + "balance_loss_mlp": 1.03342354, + "epoch": 0.8201231242785687, + "flos": 566781083136.0, + "grad_norm": 0.047344222679516414, + "language_loss": 0.85058379, + "learning_rate": 8.247815036252921e-05, + "loss": 0.86124539, + "num_input_tokens_seen": 354138848, + "router_z_loss_mlp": 0.32739258, + "step": 4263, + "time_per_iteration": 2.725421905517578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064288, + "balance_loss_mlp": 1.03281677, + "epoch": 0.8203155059638323, + "flos": 1230037913088.0, + "grad_norm": 0.057193475382645485, + "language_loss": 0.83039153, + "learning_rate": 8.230682594462652e-05, + "loss": 0.84103441, + "num_input_tokens_seen": 354227696, + "router_z_loss_mlp": 0.31445312, + "step": 4264, + "time_per_iteration": 3.5159566402435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064005, + "balance_loss_mlp": 1.03148425, + "epoch": 0.8205078876490958, + "flos": 573929445888.0, + "grad_norm": 0.11088210201568381, + "language_loss": 0.7990979, + "learning_rate": 8.213566368959558e-05, + "loss": 0.80973792, + "num_input_tokens_seen": 354298400, + "router_z_loss_mlp": 0.32519531, + "step": 4265, + "time_per_iteration": 2.6384832859039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064287, + "balance_loss_mlp": 1.03198123, + "epoch": 0.8207002693343594, + "flos": 931005467136.0, + "grad_norm": 0.05472798836871878, + "language_loss": 0.78268075, + "learning_rate": 8.196466366388744e-05, + "loss": 0.79332364, + "num_input_tokens_seen": 354385024, + "router_z_loss_mlp": 0.32299805, + "step": 4266, + "time_per_iteration": 3.189403533935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066434, + "balance_loss_mlp": 1.03419971, + "epoch": 0.8208926510196229, + "flos": 549300939264.0, + "grad_norm": 0.05160040567241084, + "language_loss": 0.80250126, + "learning_rate": 8.179382593389029e-05, + "loss": 0.81316555, + "num_input_tokens_seen": 354456384, + "router_z_loss_mlp": 0.32226562, + "step": 4267, + "time_per_iteration": 2.6618669033050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067589, + "balance_loss_mlp": 1.03592706, + "epoch": 0.8210850327048865, + "flos": 647876012544.0, + "grad_norm": 0.06251068315447797, + "language_loss": 0.81684315, + "learning_rate": 8.162315056592918e-05, + "loss": 0.827519, + "num_input_tokens_seen": 354531296, + "router_z_loss_mlp": 0.31640625, + "step": 4268, + "time_per_iteration": 2.8171610832214355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063634, + "balance_loss_mlp": 1.03230619, + "epoch": 0.82127741439015, + "flos": 601227809280.0, + "grad_norm": 0.04884521994961647, + "language_loss": 0.81217563, + "learning_rate": 8.145263762626615e-05, + "loss": 0.82281196, + "num_input_tokens_seen": 354605680, + "router_z_loss_mlp": 0.31298828, + "step": 4269, + "time_per_iteration": 2.7463865280151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060593, + "balance_loss_mlp": 1.02955055, + "epoch": 0.8214697960754136, + "flos": 474577380864.0, + "grad_norm": 0.08185779566994117, + "language_loss": 0.83571124, + "learning_rate": 8.128228718110015e-05, + "loss": 0.84631717, + "num_input_tokens_seen": 354678160, + "router_z_loss_mlp": 0.31005859, + "step": 4270, + "time_per_iteration": 2.7095983028411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062123, + "balance_loss_mlp": 1.03058004, + "epoch": 0.8216621777606772, + "flos": 903288084480.0, + "grad_norm": 0.054129185570740465, + "language_loss": 0.84336656, + "learning_rate": 8.11120992965671e-05, + "loss": 0.85398781, + "num_input_tokens_seen": 354751024, + "router_z_loss_mlp": 0.31518555, + "step": 4271, + "time_per_iteration": 3.082247257232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064562, + "balance_loss_mlp": 1.03232741, + "epoch": 0.8218545594459408, + "flos": 514203849216.0, + "grad_norm": 0.04982374432283012, + "language_loss": 0.81935704, + "learning_rate": 8.094207403873998e-05, + "loss": 0.83000261, + "num_input_tokens_seen": 354819408, + "router_z_loss_mlp": 0.32226562, + "step": 4272, + "time_per_iteration": 2.5820279121398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064387, + "balance_loss_mlp": 1.0330354, + "epoch": 0.8220469411312044, + "flos": 494282221056.0, + "grad_norm": 0.0504374120036925, + "language_loss": 0.85846829, + "learning_rate": 8.077221147362829e-05, + "loss": 0.86911225, + "num_input_tokens_seen": 354887376, + "router_z_loss_mlp": 0.31323242, + "step": 4273, + "time_per_iteration": 2.584005355834961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064804, + "balance_loss_mlp": 1.0325458, + "epoch": 0.8222393228164678, + "flos": 386223579648.0, + "grad_norm": 0.057889479724380286, + "language_loss": 0.89503336, + "learning_rate": 8.060251166717835e-05, + "loss": 0.90568137, + "num_input_tokens_seen": 354948288, + "router_z_loss_mlp": 0.32250977, + "step": 4274, + "time_per_iteration": 2.382969379425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065829, + "balance_loss_mlp": 1.03388083, + "epoch": 0.8224317045017314, + "flos": 536331234816.0, + "grad_norm": 0.054333881103771485, + "language_loss": 0.86981678, + "learning_rate": 8.043297468527383e-05, + "loss": 0.88047504, + "num_input_tokens_seen": 355016912, + "router_z_loss_mlp": 0.31933594, + "step": 4275, + "time_per_iteration": 2.648383855819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062123, + "balance_loss_mlp": 1.0305084, + "epoch": 0.822624086186995, + "flos": 554637832704.0, + "grad_norm": 0.06589028036614175, + "language_loss": 0.82127655, + "learning_rate": 8.02636005937346e-05, + "loss": 0.83189774, + "num_input_tokens_seen": 355085936, + "router_z_loss_mlp": 0.31591797, + "step": 4276, + "time_per_iteration": 2.634843587875366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060304, + "balance_loss_mlp": 1.02861845, + "epoch": 0.8228164678722586, + "flos": 539296455168.0, + "grad_norm": 0.04688779470500233, + "language_loss": 0.79762036, + "learning_rate": 8.009438945831771e-05, + "loss": 0.80822337, + "num_input_tokens_seen": 355161984, + "router_z_loss_mlp": 0.31665039, + "step": 4277, + "time_per_iteration": 2.6846394538879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106328, + "balance_loss_mlp": 1.0314033, + "epoch": 0.8230088495575221, + "flos": 473001638400.0, + "grad_norm": 0.04881634129650232, + "language_loss": 0.79050267, + "learning_rate": 7.992534134471641e-05, + "loss": 0.80113542, + "num_input_tokens_seen": 355234544, + "router_z_loss_mlp": 0.31860352, + "step": 4278, + "time_per_iteration": 2.6381237506866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065439, + "balance_loss_mlp": 1.03332353, + "epoch": 0.8232012312427857, + "flos": 591403797504.0, + "grad_norm": 0.06782075009284445, + "language_loss": 0.82754999, + "learning_rate": 7.975645631856127e-05, + "loss": 0.83820438, + "num_input_tokens_seen": 355302896, + "router_z_loss_mlp": 0.32104492, + "step": 4279, + "time_per_iteration": 2.653944969177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065308, + "balance_loss_mlp": 1.03345561, + "epoch": 0.8233936129280492, + "flos": 572359495680.0, + "grad_norm": 0.04992965269342151, + "language_loss": 0.7469328, + "learning_rate": 7.958773444541916e-05, + "loss": 0.75758588, + "num_input_tokens_seen": 355377040, + "router_z_loss_mlp": 0.31835938, + "step": 4280, + "time_per_iteration": 2.7473456859588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066109, + "balance_loss_mlp": 1.03375518, + "epoch": 0.8235859946133128, + "flos": 730986052608.0, + "grad_norm": 0.041840919211567525, + "language_loss": 0.78355992, + "learning_rate": 7.941917579079383e-05, + "loss": 0.79422104, + "num_input_tokens_seen": 355461616, + "router_z_loss_mlp": 0.32348633, + "step": 4281, + "time_per_iteration": 3.0071098804473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066616, + "balance_loss_mlp": 1.03485882, + "epoch": 0.8237783762985764, + "flos": 570044639232.0, + "grad_norm": 0.056480020127142934, + "language_loss": 0.81059593, + "learning_rate": 7.92507804201253e-05, + "loss": 0.82126206, + "num_input_tokens_seen": 355532480, + "router_z_loss_mlp": 0.31738281, + "step": 4282, + "time_per_iteration": 2.6841516494750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004566, + "balance_loss_mlp": 0.99603093, + "epoch": 0.8239707579838399, + "flos": 1465437740544.0, + "grad_norm": 0.009020840107608945, + "language_loss": 0.75297678, + "learning_rate": 7.908254839879092e-05, + "loss": 0.76302242, + "num_input_tokens_seen": 355768752, + "router_z_loss_mlp": 0.08544922, + "step": 4283, + "time_per_iteration": 5.020637512207031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062901, + "balance_loss_mlp": 1.03035665, + "epoch": 0.8241631396691035, + "flos": 467068225536.0, + "grad_norm": 0.08084450716262151, + "language_loss": 0.8038317, + "learning_rate": 7.89144797921037e-05, + "loss": 0.81446069, + "num_input_tokens_seen": 355838800, + "router_z_loss_mlp": 0.32543945, + "step": 4284, + "time_per_iteration": 2.624439001083374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002031, + "balance_loss_mlp": 0.99354351, + "epoch": 0.8243555213543671, + "flos": 1538648165376.0, + "grad_norm": 0.010143921205099075, + "language_loss": 0.77934271, + "learning_rate": 7.874657466531388e-05, + "loss": 0.78936303, + "num_input_tokens_seen": 356069280, + "router_z_loss_mlp": 0.08496094, + "step": 4285, + "time_per_iteration": 4.928821086883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061224, + "balance_loss_mlp": 1.02958596, + "epoch": 0.8245479030396307, + "flos": 797072845824.0, + "grad_norm": 0.044332510336863265, + "language_loss": 0.82397342, + "learning_rate": 7.85788330836078e-05, + "loss": 0.83458561, + "num_input_tokens_seen": 356164528, + "router_z_loss_mlp": 0.31616211, + "step": 4286, + "time_per_iteration": 3.14546537399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063052, + "balance_loss_mlp": 1.03150964, + "epoch": 0.8247402847248941, + "flos": 645793910784.0, + "grad_norm": 0.045147974337775294, + "language_loss": 0.76310241, + "learning_rate": 7.841125511210878e-05, + "loss": 0.77373296, + "num_input_tokens_seen": 356243600, + "router_z_loss_mlp": 0.31518555, + "step": 4287, + "time_per_iteration": 2.898638963699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064179, + "balance_loss_mlp": 1.03242135, + "epoch": 0.8249326664101577, + "flos": 604123218432.0, + "grad_norm": 0.06674407920035312, + "language_loss": 0.79493982, + "learning_rate": 7.824384081587637e-05, + "loss": 0.80558157, + "num_input_tokens_seen": 356320320, + "router_z_loss_mlp": 0.31738281, + "step": 4288, + "time_per_iteration": 2.7821269035339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064292, + "balance_loss_mlp": 1.03186679, + "epoch": 0.8251250480954213, + "flos": 824006034432.0, + "grad_norm": 0.06583021420859499, + "language_loss": 0.86280286, + "learning_rate": 7.807659025990637e-05, + "loss": 0.87344575, + "num_input_tokens_seen": 356406928, + "router_z_loss_mlp": 0.32421875, + "step": 4289, + "time_per_iteration": 3.1109914779663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060406, + "balance_loss_mlp": 1.02848136, + "epoch": 0.8253174297806849, + "flos": 757060853760.0, + "grad_norm": 0.058384222358934625, + "language_loss": 0.77839482, + "learning_rate": 7.790950350913112e-05, + "loss": 0.78899884, + "num_input_tokens_seen": 356481456, + "router_z_loss_mlp": 0.3190918, + "step": 4290, + "time_per_iteration": 2.9492805004119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106367, + "balance_loss_mlp": 1.03117323, + "epoch": 0.8255098114659485, + "flos": 794090096640.0, + "grad_norm": 0.05143125962292425, + "language_loss": 0.87082183, + "learning_rate": 7.774258062841971e-05, + "loss": 0.88145852, + "num_input_tokens_seen": 356568736, + "router_z_loss_mlp": 0.32495117, + "step": 4291, + "time_per_iteration": 3.2069146633148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062351, + "balance_loss_mlp": 1.03111804, + "epoch": 0.825702193151212, + "flos": 710102730240.0, + "grad_norm": 0.12023426920878982, + "language_loss": 0.77597296, + "learning_rate": 7.757582168257731e-05, + "loss": 0.78659642, + "num_input_tokens_seen": 356643328, + "router_z_loss_mlp": 0.31201172, + "step": 4292, + "time_per_iteration": 2.850250244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061486, + "balance_loss_mlp": 1.02918029, + "epoch": 0.8258945748364755, + "flos": 683076409344.0, + "grad_norm": 0.048890635163102686, + "language_loss": 0.80603421, + "learning_rate": 7.740922673634537e-05, + "loss": 0.81664902, + "num_input_tokens_seen": 356723824, + "router_z_loss_mlp": 0.32299805, + "step": 4293, + "time_per_iteration": 2.910806179046631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060982, + "balance_loss_mlp": 1.02924871, + "epoch": 0.8260869565217391, + "flos": 594284649984.0, + "grad_norm": 0.05591520410462236, + "language_loss": 0.78902394, + "learning_rate": 7.724279585440186e-05, + "loss": 0.7996338, + "num_input_tokens_seen": 356796512, + "router_z_loss_mlp": 0.31713867, + "step": 4294, + "time_per_iteration": 2.704671859741211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060187, + "balance_loss_mlp": 1.02816761, + "epoch": 0.8262793382070027, + "flos": 651189030912.0, + "grad_norm": 0.04861629765656741, + "language_loss": 0.84998047, + "learning_rate": 7.707652910136098e-05, + "loss": 0.86058241, + "num_input_tokens_seen": 356868624, + "router_z_loss_mlp": 0.32006836, + "step": 4295, + "time_per_iteration": 2.823174238204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060095, + "balance_loss_mlp": 1.02807498, + "epoch": 0.8264717198922663, + "flos": 538665030144.0, + "grad_norm": 0.05706956652691896, + "language_loss": 0.84780651, + "learning_rate": 7.691042654177315e-05, + "loss": 0.85840744, + "num_input_tokens_seen": 356934368, + "router_z_loss_mlp": 0.32006836, + "step": 4296, + "time_per_iteration": 2.6320347785949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061015, + "balance_loss_mlp": 1.02949572, + "epoch": 0.8266641015775298, + "flos": 538689761280.0, + "grad_norm": 0.053774403970765, + "language_loss": 0.74997044, + "learning_rate": 7.674448824012514e-05, + "loss": 0.7605806, + "num_input_tokens_seen": 357005536, + "router_z_loss_mlp": 0.31494141, + "step": 4297, + "time_per_iteration": 2.6300857067108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064228, + "balance_loss_mlp": 1.03199387, + "epoch": 0.8268564832627934, + "flos": 585077506560.0, + "grad_norm": 0.046756565982589225, + "language_loss": 0.84059066, + "learning_rate": 7.657871426083979e-05, + "loss": 0.85123295, + "num_input_tokens_seen": 357082160, + "router_z_loss_mlp": 0.32226562, + "step": 4298, + "time_per_iteration": 2.7552220821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056678, + "balance_loss_mlp": 1.02449179, + "epoch": 0.827048864948057, + "flos": 430434680832.0, + "grad_norm": 0.061322212544335376, + "language_loss": 0.83956921, + "learning_rate": 7.641310466827667e-05, + "loss": 0.85013604, + "num_input_tokens_seen": 357146928, + "router_z_loss_mlp": 0.32177734, + "step": 4299, + "time_per_iteration": 2.44634747505188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063761, + "balance_loss_mlp": 1.03214669, + "epoch": 0.8272412466333205, + "flos": 1387915181568.0, + "grad_norm": 0.051483866399296904, + "language_loss": 0.85022169, + "learning_rate": 7.624765952673069e-05, + "loss": 0.86085927, + "num_input_tokens_seen": 357236768, + "router_z_loss_mlp": 0.31591797, + "step": 4300, + "time_per_iteration": 3.7406394481658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061571, + "balance_loss_mlp": 1.02978992, + "epoch": 0.827433628318584, + "flos": 537952057344.0, + "grad_norm": 0.06027744747687877, + "language_loss": 0.82495129, + "learning_rate": 7.608237890043335e-05, + "loss": 0.835567, + "num_input_tokens_seen": 357307568, + "router_z_loss_mlp": 0.31762695, + "step": 4301, + "time_per_iteration": 2.707937479019165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062303, + "balance_loss_mlp": 1.0309273, + "epoch": 0.8276260100038476, + "flos": 730404089856.0, + "grad_norm": 0.051103301822031, + "language_loss": 0.77301157, + "learning_rate": 7.59172628535526e-05, + "loss": 0.78363454, + "num_input_tokens_seen": 357387712, + "router_z_loss_mlp": 0.31347656, + "step": 4302, + "time_per_iteration": 2.938246488571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062788, + "balance_loss_mlp": 1.0308876, + "epoch": 0.8278183916891112, + "flos": 870713874432.0, + "grad_norm": 0.042056828562468666, + "language_loss": 0.82270902, + "learning_rate": 7.575231145019196e-05, + "loss": 0.83333695, + "num_input_tokens_seen": 357473360, + "router_z_loss_mlp": 0.31884766, + "step": 4303, + "time_per_iteration": 3.2142274379730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064009, + "balance_loss_mlp": 1.03122663, + "epoch": 0.8280107733743748, + "flos": 594255536640.0, + "grad_norm": 0.05242827706405345, + "language_loss": 0.77579129, + "learning_rate": 7.558752475439134e-05, + "loss": 0.78643137, + "num_input_tokens_seen": 357548432, + "router_z_loss_mlp": 0.32788086, + "step": 4304, + "time_per_iteration": 2.814234972000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061162, + "balance_loss_mlp": 1.02957141, + "epoch": 0.8282031550596384, + "flos": 768253994496.0, + "grad_norm": 0.059346298032511166, + "language_loss": 0.84176564, + "learning_rate": 7.542290283012653e-05, + "loss": 0.85237724, + "num_input_tokens_seen": 357625968, + "router_z_loss_mlp": 0.31567383, + "step": 4305, + "time_per_iteration": 3.011970281600952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062221, + "balance_loss_mlp": 1.0302968, + "epoch": 0.8283955367449019, + "flos": 695775481344.0, + "grad_norm": 0.050284408342684474, + "language_loss": 0.77980834, + "learning_rate": 7.525844574130947e-05, + "loss": 0.79043055, + "num_input_tokens_seen": 357705824, + "router_z_loss_mlp": 0.3190918, + "step": 4306, + "time_per_iteration": 2.8967409133911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062089, + "balance_loss_mlp": 1.0303793, + "epoch": 0.8285879184301654, + "flos": 660304452096.0, + "grad_norm": 0.051650533406218724, + "language_loss": 0.82582647, + "learning_rate": 7.509415355178806e-05, + "loss": 0.83644736, + "num_input_tokens_seen": 357787040, + "router_z_loss_mlp": 0.31689453, + "step": 4307, + "time_per_iteration": 2.919273853302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060613, + "balance_loss_mlp": 1.02835536, + "epoch": 0.828780300115429, + "flos": 558444063744.0, + "grad_norm": 0.06767226386788444, + "language_loss": 0.78127337, + "learning_rate": 7.493002632534618e-05, + "loss": 0.79187953, + "num_input_tokens_seen": 357856960, + "router_z_loss_mlp": 0.32250977, + "step": 4308, + "time_per_iteration": 2.6678566932678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063076, + "balance_loss_mlp": 1.03141367, + "epoch": 0.8289726818006926, + "flos": 830613132288.0, + "grad_norm": 0.05551844794996671, + "language_loss": 0.81721139, + "learning_rate": 7.476606412570352e-05, + "loss": 0.82784212, + "num_input_tokens_seen": 357937760, + "router_z_loss_mlp": 0.31640625, + "step": 4309, + "time_per_iteration": 3.086724042892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063148, + "balance_loss_mlp": 1.03050852, + "epoch": 0.8291650634859561, + "flos": 731974040064.0, + "grad_norm": 0.06581932804634757, + "language_loss": 0.80861235, + "learning_rate": 7.460226701651624e-05, + "loss": 0.81924385, + "num_input_tokens_seen": 358012480, + "router_z_loss_mlp": 0.32641602, + "step": 4310, + "time_per_iteration": 2.8933186531066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067077, + "balance_loss_mlp": 1.03384113, + "epoch": 0.8293574451712197, + "flos": 860521715712.0, + "grad_norm": 0.04893141587643439, + "language_loss": 0.81319165, + "learning_rate": 7.443863506137566e-05, + "loss": 0.82386243, + "num_input_tokens_seen": 358100720, + "router_z_loss_mlp": 0.33251953, + "step": 4311, + "time_per_iteration": 3.168560743331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064421, + "balance_loss_mlp": 1.03168607, + "epoch": 0.8295498268564833, + "flos": 494874358272.0, + "grad_norm": 0.0449610760938366, + "language_loss": 0.81700766, + "learning_rate": 7.427516832380948e-05, + "loss": 0.82765186, + "num_input_tokens_seen": 358180496, + "router_z_loss_mlp": 0.32739258, + "step": 4312, + "time_per_iteration": 2.8094916343688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060381, + "balance_loss_mlp": 1.02867162, + "epoch": 0.8297422085417469, + "flos": 554176553472.0, + "grad_norm": 0.04659314008447996, + "language_loss": 0.777403, + "learning_rate": 7.4111866867281e-05, + "loss": 0.78800684, + "num_input_tokens_seen": 358261104, + "router_z_loss_mlp": 0.31689453, + "step": 4313, + "time_per_iteration": 2.776169538497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060377, + "balance_loss_mlp": 1.02849996, + "epoch": 0.8299345902270104, + "flos": 1247001523200.0, + "grad_norm": 0.04777485610881539, + "language_loss": 0.77525687, + "learning_rate": 7.39487307551896e-05, + "loss": 0.78586066, + "num_input_tokens_seen": 358356368, + "router_z_loss_mlp": 0.31860352, + "step": 4314, + "time_per_iteration": 3.6645400524139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063672, + "balance_loss_mlp": 1.03217673, + "epoch": 0.8301269719122739, + "flos": 584974199808.0, + "grad_norm": 0.056657764932407616, + "language_loss": 0.83212584, + "learning_rate": 7.378576005087034e-05, + "loss": 0.84276259, + "num_input_tokens_seen": 358429104, + "router_z_loss_mlp": 0.31469727, + "step": 4315, + "time_per_iteration": 2.713848352432251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060295, + "balance_loss_mlp": 1.02834654, + "epoch": 0.8303193535975375, + "flos": 509472239616.0, + "grad_norm": 0.057881745487426015, + "language_loss": 0.84784532, + "learning_rate": 7.362295481759412e-05, + "loss": 0.85844827, + "num_input_tokens_seen": 358501344, + "router_z_loss_mlp": 0.31933594, + "step": 4316, + "time_per_iteration": 2.6434786319732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106228, + "balance_loss_mlp": 1.03061819, + "epoch": 0.8305117352828011, + "flos": 580375010304.0, + "grad_norm": 0.06045853162415408, + "language_loss": 0.83559334, + "learning_rate": 7.346031511856722e-05, + "loss": 0.84621614, + "num_input_tokens_seen": 358575584, + "router_z_loss_mlp": 0.31640625, + "step": 4317, + "time_per_iteration": 2.6957998275756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060074, + "balance_loss_mlp": 1.02750635, + "epoch": 0.8307041169680647, + "flos": 481372153344.0, + "grad_norm": 0.047579165766144, + "language_loss": 0.78932106, + "learning_rate": 7.329784101693232e-05, + "loss": 0.79992181, + "num_input_tokens_seen": 358644304, + "router_z_loss_mlp": 0.32568359, + "step": 4318, + "time_per_iteration": 2.626657724380493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060149, + "balance_loss_mlp": 1.0276053, + "epoch": 0.8308964986533282, + "flos": 624319861248.0, + "grad_norm": 0.06476981139443477, + "language_loss": 0.82870758, + "learning_rate": 7.313553257576727e-05, + "loss": 0.8393091, + "num_input_tokens_seen": 358712384, + "router_z_loss_mlp": 0.32543945, + "step": 4319, + "time_per_iteration": 2.7071280479431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059715, + "balance_loss_mlp": 1.02738571, + "epoch": 0.8310888803385917, + "flos": 826974226944.0, + "grad_norm": 0.05413916081766935, + "language_loss": 0.78281611, + "learning_rate": 7.297338985808589e-05, + "loss": 0.79341328, + "num_input_tokens_seen": 358789264, + "router_z_loss_mlp": 0.32324219, + "step": 4320, + "time_per_iteration": 2.990934133529663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106001, + "balance_loss_mlp": 1.02813339, + "epoch": 0.8312812620238553, + "flos": 583443537408.0, + "grad_norm": 0.04487229329770065, + "language_loss": 0.8192122, + "learning_rate": 7.281141292683746e-05, + "loss": 0.82981229, + "num_input_tokens_seen": 358868976, + "router_z_loss_mlp": 0.31860352, + "step": 4321, + "time_per_iteration": 2.77132248878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060374, + "balance_loss_mlp": 1.02818751, + "epoch": 0.8314736437091189, + "flos": 1115165560320.0, + "grad_norm": 0.05537156492885857, + "language_loss": 0.7462157, + "learning_rate": 7.26496018449071e-05, + "loss": 0.75681943, + "num_input_tokens_seen": 358953600, + "router_z_loss_mlp": 0.32177734, + "step": 4322, + "time_per_iteration": 3.414076328277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060533, + "balance_loss_mlp": 1.0283463, + "epoch": 0.8316660253943825, + "flos": 517295697408.0, + "grad_norm": 0.051649264825651166, + "language_loss": 0.81687033, + "learning_rate": 7.248795667511543e-05, + "loss": 0.82747567, + "num_input_tokens_seen": 359028768, + "router_z_loss_mlp": 0.32177734, + "step": 4323, + "time_per_iteration": 2.763958692550659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061491, + "balance_loss_mlp": 1.02944708, + "epoch": 0.831858407079646, + "flos": 794989334016.0, + "grad_norm": 0.059163760563895655, + "language_loss": 0.77817202, + "learning_rate": 7.232647748021864e-05, + "loss": 0.78878695, + "num_input_tokens_seen": 359116208, + "router_z_loss_mlp": 0.3203125, + "step": 4324, + "time_per_iteration": 2.988997459411621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106194, + "balance_loss_mlp": 1.02980113, + "epoch": 0.8320507887649096, + "flos": 549699609600.0, + "grad_norm": 0.05611218346767701, + "language_loss": 0.83145595, + "learning_rate": 7.216516432290843e-05, + "loss": 0.84207541, + "num_input_tokens_seen": 359189552, + "router_z_loss_mlp": 0.32128906, + "step": 4325, + "time_per_iteration": 2.6479220390319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061886, + "balance_loss_mlp": 1.03029585, + "epoch": 0.8322431704501732, + "flos": 479160603648.0, + "grad_norm": 0.06615664891911413, + "language_loss": 0.81790996, + "learning_rate": 7.20040172658123e-05, + "loss": 0.82852876, + "num_input_tokens_seen": 359253008, + "router_z_loss_mlp": 0.31567383, + "step": 4326, + "time_per_iteration": 2.569667339324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060893, + "balance_loss_mlp": 1.029374, + "epoch": 0.8324355521354367, + "flos": 572157264384.0, + "grad_norm": 0.05577163656635302, + "language_loss": 0.85195124, + "learning_rate": 7.184303637149308e-05, + "loss": 0.86256015, + "num_input_tokens_seen": 359326368, + "router_z_loss_mlp": 0.31494141, + "step": 4327, + "time_per_iteration": 2.7528669834136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059731, + "balance_loss_mlp": 1.02763987, + "epoch": 0.8326279338207002, + "flos": 503208557568.0, + "grad_norm": 0.044215822441669876, + "language_loss": 0.82180458, + "learning_rate": 7.168222170244888e-05, + "loss": 0.83240187, + "num_input_tokens_seen": 359394192, + "router_z_loss_mlp": 0.32080078, + "step": 4328, + "time_per_iteration": 2.6244540214538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061835, + "balance_loss_mlp": 1.02972031, + "epoch": 0.8328203155059638, + "flos": 605442885120.0, + "grad_norm": 0.054206971603661426, + "language_loss": 0.81084836, + "learning_rate": 7.152157332111364e-05, + "loss": 0.82146668, + "num_input_tokens_seen": 359476016, + "router_z_loss_mlp": 0.32104492, + "step": 4329, + "time_per_iteration": 2.9348015785217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060654, + "balance_loss_mlp": 1.02818131, + "epoch": 0.8330126971912274, + "flos": 697469087232.0, + "grad_norm": 0.04560103027765181, + "language_loss": 0.85918784, + "learning_rate": 7.136109128985663e-05, + "loss": 0.86979437, + "num_input_tokens_seen": 359554048, + "router_z_loss_mlp": 0.32470703, + "step": 4330, + "time_per_iteration": 2.9106035232543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062656, + "balance_loss_mlp": 1.03032613, + "epoch": 0.833205078876491, + "flos": 493799182848.0, + "grad_norm": 0.05705382865518944, + "language_loss": 0.86475688, + "learning_rate": 7.120077567098249e-05, + "loss": 0.8753835, + "num_input_tokens_seen": 359621440, + "router_z_loss_mlp": 0.32324219, + "step": 4331, + "time_per_iteration": 2.5757062435150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064064, + "balance_loss_mlp": 1.03237844, + "epoch": 0.8333974605617546, + "flos": 482568164352.0, + "grad_norm": 0.06444033025960733, + "language_loss": 0.82880324, + "learning_rate": 7.104062652673115e-05, + "loss": 0.83944392, + "num_input_tokens_seen": 359690320, + "router_z_loss_mlp": 0.31665039, + "step": 4332, + "time_per_iteration": 2.6238632202148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060115, + "balance_loss_mlp": 1.02788115, + "epoch": 0.833589842247018, + "flos": 686517465600.0, + "grad_norm": 0.059078619019291526, + "language_loss": 0.82772213, + "learning_rate": 7.088064391927818e-05, + "loss": 0.8383233, + "num_input_tokens_seen": 359759888, + "router_z_loss_mlp": 0.32226562, + "step": 4333, + "time_per_iteration": 2.832642078399658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061254, + "balance_loss_mlp": 1.02868593, + "epoch": 0.8337822239322816, + "flos": 881377486848.0, + "grad_norm": 0.05463560677088328, + "language_loss": 0.82398927, + "learning_rate": 7.072082791073419e-05, + "loss": 0.83460188, + "num_input_tokens_seen": 359836544, + "router_z_loss_mlp": 0.32568359, + "step": 4334, + "time_per_iteration": 3.058436632156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062782, + "balance_loss_mlp": 1.03023815, + "epoch": 0.8339746056175452, + "flos": 496940493312.0, + "grad_norm": 0.05988457548558227, + "language_loss": 0.82327098, + "learning_rate": 7.056117856314531e-05, + "loss": 0.83389878, + "num_input_tokens_seen": 359903024, + "router_z_loss_mlp": 0.32543945, + "step": 4335, + "time_per_iteration": 2.6513023376464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059506, + "balance_loss_mlp": 1.02810621, + "epoch": 0.8341669873028088, + "flos": 510244849152.0, + "grad_norm": 0.06293164805467606, + "language_loss": 0.85905898, + "learning_rate": 7.040169593849289e-05, + "loss": 0.869654, + "num_input_tokens_seen": 359971200, + "router_z_loss_mlp": 0.3137207, + "step": 4336, + "time_per_iteration": 2.5953714847564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060837, + "balance_loss_mlp": 1.02924645, + "epoch": 0.8343593689880723, + "flos": 692017302528.0, + "grad_norm": 0.05109928618703078, + "language_loss": 0.84164715, + "learning_rate": 7.024238009869366e-05, + "loss": 0.85225552, + "num_input_tokens_seen": 360042560, + "router_z_loss_mlp": 0.31567383, + "step": 4337, + "time_per_iteration": 2.83786940574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061215, + "balance_loss_mlp": 1.02909958, + "epoch": 0.8345517506733359, + "flos": 552132329472.0, + "grad_norm": 0.0511172744686772, + "language_loss": 0.78007007, + "learning_rate": 7.008323110559956e-05, + "loss": 0.79068226, + "num_input_tokens_seen": 360118048, + "router_z_loss_mlp": 0.32104492, + "step": 4338, + "time_per_iteration": 2.7188031673431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063317, + "balance_loss_mlp": 1.03082061, + "epoch": 0.8347441323585995, + "flos": 591750033408.0, + "grad_norm": 0.060812686933994074, + "language_loss": 0.7611599, + "learning_rate": 6.992424902099754e-05, + "loss": 0.77179301, + "num_input_tokens_seen": 360192528, + "router_z_loss_mlp": 0.32495117, + "step": 4339, + "time_per_iteration": 2.7962260246276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060806, + "balance_loss_mlp": 1.02838063, + "epoch": 0.834936514043863, + "flos": 614625297408.0, + "grad_norm": 0.05105659953358199, + "language_loss": 0.84234512, + "learning_rate": 6.976543390660983e-05, + "loss": 0.85295308, + "num_input_tokens_seen": 360266880, + "router_z_loss_mlp": 0.32421875, + "step": 4340, + "time_per_iteration": 2.727919101715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061694, + "balance_loss_mlp": 1.03050888, + "epoch": 0.8351288957291266, + "flos": 467590551552.0, + "grad_norm": 0.05876177315186617, + "language_loss": 0.79671931, + "learning_rate": 6.960678582409424e-05, + "loss": 0.80733621, + "num_input_tokens_seen": 360336336, + "router_z_loss_mlp": 0.31152344, + "step": 4341, + "time_per_iteration": 2.5811197757720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064328, + "balance_loss_mlp": 1.03276157, + "epoch": 0.8353212774143901, + "flos": 509063394816.0, + "grad_norm": 0.0467408399393633, + "language_loss": 0.78745788, + "learning_rate": 6.944830483504328e-05, + "loss": 0.79810113, + "num_input_tokens_seen": 360409776, + "router_z_loss_mlp": 0.31542969, + "step": 4342, + "time_per_iteration": 2.6592719554901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061639, + "balance_loss_mlp": 1.02957189, + "epoch": 0.8355136590996537, + "flos": 687477749760.0, + "grad_norm": 0.047752406673021996, + "language_loss": 0.80783325, + "learning_rate": 6.928999100098483e-05, + "loss": 0.81844962, + "num_input_tokens_seen": 360486800, + "router_z_loss_mlp": 0.32055664, + "step": 4343, + "time_per_iteration": 2.8339645862579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060012, + "balance_loss_mlp": 1.0284456, + "epoch": 0.8357060407849173, + "flos": 984019249152.0, + "grad_norm": 0.06155280246036442, + "language_loss": 0.83912945, + "learning_rate": 6.913184438338138e-05, + "loss": 0.84972966, + "num_input_tokens_seen": 360568624, + "router_z_loss_mlp": 0.31542969, + "step": 4344, + "time_per_iteration": 3.216474771499634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063293, + "balance_loss_mlp": 1.03132069, + "epoch": 0.8358984224701809, + "flos": 842657458176.0, + "grad_norm": 0.0551356082042311, + "language_loss": 0.84887034, + "learning_rate": 6.89738650436313e-05, + "loss": 0.85950327, + "num_input_tokens_seen": 360652384, + "router_z_loss_mlp": 0.31958008, + "step": 4345, + "time_per_iteration": 3.195633888244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060346, + "balance_loss_mlp": 1.02849305, + "epoch": 0.8360908041554445, + "flos": 625945065984.0, + "grad_norm": 0.047281454191363835, + "language_loss": 0.81882936, + "learning_rate": 6.881605304306748e-05, + "loss": 0.82943279, + "num_input_tokens_seen": 360723200, + "router_z_loss_mlp": 0.31835938, + "step": 4346, + "time_per_iteration": 2.7578022480010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061434, + "balance_loss_mlp": 1.02884197, + "epoch": 0.8362831858407079, + "flos": 575781613056.0, + "grad_norm": 0.047125075234917546, + "language_loss": 0.84707719, + "learning_rate": 6.865840844295796e-05, + "loss": 0.85769153, + "num_input_tokens_seen": 360798240, + "router_z_loss_mlp": 0.32592773, + "step": 4347, + "time_per_iteration": 2.7195725440979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063205, + "balance_loss_mlp": 1.03120947, + "epoch": 0.8364755675259715, + "flos": 833434348032.0, + "grad_norm": 0.05954085482957289, + "language_loss": 0.80601609, + "learning_rate": 6.850093130450569e-05, + "loss": 0.81664807, + "num_input_tokens_seen": 360873552, + "router_z_loss_mlp": 0.31982422, + "step": 4348, + "time_per_iteration": 3.0620546340942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061293, + "balance_loss_mlp": 1.02963078, + "epoch": 0.8366679492112351, + "flos": 582211210752.0, + "grad_norm": 0.0624278254364128, + "language_loss": 0.86158174, + "learning_rate": 6.834362168884912e-05, + "loss": 0.87219471, + "num_input_tokens_seen": 360940800, + "router_z_loss_mlp": 0.31665039, + "step": 4349, + "time_per_iteration": 2.67112135887146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066667, + "balance_loss_mlp": 1.03438473, + "epoch": 0.8368603308964987, + "flos": 611434524672.0, + "grad_norm": 0.06986465121037809, + "language_loss": 0.87439007, + "learning_rate": 6.818647965706076e-05, + "loss": 0.88505673, + "num_input_tokens_seen": 361014368, + "router_z_loss_mlp": 0.32275391, + "step": 4350, + "time_per_iteration": 2.751300573348999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061351, + "balance_loss_mlp": 1.03030932, + "epoch": 0.8370527125817622, + "flos": 507014788608.0, + "grad_norm": 0.04892721462190198, + "language_loss": 0.85628557, + "learning_rate": 6.802950527014884e-05, + "loss": 0.86689907, + "num_input_tokens_seen": 361087184, + "router_z_loss_mlp": 0.31005859, + "step": 4351, + "time_per_iteration": 2.7105777263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062361, + "balance_loss_mlp": 1.03017473, + "epoch": 0.8372450942670258, + "flos": 770621285376.0, + "grad_norm": 0.06534150847957279, + "language_loss": 0.82583368, + "learning_rate": 6.787269858905603e-05, + "loss": 0.83645725, + "num_input_tokens_seen": 361160720, + "router_z_loss_mlp": 0.32177734, + "step": 4352, + "time_per_iteration": 2.9064080715179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065324, + "balance_loss_mlp": 1.03306603, + "epoch": 0.8374374759522893, + "flos": 579005881344.0, + "grad_norm": 0.05598661276707463, + "language_loss": 0.84989685, + "learning_rate": 6.771605967466033e-05, + "loss": 0.86055005, + "num_input_tokens_seen": 361234432, + "router_z_loss_mlp": 0.32250977, + "step": 4353, + "time_per_iteration": 2.6632039546966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061526, + "balance_loss_mlp": 1.0297451, + "epoch": 0.8376298576375529, + "flos": 787781334528.0, + "grad_norm": 0.06243401457394196, + "language_loss": 0.82518673, + "learning_rate": 6.755958858777434e-05, + "loss": 0.83580196, + "num_input_tokens_seen": 361309376, + "router_z_loss_mlp": 0.31762695, + "step": 4354, + "time_per_iteration": 2.9677281379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010626, + "balance_loss_mlp": 1.03024638, + "epoch": 0.8378222393228165, + "flos": 577337006592.0, + "grad_norm": 0.10666459553025696, + "language_loss": 0.80665678, + "learning_rate": 6.74032853891452e-05, + "loss": 0.8172828, + "num_input_tokens_seen": 361386768, + "router_z_loss_mlp": 0.32348633, + "step": 4355, + "time_per_iteration": 2.7135212421417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063218, + "balance_loss_mlp": 1.03084075, + "epoch": 0.83801462100808, + "flos": 480618482688.0, + "grad_norm": 0.05787052388359443, + "language_loss": 0.81662172, + "learning_rate": 6.724715013945548e-05, + "loss": 0.82725382, + "num_input_tokens_seen": 361456704, + "router_z_loss_mlp": 0.32373047, + "step": 4356, + "time_per_iteration": 2.58859920501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061947, + "balance_loss_mlp": 1.03021395, + "epoch": 0.8382070026933436, + "flos": 550523091456.0, + "grad_norm": 0.060545576710462894, + "language_loss": 0.89191318, + "learning_rate": 6.709118289932226e-05, + "loss": 0.90253264, + "num_input_tokens_seen": 361533648, + "router_z_loss_mlp": 0.31713867, + "step": 4357, + "time_per_iteration": 2.770278215408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063858, + "balance_loss_mlp": 1.032125, + "epoch": 0.8383993843786072, + "flos": 624655922688.0, + "grad_norm": 0.06062312450424789, + "language_loss": 0.81920969, + "learning_rate": 6.693538372929725e-05, + "loss": 0.82984829, + "num_input_tokens_seen": 361614256, + "router_z_loss_mlp": 0.31713867, + "step": 4358, + "time_per_iteration": 2.9120824337005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064332, + "balance_loss_mlp": 1.031955, + "epoch": 0.8385917660638708, + "flos": 490928504832.0, + "grad_norm": 0.050165446506244216, + "language_loss": 0.86263275, + "learning_rate": 6.677975268986719e-05, + "loss": 0.87327605, + "num_input_tokens_seen": 361679008, + "router_z_loss_mlp": 0.32373047, + "step": 4359, + "time_per_iteration": 2.5493242740631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064879, + "balance_loss_mlp": 1.0325731, + "epoch": 0.8387841477491342, + "flos": 466659380736.0, + "grad_norm": 0.05193023362700978, + "language_loss": 0.87059301, + "learning_rate": 6.662428984145336e-05, + "loss": 0.8812418, + "num_input_tokens_seen": 361747600, + "router_z_loss_mlp": 0.32299805, + "step": 4360, + "time_per_iteration": 2.61664080619812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006292, + "balance_loss_mlp": 0.99775666, + "epoch": 0.8389765294343978, + "flos": 1563339128832.0, + "grad_norm": 0.00919489122759599, + "language_loss": 0.71780187, + "learning_rate": 6.646899524441175e-05, + "loss": 0.7278648, + "num_input_tokens_seen": 361983104, + "router_z_loss_mlp": 0.08544922, + "step": 4361, + "time_per_iteration": 5.009884357452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060823, + "balance_loss_mlp": 1.02956581, + "epoch": 0.8391689111196614, + "flos": 601849059840.0, + "grad_norm": 0.04367937475787612, + "language_loss": 0.83063507, + "learning_rate": 6.631386895903308e-05, + "loss": 0.84124339, + "num_input_tokens_seen": 362065824, + "router_z_loss_mlp": 0.31225586, + "step": 4362, + "time_per_iteration": 2.8306806087493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106322, + "balance_loss_mlp": 1.0315814, + "epoch": 0.839361292804925, + "flos": 442818040320.0, + "grad_norm": 0.052955552359322186, + "language_loss": 0.79883057, + "learning_rate": 6.615891104554261e-05, + "loss": 0.80946279, + "num_input_tokens_seen": 362128240, + "router_z_loss_mlp": 0.31616211, + "step": 4363, + "time_per_iteration": 2.479904890060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062818, + "balance_loss_mlp": 1.02994013, + "epoch": 0.8395536744901886, + "flos": 593885979648.0, + "grad_norm": 0.04635728781901914, + "language_loss": 0.82487506, + "learning_rate": 6.600412156410057e-05, + "loss": 0.83550322, + "num_input_tokens_seen": 362198256, + "router_z_loss_mlp": 0.32885742, + "step": 4364, + "time_per_iteration": 2.7378604412078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059223, + "balance_loss_mlp": 1.02803802, + "epoch": 0.8397460561754521, + "flos": 889462812672.0, + "grad_norm": 0.058311818484936, + "language_loss": 0.84599465, + "learning_rate": 6.58495005748016e-05, + "loss": 0.85658687, + "num_input_tokens_seen": 362279792, + "router_z_loss_mlp": 0.31152344, + "step": 4365, + "time_per_iteration": 3.1387250423431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066412, + "balance_loss_mlp": 1.0339396, + "epoch": 0.8399384378607156, + "flos": 553239590400.0, + "grad_norm": 0.07373316547529772, + "language_loss": 0.88838422, + "learning_rate": 6.569504813767463e-05, + "loss": 0.89904833, + "num_input_tokens_seen": 362351712, + "router_z_loss_mlp": 0.32470703, + "step": 4366, + "time_per_iteration": 2.594947576522827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106208, + "balance_loss_mlp": 1.02955997, + "epoch": 0.8401308195459792, + "flos": 518664826368.0, + "grad_norm": 0.04997889714704866, + "language_loss": 0.83415538, + "learning_rate": 6.554076431268341e-05, + "loss": 0.84477615, + "num_input_tokens_seen": 362423424, + "router_z_loss_mlp": 0.32519531, + "step": 4367, + "time_per_iteration": 2.6347951889038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063314, + "balance_loss_mlp": 1.0316515, + "epoch": 0.8403232012312428, + "flos": 684593925120.0, + "grad_norm": 0.058557481945258026, + "language_loss": 0.81210721, + "learning_rate": 6.538664915972648e-05, + "loss": 0.82274044, + "num_input_tokens_seen": 362514704, + "router_z_loss_mlp": 0.31640625, + "step": 4368, + "time_per_iteration": 3.0035693645477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064984, + "balance_loss_mlp": 1.03313088, + "epoch": 0.8405155829165063, + "flos": 577424346624.0, + "grad_norm": 0.05437483826569731, + "language_loss": 0.77127528, + "learning_rate": 6.523270273863652e-05, + "loss": 0.78192508, + "num_input_tokens_seen": 362581296, + "router_z_loss_mlp": 0.31835938, + "step": 4369, + "time_per_iteration": 2.682255983352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065622, + "balance_loss_mlp": 1.03350711, + "epoch": 0.8407079646017699, + "flos": 456393028608.0, + "grad_norm": 0.059338918722140754, + "language_loss": 0.87703532, + "learning_rate": 6.507892510918079e-05, + "loss": 0.88769156, + "num_input_tokens_seen": 362648304, + "router_z_loss_mlp": 0.32104492, + "step": 4370, + "time_per_iteration": 2.5616416931152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062128, + "balance_loss_mlp": 1.03032303, + "epoch": 0.8409003462870335, + "flos": 534647803392.0, + "grad_norm": 0.06484553532482988, + "language_loss": 0.81577432, + "learning_rate": 6.492531633106114e-05, + "loss": 0.82639563, + "num_input_tokens_seen": 362721264, + "router_z_loss_mlp": 0.31787109, + "step": 4371, + "time_per_iteration": 2.758342981338501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063458, + "balance_loss_mlp": 1.03165269, + "epoch": 0.8410927279722971, + "flos": 556475443200.0, + "grad_norm": 0.060016335878015956, + "language_loss": 0.77717382, + "learning_rate": 6.477187646391374e-05, + "loss": 0.78780836, + "num_input_tokens_seen": 362795312, + "router_z_loss_mlp": 0.31787109, + "step": 4372, + "time_per_iteration": 2.6928319931030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006701, + "balance_loss_mlp": 0.99816585, + "epoch": 0.8412851096575606, + "flos": 1548963979776.0, + "grad_norm": 0.007825134945211695, + "language_loss": 0.77679121, + "learning_rate": 6.461860556730925e-05, + "loss": 0.7868582, + "num_input_tokens_seen": 363026272, + "router_z_loss_mlp": 0.08544922, + "step": 4373, + "time_per_iteration": 4.882466793060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062183, + "balance_loss_mlp": 1.03075981, + "epoch": 0.8414774913428241, + "flos": 551777329152.0, + "grad_norm": 0.0528843207904054, + "language_loss": 0.78680658, + "learning_rate": 6.446550370075271e-05, + "loss": 0.79742843, + "num_input_tokens_seen": 363098384, + "router_z_loss_mlp": 0.31396484, + "step": 4374, + "time_per_iteration": 2.6880640983581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066012, + "balance_loss_mlp": 1.03404009, + "epoch": 0.8416698730280877, + "flos": 572752373760.0, + "grad_norm": 0.0786696469695305, + "language_loss": 0.77104962, + "learning_rate": 6.431257092368336e-05, + "loss": 0.78170967, + "num_input_tokens_seen": 363170960, + "router_z_loss_mlp": 0.31958008, + "step": 4375, + "time_per_iteration": 2.669539213180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061463, + "balance_loss_mlp": 1.02946734, + "epoch": 0.8418622547133513, + "flos": 758405251584.0, + "grad_norm": 0.05603873808243246, + "language_loss": 0.79780394, + "learning_rate": 6.415980729547543e-05, + "loss": 0.80841863, + "num_input_tokens_seen": 363242000, + "router_z_loss_mlp": 0.31982422, + "step": 4376, + "time_per_iteration": 2.902561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064279, + "balance_loss_mlp": 1.03223574, + "epoch": 0.8420546363986149, + "flos": 1073717448192.0, + "grad_norm": 0.08072292873132886, + "language_loss": 0.72504145, + "learning_rate": 6.40072128754366e-05, + "loss": 0.73568422, + "num_input_tokens_seen": 363340288, + "router_z_loss_mlp": 0.3203125, + "step": 4377, + "time_per_iteration": 3.4048268795013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064955, + "balance_loss_mlp": 1.03353119, + "epoch": 0.8422470180838784, + "flos": 525632716800.0, + "grad_norm": 0.05128837603691623, + "language_loss": 0.8274287, + "learning_rate": 6.385478772280933e-05, + "loss": 0.83807814, + "num_input_tokens_seen": 363416208, + "router_z_loss_mlp": 0.31396484, + "step": 4378, + "time_per_iteration": 2.7541863918304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061165, + "balance_loss_mlp": 1.02928829, + "epoch": 0.842439399769142, + "flos": 600552714240.0, + "grad_norm": 0.07361287537672946, + "language_loss": 0.82019341, + "learning_rate": 6.370253189677038e-05, + "loss": 0.83080506, + "num_input_tokens_seen": 363492864, + "router_z_loss_mlp": 0.31860352, + "step": 4379, + "time_per_iteration": 2.717729330062866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062548, + "balance_loss_mlp": 1.03007579, + "epoch": 0.8426317814544055, + "flos": 551935890432.0, + "grad_norm": 0.05730022029420674, + "language_loss": 0.86337304, + "learning_rate": 6.355044545643073e-05, + "loss": 0.87399852, + "num_input_tokens_seen": 363572000, + "router_z_loss_mlp": 0.32470703, + "step": 4380, + "time_per_iteration": 2.8673453330993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059699, + "balance_loss_mlp": 1.02810872, + "epoch": 0.8428241631396691, + "flos": 678531064320.0, + "grad_norm": 0.05645772213934178, + "language_loss": 0.77814853, + "learning_rate": 6.33985284608356e-05, + "loss": 0.78874558, + "num_input_tokens_seen": 363646480, + "router_z_loss_mlp": 0.31567383, + "step": 4381, + "time_per_iteration": 2.797013759613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066522, + "balance_loss_mlp": 1.03512263, + "epoch": 0.8430165448249327, + "flos": 753365131776.0, + "grad_norm": 0.06324404810028335, + "language_loss": 0.79739416, + "learning_rate": 6.324678096896435e-05, + "loss": 0.80805933, + "num_input_tokens_seen": 363737552, + "router_z_loss_mlp": 0.3137207, + "step": 4382, + "time_per_iteration": 3.068904399871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061548, + "balance_loss_mlp": 1.03002882, + "epoch": 0.8432089265101962, + "flos": 698817867264.0, + "grad_norm": 0.052650923658821146, + "language_loss": 0.80755234, + "learning_rate": 6.30952030397306e-05, + "loss": 0.81816781, + "num_input_tokens_seen": 363816016, + "router_z_loss_mlp": 0.31518555, + "step": 4383, + "time_per_iteration": 2.910621166229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064072, + "balance_loss_mlp": 1.03200483, + "epoch": 0.8434013081954598, + "flos": 485513035776.0, + "grad_norm": 0.0532202077177964, + "language_loss": 0.8463192, + "learning_rate": 6.294379473198208e-05, + "loss": 0.85695994, + "num_input_tokens_seen": 363888192, + "router_z_loss_mlp": 0.32055664, + "step": 4384, + "time_per_iteration": 2.6428587436676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061742, + "balance_loss_mlp": 1.03017521, + "epoch": 0.8435936898807234, + "flos": 520372988928.0, + "grad_norm": 0.05180994761233764, + "language_loss": 0.85464537, + "learning_rate": 6.279255610450068e-05, + "loss": 0.86526275, + "num_input_tokens_seen": 363953904, + "router_z_loss_mlp": 0.31542969, + "step": 4385, + "time_per_iteration": 2.6025028228759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062937, + "balance_loss_mlp": 1.03146529, + "epoch": 0.843786071565987, + "flos": 785604690432.0, + "grad_norm": 0.05052030046697327, + "language_loss": 0.80340213, + "learning_rate": 6.264148721600254e-05, + "loss": 0.81403148, + "num_input_tokens_seen": 364031552, + "router_z_loss_mlp": 0.31445312, + "step": 4386, + "time_per_iteration": 2.9918956756591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003602, + "balance_loss_mlp": 0.99516225, + "epoch": 0.8439784532512504, + "flos": 1445472442368.0, + "grad_norm": 0.005014150757635798, + "language_loss": 0.75836509, + "learning_rate": 6.24905881251378e-05, + "loss": 0.76840121, + "num_input_tokens_seen": 364256480, + "router_z_loss_mlp": 0.08447266, + "step": 4387, + "time_per_iteration": 4.906704664230347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062352, + "balance_loss_mlp": 1.03026116, + "epoch": 0.844170834936514, + "flos": 708384393216.0, + "grad_norm": 0.0608021946942925, + "language_loss": 0.82278877, + "learning_rate": 6.23398588904906e-05, + "loss": 0.83341229, + "num_input_tokens_seen": 364329696, + "router_z_loss_mlp": 0.32080078, + "step": 4388, + "time_per_iteration": 2.8366615772247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062411, + "balance_loss_mlp": 1.03039169, + "epoch": 0.8443632166217776, + "flos": 483183622656.0, + "grad_norm": 0.06339245452093885, + "language_loss": 0.79611492, + "learning_rate": 6.218929957057922e-05, + "loss": 0.80673903, + "num_input_tokens_seen": 364400944, + "router_z_loss_mlp": 0.32006836, + "step": 4389, + "time_per_iteration": 2.6750078201293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064765, + "balance_loss_mlp": 1.03262651, + "epoch": 0.8445555983070412, + "flos": 678388469760.0, + "grad_norm": 0.10070557471224953, + "language_loss": 0.80328929, + "learning_rate": 6.2038910223856e-05, + "loss": 0.81393689, + "num_input_tokens_seen": 364475744, + "router_z_loss_mlp": 0.32128906, + "step": 4390, + "time_per_iteration": 2.855630874633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062723, + "balance_loss_mlp": 1.03094149, + "epoch": 0.8447479799923048, + "flos": 741143305728.0, + "grad_norm": 0.055868322560849205, + "language_loss": 0.74313754, + "learning_rate": 6.18886909087073e-05, + "loss": 0.75376475, + "num_input_tokens_seen": 364557248, + "router_z_loss_mlp": 0.31762695, + "step": 4391, + "time_per_iteration": 2.9666664600372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061502, + "balance_loss_mlp": 1.02984023, + "epoch": 0.8449403616775683, + "flos": 952897125888.0, + "grad_norm": 0.08754318552441484, + "language_loss": 0.80220729, + "learning_rate": 6.173864168345344e-05, + "loss": 0.81282234, + "num_input_tokens_seen": 364647856, + "router_z_loss_mlp": 0.31640625, + "step": 4392, + "time_per_iteration": 3.2447478771209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106655, + "balance_loss_mlp": 1.03405356, + "epoch": 0.8451327433628318, + "flos": 657054042624.0, + "grad_norm": 0.061100470877336555, + "language_loss": 0.72091293, + "learning_rate": 6.158876260634871e-05, + "loss": 0.73157841, + "num_input_tokens_seen": 364728848, + "router_z_loss_mlp": 0.32495117, + "step": 4393, + "time_per_iteration": 2.870314598083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062012, + "balance_loss_mlp": 1.03011155, + "epoch": 0.8453251250480954, + "flos": 445880775168.0, + "grad_norm": 0.056584995707747415, + "language_loss": 0.83570069, + "learning_rate": 6.143905373558112e-05, + "loss": 0.84632081, + "num_input_tokens_seen": 364794032, + "router_z_loss_mlp": 0.31884766, + "step": 4394, + "time_per_iteration": 2.5169620513916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065302, + "balance_loss_mlp": 1.0335449, + "epoch": 0.845517506733359, + "flos": 542491610112.0, + "grad_norm": 0.06893065847374383, + "language_loss": 0.70728701, + "learning_rate": 6.128951512927305e-05, + "loss": 0.71794009, + "num_input_tokens_seen": 364868624, + "router_z_loss_mlp": 0.31738281, + "step": 4395, + "time_per_iteration": 2.6561121940612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061554, + "balance_loss_mlp": 1.02962959, + "epoch": 0.8457098884186226, + "flos": 502175490048.0, + "grad_norm": 0.0520725454143225, + "language_loss": 0.84400153, + "learning_rate": 6.114014684548046e-05, + "loss": 0.854617, + "num_input_tokens_seen": 364938208, + "router_z_loss_mlp": 0.3190918, + "step": 4396, + "time_per_iteration": 2.607789993286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065204, + "balance_loss_mlp": 1.03335178, + "epoch": 0.8459022701038861, + "flos": 448643764224.0, + "grad_norm": 0.06588987467547251, + "language_loss": 0.79514521, + "learning_rate": 6.099094894219326e-05, + "loss": 0.80579728, + "num_input_tokens_seen": 365009440, + "router_z_loss_mlp": 0.31835938, + "step": 4397, + "time_per_iteration": 2.749403953552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105971, + "balance_loss_mlp": 1.02838171, + "epoch": 0.8460946517891497, + "flos": 742855850496.0, + "grad_norm": 0.053779165984043746, + "language_loss": 0.74806583, + "learning_rate": 6.0841921477335194e-05, + "loss": 0.75866288, + "num_input_tokens_seen": 365085904, + "router_z_loss_mlp": 0.31298828, + "step": 4398, + "time_per_iteration": 2.9234650135040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060698, + "balance_loss_mlp": 1.02929795, + "epoch": 0.8462870334744133, + "flos": 552939844608.0, + "grad_norm": 0.053512872285819295, + "language_loss": 0.79614019, + "learning_rate": 6.069306450876389e-05, + "loss": 0.80674708, + "num_input_tokens_seen": 365163600, + "router_z_loss_mlp": 0.3137207, + "step": 4399, + "time_per_iteration": 2.733107089996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01000293, + "balance_loss_mlp": 0.99199617, + "epoch": 0.8464794151596768, + "flos": 1564033162752.0, + "grad_norm": 0.008637773298876451, + "language_loss": 0.81708568, + "learning_rate": 6.054437809427071e-05, + "loss": 0.82708859, + "num_input_tokens_seen": 365384528, + "router_z_loss_mlp": 0.08300781, + "step": 4400, + "time_per_iteration": 4.845052719116211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062975, + "balance_loss_mlp": 1.03066921, + "epoch": 0.8466717968449403, + "flos": 549930954240.0, + "grad_norm": 0.05555564096889626, + "language_loss": 0.79862118, + "learning_rate": 6.039586229158084e-05, + "loss": 0.80925095, + "num_input_tokens_seen": 365453760, + "router_z_loss_mlp": 0.32299805, + "step": 4401, + "time_per_iteration": 2.6397857666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062133, + "balance_loss_mlp": 1.02954078, + "epoch": 0.8468641785302039, + "flos": 551625970176.0, + "grad_norm": 0.05665481648862394, + "language_loss": 0.84565353, + "learning_rate": 6.024751715835314e-05, + "loss": 0.85627484, + "num_input_tokens_seen": 365532416, + "router_z_loss_mlp": 0.32592773, + "step": 4402, + "time_per_iteration": 2.751540422439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060788, + "balance_loss_mlp": 1.02891159, + "epoch": 0.8470565602154675, + "flos": 572384226816.0, + "grad_norm": 0.05542155513246256, + "language_loss": 0.8710115, + "learning_rate": 6.009934275218049e-05, + "loss": 0.88161939, + "num_input_tokens_seen": 365603776, + "router_z_loss_mlp": 0.31860352, + "step": 4403, + "time_per_iteration": 2.699852705001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063, + "balance_loss_mlp": 1.0309329, + "epoch": 0.8472489419007311, + "flos": 472597175808.0, + "grad_norm": 0.06255636958013598, + "language_loss": 0.84244227, + "learning_rate": 5.995133913058936e-05, + "loss": 0.85307229, + "num_input_tokens_seen": 365670432, + "router_z_loss_mlp": 0.32055664, + "step": 4404, + "time_per_iteration": 2.5569348335266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059261, + "balance_loss_mlp": 1.02776599, + "epoch": 0.8474413235859947, + "flos": 797682511872.0, + "grad_norm": 0.06074469603252205, + "language_loss": 0.79307783, + "learning_rate": 5.980350635103954e-05, + "loss": 0.80367047, + "num_input_tokens_seen": 365741584, + "router_z_loss_mlp": 0.31469727, + "step": 4405, + "time_per_iteration": 2.9716339111328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066394, + "balance_loss_mlp": 1.03525686, + "epoch": 0.8476337052712581, + "flos": 502130409984.0, + "grad_norm": 0.05964249854785047, + "language_loss": 0.80296123, + "learning_rate": 5.9655844470924866e-05, + "loss": 0.81362522, + "num_input_tokens_seen": 365805344, + "router_z_loss_mlp": 0.31103516, + "step": 4406, + "time_per_iteration": 2.5619914531707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061086, + "balance_loss_mlp": 1.02990091, + "epoch": 0.8478260869565217, + "flos": 931586019840.0, + "grad_norm": 0.047034388507541325, + "language_loss": 0.82831132, + "learning_rate": 5.9508353547573e-05, + "loss": 0.83892226, + "num_input_tokens_seen": 365890976, + "router_z_loss_mlp": 0.31152344, + "step": 4407, + "time_per_iteration": 3.201432228088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062926, + "balance_loss_mlp": 1.03207469, + "epoch": 0.8480184686417853, + "flos": 708502256640.0, + "grad_norm": 0.0520483514476875, + "language_loss": 0.80806863, + "learning_rate": 5.9361033638244855e-05, + "loss": 0.81869787, + "num_input_tokens_seen": 365968912, + "router_z_loss_mlp": 0.30810547, + "step": 4408, + "time_per_iteration": 2.8537440299987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061734, + "balance_loss_mlp": 1.03047752, + "epoch": 0.8482108503270489, + "flos": 614152433664.0, + "grad_norm": 0.048904900371612575, + "language_loss": 0.82296753, + "learning_rate": 5.9213884800135066e-05, + "loss": 0.8335849, + "num_input_tokens_seen": 366047680, + "router_z_loss_mlp": 0.31225586, + "step": 4409, + "time_per_iteration": 2.8240151405334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063069, + "balance_loss_mlp": 1.03200269, + "epoch": 0.8484032320123124, + "flos": 530752822272.0, + "grad_norm": 0.12602145996095604, + "language_loss": 0.82197714, + "learning_rate": 5.906690709037194e-05, + "loss": 0.83260781, + "num_input_tokens_seen": 366118720, + "router_z_loss_mlp": 0.31030273, + "step": 4410, + "time_per_iteration": 2.600715398788452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01001074, + "balance_loss_mlp": 0.99291962, + "epoch": 0.848595613697576, + "flos": 1541930508288.0, + "grad_norm": 0.0076180940143389065, + "language_loss": 0.76296914, + "learning_rate": 5.892010056601726e-05, + "loss": 0.77297986, + "num_input_tokens_seen": 366346928, + "router_z_loss_mlp": 0.08154297, + "step": 4411, + "time_per_iteration": 4.879023551940918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061447, + "balance_loss_mlp": 1.03061938, + "epoch": 0.8487879953828396, + "flos": 677025133056.0, + "grad_norm": 0.05602222185131278, + "language_loss": 0.7385751, + "learning_rate": 5.877346528406635e-05, + "loss": 0.7491895, + "num_input_tokens_seen": 366422848, + "router_z_loss_mlp": 0.30786133, + "step": 4412, + "time_per_iteration": 2.880748987197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061373, + "balance_loss_mlp": 1.03035462, + "epoch": 0.8489803770681031, + "flos": 503425345536.0, + "grad_norm": 0.05707259676031019, + "language_loss": 0.79403526, + "learning_rate": 5.8627001301448105e-05, + "loss": 0.80464894, + "num_input_tokens_seen": 366492016, + "router_z_loss_mlp": 0.30981445, + "step": 4413, + "time_per_iteration": 2.5811662673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010618, + "balance_loss_mlp": 1.03063893, + "epoch": 0.8491727587533667, + "flos": 562896276480.0, + "grad_norm": 0.051344751965668234, + "language_loss": 0.76542878, + "learning_rate": 5.84807086750247e-05, + "loss": 0.77604681, + "num_input_tokens_seen": 366566400, + "router_z_loss_mlp": 0.3112793, + "step": 4414, + "time_per_iteration": 2.7214043140411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063085, + "balance_loss_mlp": 1.03044593, + "epoch": 0.8493651404386302, + "flos": 459544513536.0, + "grad_norm": 0.0639628244470696, + "language_loss": 0.77723747, + "learning_rate": 5.833458746159243e-05, + "loss": 0.78786838, + "num_input_tokens_seen": 366634016, + "router_z_loss_mlp": 0.32641602, + "step": 4415, + "time_per_iteration": 2.603907823562622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062645, + "balance_loss_mlp": 1.03086436, + "epoch": 0.8495575221238938, + "flos": 460928199168.0, + "grad_norm": 0.06700935131460924, + "language_loss": 0.81717062, + "learning_rate": 5.818863771788013e-05, + "loss": 0.82779706, + "num_input_tokens_seen": 366704384, + "router_z_loss_mlp": 0.31762695, + "step": 4416, + "time_per_iteration": 2.6823158264160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061617, + "balance_loss_mlp": 1.03016961, + "epoch": 0.8497499038091574, + "flos": 870353081856.0, + "grad_norm": 0.05272559442866759, + "language_loss": 0.81311977, + "learning_rate": 5.8042859500550604e-05, + "loss": 0.82373595, + "num_input_tokens_seen": 366785456, + "router_z_loss_mlp": 0.31420898, + "step": 4417, + "time_per_iteration": 3.0989885330200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061164, + "balance_loss_mlp": 1.029598, + "epoch": 0.849942285494421, + "flos": 779258050560.0, + "grad_norm": 0.12053257380548967, + "language_loss": 0.78102922, + "learning_rate": 5.789725286620018e-05, + "loss": 0.79164088, + "num_input_tokens_seen": 366862848, + "router_z_loss_mlp": 0.31542969, + "step": 4418, + "time_per_iteration": 3.0168724060058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061302, + "balance_loss_mlp": 1.0296638, + "epoch": 0.8501346671796844, + "flos": 513544720896.0, + "grad_norm": 0.05675897396855917, + "language_loss": 0.84916025, + "learning_rate": 5.775181787135819e-05, + "loss": 0.85977328, + "num_input_tokens_seen": 366934800, + "router_z_loss_mlp": 0.31616211, + "step": 4419, + "time_per_iteration": 2.651323080062866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062383, + "balance_loss_mlp": 1.0311023, + "epoch": 0.850327048864948, + "flos": 621149437440.0, + "grad_norm": 0.04589752872200537, + "language_loss": 0.83418536, + "learning_rate": 5.76065545724877e-05, + "loss": 0.84480917, + "num_input_tokens_seen": 367015152, + "router_z_loss_mlp": 0.3125, + "step": 4420, + "time_per_iteration": 2.8431475162506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061049, + "balance_loss_mlp": 1.02974463, + "epoch": 0.8505194305502116, + "flos": 773890633728.0, + "grad_norm": 0.05328835761082586, + "language_loss": 0.79577553, + "learning_rate": 5.746146302598454e-05, + "loss": 0.80638599, + "num_input_tokens_seen": 367092192, + "router_z_loss_mlp": 0.31274414, + "step": 4421, + "time_per_iteration": 3.0034854412078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059109, + "balance_loss_mlp": 1.02761436, + "epoch": 0.8507118122354752, + "flos": 465019619328.0, + "grad_norm": 0.05394674589529038, + "language_loss": 0.8635028, + "learning_rate": 5.731654328817859e-05, + "loss": 0.87409389, + "num_input_tokens_seen": 367159744, + "router_z_loss_mlp": 0.31469727, + "step": 4422, + "time_per_iteration": 2.6123807430267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061076, + "balance_loss_mlp": 1.02958083, + "epoch": 0.8509041939207388, + "flos": 534150208512.0, + "grad_norm": 0.05638570018802557, + "language_loss": 0.847974, + "learning_rate": 5.717179541533257e-05, + "loss": 0.85858476, + "num_input_tokens_seen": 367226384, + "router_z_loss_mlp": 0.31469727, + "step": 4423, + "time_per_iteration": 2.6444549560546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061706, + "balance_loss_mlp": 1.03002, + "epoch": 0.8510965756060023, + "flos": 583466858496.0, + "grad_norm": 0.05656669272742153, + "language_loss": 0.84546876, + "learning_rate": 5.702721946364264e-05, + "loss": 0.85608578, + "num_input_tokens_seen": 367294768, + "router_z_loss_mlp": 0.31665039, + "step": 4424, + "time_per_iteration": 2.6550204753875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062538, + "balance_loss_mlp": 1.03092384, + "epoch": 0.8512889572912659, + "flos": 600548332032.0, + "grad_norm": 0.06195322404547727, + "language_loss": 0.77515125, + "learning_rate": 5.688281548923796e-05, + "loss": 0.78577662, + "num_input_tokens_seen": 367372368, + "router_z_loss_mlp": 0.31591797, + "step": 4425, + "time_per_iteration": 2.788872003555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062864, + "balance_loss_mlp": 1.03124976, + "epoch": 0.8514813389765294, + "flos": 654474345984.0, + "grad_norm": 0.06031925061221975, + "language_loss": 0.78652239, + "learning_rate": 5.673858354818151e-05, + "loss": 0.79715109, + "num_input_tokens_seen": 367452656, + "router_z_loss_mlp": 0.31591797, + "step": 4426, + "time_per_iteration": 2.8395063877105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062488, + "balance_loss_mlp": 1.03132665, + "epoch": 0.851673720661793, + "flos": 429538415616.0, + "grad_norm": 0.06630539351517445, + "language_loss": 0.7778796, + "learning_rate": 5.6594523696468726e-05, + "loss": 0.78850448, + "num_input_tokens_seen": 367517808, + "router_z_loss_mlp": 0.3112793, + "step": 4427, + "time_per_iteration": 2.52323842048645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063991, + "balance_loss_mlp": 1.03256702, + "epoch": 0.8518661023470565, + "flos": 641277679104.0, + "grad_norm": 0.054901162952162144, + "language_loss": 0.79352564, + "learning_rate": 5.645063599002875e-05, + "loss": 0.80416554, + "num_input_tokens_seen": 367591728, + "router_z_loss_mlp": 0.31396484, + "step": 4428, + "time_per_iteration": 2.7660624980926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066318, + "balance_loss_mlp": 1.03451312, + "epoch": 0.8520584840323201, + "flos": 561880737792.0, + "grad_norm": 0.05488389146689813, + "language_loss": 0.79418516, + "learning_rate": 5.630692048472363e-05, + "loss": 0.80484831, + "num_input_tokens_seen": 367664496, + "router_z_loss_mlp": 0.31787109, + "step": 4429, + "time_per_iteration": 2.6520426273345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060507, + "balance_loss_mlp": 1.0296793, + "epoch": 0.8522508657175837, + "flos": 526793822208.0, + "grad_norm": 0.059198790333543354, + "language_loss": 0.78811502, + "learning_rate": 5.61633772363489e-05, + "loss": 0.79872012, + "num_input_tokens_seen": 367735584, + "router_z_loss_mlp": 0.30786133, + "step": 4430, + "time_per_iteration": 2.639409065246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061431, + "balance_loss_mlp": 1.03053164, + "epoch": 0.8524432474028473, + "flos": 498875618304.0, + "grad_norm": 0.0498357465810583, + "language_loss": 0.80618191, + "learning_rate": 5.602000630063298e-05, + "loss": 0.81679624, + "num_input_tokens_seen": 367801136, + "router_z_loss_mlp": 0.30859375, + "step": 4431, + "time_per_iteration": 2.5802340507507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063948, + "balance_loss_mlp": 1.03176129, + "epoch": 0.8526356290881109, + "flos": 421089325056.0, + "grad_norm": 0.06741417888605215, + "language_loss": 0.79432404, + "learning_rate": 5.587680773323706e-05, + "loss": 0.80496353, + "num_input_tokens_seen": 367865312, + "router_z_loss_mlp": 0.32177734, + "step": 4432, + "time_per_iteration": 2.5548770427703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064662, + "balance_loss_mlp": 1.03373873, + "epoch": 0.8528280107733743, + "flos": 507078807552.0, + "grad_norm": 0.06908943944415502, + "language_loss": 0.80579317, + "learning_rate": 5.5733781589756115e-05, + "loss": 0.81643981, + "num_input_tokens_seen": 367931104, + "router_z_loss_mlp": 0.30883789, + "step": 4433, + "time_per_iteration": 2.6177783012390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062039, + "balance_loss_mlp": 1.03111625, + "epoch": 0.8530203924586379, + "flos": 445663987200.0, + "grad_norm": 0.061652443542314195, + "language_loss": 0.82560652, + "learning_rate": 5.5590927925717684e-05, + "loss": 0.83622682, + "num_input_tokens_seen": 367995520, + "router_z_loss_mlp": 0.30883789, + "step": 4434, + "time_per_iteration": 2.497509479522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063718, + "balance_loss_mlp": 1.03157926, + "epoch": 0.8532127741439015, + "flos": 657452712960.0, + "grad_norm": 0.07889599950997465, + "language_loss": 0.83384633, + "learning_rate": 5.54482467965825e-05, + "loss": 0.84448349, + "num_input_tokens_seen": 368073664, + "router_z_loss_mlp": 0.32128906, + "step": 4435, + "time_per_iteration": 2.8157849311828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062143, + "balance_loss_mlp": 1.03076673, + "epoch": 0.8534051558291651, + "flos": 535750682112.0, + "grad_norm": 0.05205501193179772, + "language_loss": 0.8285321, + "learning_rate": 5.5305738257744264e-05, + "loss": 0.83915353, + "num_input_tokens_seen": 368147536, + "router_z_loss_mlp": 0.31347656, + "step": 4436, + "time_per_iteration": 2.7104005813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065615, + "balance_loss_mlp": 1.03404808, + "epoch": 0.8535975375144286, + "flos": 532741791744.0, + "grad_norm": 0.06598498607456474, + "language_loss": 0.79102892, + "learning_rate": 5.5163402364529655e-05, + "loss": 0.80168509, + "num_input_tokens_seen": 368218672, + "router_z_loss_mlp": 0.31542969, + "step": 4437, + "time_per_iteration": 2.6187076568603516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059712, + "balance_loss_mlp": 1.02819347, + "epoch": 0.8537899191996922, + "flos": 573861044736.0, + "grad_norm": 0.06908451740926473, + "language_loss": 0.82492721, + "learning_rate": 5.502123917219848e-05, + "loss": 0.83552432, + "num_input_tokens_seen": 368287056, + "router_z_loss_mlp": 0.31494141, + "step": 4438, + "time_per_iteration": 2.676590919494629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067428, + "balance_loss_mlp": 1.03602839, + "epoch": 0.8539823008849557, + "flos": 464759161344.0, + "grad_norm": 0.05953176501348747, + "language_loss": 0.83260107, + "learning_rate": 5.48792487359433e-05, + "loss": 0.84327531, + "num_input_tokens_seen": 368358400, + "router_z_loss_mlp": 0.3137207, + "step": 4439, + "time_per_iteration": 2.6769707202911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059953, + "balance_loss_mlp": 1.0280292, + "epoch": 0.8541746825702193, + "flos": 554441393664.0, + "grad_norm": 0.059973770389771766, + "language_loss": 0.8156724, + "learning_rate": 5.4737431110889745e-05, + "loss": 0.82627189, + "num_input_tokens_seen": 368427168, + "router_z_loss_mlp": 0.3190918, + "step": 4440, + "time_per_iteration": 2.642137050628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064084, + "balance_loss_mlp": 1.0323509, + "epoch": 0.8543670642554829, + "flos": 546101402112.0, + "grad_norm": 0.04886224100797358, + "language_loss": 0.7693212, + "learning_rate": 5.4595786352096165e-05, + "loss": 0.77996206, + "num_input_tokens_seen": 368503584, + "router_z_loss_mlp": 0.31713867, + "step": 4441, + "time_per_iteration": 2.737921714782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061215, + "balance_loss_mlp": 1.02988696, + "epoch": 0.8545594459407464, + "flos": 511766747136.0, + "grad_norm": 0.05039354849040429, + "language_loss": 0.81989372, + "learning_rate": 5.4454314514554236e-05, + "loss": 0.83050585, + "num_input_tokens_seen": 368576976, + "router_z_loss_mlp": 0.31298828, + "step": 4442, + "time_per_iteration": 2.7490930557250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059972, + "balance_loss_mlp": 1.02814305, + "epoch": 0.85475182762601, + "flos": 420961287168.0, + "grad_norm": 0.05564199021069246, + "language_loss": 0.81687701, + "learning_rate": 5.431301565318786e-05, + "loss": 0.82747674, + "num_input_tokens_seen": 368641664, + "router_z_loss_mlp": 0.31811523, + "step": 4443, + "time_per_iteration": 2.5382466316223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063197, + "balance_loss_mlp": 1.03174961, + "epoch": 0.8549442093112736, + "flos": 389222295552.0, + "grad_norm": 0.07918770769320456, + "language_loss": 0.7730273, + "learning_rate": 5.41718898228542e-05, + "loss": 0.78365928, + "num_input_tokens_seen": 368705616, + "router_z_loss_mlp": 0.31420898, + "step": 4444, + "time_per_iteration": 2.478496551513672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105818, + "balance_loss_mlp": 1.02589846, + "epoch": 0.8551365909965372, + "flos": 605620385280.0, + "grad_norm": 0.3659120344191015, + "language_loss": 0.795048, + "learning_rate": 5.403093707834334e-05, + "loss": 0.80562979, + "num_input_tokens_seen": 368779664, + "router_z_loss_mlp": 0.32275391, + "step": 4445, + "time_per_iteration": 2.8390157222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063635, + "balance_loss_mlp": 1.03149629, + "epoch": 0.8553289726818007, + "flos": 503912765952.0, + "grad_norm": 0.1926069572015181, + "language_loss": 0.78790247, + "learning_rate": 5.3890157474377865e-05, + "loss": 0.7985388, + "num_input_tokens_seen": 368846656, + "router_z_loss_mlp": 0.32128906, + "step": 4446, + "time_per_iteration": 2.548919677734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063127, + "balance_loss_mlp": 1.03189397, + "epoch": 0.8555213543670642, + "flos": 556735901184.0, + "grad_norm": 0.056427781498625505, + "language_loss": 0.76149607, + "learning_rate": 5.374955106561324e-05, + "loss": 0.77212739, + "num_input_tokens_seen": 368923712, + "router_z_loss_mlp": 0.31201172, + "step": 4447, + "time_per_iteration": 2.7372312545776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106394, + "balance_loss_mlp": 1.03263545, + "epoch": 0.8557137360523278, + "flos": 547843060224.0, + "grad_norm": 0.052660516430393885, + "language_loss": 0.74772626, + "learning_rate": 5.360911790663775e-05, + "loss": 0.75836563, + "num_input_tokens_seen": 368994496, + "router_z_loss_mlp": 0.31274414, + "step": 4448, + "time_per_iteration": 2.6941676139831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060031, + "balance_loss_mlp": 1.02848864, + "epoch": 0.8559061177375914, + "flos": 727853506560.0, + "grad_norm": 0.04843590517934965, + "language_loss": 0.78747225, + "learning_rate": 5.346885805197238e-05, + "loss": 0.79807258, + "num_input_tokens_seen": 369077088, + "router_z_loss_mlp": 0.31518555, + "step": 4449, + "time_per_iteration": 2.9643099308013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061228, + "balance_loss_mlp": 1.02951789, + "epoch": 0.856098499422855, + "flos": 535608087552.0, + "grad_norm": 0.08512499863393, + "language_loss": 0.8254863, + "learning_rate": 5.332877155607085e-05, + "loss": 0.83609855, + "num_input_tokens_seen": 369147680, + "router_z_loss_mlp": 0.31689453, + "step": 4450, + "time_per_iteration": 2.645606517791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066721, + "balance_loss_mlp": 1.03372383, + "epoch": 0.8562908811081185, + "flos": 573388180992.0, + "grad_norm": 0.05548360969534156, + "language_loss": 0.83302569, + "learning_rate": 5.3188858473319504e-05, + "loss": 0.8436929, + "num_input_tokens_seen": 369224320, + "router_z_loss_mlp": 0.33007812, + "step": 4451, + "time_per_iteration": 2.685065507888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059945, + "balance_loss_mlp": 1.02825868, + "epoch": 0.856483262793382, + "flos": 781391024640.0, + "grad_norm": 0.057471374374553505, + "language_loss": 0.80552411, + "learning_rate": 5.3049118858037426e-05, + "loss": 0.81612355, + "num_input_tokens_seen": 369315744, + "router_z_loss_mlp": 0.31665039, + "step": 4452, + "time_per_iteration": 3.104637861251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061008, + "balance_loss_mlp": 1.02944088, + "epoch": 0.8566756444786456, + "flos": 455585513472.0, + "grad_norm": 0.046190458281021356, + "language_loss": 0.84324169, + "learning_rate": 5.290955276447651e-05, + "loss": 0.85385174, + "num_input_tokens_seen": 369382800, + "router_z_loss_mlp": 0.31542969, + "step": 4453, + "time_per_iteration": 2.52528715133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060297, + "balance_loss_mlp": 1.02844453, + "epoch": 0.8568680261639092, + "flos": 449150123520.0, + "grad_norm": 0.05730147141848336, + "language_loss": 0.84070498, + "learning_rate": 5.277016024682091e-05, + "loss": 0.85130793, + "num_input_tokens_seen": 369447312, + "router_z_loss_mlp": 0.31835938, + "step": 4454, + "time_per_iteration": 2.5523717403411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061824, + "balance_loss_mlp": 1.02999473, + "epoch": 0.8570604078491728, + "flos": 479736774144.0, + "grad_norm": 0.06420834276058747, + "language_loss": 0.8262471, + "learning_rate": 5.2630941359187665e-05, + "loss": 0.83686531, + "num_input_tokens_seen": 369512800, + "router_z_loss_mlp": 0.31811523, + "step": 4455, + "time_per_iteration": 2.5419952869415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064325, + "balance_loss_mlp": 1.03206658, + "epoch": 0.8572527895344363, + "flos": 505695121920.0, + "grad_norm": 0.047691689677810546, + "language_loss": 0.84493929, + "learning_rate": 5.249189615562627e-05, + "loss": 0.85558259, + "num_input_tokens_seen": 369580720, + "router_z_loss_mlp": 0.32250977, + "step": 4456, + "time_per_iteration": 2.61643648147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062107, + "balance_loss_mlp": 1.03077888, + "epoch": 0.8574451712196999, + "flos": 786688630272.0, + "grad_norm": 0.0541100916451422, + "language_loss": 0.82981288, + "learning_rate": 5.235302469011905e-05, + "loss": 0.8404339, + "num_input_tokens_seen": 369672544, + "router_z_loss_mlp": 0.31298828, + "step": 4457, + "time_per_iteration": 3.0279483795166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061214, + "balance_loss_mlp": 1.02933741, + "epoch": 0.8576375529049635, + "flos": 508980436992.0, + "grad_norm": 0.050346317822273064, + "language_loss": 0.75190985, + "learning_rate": 5.2214327016580575e-05, + "loss": 0.76252198, + "num_input_tokens_seen": 369745776, + "router_z_loss_mlp": 0.31860352, + "step": 4458, + "time_per_iteration": 2.7106165885925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009578, + "balance_loss_mlp": 1.00147212, + "epoch": 0.857829934590227, + "flos": 1459996130304.0, + "grad_norm": 0.00950278300704673, + "language_loss": 0.84767288, + "learning_rate": 5.207580318885802e-05, + "loss": 0.85776865, + "num_input_tokens_seen": 369975200, + "router_z_loss_mlp": 0.08105469, + "step": 4459, + "time_per_iteration": 4.9368908405303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062088, + "balance_loss_mlp": 1.02963924, + "epoch": 0.8580223162754905, + "flos": 479057296896.0, + "grad_norm": 0.050243088547339124, + "language_loss": 0.88987887, + "learning_rate": 5.193745326073118e-05, + "loss": 0.9004997, + "num_input_tokens_seen": 370043296, + "router_z_loss_mlp": 0.32446289, + "step": 4460, + "time_per_iteration": 2.6464526653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106302, + "balance_loss_mlp": 1.03142953, + "epoch": 0.8582146979607541, + "flos": 705926942208.0, + "grad_norm": 0.0551820595184576, + "language_loss": 0.79153854, + "learning_rate": 5.179927728591227e-05, + "loss": 0.80216873, + "num_input_tokens_seen": 370111152, + "router_z_loss_mlp": 0.31567383, + "step": 4461, + "time_per_iteration": 2.8269202709198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062228, + "balance_loss_mlp": 1.03023219, + "epoch": 0.8584070796460177, + "flos": 764826084864.0, + "grad_norm": 0.05216333699988601, + "language_loss": 0.82483435, + "learning_rate": 5.1661275318045874e-05, + "loss": 0.83545661, + "num_input_tokens_seen": 370190272, + "router_z_loss_mlp": 0.31982422, + "step": 4462, + "time_per_iteration": 3.035334825515747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060691, + "balance_loss_mlp": 1.02862346, + "epoch": 0.8585994613312813, + "flos": 586535385600.0, + "grad_norm": 0.04772965278083779, + "language_loss": 0.85539973, + "learning_rate": 5.152344741070919e-05, + "loss": 0.86600661, + "num_input_tokens_seen": 370267056, + "router_z_loss_mlp": 0.32055664, + "step": 4463, + "time_per_iteration": 2.7710516452789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061667, + "balance_loss_mlp": 1.02917063, + "epoch": 0.8587918430165449, + "flos": 607993468416.0, + "grad_norm": 0.052538127660877086, + "language_loss": 0.78715736, + "learning_rate": 5.138579361741169e-05, + "loss": 0.79777402, + "num_input_tokens_seen": 370344176, + "router_z_loss_mlp": 0.32495117, + "step": 4464, + "time_per_iteration": 2.799131393432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061744, + "balance_loss_mlp": 1.02929556, + "epoch": 0.8589842247018084, + "flos": 588710619648.0, + "grad_norm": 0.052020809474610616, + "language_loss": 0.81095582, + "learning_rate": 5.124831399159535e-05, + "loss": 0.82157326, + "num_input_tokens_seen": 370414224, + "router_z_loss_mlp": 0.32446289, + "step": 4465, + "time_per_iteration": 2.6786587238311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064242, + "balance_loss_mlp": 1.03172147, + "epoch": 0.8591766063870719, + "flos": 543609045504.0, + "grad_norm": 0.07015130917214298, + "language_loss": 0.78573036, + "learning_rate": 5.1111008586634475e-05, + "loss": 0.79637277, + "num_input_tokens_seen": 370484736, + "router_z_loss_mlp": 0.32519531, + "step": 4466, + "time_per_iteration": 2.6729421615600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059412, + "balance_loss_mlp": 1.02724934, + "epoch": 0.8593689880723355, + "flos": 493499437056.0, + "grad_norm": 0.057052492442745496, + "language_loss": 0.80829519, + "learning_rate": 5.0973877455835816e-05, + "loss": 0.81888938, + "num_input_tokens_seen": 370556512, + "router_z_loss_mlp": 0.3215332, + "step": 4467, + "time_per_iteration": 2.665745496749878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060448, + "balance_loss_mlp": 1.02892888, + "epoch": 0.8595613697575991, + "flos": 533652613632.0, + "grad_norm": 0.08486411489909984, + "language_loss": 0.83856833, + "learning_rate": 5.083692065243822e-05, + "loss": 0.84917283, + "num_input_tokens_seen": 370622880, + "router_z_loss_mlp": 0.31494141, + "step": 4468, + "time_per_iteration": 2.605087995529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065534, + "balance_loss_mlp": 1.03279936, + "epoch": 0.8597537514428626, + "flos": 617347588608.0, + "grad_norm": 0.05427930741428609, + "language_loss": 0.7589013, + "learning_rate": 5.070013822961328e-05, + "loss": 0.7695567, + "num_input_tokens_seen": 370691632, + "router_z_loss_mlp": 0.32739258, + "step": 4469, + "time_per_iteration": 2.726418972015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063558, + "balance_loss_mlp": 1.03208721, + "epoch": 0.8599461331281262, + "flos": 608450365440.0, + "grad_norm": 0.050935024727546276, + "language_loss": 0.83729804, + "learning_rate": 5.056353024046462e-05, + "loss": 0.84793365, + "num_input_tokens_seen": 370764848, + "router_z_loss_mlp": 0.31445312, + "step": 4470, + "time_per_iteration": 2.757049798965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062047, + "balance_loss_mlp": 1.02978909, + "epoch": 0.8601385148133898, + "flos": 550979988480.0, + "grad_norm": 0.05051789331606474, + "language_loss": 0.83390927, + "learning_rate": 5.042709673802786e-05, + "loss": 0.84452975, + "num_input_tokens_seen": 370832496, + "router_z_loss_mlp": 0.32250977, + "step": 4471, + "time_per_iteration": 2.6580097675323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.03182566, + "epoch": 0.8603308964986534, + "flos": 580907510784.0, + "grad_norm": 0.0485236684203926, + "language_loss": 0.81116891, + "learning_rate": 5.0290837775271494e-05, + "loss": 0.82181472, + "num_input_tokens_seen": 370917104, + "router_z_loss_mlp": 0.32763672, + "step": 4472, + "time_per_iteration": 2.839219808578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060787, + "balance_loss_mlp": 1.02836204, + "epoch": 0.8605232781839169, + "flos": 628731376128.0, + "grad_norm": 0.06326689914609517, + "language_loss": 0.7511692, + "learning_rate": 5.0154753405095846e-05, + "loss": 0.76177704, + "num_input_tokens_seen": 370984512, + "router_z_loss_mlp": 0.32421875, + "step": 4473, + "time_per_iteration": 2.7530417442321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062597, + "balance_loss_mlp": 1.0308156, + "epoch": 0.8607156598691804, + "flos": 467904854016.0, + "grad_norm": 0.05697633265814371, + "language_loss": 0.76700628, + "learning_rate": 5.0018843680333604e-05, + "loss": 0.77763224, + "num_input_tokens_seen": 371049664, + "router_z_loss_mlp": 0.31762695, + "step": 4474, + "time_per_iteration": 2.497021198272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063581, + "balance_loss_mlp": 1.03177595, + "epoch": 0.860908041554444, + "flos": 488142194688.0, + "grad_norm": 0.05343941183350629, + "language_loss": 0.8279053, + "learning_rate": 4.988310865374945e-05, + "loss": 0.83854115, + "num_input_tokens_seen": 371120704, + "router_z_loss_mlp": 0.31787109, + "step": 4475, + "time_per_iteration": 2.661044120788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062188, + "balance_loss_mlp": 1.03143167, + "epoch": 0.8611004232397076, + "flos": 591827198976.0, + "grad_norm": 0.06189407919584074, + "language_loss": 0.79964399, + "learning_rate": 4.974754837804057e-05, + "loss": 0.8102659, + "num_input_tokens_seen": 371189376, + "router_z_loss_mlp": 0.30712891, + "step": 4476, + "time_per_iteration": 2.6757631301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059104, + "balance_loss_mlp": 1.0272038, + "epoch": 0.8612928049249712, + "flos": 773857138176.0, + "grad_norm": 0.052722096051547256, + "language_loss": 0.85997331, + "learning_rate": 4.9612162905836036e-05, + "loss": 0.87056434, + "num_input_tokens_seen": 371275184, + "router_z_loss_mlp": 0.31884766, + "step": 4477, + "time_per_iteration": 3.0340847969055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063746, + "balance_loss_mlp": 1.03148818, + "epoch": 0.8614851866102347, + "flos": 537291518976.0, + "grad_norm": 0.06050404360645883, + "language_loss": 0.82557905, + "learning_rate": 4.947695228969718e-05, + "loss": 0.83621651, + "num_input_tokens_seen": 371347920, + "router_z_loss_mlp": 0.32250977, + "step": 4478, + "time_per_iteration": 2.700917959213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057319, + "balance_loss_mlp": 1.02632427, + "epoch": 0.8616775682954982, + "flos": 565647681024.0, + "grad_norm": 0.04893161007491131, + "language_loss": 0.79213041, + "learning_rate": 4.934191658211729e-05, + "loss": 0.80270362, + "num_input_tokens_seen": 371419728, + "router_z_loss_mlp": 0.30957031, + "step": 4479, + "time_per_iteration": 2.638625144958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.03211105, + "epoch": 0.8618699499807618, + "flos": 481351804416.0, + "grad_norm": 0.06177653986234914, + "language_loss": 0.81433135, + "learning_rate": 4.92070558355221e-05, + "loss": 0.82497722, + "num_input_tokens_seen": 371488768, + "router_z_loss_mlp": 0.32470703, + "step": 4480, + "time_per_iteration": 2.5831832885742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063637, + "balance_loss_mlp": 1.03095031, + "epoch": 0.8620623316660254, + "flos": 649214618112.0, + "grad_norm": 0.06454644868462169, + "language_loss": 0.74008536, + "learning_rate": 4.9072370102269226e-05, + "loss": 0.75072169, + "num_input_tokens_seen": 371560144, + "router_z_loss_mlp": 0.3269043, + "step": 4481, + "time_per_iteration": 2.800652503967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062478, + "balance_loss_mlp": 1.03045893, + "epoch": 0.862254713351289, + "flos": 751457710080.0, + "grad_norm": 0.060736689413149525, + "language_loss": 0.85903275, + "learning_rate": 4.893785943464801e-05, + "loss": 0.86965752, + "num_input_tokens_seen": 371635920, + "router_z_loss_mlp": 0.32006836, + "step": 4482, + "time_per_iteration": 2.9607954025268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062729, + "balance_loss_mlp": 1.03061461, + "epoch": 0.8624470950365525, + "flos": 841147144704.0, + "grad_norm": 0.053974741732672415, + "language_loss": 0.77722287, + "learning_rate": 4.880352388488024e-05, + "loss": 0.78785014, + "num_input_tokens_seen": 371727664, + "router_z_loss_mlp": 0.32104492, + "step": 4483, + "time_per_iteration": 3.226982355117798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061835, + "balance_loss_mlp": 1.0292666, + "epoch": 0.8626394767218161, + "flos": 754470982656.0, + "grad_norm": 0.062075712884062044, + "language_loss": 0.8284198, + "learning_rate": 4.866936350511969e-05, + "loss": 0.83903813, + "num_input_tokens_seen": 371800832, + "router_z_loss_mlp": 0.32568359, + "step": 4484, + "time_per_iteration": 2.9436373710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060867, + "balance_loss_mlp": 1.02875233, + "epoch": 0.8628318584070797, + "flos": 703268669952.0, + "grad_norm": 0.06969941920153218, + "language_loss": 0.82350802, + "learning_rate": 4.853537834745203e-05, + "loss": 0.8341167, + "num_input_tokens_seen": 371871472, + "router_z_loss_mlp": 0.32104492, + "step": 4485, + "time_per_iteration": 2.8440961837768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059193, + "balance_loss_mlp": 1.0271498, + "epoch": 0.8630242400923432, + "flos": 471006876672.0, + "grad_norm": 0.06648371861207134, + "language_loss": 0.77326876, + "learning_rate": 4.840156846389487e-05, + "loss": 0.78386068, + "num_input_tokens_seen": 371936512, + "router_z_loss_mlp": 0.3203125, + "step": 4486, + "time_per_iteration": 2.5322835445404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061159, + "balance_loss_mlp": 1.02921081, + "epoch": 0.8632166217776067, + "flos": 963965200896.0, + "grad_norm": 0.06128857742360526, + "language_loss": 0.77070493, + "learning_rate": 4.826793390639783e-05, + "loss": 0.78131652, + "num_input_tokens_seen": 372018032, + "router_z_loss_mlp": 0.31933594, + "step": 4487, + "time_per_iteration": 3.189706563949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060666, + "balance_loss_mlp": 1.02886093, + "epoch": 0.8634090034628703, + "flos": 767583281664.0, + "grad_norm": 0.07112701478219728, + "language_loss": 0.78677434, + "learning_rate": 4.813447472684246e-05, + "loss": 0.79738104, + "num_input_tokens_seen": 372092176, + "router_z_loss_mlp": 0.31787109, + "step": 4488, + "time_per_iteration": 2.954084634780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064986, + "balance_loss_mlp": 1.03203702, + "epoch": 0.8636013851481339, + "flos": 520310380032.0, + "grad_norm": 0.0522005534966177, + "language_loss": 0.83078921, + "learning_rate": 4.800119097704214e-05, + "loss": 0.84143913, + "num_input_tokens_seen": 372166880, + "router_z_loss_mlp": 0.32958984, + "step": 4489, + "time_per_iteration": 2.7687644958496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059066, + "balance_loss_mlp": 1.02695084, + "epoch": 0.8637937668333975, + "flos": 631858129920.0, + "grad_norm": 0.055171692637559167, + "language_loss": 0.80351138, + "learning_rate": 4.7868082708742324e-05, + "loss": 0.81410205, + "num_input_tokens_seen": 372234608, + "router_z_loss_mlp": 0.32104492, + "step": 4490, + "time_per_iteration": 2.71951961517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063816, + "balance_loss_mlp": 1.03227329, + "epoch": 0.8639861485186611, + "flos": 855739233792.0, + "grad_norm": 0.04993299201791118, + "language_loss": 0.76146114, + "learning_rate": 4.773514997362e-05, + "loss": 0.77209932, + "num_input_tokens_seen": 372314704, + "router_z_loss_mlp": 0.31542969, + "step": 4491, + "time_per_iteration": 3.069485664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060774, + "balance_loss_mlp": 1.02968383, + "epoch": 0.8641785302039245, + "flos": 481017153024.0, + "grad_norm": 0.0570401502594965, + "language_loss": 0.77674156, + "learning_rate": 4.7602392823284605e-05, + "loss": 0.78734934, + "num_input_tokens_seen": 372374848, + "router_z_loss_mlp": 0.31054688, + "step": 4492, + "time_per_iteration": 2.533755302429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063273, + "balance_loss_mlp": 1.03168309, + "epoch": 0.8643709118891881, + "flos": 504385629696.0, + "grad_norm": 0.04924498248309733, + "language_loss": 0.80397034, + "learning_rate": 4.746981130927675e-05, + "loss": 0.81460309, + "num_input_tokens_seen": 372442432, + "router_z_loss_mlp": 0.31567383, + "step": 4493, + "time_per_iteration": 2.587989568710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059069, + "balance_loss_mlp": 1.02762127, + "epoch": 0.8645632935744517, + "flos": 552074102784.0, + "grad_norm": 0.05394896105958079, + "language_loss": 0.82090062, + "learning_rate": 4.733740548306908e-05, + "loss": 0.83149135, + "num_input_tokens_seen": 372520048, + "router_z_loss_mlp": 0.31420898, + "step": 4494, + "time_per_iteration": 2.7614352703094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061639, + "balance_loss_mlp": 1.02978659, + "epoch": 0.8647556752597153, + "flos": 524489140224.0, + "grad_norm": 0.055631203926486274, + "language_loss": 0.83849758, + "learning_rate": 4.7205175396066336e-05, + "loss": 0.849114, + "num_input_tokens_seen": 372587968, + "router_z_loss_mlp": 0.31835938, + "step": 4495, + "time_per_iteration": 2.5548112392425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060323, + "balance_loss_mlp": 1.02830327, + "epoch": 0.8649480569449788, + "flos": 787403013120.0, + "grad_norm": 0.057759633350782755, + "language_loss": 0.82145321, + "learning_rate": 4.707312109960471e-05, + "loss": 0.83205652, + "num_input_tokens_seen": 372672544, + "router_z_loss_mlp": 0.32006836, + "step": 4496, + "time_per_iteration": 3.0689432621002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058381, + "balance_loss_mlp": 1.02605116, + "epoch": 0.8651404386302424, + "flos": 763531149312.0, + "grad_norm": 0.0515665037941851, + "language_loss": 0.76591146, + "learning_rate": 4.694124264495225e-05, + "loss": 0.77649528, + "num_input_tokens_seen": 372751296, + "router_z_loss_mlp": 0.32324219, + "step": 4497, + "time_per_iteration": 3.0204567909240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063092, + "balance_loss_mlp": 1.03135872, + "epoch": 0.865332820315506, + "flos": 539620932096.0, + "grad_norm": 0.07535101034145897, + "language_loss": 0.8228246, + "learning_rate": 4.680954008330851e-05, + "loss": 0.83345556, + "num_input_tokens_seen": 372825264, + "router_z_loss_mlp": 0.31713867, + "step": 4498, + "time_per_iteration": 2.704850196838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010122, + "balance_loss_mlp": 1.0021112, + "epoch": 0.8655252020007695, + "flos": 1475874390528.0, + "grad_norm": 0.008064108702508102, + "language_loss": 0.79174447, + "learning_rate": 4.667801346580519e-05, + "loss": 0.80184567, + "num_input_tokens_seen": 373052000, + "router_z_loss_mlp": 0.08007812, + "step": 4499, + "time_per_iteration": 4.744141101837158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063596, + "balance_loss_mlp": 1.0319581, + "epoch": 0.8657175836860331, + "flos": 517094876160.0, + "grad_norm": 0.06467601085738069, + "language_loss": 0.82661498, + "learning_rate": 4.6546662843505396e-05, + "loss": 0.83725095, + "num_input_tokens_seen": 373124128, + "router_z_loss_mlp": 0.31616211, + "step": 4500, + "time_per_iteration": 2.6995272636413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106124, + "balance_loss_mlp": 1.02957797, + "epoch": 0.8659099653712966, + "flos": 590247074304.0, + "grad_norm": 0.0488325766448074, + "language_loss": 0.79730713, + "learning_rate": 4.641548826740394e-05, + "loss": 0.8079195, + "num_input_tokens_seen": 373195472, + "router_z_loss_mlp": 0.31640625, + "step": 4501, + "time_per_iteration": 2.756542921066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061576, + "balance_loss_mlp": 1.02979493, + "epoch": 0.8661023470565602, + "flos": 590168498688.0, + "grad_norm": 0.04885372872328607, + "language_loss": 0.87834525, + "learning_rate": 4.628448978842731e-05, + "loss": 0.88896096, + "num_input_tokens_seen": 373273504, + "router_z_loss_mlp": 0.31762695, + "step": 4502, + "time_per_iteration": 2.8756601810455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061181, + "balance_loss_mlp": 1.02963829, + "epoch": 0.8662947287418238, + "flos": 567405305856.0, + "grad_norm": 0.0556252853798282, + "language_loss": 0.79367119, + "learning_rate": 4.61536674574336e-05, + "loss": 0.80428302, + "num_input_tokens_seen": 373346032, + "router_z_loss_mlp": 0.31518555, + "step": 4503, + "time_per_iteration": 2.730353832244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061547, + "balance_loss_mlp": 1.02995646, + "epoch": 0.8664871104270874, + "flos": 515661728256.0, + "grad_norm": 0.06306513847558601, + "language_loss": 0.82253635, + "learning_rate": 4.6023021325212636e-05, + "loss": 0.83315182, + "num_input_tokens_seen": 373419968, + "router_z_loss_mlp": 0.31567383, + "step": 4504, + "time_per_iteration": 2.8205671310424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062618, + "balance_loss_mlp": 1.03100419, + "epoch": 0.866679492112351, + "flos": 556973038080.0, + "grad_norm": 0.05196414276050884, + "language_loss": 0.78062767, + "learning_rate": 4.589255144248561e-05, + "loss": 0.79125381, + "num_input_tokens_seen": 373502448, + "router_z_loss_mlp": 0.31591797, + "step": 4505, + "time_per_iteration": 2.7927794456481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061347, + "balance_loss_mlp": 1.03013778, + "epoch": 0.8668718737976144, + "flos": 722145646080.0, + "grad_norm": 0.06563340698886916, + "language_loss": 0.81826079, + "learning_rate": 4.57622578599054e-05, + "loss": 0.82887423, + "num_input_tokens_seen": 373581184, + "router_z_loss_mlp": 0.31176758, + "step": 4506, + "time_per_iteration": 2.884320020675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060244, + "balance_loss_mlp": 1.02860546, + "epoch": 0.867064255482878, + "flos": 600424676352.0, + "grad_norm": 0.06596119953742512, + "language_loss": 0.8435837, + "learning_rate": 4.5632140628056705e-05, + "loss": 0.85418612, + "num_input_tokens_seen": 373652272, + "router_z_loss_mlp": 0.31616211, + "step": 4507, + "time_per_iteration": 2.7202742099761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060491, + "balance_loss_mlp": 1.02847147, + "epoch": 0.8672566371681416, + "flos": 803177966592.0, + "grad_norm": 0.05619429968013786, + "language_loss": 0.75715232, + "learning_rate": 4.550219979745529e-05, + "loss": 0.76775718, + "num_input_tokens_seen": 373734896, + "router_z_loss_mlp": 0.32006836, + "step": 4508, + "time_per_iteration": 3.0621252059936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059917, + "balance_loss_mlp": 1.02877963, + "epoch": 0.8674490188534052, + "flos": 627072675840.0, + "grad_norm": 0.04631178506085383, + "language_loss": 0.837807, + "learning_rate": 4.5372435418548905e-05, + "loss": 0.8484062, + "num_input_tokens_seen": 373806960, + "router_z_loss_mlp": 0.31103516, + "step": 4509, + "time_per_iteration": 2.726402759552002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062583, + "balance_loss_mlp": 1.03075373, + "epoch": 0.8676414005386687, + "flos": 727489741824.0, + "grad_norm": 0.04521100568191671, + "language_loss": 0.8632676, + "learning_rate": 4.524284754171615e-05, + "loss": 0.87389338, + "num_input_tokens_seen": 373888352, + "router_z_loss_mlp": 0.31811523, + "step": 4510, + "time_per_iteration": 2.9605391025543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063445, + "balance_loss_mlp": 1.03199792, + "epoch": 0.8678337822239323, + "flos": 539676186624.0, + "grad_norm": 0.05450838794945064, + "language_loss": 0.80499184, + "learning_rate": 4.5113436217267765e-05, + "loss": 0.81562626, + "num_input_tokens_seen": 373962112, + "router_z_loss_mlp": 0.31420898, + "step": 4511, + "time_per_iteration": 2.7507681846618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063839, + "balance_loss_mlp": 1.03251052, + "epoch": 0.8680261639091958, + "flos": 507270864384.0, + "grad_norm": 0.06627366254356618, + "language_loss": 0.7913667, + "learning_rate": 4.4984201495445744e-05, + "loss": 0.80200505, + "num_input_tokens_seen": 374028256, + "router_z_loss_mlp": 0.31323242, + "step": 4512, + "time_per_iteration": 2.5611917972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062078, + "balance_loss_mlp": 1.0304879, + "epoch": 0.8682185455944594, + "flos": 486871990272.0, + "grad_norm": 0.05491483118349731, + "language_loss": 0.80959839, + "learning_rate": 4.4855143426423275e-05, + "loss": 0.82021916, + "num_input_tokens_seen": 374100080, + "router_z_loss_mlp": 0.31567383, + "step": 4513, + "time_per_iteration": 2.7118194103240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060989, + "balance_loss_mlp": 1.03016114, + "epoch": 0.868410927279723, + "flos": 603413217792.0, + "grad_norm": 0.06031526727588095, + "language_loss": 0.80724663, + "learning_rate": 4.472626206030528e-05, + "loss": 0.81785655, + "num_input_tokens_seen": 374174368, + "router_z_loss_mlp": 0.30786133, + "step": 4514, + "time_per_iteration": 2.7290971279144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061083, + "balance_loss_mlp": 1.02884901, + "epoch": 0.8686033089649865, + "flos": 1118552772096.0, + "grad_norm": 0.057770146114941426, + "language_loss": 0.84628344, + "learning_rate": 4.4597557447127846e-05, + "loss": 0.85689425, + "num_input_tokens_seen": 374257328, + "router_z_loss_mlp": 0.32226562, + "step": 4515, + "time_per_iteration": 3.4016566276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063551, + "balance_loss_mlp": 1.03179383, + "epoch": 0.8687956906502501, + "flos": 567750131712.0, + "grad_norm": 0.059882180592515495, + "language_loss": 0.83618152, + "learning_rate": 4.446902963685862e-05, + "loss": 0.84681702, + "num_input_tokens_seen": 374327936, + "router_z_loss_mlp": 0.31738281, + "step": 4516, + "time_per_iteration": 2.6526734828948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106162, + "balance_loss_mlp": 1.03033924, + "epoch": 0.8689880723355137, + "flos": 544071734784.0, + "grad_norm": 0.05519247318645579, + "language_loss": 0.84240782, + "learning_rate": 4.4340678679396454e-05, + "loss": 0.85302395, + "num_input_tokens_seen": 374400496, + "router_z_loss_mlp": 0.3125, + "step": 4517, + "time_per_iteration": 2.685844898223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062904, + "balance_loss_mlp": 1.03143275, + "epoch": 0.8691804540207773, + "flos": 457185987072.0, + "grad_norm": 0.05019728971382003, + "language_loss": 0.85942495, + "learning_rate": 4.4212504624571495e-05, + "loss": 0.87005395, + "num_input_tokens_seen": 374470528, + "router_z_loss_mlp": 0.31445312, + "step": 4518, + "time_per_iteration": 2.6083579063415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063422, + "balance_loss_mlp": 1.03216529, + "epoch": 0.8693728357060407, + "flos": 591591472128.0, + "grad_norm": 0.05314480965772763, + "language_loss": 0.79870903, + "learning_rate": 4.40845075221456e-05, + "loss": 0.80934334, + "num_input_tokens_seen": 374542656, + "router_z_loss_mlp": 0.31225586, + "step": 4519, + "time_per_iteration": 2.686267614364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058795, + "balance_loss_mlp": 1.02801549, + "epoch": 0.8695652173913043, + "flos": 679949655552.0, + "grad_norm": 0.06051907660879658, + "language_loss": 0.79408765, + "learning_rate": 4.395668742181164e-05, + "loss": 0.80467558, + "num_input_tokens_seen": 374617232, + "router_z_loss_mlp": 0.30737305, + "step": 4520, + "time_per_iteration": 2.954463005065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059756, + "balance_loss_mlp": 1.02873731, + "epoch": 0.8697575990765679, + "flos": 492120133632.0, + "grad_norm": 0.12778474077428317, + "language_loss": 0.78053939, + "learning_rate": 4.38290443731934e-05, + "loss": 0.79113692, + "num_input_tokens_seen": 374681888, + "router_z_loss_mlp": 0.30981445, + "step": 4521, + "time_per_iteration": 2.5474846363067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065092, + "balance_loss_mlp": 1.03412151, + "epoch": 0.8699499807618315, + "flos": 526690515456.0, + "grad_norm": 0.0489157541753566, + "language_loss": 0.81840932, + "learning_rate": 4.370157842584671e-05, + "loss": 0.82906032, + "num_input_tokens_seen": 374750464, + "router_z_loss_mlp": 0.30932617, + "step": 4522, + "time_per_iteration": 2.6422176361083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106357, + "balance_loss_mlp": 1.0320034, + "epoch": 0.8701423624470951, + "flos": 813981201408.0, + "grad_norm": 0.05775570775583278, + "language_loss": 0.79841703, + "learning_rate": 4.357428962925808e-05, + "loss": 0.80905277, + "num_input_tokens_seen": 374836064, + "router_z_loss_mlp": 0.31542969, + "step": 4523, + "time_per_iteration": 3.1012167930603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059208, + "balance_loss_mlp": 1.02723622, + "epoch": 0.8703347441323586, + "flos": 556519113216.0, + "grad_norm": 0.05599933750738037, + "language_loss": 0.88216102, + "learning_rate": 4.344717803284542e-05, + "loss": 0.89275301, + "num_input_tokens_seen": 374903392, + "router_z_loss_mlp": 0.31958008, + "step": 4524, + "time_per_iteration": 2.6371071338653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059655, + "balance_loss_mlp": 1.02835059, + "epoch": 0.8705271258176221, + "flos": 585151699968.0, + "grad_norm": 0.05141240379057952, + "language_loss": 0.84252208, + "learning_rate": 4.3320243685957825e-05, + "loss": 0.85311866, + "num_input_tokens_seen": 374985904, + "router_z_loss_mlp": 0.31274414, + "step": 4525, + "time_per_iteration": 2.8390161991119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062694, + "balance_loss_mlp": 1.03136575, + "epoch": 0.8707195075028857, + "flos": 668896137216.0, + "grad_norm": 0.04600181508448894, + "language_loss": 0.85132861, + "learning_rate": 4.3193486637875536e-05, + "loss": 0.86195552, + "num_input_tokens_seen": 375062992, + "router_z_loss_mlp": 0.31298828, + "step": 4526, + "time_per_iteration": 2.9073646068573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106146, + "balance_loss_mlp": 1.02955973, + "epoch": 0.8709118891881493, + "flos": 520122705408.0, + "grad_norm": 0.052761141095982234, + "language_loss": 0.83831137, + "learning_rate": 4.306690693781007e-05, + "loss": 0.84892601, + "num_input_tokens_seen": 375139296, + "router_z_loss_mlp": 0.31884766, + "step": 4527, + "time_per_iteration": 2.767987012863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061555, + "balance_loss_mlp": 1.03008366, + "epoch": 0.8711042708734128, + "flos": 552944226816.0, + "grad_norm": 0.06573110271460979, + "language_loss": 0.8149147, + "learning_rate": 4.294050463490401e-05, + "loss": 0.82553029, + "num_input_tokens_seen": 375206576, + "router_z_loss_mlp": 0.31445312, + "step": 4528, + "time_per_iteration": 2.665611505508423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063718, + "balance_loss_mlp": 1.03305793, + "epoch": 0.8712966525586764, + "flos": 501933970944.0, + "grad_norm": 0.08156910290101471, + "language_loss": 0.82087851, + "learning_rate": 4.281427977823094e-05, + "loss": 0.83151567, + "num_input_tokens_seen": 375279008, + "router_z_loss_mlp": 0.30615234, + "step": 4529, + "time_per_iteration": 2.6962764263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063911, + "balance_loss_mlp": 1.0326066, + "epoch": 0.87148903424394, + "flos": 803739580416.0, + "grad_norm": 0.09272960269684108, + "language_loss": 0.73765504, + "learning_rate": 4.268823241679593e-05, + "loss": 0.74829412, + "num_input_tokens_seen": 375368512, + "router_z_loss_mlp": 0.31274414, + "step": 4530, + "time_per_iteration": 3.0716986656188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060824, + "balance_loss_mlp": 1.02975786, + "epoch": 0.8716814159292036, + "flos": 773088910848.0, + "grad_norm": 0.041732057671581745, + "language_loss": 0.86070085, + "learning_rate": 4.256236259953489e-05, + "loss": 0.87130916, + "num_input_tokens_seen": 375450528, + "router_z_loss_mlp": 0.31030273, + "step": 4531, + "time_per_iteration": 3.009658098220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063716, + "balance_loss_mlp": 1.03300774, + "epoch": 0.8718737976144671, + "flos": 486595565568.0, + "grad_norm": 0.057775826764998164, + "language_loss": 0.85169399, + "learning_rate": 4.243667037531468e-05, + "loss": 0.86233115, + "num_input_tokens_seen": 375518256, + "router_z_loss_mlp": 0.30664062, + "step": 4532, + "time_per_iteration": 2.5984034538269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062579, + "balance_loss_mlp": 1.03122652, + "epoch": 0.8720661792997306, + "flos": 583850972160.0, + "grad_norm": 0.050781364973923986, + "language_loss": 0.78413302, + "learning_rate": 4.2311155792933264e-05, + "loss": 0.7947588, + "num_input_tokens_seen": 375588112, + "router_z_loss_mlp": 0.31323242, + "step": 4533, + "time_per_iteration": 2.710092306137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013214, + "balance_loss_mlp": 1.00510764, + "epoch": 0.8722585609849942, + "flos": 1495180560384.0, + "grad_norm": 0.005859621779777296, + "language_loss": 0.80966806, + "learning_rate": 4.2185818901119946e-05, + "loss": 0.81980014, + "num_input_tokens_seen": 375814496, + "router_z_loss_mlp": 0.08105469, + "step": 4534, + "time_per_iteration": 4.804488897323608 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059822, + "balance_loss_mlp": 1.02785015, + "epoch": 0.8724509426702578, + "flos": 595885123584.0, + "grad_norm": 0.05535231433932961, + "language_loss": 0.87492794, + "learning_rate": 4.206065974853479e-05, + "loss": 0.88552618, + "num_input_tokens_seen": 375885440, + "router_z_loss_mlp": 0.31958008, + "step": 4535, + "time_per_iteration": 2.7415826320648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062201, + "balance_loss_mlp": 1.03077722, + "epoch": 0.8726433243555214, + "flos": 443408767488.0, + "grad_norm": 0.05376597495630459, + "language_loss": 0.80928969, + "learning_rate": 4.193567838376888e-05, + "loss": 0.81991172, + "num_input_tokens_seen": 375952640, + "router_z_loss_mlp": 0.31396484, + "step": 4536, + "time_per_iteration": 2.5765163898468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061895, + "balance_loss_mlp": 1.03101945, + "epoch": 0.8728357060407849, + "flos": 552919495680.0, + "grad_norm": 0.06265478182979331, + "language_loss": 0.82042164, + "learning_rate": 4.181087485534402e-05, + "loss": 0.83104056, + "num_input_tokens_seen": 376021648, + "router_z_loss_mlp": 0.30834961, + "step": 4537, + "time_per_iteration": 2.6786723136901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058304, + "balance_loss_mlp": 1.02654707, + "epoch": 0.8730280877260485, + "flos": 627506251776.0, + "grad_norm": 0.050518176985259906, + "language_loss": 0.78287077, + "learning_rate": 4.16862492117136e-05, + "loss": 0.79345381, + "num_input_tokens_seen": 376102304, + "router_z_loss_mlp": 0.31738281, + "step": 4538, + "time_per_iteration": 2.8118271827697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061457, + "balance_loss_mlp": 1.02948546, + "epoch": 0.873220469411312, + "flos": 535106110464.0, + "grad_norm": 0.05971217476802488, + "language_loss": 0.80033374, + "learning_rate": 4.156180150126143e-05, + "loss": 0.81094825, + "num_input_tokens_seen": 376177072, + "router_z_loss_mlp": 0.31958008, + "step": 4539, + "time_per_iteration": 2.6918630599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106634, + "balance_loss_mlp": 1.0347259, + "epoch": 0.8734128510965756, + "flos": 561605723136.0, + "grad_norm": 0.05091768450849498, + "language_loss": 0.8370958, + "learning_rate": 4.143753177230242e-05, + "loss": 0.84775919, + "num_input_tokens_seen": 376251376, + "router_z_loss_mlp": 0.31591797, + "step": 4540, + "time_per_iteration": 2.6893396377563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106051, + "balance_loss_mlp": 1.0292058, + "epoch": 0.8736052327818392, + "flos": 686134761984.0, + "grad_norm": 0.06496183714869043, + "language_loss": 0.79499495, + "learning_rate": 4.131344007308224e-05, + "loss": 0.80560005, + "num_input_tokens_seen": 376337104, + "router_z_loss_mlp": 0.31274414, + "step": 4541, + "time_per_iteration": 2.944713830947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062058, + "balance_loss_mlp": 1.03046799, + "epoch": 0.8737976144671027, + "flos": 531384247296.0, + "grad_norm": 0.05340963644738213, + "language_loss": 0.81737614, + "learning_rate": 4.1189526451777816e-05, + "loss": 0.82799673, + "num_input_tokens_seen": 376415456, + "router_z_loss_mlp": 0.31567383, + "step": 4542, + "time_per_iteration": 2.820056676864624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061357, + "balance_loss_mlp": 1.02981448, + "epoch": 0.8739899961523663, + "flos": 575308749312.0, + "grad_norm": 0.051498816346912536, + "language_loss": 0.81901187, + "learning_rate": 4.106579095649649e-05, + "loss": 0.82962549, + "num_input_tokens_seen": 376494880, + "router_z_loss_mlp": 0.31518555, + "step": 4543, + "time_per_iteration": 2.8078243732452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063084, + "balance_loss_mlp": 1.03137445, + "epoch": 0.8741823778376299, + "flos": 731009373696.0, + "grad_norm": 0.08412552836981564, + "language_loss": 0.76142043, + "learning_rate": 4.094223363527666e-05, + "loss": 0.77205127, + "num_input_tokens_seen": 376571760, + "router_z_loss_mlp": 0.31689453, + "step": 4544, + "time_per_iteration": 2.895599365234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062313, + "balance_loss_mlp": 1.0306983, + "epoch": 0.8743747595228935, + "flos": 566795639808.0, + "grad_norm": 0.06200041308589868, + "language_loss": 0.83552009, + "learning_rate": 4.081885453608747e-05, + "loss": 0.84614325, + "num_input_tokens_seen": 376644464, + "router_z_loss_mlp": 0.31591797, + "step": 4545, + "time_per_iteration": 2.728745222091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062007, + "balance_loss_mlp": 1.03020191, + "epoch": 0.8745671412081569, + "flos": 493115323392.0, + "grad_norm": 0.053444102465167606, + "language_loss": 0.81948709, + "learning_rate": 4.0695653706829095e-05, + "loss": 0.83010715, + "num_input_tokens_seen": 376709584, + "router_z_loss_mlp": 0.31787109, + "step": 4546, + "time_per_iteration": 2.5782153606414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060239, + "balance_loss_mlp": 1.02814758, + "epoch": 0.8747595228934205, + "flos": 523883856384.0, + "grad_norm": 0.04577609874107169, + "language_loss": 0.83400089, + "learning_rate": 4.057263119533233e-05, + "loss": 0.8446033, + "num_input_tokens_seen": 376779472, + "router_z_loss_mlp": 0.32080078, + "step": 4547, + "time_per_iteration": 2.6548120975494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061892, + "balance_loss_mlp": 1.03011048, + "epoch": 0.8749519045786841, + "flos": 743999427072.0, + "grad_norm": 0.06038252152055413, + "language_loss": 0.79740083, + "learning_rate": 4.044978704935853e-05, + "loss": 0.80801976, + "num_input_tokens_seen": 376863408, + "router_z_loss_mlp": 0.31762695, + "step": 4548, + "time_per_iteration": 3.041475534439087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106142, + "balance_loss_mlp": 1.03061604, + "epoch": 0.8751442862639477, + "flos": 594003843072.0, + "grad_norm": 0.048567013140779235, + "language_loss": 0.80103874, + "learning_rate": 4.032712131660027e-05, + "loss": 0.81165296, + "num_input_tokens_seen": 376942080, + "router_z_loss_mlp": 0.30786133, + "step": 4549, + "time_per_iteration": 2.8483879566192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063327, + "balance_loss_mlp": 1.03190303, + "epoch": 0.8753366679492113, + "flos": 496285747200.0, + "grad_norm": 0.05234931726187161, + "language_loss": 0.78447485, + "learning_rate": 4.020463404468055e-05, + "loss": 0.79510808, + "num_input_tokens_seen": 377015696, + "router_z_loss_mlp": 0.31396484, + "step": 4550, + "time_per_iteration": 2.732851982116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060994, + "balance_loss_mlp": 1.02890277, + "epoch": 0.8755290496344748, + "flos": 489619012608.0, + "grad_norm": 0.0557326561034337, + "language_loss": 0.81853771, + "learning_rate": 4.0082325281153074e-05, + "loss": 0.8291477, + "num_input_tokens_seen": 377081424, + "router_z_loss_mlp": 0.32080078, + "step": 4551, + "time_per_iteration": 2.5726113319396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061371, + "balance_loss_mlp": 1.02987576, + "epoch": 0.8757214313197383, + "flos": 591557976576.0, + "grad_norm": 0.04987162654706675, + "language_loss": 0.81259084, + "learning_rate": 3.9960195073502345e-05, + "loss": 0.82320452, + "num_input_tokens_seen": 377159360, + "router_z_loss_mlp": 0.31469727, + "step": 4552, + "time_per_iteration": 2.817884683609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062068, + "balance_loss_mlp": 1.03012002, + "epoch": 0.8759138130050019, + "flos": 976456249344.0, + "grad_norm": 0.06584475086644327, + "language_loss": 0.77716434, + "learning_rate": 3.9838243469143555e-05, + "loss": 0.78778505, + "num_input_tokens_seen": 377240704, + "router_z_loss_mlp": 0.31933594, + "step": 4553, + "time_per_iteration": 3.2275702953338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059915, + "balance_loss_mlp": 1.02808642, + "epoch": 0.8761061946902655, + "flos": 802405357056.0, + "grad_norm": 0.04329284311928099, + "language_loss": 0.77679586, + "learning_rate": 3.971647051542243e-05, + "loss": 0.78739506, + "num_input_tokens_seen": 377324176, + "router_z_loss_mlp": 0.31811523, + "step": 4554, + "time_per_iteration": 3.0647592544555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058788, + "balance_loss_mlp": 1.02731705, + "epoch": 0.8762985763755291, + "flos": 698158738944.0, + "grad_norm": 0.048765571498963003, + "language_loss": 0.74434495, + "learning_rate": 3.95948762596155e-05, + "loss": 0.75493276, + "num_input_tokens_seen": 377403440, + "router_z_loss_mlp": 0.31445312, + "step": 4555, + "time_per_iteration": 2.961852550506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106342, + "balance_loss_mlp": 1.03130519, + "epoch": 0.8764909580607926, + "flos": 629416645632.0, + "grad_norm": 0.05400398397225891, + "language_loss": 0.80043375, + "learning_rate": 3.9473460748929765e-05, + "loss": 0.81106794, + "num_input_tokens_seen": 377483440, + "router_z_loss_mlp": 0.32104492, + "step": 4556, + "time_per_iteration": 2.833293914794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059523, + "balance_loss_mlp": 1.028934, + "epoch": 0.8766833397460562, + "flos": 481297959936.0, + "grad_norm": 0.05296826742775525, + "language_loss": 0.80469096, + "learning_rate": 3.935222403050304e-05, + "loss": 0.81528622, + "num_input_tokens_seen": 377554688, + "router_z_loss_mlp": 0.30541992, + "step": 4557, + "time_per_iteration": 2.6411917209625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059244, + "balance_loss_mlp": 1.02808261, + "epoch": 0.8768757214313198, + "flos": 407514336768.0, + "grad_norm": 0.05694908001629326, + "language_loss": 0.78118753, + "learning_rate": 3.923116615140354e-05, + "loss": 0.79177999, + "num_input_tokens_seen": 377617616, + "router_z_loss_mlp": 0.3112793, + "step": 4558, + "time_per_iteration": 2.472745180130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061431, + "balance_loss_mlp": 1.02931571, + "epoch": 0.8770681031165833, + "flos": 582314517504.0, + "grad_norm": 0.06519133015059232, + "language_loss": 0.8193962, + "learning_rate": 3.9110287158630076e-05, + "loss": 0.83001053, + "num_input_tokens_seen": 377685888, + "router_z_loss_mlp": 0.32104492, + "step": 4559, + "time_per_iteration": 2.6806676387786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062883, + "balance_loss_mlp": 1.03086364, + "epoch": 0.8772604848018468, + "flos": 508437762048.0, + "grad_norm": 0.06392536215328089, + "language_loss": 0.80933923, + "learning_rate": 3.8989587099111875e-05, + "loss": 0.81996804, + "num_input_tokens_seen": 377755744, + "router_z_loss_mlp": 0.32006836, + "step": 4560, + "time_per_iteration": 2.5991218090057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059632, + "balance_loss_mlp": 1.02837586, + "epoch": 0.8774528664871104, + "flos": 408617215488.0, + "grad_norm": 0.06067903743456465, + "language_loss": 0.84571135, + "learning_rate": 3.886906601970913e-05, + "loss": 0.85630763, + "num_input_tokens_seen": 377818880, + "router_z_loss_mlp": 0.31225586, + "step": 4561, + "time_per_iteration": 2.455996513366699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061512, + "balance_loss_mlp": 1.02975512, + "epoch": 0.877645248172374, + "flos": 500589573120.0, + "grad_norm": 0.05408973543487403, + "language_loss": 0.83434474, + "learning_rate": 3.8748723967212184e-05, + "loss": 0.84495986, + "num_input_tokens_seen": 377893280, + "router_z_loss_mlp": 0.31738281, + "step": 4562, + "time_per_iteration": 2.6538524627685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061289, + "balance_loss_mlp": 1.02948415, + "epoch": 0.8778376298576376, + "flos": 632857701888.0, + "grad_norm": 0.05369014995175808, + "language_loss": 0.77912921, + "learning_rate": 3.862856098834189e-05, + "loss": 0.78974211, + "num_input_tokens_seen": 377972912, + "router_z_loss_mlp": 0.31787109, + "step": 4563, + "time_per_iteration": 2.8910348415374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063792, + "balance_loss_mlp": 1.03274965, + "epoch": 0.8780300115429012, + "flos": 533707868160.0, + "grad_norm": 0.053856474502613036, + "language_loss": 0.79521894, + "learning_rate": 3.850857712974976e-05, + "loss": 0.80585694, + "num_input_tokens_seen": 378054000, + "router_z_loss_mlp": 0.31005859, + "step": 4564, + "time_per_iteration": 2.7875571250915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059359, + "balance_loss_mlp": 1.02855527, + "epoch": 0.8782223932281646, + "flos": 511411746816.0, + "grad_norm": 0.04753076591808214, + "language_loss": 0.7683506, + "learning_rate": 3.838877243801758e-05, + "loss": 0.77894419, + "num_input_tokens_seen": 378120336, + "router_z_loss_mlp": 0.30761719, + "step": 4565, + "time_per_iteration": 2.6198067665100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061378, + "balance_loss_mlp": 1.02950168, + "epoch": 0.8784147749134282, + "flos": 780333225984.0, + "grad_norm": 0.05851858406426915, + "language_loss": 0.69561017, + "learning_rate": 3.826914695965766e-05, + "loss": 0.70622396, + "num_input_tokens_seen": 378216672, + "router_z_loss_mlp": 0.31860352, + "step": 4566, + "time_per_iteration": 3.172079563140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063568, + "balance_loss_mlp": 1.03228772, + "epoch": 0.8786071565986918, + "flos": 560738571264.0, + "grad_norm": 0.06257605820389481, + "language_loss": 0.75450444, + "learning_rate": 3.814970074111279e-05, + "loss": 0.76514018, + "num_input_tokens_seen": 378287536, + "router_z_loss_mlp": 0.3125, + "step": 4567, + "time_per_iteration": 2.718393087387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060158, + "balance_loss_mlp": 1.02818608, + "epoch": 0.8787995382839554, + "flos": 603148377600.0, + "grad_norm": 0.04978565158238451, + "language_loss": 0.77231061, + "learning_rate": 3.8030433828755926e-05, + "loss": 0.78291219, + "num_input_tokens_seen": 378362128, + "router_z_loss_mlp": 0.31958008, + "step": 4568, + "time_per_iteration": 2.783825159072876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059599, + "balance_loss_mlp": 1.02838981, + "epoch": 0.8789919199692189, + "flos": 559970343936.0, + "grad_norm": 0.04489278278007527, + "language_loss": 0.84756523, + "learning_rate": 3.7911346268890924e-05, + "loss": 0.85816121, + "num_input_tokens_seen": 378435696, + "router_z_loss_mlp": 0.31176758, + "step": 4569, + "time_per_iteration": 2.6692512035369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059281, + "balance_loss_mlp": 1.0283581, + "epoch": 0.8791843016544825, + "flos": 538857086976.0, + "grad_norm": 0.056088812281162366, + "language_loss": 0.81766403, + "learning_rate": 3.7792438107751405e-05, + "loss": 0.82825685, + "num_input_tokens_seen": 378505664, + "router_z_loss_mlp": 0.30883789, + "step": 4570, + "time_per_iteration": 2.6016881465911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060689, + "balance_loss_mlp": 1.02938414, + "epoch": 0.8793766833397461, + "flos": 1008275226624.0, + "grad_norm": 0.05357677729025578, + "language_loss": 0.79291123, + "learning_rate": 3.767370939150167e-05, + "loss": 0.80351812, + "num_input_tokens_seen": 378598016, + "router_z_loss_mlp": 0.31274414, + "step": 4571, + "time_per_iteration": 3.325495481491089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065188, + "balance_loss_mlp": 1.03319228, + "epoch": 0.8795690650250096, + "flos": 678320068608.0, + "grad_norm": 0.04870756479019928, + "language_loss": 0.80827546, + "learning_rate": 3.755516016623628e-05, + "loss": 0.81892741, + "num_input_tokens_seen": 378676176, + "router_z_loss_mlp": 0.31982422, + "step": 4572, + "time_per_iteration": 2.8708269596099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058214, + "balance_loss_mlp": 1.02669477, + "epoch": 0.8797614467102732, + "flos": 453202255872.0, + "grad_norm": 0.06319465276598665, + "language_loss": 0.88573319, + "learning_rate": 3.7436790477980157e-05, + "loss": 0.89631534, + "num_input_tokens_seen": 378737952, + "router_z_loss_mlp": 0.31494141, + "step": 4573, + "time_per_iteration": 2.5026752948760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059994, + "balance_loss_mlp": 1.02866578, + "epoch": 0.8799538283955367, + "flos": 550649719296.0, + "grad_norm": 0.04909208675254753, + "language_loss": 0.8408621, + "learning_rate": 3.7318600372688526e-05, + "loss": 0.85146207, + "num_input_tokens_seen": 378806704, + "router_z_loss_mlp": 0.31298828, + "step": 4574, + "time_per_iteration": 2.6635913848876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062455, + "balance_loss_mlp": 1.03079283, + "epoch": 0.8801462100808003, + "flos": 807072947712.0, + "grad_norm": 0.06010438341663848, + "language_loss": 0.8430174, + "learning_rate": 3.720058989624681e-05, + "loss": 0.85364199, + "num_input_tokens_seen": 378887616, + "router_z_loss_mlp": 0.31640625, + "step": 4575, + "time_per_iteration": 3.0828750133514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106221, + "balance_loss_mlp": 1.03057218, + "epoch": 0.8803385917660639, + "flos": 768366065664.0, + "grad_norm": 0.047129972836998074, + "language_loss": 0.84180987, + "learning_rate": 3.708275909447079e-05, + "loss": 0.85243201, + "num_input_tokens_seen": 378964656, + "router_z_loss_mlp": 0.31616211, + "step": 4576, + "time_per_iteration": 2.9739205837249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105957, + "balance_loss_mlp": 1.02814686, + "epoch": 0.8805309734513275, + "flos": 567070654464.0, + "grad_norm": 0.053878873022787786, + "language_loss": 0.8110702, + "learning_rate": 3.696510801310632e-05, + "loss": 0.82166588, + "num_input_tokens_seen": 379036752, + "router_z_loss_mlp": 0.31396484, + "step": 4577, + "time_per_iteration": 2.696599006652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060244, + "balance_loss_mlp": 1.02851081, + "epoch": 0.880723355136591, + "flos": 679481174016.0, + "grad_norm": 0.05262790478746066, + "language_loss": 0.8144868, + "learning_rate": 3.6847636697829755e-05, + "loss": 0.82508922, + "num_input_tokens_seen": 379106480, + "router_z_loss_mlp": 0.31713867, + "step": 4578, + "time_per_iteration": 2.802515745162964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106303, + "balance_loss_mlp": 1.03151155, + "epoch": 0.8809157368218545, + "flos": 565347935232.0, + "grad_norm": 0.05499943484212242, + "language_loss": 0.78842521, + "learning_rate": 3.673034519424734e-05, + "loss": 0.79905552, + "num_input_tokens_seen": 379182544, + "router_z_loss_mlp": 0.31494141, + "step": 4579, + "time_per_iteration": 2.7430505752563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060042, + "balance_loss_mlp": 1.02816534, + "epoch": 0.8811081185071181, + "flos": 515153958912.0, + "grad_norm": 0.04849663850018554, + "language_loss": 0.7603749, + "learning_rate": 3.661323354789586e-05, + "loss": 0.77097535, + "num_input_tokens_seen": 379255856, + "router_z_loss_mlp": 0.31860352, + "step": 4580, + "time_per_iteration": 2.7311344146728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061477, + "balance_loss_mlp": 1.03117371, + "epoch": 0.8813005001923817, + "flos": 594067862016.0, + "grad_norm": 0.06983405724673822, + "language_loss": 0.8144145, + "learning_rate": 3.649630180424191e-05, + "loss": 0.82502925, + "num_input_tokens_seen": 379322704, + "router_z_loss_mlp": 0.30273438, + "step": 4581, + "time_per_iteration": 2.7526886463165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055676, + "balance_loss_mlp": 1.02415729, + "epoch": 0.8814928818776453, + "flos": 666630743040.0, + "grad_norm": 0.055477837077192144, + "language_loss": 0.79182696, + "learning_rate": 3.637955000868254e-05, + "loss": 0.80238372, + "num_input_tokens_seen": 379395008, + "router_z_loss_mlp": 0.31494141, + "step": 4582, + "time_per_iteration": 2.8307814598083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060037, + "balance_loss_mlp": 1.02868462, + "epoch": 0.8816852635629088, + "flos": 608873766912.0, + "grad_norm": 0.048277670038562745, + "language_loss": 0.85441208, + "learning_rate": 3.626297820654467e-05, + "loss": 0.86501247, + "num_input_tokens_seen": 379465824, + "router_z_loss_mlp": 0.31323242, + "step": 4583, + "time_per_iteration": 2.7170026302337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061128, + "balance_loss_mlp": 1.02958536, + "epoch": 0.8818776452481724, + "flos": 480131062272.0, + "grad_norm": 0.06483310109620229, + "language_loss": 0.82067692, + "learning_rate": 3.614658644308572e-05, + "loss": 0.83128822, + "num_input_tokens_seen": 379534960, + "router_z_loss_mlp": 0.31518555, + "step": 4584, + "time_per_iteration": 2.618037223815918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064716, + "balance_loss_mlp": 1.0326488, + "epoch": 0.882070026933436, + "flos": 1044985936896.0, + "grad_norm": 0.06883560573017064, + "language_loss": 0.73482269, + "learning_rate": 3.60303747634928e-05, + "loss": 0.74546981, + "num_input_tokens_seen": 379617456, + "router_z_loss_mlp": 0.32055664, + "step": 4585, + "time_per_iteration": 3.3003652095794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060323, + "balance_loss_mlp": 1.02880442, + "epoch": 0.8822624086186995, + "flos": 474153979392.0, + "grad_norm": 0.05196830736419867, + "language_loss": 0.79747665, + "learning_rate": 3.591434321288345e-05, + "loss": 0.80807984, + "num_input_tokens_seen": 379687792, + "router_z_loss_mlp": 0.31494141, + "step": 4586, + "time_per_iteration": 2.6248860359191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059576, + "balance_loss_mlp": 1.02862895, + "epoch": 0.882454790303963, + "flos": 653725057536.0, + "grad_norm": 0.06065088286401367, + "language_loss": 0.81563091, + "learning_rate": 3.579849183630485e-05, + "loss": 0.82622671, + "num_input_tokens_seen": 379761120, + "router_z_loss_mlp": 0.30932617, + "step": 4587, + "time_per_iteration": 2.7769362926483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061148, + "balance_loss_mlp": 1.02922332, + "epoch": 0.8826471719892266, + "flos": 470081498112.0, + "grad_norm": 0.04739147053606439, + "language_loss": 0.78241807, + "learning_rate": 3.568282067873468e-05, + "loss": 0.79302955, + "num_input_tokens_seen": 379829008, + "router_z_loss_mlp": 0.3190918, + "step": 4588, + "time_per_iteration": 2.570162773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061217, + "balance_loss_mlp": 1.02879214, + "epoch": 0.8828395536744902, + "flos": 468501373440.0, + "grad_norm": 0.047057706534628874, + "language_loss": 0.83742458, + "learning_rate": 3.556732978508048e-05, + "loss": 0.84803677, + "num_input_tokens_seen": 379899584, + "router_z_loss_mlp": 0.32421875, + "step": 4589, + "time_per_iteration": 2.686675786972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105933, + "balance_loss_mlp": 1.02783465, + "epoch": 0.8830319353597538, + "flos": 721044177408.0, + "grad_norm": 0.04950856406845548, + "language_loss": 0.81078488, + "learning_rate": 3.545201920017971e-05, + "loss": 0.82137823, + "num_input_tokens_seen": 379979440, + "router_z_loss_mlp": 0.31469727, + "step": 4590, + "time_per_iteration": 2.9431474208831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064053, + "balance_loss_mlp": 1.03282046, + "epoch": 0.8832243170450174, + "flos": 443049384960.0, + "grad_norm": 0.10786996917657111, + "language_loss": 0.81250805, + "learning_rate": 3.5336888968799996e-05, + "loss": 0.82314861, + "num_input_tokens_seen": 380046944, + "router_z_loss_mlp": 0.31201172, + "step": 4591, + "time_per_iteration": 2.5478732585906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061662, + "balance_loss_mlp": 1.02983332, + "epoch": 0.8834166987302808, + "flos": 566293662720.0, + "grad_norm": 0.05506928252909138, + "language_loss": 0.82108343, + "learning_rate": 3.5221939135638756e-05, + "loss": 0.83170003, + "num_input_tokens_seen": 380118048, + "router_z_loss_mlp": 0.31811523, + "step": 4592, + "time_per_iteration": 2.7421905994415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059531, + "balance_loss_mlp": 1.02796483, + "epoch": 0.8836090804155444, + "flos": 609022153728.0, + "grad_norm": 0.0589985976139412, + "language_loss": 0.81940699, + "learning_rate": 3.510716974532352e-05, + "loss": 0.83000231, + "num_input_tokens_seen": 380192416, + "router_z_loss_mlp": 0.31542969, + "step": 4593, + "time_per_iteration": 2.7895593643188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061044, + "balance_loss_mlp": 1.02964461, + "epoch": 0.883801462100808, + "flos": 556804302336.0, + "grad_norm": 0.05233139357243583, + "language_loss": 0.80372763, + "learning_rate": 3.4992580842411745e-05, + "loss": 0.81433809, + "num_input_tokens_seen": 380264432, + "router_z_loss_mlp": 0.3137207, + "step": 4594, + "time_per_iteration": 2.6803102493286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060001, + "balance_loss_mlp": 1.02843499, + "epoch": 0.8839938437860716, + "flos": 515936742912.0, + "grad_norm": 0.07064799718920817, + "language_loss": 0.77353942, + "learning_rate": 3.487817247139064e-05, + "loss": 0.78413939, + "num_input_tokens_seen": 380334192, + "router_z_loss_mlp": 0.31542969, + "step": 4595, + "time_per_iteration": 2.6111834049224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106149, + "balance_loss_mlp": 1.02989948, + "epoch": 0.8841862254713351, + "flos": 713386635264.0, + "grad_norm": 0.09630405385113956, + "language_loss": 0.7850247, + "learning_rate": 3.47639446766777e-05, + "loss": 0.79563963, + "num_input_tokens_seen": 380407504, + "router_z_loss_mlp": 0.31567383, + "step": 4596, + "time_per_iteration": 2.863654375076294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061691, + "balance_loss_mlp": 1.03012478, + "epoch": 0.8843786071565987, + "flos": 833626404864.0, + "grad_norm": 0.05760354635847824, + "language_loss": 0.82457066, + "learning_rate": 3.4649897502620095e-05, + "loss": 0.83518755, + "num_input_tokens_seen": 380486272, + "router_z_loss_mlp": 0.31542969, + "step": 4597, + "time_per_iteration": 2.9976413249969482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062715, + "balance_loss_mlp": 1.03188777, + "epoch": 0.8845709888418622, + "flos": 656562240000.0, + "grad_norm": 0.048945613215712114, + "language_loss": 0.82873589, + "learning_rate": 3.453603099349462e-05, + "loss": 0.83936304, + "num_input_tokens_seen": 380568480, + "router_z_loss_mlp": 0.30810547, + "step": 4598, + "time_per_iteration": 2.8781182765960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061116, + "balance_loss_mlp": 1.03000224, + "epoch": 0.8847633705271258, + "flos": 523038463488.0, + "grad_norm": 0.0499308034693015, + "language_loss": 0.80891055, + "learning_rate": 3.442234519350823e-05, + "loss": 0.81952167, + "num_input_tokens_seen": 380643088, + "router_z_loss_mlp": 0.31079102, + "step": 4599, + "time_per_iteration": 2.723100423812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063587, + "balance_loss_mlp": 1.03261626, + "epoch": 0.8849557522123894, + "flos": 548330480640.0, + "grad_norm": 0.05641788542621963, + "language_loss": 0.84390503, + "learning_rate": 3.430884014679786e-05, + "loss": 0.85454094, + "num_input_tokens_seen": 380714512, + "router_z_loss_mlp": 0.30932617, + "step": 4600, + "time_per_iteration": 2.6754045486450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062521, + "balance_loss_mlp": 1.03088272, + "epoch": 0.8851481338976529, + "flos": 622070433792.0, + "grad_norm": 0.0608435446742517, + "language_loss": 0.83655906, + "learning_rate": 3.4195515897429974e-05, + "loss": 0.84718424, + "num_input_tokens_seen": 380789168, + "router_z_loss_mlp": 0.31616211, + "step": 4601, + "time_per_iteration": 2.7671477794647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106026, + "balance_loss_mlp": 1.02902758, + "epoch": 0.8853405155829165, + "flos": 444123150336.0, + "grad_norm": 0.05872328742975128, + "language_loss": 0.80575866, + "learning_rate": 3.408237248940088e-05, + "loss": 0.81636125, + "num_input_tokens_seen": 380856992, + "router_z_loss_mlp": 0.31201172, + "step": 4602, + "time_per_iteration": 2.560318946838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062734, + "balance_loss_mlp": 1.03095329, + "epoch": 0.8855328972681801, + "flos": 730152396288.0, + "grad_norm": 0.056216541125300654, + "language_loss": 0.77893907, + "learning_rate": 3.396940996663683e-05, + "loss": 0.7895664, + "num_input_tokens_seen": 380930480, + "router_z_loss_mlp": 0.31762695, + "step": 4603, + "time_per_iteration": 2.8867790699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063127, + "balance_loss_mlp": 1.03201365, + "epoch": 0.8857252789534437, + "flos": 487132448256.0, + "grad_norm": 0.07079921333147207, + "language_loss": 0.78746498, + "learning_rate": 3.385662837299375e-05, + "loss": 0.7980963, + "num_input_tokens_seen": 380994192, + "router_z_loss_mlp": 0.31079102, + "step": 4604, + "time_per_iteration": 2.5524046421051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062191, + "balance_loss_mlp": 1.03117263, + "epoch": 0.8859176606387072, + "flos": 508290785280.0, + "grad_norm": 0.05238353409776557, + "language_loss": 0.81618583, + "learning_rate": 3.374402775225727e-05, + "loss": 0.82680774, + "num_input_tokens_seen": 381066848, + "router_z_loss_mlp": 0.31005859, + "step": 4605, + "time_per_iteration": 2.6777870655059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062144, + "balance_loss_mlp": 1.03043461, + "epoch": 0.8861100423239707, + "flos": 516370318848.0, + "grad_norm": 0.055497975758408605, + "language_loss": 0.85710311, + "learning_rate": 3.3631608148142925e-05, + "loss": 0.8677246, + "num_input_tokens_seen": 381138816, + "router_z_loss_mlp": 0.31689453, + "step": 4606, + "time_per_iteration": 2.6625237464904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064071, + "balance_loss_mlp": 1.03319585, + "epoch": 0.8863024240092343, + "flos": 626692944384.0, + "grad_norm": 0.05509705271526416, + "language_loss": 0.79623628, + "learning_rate": 3.3519369604295746e-05, + "loss": 0.80687696, + "num_input_tokens_seen": 381208448, + "router_z_loss_mlp": 0.30834961, + "step": 4607, + "time_per_iteration": 2.7269294261932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106404, + "balance_loss_mlp": 1.03230667, + "epoch": 0.8864948056944979, + "flos": 766564770816.0, + "grad_norm": 0.10040451996396124, + "language_loss": 0.83269691, + "learning_rate": 3.340731216429083e-05, + "loss": 0.8433373, + "num_input_tokens_seen": 381289712, + "router_z_loss_mlp": 0.31713867, + "step": 4608, + "time_per_iteration": 2.991093397140503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018938, + "balance_loss_mlp": 1.01102269, + "epoch": 0.8866871873797615, + "flos": 1501500907008.0, + "grad_norm": 0.009535247872241597, + "language_loss": 0.78830957, + "learning_rate": 3.329543587163253e-05, + "loss": 0.79849893, + "num_input_tokens_seen": 381520848, + "router_z_loss_mlp": 0.07910156, + "step": 4609, + "time_per_iteration": 4.8284571170806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061082, + "balance_loss_mlp": 1.02837062, + "epoch": 0.886879569065025, + "flos": 811164367872.0, + "grad_norm": 0.13586161840975353, + "language_loss": 0.81234121, + "learning_rate": 3.3183740769755e-05, + "loss": 0.82295209, + "num_input_tokens_seen": 381603008, + "router_z_loss_mlp": 0.32714844, + "step": 4610, + "time_per_iteration": 3.0232110023498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018727, + "balance_loss_mlp": 1.01081121, + "epoch": 0.8870719507502886, + "flos": 1581994934784.0, + "grad_norm": 0.009521282732020938, + "language_loss": 0.7691083, + "learning_rate": 3.307222690202238e-05, + "loss": 0.77929556, + "num_input_tokens_seen": 381844336, + "router_z_loss_mlp": 0.07910156, + "step": 4611, + "time_per_iteration": 5.034501552581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106293, + "balance_loss_mlp": 1.03231668, + "epoch": 0.8872643324355521, + "flos": 633743792640.0, + "grad_norm": 0.05784261037220574, + "language_loss": 0.74835932, + "learning_rate": 3.296089431172811e-05, + "loss": 0.75898862, + "num_input_tokens_seen": 381918576, + "router_z_loss_mlp": 0.30566406, + "step": 4612, + "time_per_iteration": 2.8261477947235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062432, + "balance_loss_mlp": 1.031461, + "epoch": 0.8874567141208157, + "flos": 535498988544.0, + "grad_norm": 0.0754643632292133, + "language_loss": 0.8301453, + "learning_rate": 3.284974304209532e-05, + "loss": 0.84076959, + "num_input_tokens_seen": 381987296, + "router_z_loss_mlp": 0.30932617, + "step": 4613, + "time_per_iteration": 2.6077656745910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058933, + "balance_loss_mlp": 1.02801013, + "epoch": 0.8876490958060793, + "flos": 1565700931584.0, + "grad_norm": 0.05499745508093668, + "language_loss": 0.79193819, + "learning_rate": 3.27387731362766e-05, + "loss": 0.80252743, + "num_input_tokens_seen": 382091744, + "router_z_loss_mlp": 0.30883789, + "step": 4614, + "time_per_iteration": 3.8746235370635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064375, + "balance_loss_mlp": 1.03335643, + "epoch": 0.8878414774913428, + "flos": 636343838208.0, + "grad_norm": 0.05793142822201318, + "language_loss": 0.84617949, + "learning_rate": 3.2627984637354444e-05, + "loss": 0.85682321, + "num_input_tokens_seen": 382169600, + "router_z_loss_mlp": 0.30981445, + "step": 4615, + "time_per_iteration": 2.779799461364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061212, + "balance_loss_mlp": 1.03014576, + "epoch": 0.8880338591766064, + "flos": 496182440448.0, + "grad_norm": 0.06017785119690372, + "language_loss": 0.81558824, + "learning_rate": 3.251737758834084e-05, + "loss": 0.82620031, + "num_input_tokens_seen": 382238336, + "router_z_loss_mlp": 0.31030273, + "step": 4616, + "time_per_iteration": 2.609734058380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063093, + "balance_loss_mlp": 1.03126431, + "epoch": 0.88822624086187, + "flos": 542599299072.0, + "grad_norm": 0.05444758565813165, + "language_loss": 0.79956746, + "learning_rate": 3.2406952032177086e-05, + "loss": 0.81019837, + "num_input_tokens_seen": 382308560, + "router_z_loss_mlp": 0.31811523, + "step": 4617, + "time_per_iteration": 2.63232684135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061015, + "balance_loss_mlp": 1.02890027, + "epoch": 0.8884186225471336, + "flos": 551560541184.0, + "grad_norm": 0.06903875760224201, + "language_loss": 0.83818024, + "learning_rate": 3.229670801173418e-05, + "loss": 0.84879041, + "num_input_tokens_seen": 382377504, + "router_z_loss_mlp": 0.32104492, + "step": 4618, + "time_per_iteration": 2.589545488357544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013111, + "balance_loss_mlp": 1.00505221, + "epoch": 0.888611004232397, + "flos": 1564417276416.0, + "grad_norm": 0.0068369418927251785, + "language_loss": 0.78512192, + "learning_rate": 3.218664556981288e-05, + "loss": 0.79525304, + "num_input_tokens_seen": 382615728, + "router_z_loss_mlp": 0.08056641, + "step": 4619, + "time_per_iteration": 5.003114938735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062648, + "balance_loss_mlp": 1.03170085, + "epoch": 0.8888033859176606, + "flos": 766678252032.0, + "grad_norm": 0.057281222385008684, + "language_loss": 0.82745749, + "learning_rate": 3.207676474914301e-05, + "loss": 0.83808392, + "num_input_tokens_seen": 382695552, + "router_z_loss_mlp": 0.30908203, + "step": 4620, + "time_per_iteration": 2.990114212036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061842, + "balance_loss_mlp": 1.0310626, + "epoch": 0.8889957676029242, + "flos": 933727758336.0, + "grad_norm": 0.053752902191243575, + "language_loss": 0.84139264, + "learning_rate": 3.1967065592384105e-05, + "loss": 0.85201108, + "num_input_tokens_seen": 382775824, + "router_z_loss_mlp": 0.30761719, + "step": 4621, + "time_per_iteration": 3.1363883018493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064357, + "balance_loss_mlp": 1.03295684, + "epoch": 0.8891881492881878, + "flos": 589317313536.0, + "grad_norm": 0.057360134783463114, + "language_loss": 0.81454372, + "learning_rate": 3.1857548142125104e-05, + "loss": 0.82518733, + "num_input_tokens_seen": 382854464, + "router_z_loss_mlp": 0.3137207, + "step": 4622, + "time_per_iteration": 2.7618589401245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060903, + "balance_loss_mlp": 1.02976584, + "epoch": 0.8893805309734514, + "flos": 540438621696.0, + "grad_norm": 0.06850653634595572, + "language_loss": 0.82143193, + "learning_rate": 3.174821244088466e-05, + "loss": 0.83204097, + "num_input_tokens_seen": 382925088, + "router_z_loss_mlp": 0.3112793, + "step": 4623, + "time_per_iteration": 2.7498483657836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061404, + "balance_loss_mlp": 1.02990842, + "epoch": 0.8895729126587149, + "flos": 559827749376.0, + "grad_norm": 0.17707667007827743, + "language_loss": 0.81648695, + "learning_rate": 3.163905853111054e-05, + "loss": 0.82710099, + "num_input_tokens_seen": 382998640, + "router_z_loss_mlp": 0.31469727, + "step": 4624, + "time_per_iteration": 2.650419235229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106132, + "balance_loss_mlp": 1.03027821, + "epoch": 0.8897652943439784, + "flos": 609873338880.0, + "grad_norm": 0.04962289154740808, + "language_loss": 0.81375515, + "learning_rate": 3.153008645517996e-05, + "loss": 0.82436836, + "num_input_tokens_seen": 383076000, + "router_z_loss_mlp": 0.31005859, + "step": 4625, + "time_per_iteration": 2.7451446056365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062136, + "balance_loss_mlp": 1.03047383, + "epoch": 0.889957676029242, + "flos": 917455209984.0, + "grad_norm": 0.051652869550322736, + "language_loss": 0.77054471, + "learning_rate": 3.142129625539969e-05, + "loss": 0.78116608, + "num_input_tokens_seen": 383166640, + "router_z_loss_mlp": 0.31640625, + "step": 4626, + "time_per_iteration": 3.1661722660064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063731, + "balance_loss_mlp": 1.03202164, + "epoch": 0.8901500577145056, + "flos": 488452114944.0, + "grad_norm": 0.056002822435115965, + "language_loss": 0.80131978, + "learning_rate": 3.131268797400588e-05, + "loss": 0.81195712, + "num_input_tokens_seen": 383232928, + "router_z_loss_mlp": 0.31689453, + "step": 4627, + "time_per_iteration": 2.566154718399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062359, + "balance_loss_mlp": 1.03029203, + "epoch": 0.8903424393997691, + "flos": 733332994560.0, + "grad_norm": 0.06173508777641705, + "language_loss": 0.80719995, + "learning_rate": 3.120426165316398e-05, + "loss": 0.81782359, + "num_input_tokens_seen": 383314352, + "router_z_loss_mlp": 0.32055664, + "step": 4628, + "time_per_iteration": 2.9888558387756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105903, + "balance_loss_mlp": 1.02784455, + "epoch": 0.8905348210850327, + "flos": 519546534912.0, + "grad_norm": 0.04930821670569134, + "language_loss": 0.81522822, + "learning_rate": 3.109601733496881e-05, + "loss": 0.82581854, + "num_input_tokens_seen": 383384848, + "router_z_loss_mlp": 0.31152344, + "step": 4629, + "time_per_iteration": 2.6437489986419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060464, + "balance_loss_mlp": 1.02958894, + "epoch": 0.8907272027702963, + "flos": 578672640000.0, + "grad_norm": 0.05385866355437149, + "language_loss": 0.79690862, + "learning_rate": 3.098795506144458e-05, + "loss": 0.8075133, + "num_input_tokens_seen": 383463360, + "router_z_loss_mlp": 0.30834961, + "step": 4630, + "time_per_iteration": 2.810612916946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061651, + "balance_loss_mlp": 1.03070378, + "epoch": 0.8909195844555599, + "flos": 893258869248.0, + "grad_norm": 0.052849257039567936, + "language_loss": 0.79265952, + "learning_rate": 3.088007487454475e-05, + "loss": 0.803276, + "num_input_tokens_seen": 383542080, + "router_z_loss_mlp": 0.30908203, + "step": 4631, + "time_per_iteration": 3.088334321975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062189, + "balance_loss_mlp": 1.03069353, + "epoch": 0.8911119661408234, + "flos": 549596302848.0, + "grad_norm": 0.06712203160274297, + "language_loss": 0.84319258, + "learning_rate": 3.077237681615208e-05, + "loss": 0.85381448, + "num_input_tokens_seen": 383613056, + "router_z_loss_mlp": 0.31469727, + "step": 4632, + "time_per_iteration": 2.6473772525787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061159, + "balance_loss_mlp": 1.02980685, + "epoch": 0.8913043478260869, + "flos": 480884732928.0, + "grad_norm": 0.07195593938803238, + "language_loss": 0.83490551, + "learning_rate": 3.066486092807874e-05, + "loss": 0.84551716, + "num_input_tokens_seen": 383683280, + "router_z_loss_mlp": 0.31323242, + "step": 4633, + "time_per_iteration": 2.620727777481079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062888, + "balance_loss_mlp": 1.03086805, + "epoch": 0.8914967295113505, + "flos": 484317024768.0, + "grad_norm": 0.04555285940128422, + "language_loss": 0.84773296, + "learning_rate": 3.055752725206601e-05, + "loss": 0.85836184, + "num_input_tokens_seen": 383754624, + "router_z_loss_mlp": 0.32006836, + "step": 4634, + "time_per_iteration": 2.618859052658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060843, + "balance_loss_mlp": 1.02932405, + "epoch": 0.8916891111966141, + "flos": 445432642560.0, + "grad_norm": 0.0523806827340635, + "language_loss": 0.81158233, + "learning_rate": 3.0450375829784714e-05, + "loss": 0.82219076, + "num_input_tokens_seen": 383821984, + "router_z_loss_mlp": 0.31494141, + "step": 4635, + "time_per_iteration": 2.5323636531829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060849, + "balance_loss_mlp": 1.03002167, + "epoch": 0.8918814928818777, + "flos": 563751843840.0, + "grad_norm": 0.0513354141188765, + "language_loss": 0.78050125, + "learning_rate": 3.034340670283453e-05, + "loss": 0.79110974, + "num_input_tokens_seen": 383890880, + "router_z_loss_mlp": 0.30786133, + "step": 4636, + "time_per_iteration": 2.6924479007720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060607, + "balance_loss_mlp": 1.03030384, + "epoch": 0.8920738745671412, + "flos": 575672514048.0, + "grad_norm": 0.04845445615899239, + "language_loss": 0.81120145, + "learning_rate": 3.0236619912744513e-05, + "loss": 0.8218075, + "num_input_tokens_seen": 383962480, + "router_z_loss_mlp": 0.30249023, + "step": 4637, + "time_per_iteration": 2.693192481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061314, + "balance_loss_mlp": 1.0303911, + "epoch": 0.8922662562524047, + "flos": 619898171904.0, + "grad_norm": 0.049196243556278496, + "language_loss": 0.84060216, + "learning_rate": 3.0130015500973163e-05, + "loss": 0.8512153, + "num_input_tokens_seen": 384033616, + "router_z_loss_mlp": 0.30883789, + "step": 4638, + "time_per_iteration": 2.7037692070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060568, + "balance_loss_mlp": 1.02969277, + "epoch": 0.8924586379376683, + "flos": 583330056192.0, + "grad_norm": 0.05184193670463406, + "language_loss": 0.79242623, + "learning_rate": 3.0023593508907877e-05, + "loss": 0.80303186, + "num_input_tokens_seen": 384108848, + "router_z_loss_mlp": 0.30834961, + "step": 4639, + "time_per_iteration": 2.748689889907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062785, + "balance_loss_mlp": 1.03164768, + "epoch": 0.8926510196229319, + "flos": 524922716160.0, + "grad_norm": 0.04558515504354127, + "language_loss": 0.8157109, + "learning_rate": 2.991735397786538e-05, + "loss": 0.82633877, + "num_input_tokens_seen": 384185728, + "router_z_loss_mlp": 0.31103516, + "step": 4640, + "time_per_iteration": 2.780665874481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064798, + "balance_loss_mlp": 1.03318334, + "epoch": 0.8928434013081955, + "flos": 486428239872.0, + "grad_norm": 0.05672028214333359, + "language_loss": 0.80730885, + "learning_rate": 2.981129694909146e-05, + "loss": 0.81795681, + "num_input_tokens_seen": 384251552, + "router_z_loss_mlp": 0.31591797, + "step": 4641, + "time_per_iteration": 2.545320749282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012709, + "balance_loss_mlp": 1.00441241, + "epoch": 0.893035782993459, + "flos": 1447580837376.0, + "grad_norm": 0.005693234754152928, + "language_loss": 0.80330861, + "learning_rate": 2.970542246376118e-05, + "loss": 0.81343567, + "num_input_tokens_seen": 384472176, + "router_z_loss_mlp": 0.08300781, + "step": 4642, + "time_per_iteration": 4.690560817718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061836, + "balance_loss_mlp": 1.03088951, + "epoch": 0.8932281646787226, + "flos": 611040236544.0, + "grad_norm": 0.05793428976399419, + "language_loss": 0.8072226, + "learning_rate": 2.95997305629786e-05, + "loss": 0.81784093, + "num_input_tokens_seen": 384544224, + "router_z_loss_mlp": 0.30908203, + "step": 4643, + "time_per_iteration": 2.758070945739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062801, + "balance_loss_mlp": 1.03104377, + "epoch": 0.8934205463639862, + "flos": 565494912000.0, + "grad_norm": 0.04973706186555829, + "language_loss": 0.84834957, + "learning_rate": 2.9494221287776957e-05, + "loss": 0.85897756, + "num_input_tokens_seen": 384611728, + "router_z_loss_mlp": 0.31738281, + "step": 4644, + "time_per_iteration": 2.6707870960235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063096, + "balance_loss_mlp": 1.03217363, + "epoch": 0.8936129280492497, + "flos": 488181482496.0, + "grad_norm": 0.09316028593492325, + "language_loss": 0.77998525, + "learning_rate": 2.9388894679118484e-05, + "loss": 0.79061615, + "num_input_tokens_seen": 384678048, + "router_z_loss_mlp": 0.30883789, + "step": 4645, + "time_per_iteration": 2.5601553916931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063396, + "balance_loss_mlp": 1.03168607, + "epoch": 0.8938053097345132, + "flos": 886095949824.0, + "grad_norm": 0.05248493446128753, + "language_loss": 0.8068549, + "learning_rate": 2.9283750777894912e-05, + "loss": 0.81748885, + "num_input_tokens_seen": 384766768, + "router_z_loss_mlp": 0.31689453, + "step": 4646, + "time_per_iteration": 3.2007439136505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060405, + "balance_loss_mlp": 1.02924371, + "epoch": 0.8939976914197768, + "flos": 592999888896.0, + "grad_norm": 0.05511284894633522, + "language_loss": 0.83739501, + "learning_rate": 2.9178789624926427e-05, + "loss": 0.8479991, + "num_input_tokens_seen": 384842352, + "router_z_loss_mlp": 0.3112793, + "step": 4647, + "time_per_iteration": 2.709075927734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065548, + "balance_loss_mlp": 1.03357601, + "epoch": 0.8941900731050404, + "flos": 522983208960.0, + "grad_norm": 0.056894932724212664, + "language_loss": 0.80778831, + "learning_rate": 2.9074011260962706e-05, + "loss": 0.81844378, + "num_input_tokens_seen": 384912048, + "router_z_loss_mlp": 0.31958008, + "step": 4648, + "time_per_iteration": 2.6082539558410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061533, + "balance_loss_mlp": 1.03051448, + "epoch": 0.894382454790304, + "flos": 800247651840.0, + "grad_norm": 0.04566115166749404, + "language_loss": 0.80567217, + "learning_rate": 2.8969415726682158e-05, + "loss": 0.8162874, + "num_input_tokens_seen": 384986560, + "router_z_loss_mlp": 0.30981445, + "step": 4649, + "time_per_iteration": 2.979668140411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106435, + "balance_loss_mlp": 1.03302193, + "epoch": 0.8945748364755676, + "flos": 478782282240.0, + "grad_norm": 0.05084175765827824, + "language_loss": 0.84974194, + "learning_rate": 2.8865003062692517e-05, + "loss": 0.86038542, + "num_input_tokens_seen": 385057376, + "router_z_loss_mlp": 0.31298828, + "step": 4650, + "time_per_iteration": 2.5919971466064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061126, + "balance_loss_mlp": 1.03068006, + "epoch": 0.894767218160831, + "flos": 508507573248.0, + "grad_norm": 0.050809321075872965, + "language_loss": 0.82988006, + "learning_rate": 2.876077330953042e-05, + "loss": 0.84049129, + "num_input_tokens_seen": 385130880, + "router_z_loss_mlp": 0.30395508, + "step": 4651, + "time_per_iteration": 2.7233057022094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058609, + "balance_loss_mlp": 1.02763844, + "epoch": 0.8949595998460946, + "flos": 685557181440.0, + "grad_norm": 0.06487677306684464, + "language_loss": 0.81605327, + "learning_rate": 2.8656726507661378e-05, + "loss": 0.82663941, + "num_input_tokens_seen": 385205808, + "router_z_loss_mlp": 0.30932617, + "step": 4652, + "time_per_iteration": 2.82380747795105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061293, + "balance_loss_mlp": 1.02941608, + "epoch": 0.8951519815313582, + "flos": 799578349056.0, + "grad_norm": 0.05081684186853934, + "language_loss": 0.7694239, + "learning_rate": 2.855286269747981e-05, + "loss": 0.78003681, + "num_input_tokens_seen": 385283616, + "router_z_loss_mlp": 0.31860352, + "step": 4653, + "time_per_iteration": 2.9739062786102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059724, + "balance_loss_mlp": 1.02849102, + "epoch": 0.8953443632166218, + "flos": 666443068416.0, + "grad_norm": 0.06375205061358989, + "language_loss": 0.85606253, + "learning_rate": 2.8449181919309398e-05, + "loss": 0.86665976, + "num_input_tokens_seen": 385357488, + "router_z_loss_mlp": 0.31201172, + "step": 4654, + "time_per_iteration": 2.8078479766845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057999, + "balance_loss_mlp": 1.02690959, + "epoch": 0.8955367449018854, + "flos": 644670683136.0, + "grad_norm": 0.04984422394174067, + "language_loss": 0.83020389, + "learning_rate": 2.8345684213402556e-05, + "loss": 0.84078383, + "num_input_tokens_seen": 385431280, + "router_z_loss_mlp": 0.31054688, + "step": 4655, + "time_per_iteration": 2.814558506011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062912, + "balance_loss_mlp": 1.03113103, + "epoch": 0.8957291265871489, + "flos": 808353326592.0, + "grad_norm": 0.053021459210243815, + "language_loss": 0.77264309, + "learning_rate": 2.8242369619940644e-05, + "loss": 0.78327227, + "num_input_tokens_seen": 385509840, + "router_z_loss_mlp": 0.31762695, + "step": 4656, + "time_per_iteration": 3.0364105701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062996, + "balance_loss_mlp": 1.03104842, + "epoch": 0.8959215082724125, + "flos": 518664826368.0, + "grad_norm": 0.06969643798779511, + "language_loss": 0.77000499, + "learning_rate": 2.813923817903391e-05, + "loss": 0.78063488, + "num_input_tokens_seen": 385580384, + "router_z_loss_mlp": 0.31933594, + "step": 4657, + "time_per_iteration": 2.6151626110076904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106056, + "balance_loss_mlp": 1.02889752, + "epoch": 0.896113889957676, + "flos": 476669657088.0, + "grad_norm": 0.04964287244699384, + "language_loss": 0.76999301, + "learning_rate": 2.8036289930721603e-05, + "loss": 0.78059864, + "num_input_tokens_seen": 385649184, + "router_z_loss_mlp": 0.31640625, + "step": 4658, + "time_per_iteration": 2.5889346599578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062157, + "balance_loss_mlp": 1.03051877, + "epoch": 0.8963062716429396, + "flos": 517911155712.0, + "grad_norm": 0.05573202137448351, + "language_loss": 0.82991636, + "learning_rate": 2.7933524914971697e-05, + "loss": 0.84053797, + "num_input_tokens_seen": 385717072, + "router_z_loss_mlp": 0.31616211, + "step": 4659, + "time_per_iteration": 2.6229076385498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059463, + "balance_loss_mlp": 1.02868307, + "epoch": 0.8964986533282031, + "flos": 508231148544.0, + "grad_norm": 0.05335291293119473, + "language_loss": 0.81595254, + "learning_rate": 2.7830943171681113e-05, + "loss": 0.82654721, + "num_input_tokens_seen": 385788880, + "router_z_loss_mlp": 0.30737305, + "step": 4660, + "time_per_iteration": 2.6678032875061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061446, + "balance_loss_mlp": 1.03007066, + "epoch": 0.8966910350134667, + "flos": 535819083264.0, + "grad_norm": 0.05953051105641742, + "language_loss": 0.81392318, + "learning_rate": 2.77285447406756e-05, + "loss": 0.82453763, + "num_input_tokens_seen": 385854240, + "router_z_loss_mlp": 0.31347656, + "step": 4661, + "time_per_iteration": 2.5935118198394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061865, + "balance_loss_mlp": 1.03110909, + "epoch": 0.8968834166987303, + "flos": 722909491200.0, + "grad_norm": 0.053847981098818644, + "language_loss": 0.83905041, + "learning_rate": 2.7626329661709914e-05, + "loss": 0.8496691, + "num_input_tokens_seen": 385926080, + "router_z_loss_mlp": 0.30712891, + "step": 4662, + "time_per_iteration": 2.8665292263031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064722, + "balance_loss_mlp": 1.0333935, + "epoch": 0.8970757983839939, + "flos": 681372628992.0, + "grad_norm": 0.04437914022124262, + "language_loss": 0.83813488, + "learning_rate": 2.7524297974467372e-05, + "loss": 0.84878206, + "num_input_tokens_seen": 386005696, + "router_z_loss_mlp": 0.31298828, + "step": 4663, + "time_per_iteration": 2.8976876735687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059748, + "balance_loss_mlp": 1.02832484, + "epoch": 0.8972681800692575, + "flos": 612758573568.0, + "grad_norm": 0.06417585918010674, + "language_loss": 0.75612116, + "learning_rate": 2.742244971856006e-05, + "loss": 0.76671863, + "num_input_tokens_seen": 386073248, + "router_z_loss_mlp": 0.31396484, + "step": 4664, + "time_per_iteration": 2.703761577606201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061274, + "balance_loss_mlp": 1.0300653, + "epoch": 0.8974605617545209, + "flos": 572064132096.0, + "grad_norm": 0.05003512602131667, + "language_loss": 0.83072126, + "learning_rate": 2.732078493352913e-05, + "loss": 0.84133399, + "num_input_tokens_seen": 386148528, + "router_z_loss_mlp": 0.31176758, + "step": 4665, + "time_per_iteration": 2.7152178287506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062922, + "balance_loss_mlp": 1.03080714, + "epoch": 0.8976529434397845, + "flos": 520147436544.0, + "grad_norm": 0.04771841444398887, + "language_loss": 0.87391418, + "learning_rate": 2.721930365884434e-05, + "loss": 0.88454342, + "num_input_tokens_seen": 386218528, + "router_z_loss_mlp": 0.32104492, + "step": 4666, + "time_per_iteration": 2.6735920906066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057794, + "balance_loss_mlp": 1.02670431, + "epoch": 0.8978453251250481, + "flos": 471124740096.0, + "grad_norm": 0.0485725814683155, + "language_loss": 0.82510161, + "learning_rate": 2.7118005933904176e-05, + "loss": 0.83567965, + "num_input_tokens_seen": 386284704, + "router_z_loss_mlp": 0.31054688, + "step": 4667, + "time_per_iteration": 2.604840040206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058903, + "balance_loss_mlp": 1.02862406, + "epoch": 0.8980377068103117, + "flos": 591370301952.0, + "grad_norm": 0.051378272948665586, + "language_loss": 0.81776893, + "learning_rate": 2.7016891798035904e-05, + "loss": 0.82835793, + "num_input_tokens_seen": 386356128, + "router_z_loss_mlp": 0.30249023, + "step": 4668, + "time_per_iteration": 2.750239372253418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063468, + "balance_loss_mlp": 1.0324496, + "epoch": 0.8982300884955752, + "flos": 767287918080.0, + "grad_norm": 0.06911870880947439, + "language_loss": 0.82571542, + "learning_rate": 2.691596129049556e-05, + "loss": 0.83635008, + "num_input_tokens_seen": 386434048, + "router_z_loss_mlp": 0.31005859, + "step": 4669, + "time_per_iteration": 2.945383071899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106161, + "balance_loss_mlp": 1.03099728, + "epoch": 0.8984224701808388, + "flos": 844189530624.0, + "grad_norm": 0.06637017917952584, + "language_loss": 0.7722441, + "learning_rate": 2.681521445046775e-05, + "loss": 0.78286028, + "num_input_tokens_seen": 386532384, + "router_z_loss_mlp": 0.30566406, + "step": 4670, + "time_per_iteration": 3.198310613632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062976, + "balance_loss_mlp": 1.03155208, + "epoch": 0.8986148518661023, + "flos": 757303782912.0, + "grad_norm": 0.07008375711524328, + "language_loss": 0.75845528, + "learning_rate": 2.6714651317065963e-05, + "loss": 0.76908505, + "num_input_tokens_seen": 386627120, + "router_z_loss_mlp": 0.31396484, + "step": 4671, + "time_per_iteration": 3.1156165599823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061856, + "balance_loss_mlp": 1.03047979, + "epoch": 0.8988072335513659, + "flos": 562801734144.0, + "grad_norm": 0.05163883650190103, + "language_loss": 0.76486373, + "learning_rate": 2.6614271929332133e-05, + "loss": 0.7754823, + "num_input_tokens_seen": 386700192, + "router_z_loss_mlp": 0.31347656, + "step": 4672, + "time_per_iteration": 2.671839475631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062538, + "balance_loss_mlp": 1.03097177, + "epoch": 0.8989996152366295, + "flos": 492440228352.0, + "grad_norm": 0.05196577286527717, + "language_loss": 0.86882824, + "learning_rate": 2.6514076326237147e-05, + "loss": 0.87945366, + "num_input_tokens_seen": 386764256, + "router_z_loss_mlp": 0.31542969, + "step": 4673, + "time_per_iteration": 2.5203633308410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060918, + "balance_loss_mlp": 1.02963722, + "epoch": 0.899191996921893, + "flos": 542303935488.0, + "grad_norm": 0.06061502607868415, + "language_loss": 0.758295, + "learning_rate": 2.6414064546680438e-05, + "loss": 0.76890421, + "num_input_tokens_seen": 386835792, + "router_z_loss_mlp": 0.3125, + "step": 4674, + "time_per_iteration": 2.6241261959075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060398, + "balance_loss_mlp": 1.02914178, + "epoch": 0.8993843786071566, + "flos": 471081070080.0, + "grad_norm": 0.052429553353469285, + "language_loss": 0.80238754, + "learning_rate": 2.631423662948984e-05, + "loss": 0.8129915, + "num_input_tokens_seen": 386904368, + "router_z_loss_mlp": 0.31225586, + "step": 4675, + "time_per_iteration": 2.5443856716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062814, + "balance_loss_mlp": 1.03170013, + "epoch": 0.8995767602924202, + "flos": 526454788608.0, + "grad_norm": 0.04980258004254359, + "language_loss": 0.82579398, + "learning_rate": 2.621459261342196e-05, + "loss": 0.83642209, + "num_input_tokens_seen": 386977872, + "router_z_loss_mlp": 0.31079102, + "step": 4676, + "time_per_iteration": 2.721583127975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061451, + "balance_loss_mlp": 1.02997994, + "epoch": 0.8997691419776838, + "flos": 557365916160.0, + "grad_norm": 0.08112559791576887, + "language_loss": 0.84614646, + "learning_rate": 2.6115132537162245e-05, + "loss": 0.85676098, + "num_input_tokens_seen": 387052080, + "router_z_loss_mlp": 0.31445312, + "step": 4677, + "time_per_iteration": 2.678091049194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064491, + "balance_loss_mlp": 1.03287649, + "epoch": 0.8999615236629472, + "flos": 638722713600.0, + "grad_norm": 0.05386852580878479, + "language_loss": 0.8060981, + "learning_rate": 2.601585643932436e-05, + "loss": 0.81674302, + "num_input_tokens_seen": 387129712, + "router_z_loss_mlp": 0.31591797, + "step": 4678, + "time_per_iteration": 2.8065719604492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008515, + "balance_loss_mlp": 1.00031304, + "epoch": 0.9001539053482108, + "flos": 1430743703040.0, + "grad_norm": 0.0043436947203847566, + "language_loss": 0.85784018, + "learning_rate": 2.5916764358450862e-05, + "loss": 0.86792541, + "num_input_tokens_seen": 387356560, + "router_z_loss_mlp": 0.08203125, + "step": 4679, + "time_per_iteration": 4.774789810180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063627, + "balance_loss_mlp": 1.03268027, + "epoch": 0.9003462870334744, + "flos": 566589026304.0, + "grad_norm": 0.06797302187822865, + "language_loss": 0.79665619, + "learning_rate": 2.5817856333012425e-05, + "loss": 0.80729246, + "num_input_tokens_seen": 387438640, + "router_z_loss_mlp": 0.30908203, + "step": 4680, + "time_per_iteration": 2.839365243911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063395, + "balance_loss_mlp": 1.03173351, + "epoch": 0.900538668718738, + "flos": 538394397696.0, + "grad_norm": 0.051508441084865235, + "language_loss": 0.78311312, + "learning_rate": 2.5719132401408883e-05, + "loss": 0.79374701, + "num_input_tokens_seen": 387507088, + "router_z_loss_mlp": 0.31640625, + "step": 4681, + "time_per_iteration": 2.6403775215148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062917, + "balance_loss_mlp": 1.03225613, + "epoch": 0.9007310504040016, + "flos": 488146576896.0, + "grad_norm": 0.06865687057695076, + "language_loss": 0.85842234, + "learning_rate": 2.5620592601968028e-05, + "loss": 0.86905152, + "num_input_tokens_seen": 387574160, + "router_z_loss_mlp": 0.30615234, + "step": 4682, + "time_per_iteration": 2.5301215648651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061072, + "balance_loss_mlp": 1.02983928, + "epoch": 0.9009234320892651, + "flos": 652593065472.0, + "grad_norm": 0.06244574695746065, + "language_loss": 0.78513706, + "learning_rate": 2.5522236972946532e-05, + "loss": 0.79574782, + "num_input_tokens_seen": 387652528, + "router_z_loss_mlp": 0.31201172, + "step": 4683, + "time_per_iteration": 2.8237221240997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061087, + "balance_loss_mlp": 1.02878177, + "epoch": 0.9011158137745287, + "flos": 545302651392.0, + "grad_norm": 0.04806790950859059, + "language_loss": 0.85241842, + "learning_rate": 2.5424065552529295e-05, + "loss": 0.86302924, + "num_input_tokens_seen": 387723520, + "router_z_loss_mlp": 0.32299805, + "step": 4684, + "time_per_iteration": 2.6552274227142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059354, + "balance_loss_mlp": 1.02890825, + "epoch": 0.9013081954597922, + "flos": 559429079040.0, + "grad_norm": 0.06508023738808166, + "language_loss": 0.82589149, + "learning_rate": 2.532607837883011e-05, + "loss": 0.83648503, + "num_input_tokens_seen": 387793664, + "router_z_loss_mlp": 0.30395508, + "step": 4685, + "time_per_iteration": 2.766566753387451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060398, + "balance_loss_mlp": 1.02921259, + "epoch": 0.9015005771450558, + "flos": 728330752512.0, + "grad_norm": 0.05594908680106596, + "language_loss": 0.81363046, + "learning_rate": 2.5228275489890706e-05, + "loss": 0.82423443, + "num_input_tokens_seen": 387871008, + "router_z_loss_mlp": 0.31176758, + "step": 4686, + "time_per_iteration": 2.903522491455078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060619, + "balance_loss_mlp": 1.02867079, + "epoch": 0.9016929588303193, + "flos": 517148720640.0, + "grad_norm": 0.04850157903091311, + "language_loss": 0.80952024, + "learning_rate": 2.5130656923681605e-05, + "loss": 0.82012641, + "num_input_tokens_seen": 387950832, + "router_z_loss_mlp": 0.31933594, + "step": 4687, + "time_per_iteration": 2.770630121231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060384, + "balance_loss_mlp": 1.02941346, + "epoch": 0.9018853405155829, + "flos": 622031145984.0, + "grad_norm": 0.05618517422813967, + "language_loss": 0.8593204, + "learning_rate": 2.503322271810171e-05, + "loss": 0.86992431, + "num_input_tokens_seen": 388029792, + "router_z_loss_mlp": 0.30932617, + "step": 4688, + "time_per_iteration": 2.8023810386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106025, + "balance_loss_mlp": 1.02908909, + "epoch": 0.9020777222008465, + "flos": 523022496768.0, + "grad_norm": 0.05048030300979413, + "language_loss": 0.77799124, + "learning_rate": 2.4935972910978378e-05, + "loss": 0.78859371, + "num_input_tokens_seen": 388095872, + "router_z_loss_mlp": 0.3112793, + "step": 4689, + "time_per_iteration": 2.626427412033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059848, + "balance_loss_mlp": 1.02890086, + "epoch": 0.9022701038861101, + "flos": 633419315712.0, + "grad_norm": 0.05593641687528262, + "language_loss": 0.81798267, + "learning_rate": 2.4838907540067346e-05, + "loss": 0.82858115, + "num_input_tokens_seen": 388171632, + "router_z_loss_mlp": 0.30908203, + "step": 4690, + "time_per_iteration": 2.8088419437408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065299, + "balance_loss_mlp": 1.03471041, + "epoch": 0.9024624855713737, + "flos": 513036951552.0, + "grad_norm": 0.04860641304257661, + "language_loss": 0.84015805, + "learning_rate": 2.474202664305253e-05, + "loss": 0.850811, + "num_input_tokens_seen": 388242240, + "router_z_loss_mlp": 0.30541992, + "step": 4691, + "time_per_iteration": 2.6090428829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060526, + "balance_loss_mlp": 1.02977014, + "epoch": 0.9026548672566371, + "flos": 477152695296.0, + "grad_norm": 0.07265058382258091, + "language_loss": 0.86403483, + "learning_rate": 2.464533025754673e-05, + "loss": 0.87464011, + "num_input_tokens_seen": 388310960, + "router_z_loss_mlp": 0.30712891, + "step": 4692, + "time_per_iteration": 2.6414620876312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063388, + "balance_loss_mlp": 1.03158331, + "epoch": 0.9028472489419007, + "flos": 661701284352.0, + "grad_norm": 0.050677487333482145, + "language_loss": 0.73312789, + "learning_rate": 2.454881842109058e-05, + "loss": 0.74376178, + "num_input_tokens_seen": 388387280, + "router_z_loss_mlp": 0.31787109, + "step": 4693, + "time_per_iteration": 2.8417153358459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106063, + "balance_loss_mlp": 1.02927816, + "epoch": 0.9030396306271643, + "flos": 534332090880.0, + "grad_norm": 0.052476495010180334, + "language_loss": 0.8169986, + "learning_rate": 2.4452491171153445e-05, + "loss": 0.82760489, + "num_input_tokens_seen": 388456992, + "router_z_loss_mlp": 0.31323242, + "step": 4694, + "time_per_iteration": 2.6444764137268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062559, + "balance_loss_mlp": 1.03151679, + "epoch": 0.9032320123124279, + "flos": 800695784448.0, + "grad_norm": 0.05285845656928848, + "language_loss": 0.82164681, + "learning_rate": 2.43563485451328e-05, + "loss": 0.83227229, + "num_input_tokens_seen": 388534896, + "router_z_loss_mlp": 0.31005859, + "step": 4695, + "time_per_iteration": 2.9648303985595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105879, + "balance_loss_mlp": 1.02741396, + "epoch": 0.9034243939976914, + "flos": 553673166336.0, + "grad_norm": 0.06654317576841562, + "language_loss": 0.76353633, + "learning_rate": 2.426039058035451e-05, + "loss": 0.77412426, + "num_input_tokens_seen": 388606640, + "router_z_loss_mlp": 0.31347656, + "step": 4696, + "time_per_iteration": 2.6323938369750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061699, + "balance_loss_mlp": 1.03065646, + "epoch": 0.903616775682955, + "flos": 503656690176.0, + "grad_norm": 0.05234220926358092, + "language_loss": 0.82479656, + "learning_rate": 2.4164617314072823e-05, + "loss": 0.83541358, + "num_input_tokens_seen": 388675920, + "router_z_loss_mlp": 0.31005859, + "step": 4697, + "time_per_iteration": 2.598928928375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060807, + "balance_loss_mlp": 1.02962184, + "epoch": 0.9038091573682185, + "flos": 436058173440.0, + "grad_norm": 0.05082677150360358, + "language_loss": 0.7861774, + "learning_rate": 2.406902878347017e-05, + "loss": 0.79678547, + "num_input_tokens_seen": 388743968, + "router_z_loss_mlp": 0.31176758, + "step": 4698, + "time_per_iteration": 2.60606050491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059475, + "balance_loss_mlp": 1.0276463, + "epoch": 0.9040015390534821, + "flos": 532648659456.0, + "grad_norm": 0.06214700158023469, + "language_loss": 0.81251138, + "learning_rate": 2.3973625025657253e-05, + "loss": 0.82310611, + "num_input_tokens_seen": 388810784, + "router_z_loss_mlp": 0.31811523, + "step": 4699, + "time_per_iteration": 2.6206655502319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057598, + "balance_loss_mlp": 1.02665126, + "epoch": 0.9041939207387457, + "flos": 564028268544.0, + "grad_norm": 0.06726582605850791, + "language_loss": 0.80017805, + "learning_rate": 2.3878406077673275e-05, + "loss": 0.810754, + "num_input_tokens_seen": 388885072, + "router_z_loss_mlp": 0.30908203, + "step": 4700, + "time_per_iteration": 2.755746364593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059864, + "balance_loss_mlp": 1.02913213, + "epoch": 0.9043863024240092, + "flos": 515257265664.0, + "grad_norm": 0.06510896632722754, + "language_loss": 0.77814531, + "learning_rate": 2.3783371976485447e-05, + "loss": 0.78874397, + "num_input_tokens_seen": 388951184, + "router_z_loss_mlp": 0.30688477, + "step": 4701, + "time_per_iteration": 2.5619757175445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008403, + "balance_loss_mlp": 1.00024879, + "epoch": 0.9045786841092728, + "flos": 1277243043840.0, + "grad_norm": 0.003958799533886951, + "language_loss": 0.72929788, + "learning_rate": 2.368852275898914e-05, + "loss": 0.73938191, + "num_input_tokens_seen": 389170752, + "router_z_loss_mlp": 0.08154297, + "step": 4702, + "time_per_iteration": 4.942458152770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062191, + "balance_loss_mlp": 1.03133917, + "epoch": 0.9047710657945364, + "flos": 585569309184.0, + "grad_norm": 0.05494675450974493, + "language_loss": 0.82736337, + "learning_rate": 2.3593858462008178e-05, + "loss": 0.83798528, + "num_input_tokens_seen": 389239600, + "router_z_loss_mlp": 0.30810547, + "step": 4703, + "time_per_iteration": 2.676253080368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061667, + "balance_loss_mlp": 1.03017187, + "epoch": 0.9049634474798, + "flos": 571655287296.0, + "grad_norm": 0.05299767963476469, + "language_loss": 0.79625463, + "learning_rate": 2.3499379122294495e-05, + "loss": 0.80687135, + "num_input_tokens_seen": 389316032, + "router_z_loss_mlp": 0.31469727, + "step": 4704, + "time_per_iteration": 2.7089710235595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061381, + "balance_loss_mlp": 1.03081548, + "epoch": 0.9051558291650635, + "flos": 572353703424.0, + "grad_norm": 0.06198220417102737, + "language_loss": 0.74331594, + "learning_rate": 2.3405084776528307e-05, + "loss": 0.75392973, + "num_input_tokens_seen": 389383504, + "router_z_loss_mlp": 0.30517578, + "step": 4705, + "time_per_iteration": 2.657379388809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061059, + "balance_loss_mlp": 1.03013611, + "epoch": 0.905348210850327, + "flos": 540280060416.0, + "grad_norm": 0.07947376611636264, + "language_loss": 0.79365158, + "learning_rate": 2.331097546131783e-05, + "loss": 0.80426216, + "num_input_tokens_seen": 389454592, + "router_z_loss_mlp": 0.30883789, + "step": 4706, + "time_per_iteration": 2.6540727615356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064249, + "balance_loss_mlp": 1.03370762, + "epoch": 0.9055405925355906, + "flos": 516128799744.0, + "grad_norm": 0.057615129617973604, + "language_loss": 0.81330758, + "learning_rate": 2.321705121319956e-05, + "loss": 0.82395005, + "num_input_tokens_seen": 389519696, + "router_z_loss_mlp": 0.30493164, + "step": 4707, + "time_per_iteration": 2.5897743701934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056914, + "balance_loss_mlp": 1.02506149, + "epoch": 0.9057329742208542, + "flos": 914249880576.0, + "grad_norm": 0.04603068937294546, + "language_loss": 0.84972519, + "learning_rate": 2.3123312068638104e-05, + "loss": 0.86029434, + "num_input_tokens_seen": 389603568, + "router_z_loss_mlp": 0.31835938, + "step": 4708, + "time_per_iteration": 3.160703420639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010632, + "balance_loss_mlp": 1.03206229, + "epoch": 0.9059253559061178, + "flos": 904884175872.0, + "grad_norm": 0.16660465263722607, + "language_loss": 0.82760024, + "learning_rate": 2.3029758064026295e-05, + "loss": 0.83823222, + "num_input_tokens_seen": 389687504, + "router_z_loss_mlp": 0.31103516, + "step": 4709, + "time_per_iteration": 3.179295301437378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059241, + "balance_loss_mlp": 1.02836561, + "epoch": 0.9061177375913813, + "flos": 664218372096.0, + "grad_norm": 0.06166960355776129, + "language_loss": 0.77393854, + "learning_rate": 2.2936389235684918e-05, + "loss": 0.78453094, + "num_input_tokens_seen": 389764880, + "router_z_loss_mlp": 0.30834961, + "step": 4710, + "time_per_iteration": 2.8492090702056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063495, + "balance_loss_mlp": 1.03223789, + "epoch": 0.9063101192766448, + "flos": 565318821888.0, + "grad_norm": 0.05907794054329625, + "language_loss": 0.82644868, + "learning_rate": 2.2843205619862972e-05, + "loss": 0.8370837, + "num_input_tokens_seen": 389838304, + "router_z_loss_mlp": 0.31225586, + "step": 4711, + "time_per_iteration": 2.7433969974517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106258, + "balance_loss_mlp": 1.03234863, + "epoch": 0.9065025009619084, + "flos": 727064930304.0, + "grad_norm": 0.06819697260441993, + "language_loss": 0.78757668, + "learning_rate": 2.2750207252737742e-05, + "loss": 0.79820251, + "num_input_tokens_seen": 389908592, + "router_z_loss_mlp": 0.30175781, + "step": 4712, + "time_per_iteration": 2.885631799697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061608, + "balance_loss_mlp": 1.03123391, + "epoch": 0.906694882647172, + "flos": 531254799360.0, + "grad_norm": 0.06086888254866861, + "language_loss": 0.79970586, + "learning_rate": 2.265739417041418e-05, + "loss": 0.81032193, + "num_input_tokens_seen": 389979040, + "router_z_loss_mlp": 0.30322266, + "step": 4713, + "time_per_iteration": 2.6492934226989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065061, + "balance_loss_mlp": 1.03363752, + "epoch": 0.9068872643324356, + "flos": 429563146752.0, + "grad_norm": 0.05345280060341207, + "language_loss": 0.84838974, + "learning_rate": 2.2564766408925574e-05, + "loss": 0.85904038, + "num_input_tokens_seen": 390046080, + "router_z_loss_mlp": 0.31396484, + "step": 4714, + "time_per_iteration": 2.578385591506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061674, + "balance_loss_mlp": 1.03034616, + "epoch": 0.9070796460176991, + "flos": 588095161344.0, + "grad_norm": 0.054473857957546834, + "language_loss": 0.79786414, + "learning_rate": 2.2472324004233214e-05, + "loss": 0.80848086, + "num_input_tokens_seen": 390122176, + "router_z_loss_mlp": 0.31298828, + "step": 4715, + "time_per_iteration": 2.7411398887634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062125, + "balance_loss_mlp": 1.03082108, + "epoch": 0.9072720277029627, + "flos": 571314843648.0, + "grad_norm": 0.06136918280584941, + "language_loss": 0.7556839, + "learning_rate": 2.2380066992226446e-05, + "loss": 0.76630509, + "num_input_tokens_seen": 390195216, + "router_z_loss_mlp": 0.31274414, + "step": 4716, + "time_per_iteration": 2.717400550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065363, + "balance_loss_mlp": 1.03432047, + "epoch": 0.9074644093882263, + "flos": 555534097920.0, + "grad_norm": 0.05054647084062828, + "language_loss": 0.88467407, + "learning_rate": 2.2287995408722617e-05, + "loss": 0.89532775, + "num_input_tokens_seen": 390263216, + "router_z_loss_mlp": 0.31005859, + "step": 4717, + "time_per_iteration": 2.626262664794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065114, + "balance_loss_mlp": 1.0337857, + "epoch": 0.9076567910734898, + "flos": 640701508608.0, + "grad_norm": 0.05014211489878531, + "language_loss": 0.82399035, + "learning_rate": 2.2196109289467083e-05, + "loss": 0.83464146, + "num_input_tokens_seen": 390337360, + "router_z_loss_mlp": 0.31298828, + "step": 4718, + "time_per_iteration": 2.8218960762023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063784, + "balance_loss_mlp": 1.03300405, + "epoch": 0.9078491727587533, + "flos": 733635560448.0, + "grad_norm": 0.05294662816605839, + "language_loss": 0.81557, + "learning_rate": 2.2104408670133193e-05, + "loss": 0.82620788, + "num_input_tokens_seen": 390427728, + "router_z_loss_mlp": 0.30737305, + "step": 4719, + "time_per_iteration": 3.107689142227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063177, + "balance_loss_mlp": 1.0316577, + "epoch": 0.9080415544440169, + "flos": 654464171520.0, + "grad_norm": 0.05391534489649744, + "language_loss": 0.86544436, + "learning_rate": 2.2012893586322245e-05, + "loss": 0.8760761, + "num_input_tokens_seen": 390504736, + "router_z_loss_mlp": 0.31494141, + "step": 4720, + "time_per_iteration": 2.8423755168914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060877, + "balance_loss_mlp": 1.02964377, + "epoch": 0.9082339361292805, + "flos": 597180059136.0, + "grad_norm": 0.051732261345012694, + "language_loss": 0.79402268, + "learning_rate": 2.1921564073563604e-05, + "loss": 0.80463141, + "num_input_tokens_seen": 390582048, + "router_z_loss_mlp": 0.31201172, + "step": 4721, + "time_per_iteration": 2.7443206310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063085, + "balance_loss_mlp": 1.03211474, + "epoch": 0.9084263178145441, + "flos": 504154285056.0, + "grad_norm": 0.05049795376918643, + "language_loss": 0.84334135, + "learning_rate": 2.183042016731457e-05, + "loss": 0.8539722, + "num_input_tokens_seen": 390652976, + "router_z_loss_mlp": 0.30932617, + "step": 4722, + "time_per_iteration": 2.6413490772247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063234, + "balance_loss_mlp": 1.03235853, + "epoch": 0.9086186994998077, + "flos": 549763628544.0, + "grad_norm": 0.052887401454076326, + "language_loss": 0.8025831, + "learning_rate": 2.1739461902960223e-05, + "loss": 0.81321543, + "num_input_tokens_seen": 390726832, + "router_z_loss_mlp": 0.30834961, + "step": 4723, + "time_per_iteration": 2.7238101959228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059634, + "balance_loss_mlp": 1.02861619, + "epoch": 0.9088110811850711, + "flos": 1133620545024.0, + "grad_norm": 0.049238077529050184, + "language_loss": 0.75059247, + "learning_rate": 2.1648689315813763e-05, + "loss": 0.76118881, + "num_input_tokens_seen": 390824480, + "router_z_loss_mlp": 0.30981445, + "step": 4724, + "time_per_iteration": 3.576720952987671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063217, + "balance_loss_mlp": 1.03165007, + "epoch": 0.9090034628703347, + "flos": 556725726720.0, + "grad_norm": 0.0503925655199207, + "language_loss": 0.76640475, + "learning_rate": 2.155810244111628e-05, + "loss": 0.77703691, + "num_input_tokens_seen": 390897552, + "router_z_loss_mlp": 0.31542969, + "step": 4725, + "time_per_iteration": 2.6426239013671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061991, + "balance_loss_mlp": 1.03090096, + "epoch": 0.9091958445555983, + "flos": 543697795584.0, + "grad_norm": 0.05168476660108364, + "language_loss": 0.84019625, + "learning_rate": 2.146770131403658e-05, + "loss": 0.85081613, + "num_input_tokens_seen": 390969008, + "router_z_loss_mlp": 0.31054688, + "step": 4726, + "time_per_iteration": 2.671800374984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062231, + "balance_loss_mlp": 1.03128409, + "epoch": 0.9093882262408619, + "flos": 525858269184.0, + "grad_norm": 0.057180053188060825, + "language_loss": 0.81223357, + "learning_rate": 2.1377485969671594e-05, + "loss": 0.82285595, + "num_input_tokens_seen": 391038880, + "router_z_loss_mlp": 0.30908203, + "step": 4727, + "time_per_iteration": 2.626508951187134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059711, + "balance_loss_mlp": 1.02893078, + "epoch": 0.9095806079261254, + "flos": 548266461696.0, + "grad_norm": 0.059368213087244666, + "language_loss": 0.81565529, + "learning_rate": 2.1287456443046084e-05, + "loss": 0.82625234, + "num_input_tokens_seen": 391106720, + "router_z_loss_mlp": 0.30737305, + "step": 4728, + "time_per_iteration": 2.679184913635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062462, + "balance_loss_mlp": 1.03063333, + "epoch": 0.909772989611389, + "flos": 572260571136.0, + "grad_norm": 0.11685858483587666, + "language_loss": 0.8463881, + "learning_rate": 2.1197612769112528e-05, + "loss": 0.85701275, + "num_input_tokens_seen": 391178128, + "router_z_loss_mlp": 0.31811523, + "step": 4729, + "time_per_iteration": 2.692808151245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062924, + "balance_loss_mlp": 1.03154778, + "epoch": 0.9099653712966526, + "flos": 561546086400.0, + "grad_norm": 0.06067152965418052, + "language_loss": 0.79611409, + "learning_rate": 2.1107954982751254e-05, + "loss": 0.80674326, + "num_input_tokens_seen": 391248848, + "router_z_loss_mlp": 0.31347656, + "step": 4730, + "time_per_iteration": 2.662307024002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059671, + "balance_loss_mlp": 1.02843797, + "epoch": 0.9101577529819161, + "flos": 1093377208320.0, + "grad_norm": 0.05540170289696782, + "language_loss": 0.79978657, + "learning_rate": 2.101848311877069e-05, + "loss": 0.81038332, + "num_input_tokens_seen": 391328000, + "router_z_loss_mlp": 0.31201172, + "step": 4731, + "time_per_iteration": 3.3738834857940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063156, + "balance_loss_mlp": 1.03147006, + "epoch": 0.9103501346671797, + "flos": 445215854592.0, + "grad_norm": 0.05697116916892201, + "language_loss": 0.81553221, + "learning_rate": 2.092919721190678e-05, + "loss": 0.82616377, + "num_input_tokens_seen": 391391616, + "router_z_loss_mlp": 0.31665039, + "step": 4732, + "time_per_iteration": 2.5527071952819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062088, + "balance_loss_mlp": 1.03056908, + "epoch": 0.9105425163524432, + "flos": 500510997504.0, + "grad_norm": 0.06484045285144836, + "language_loss": 0.7739293, + "learning_rate": 2.0840097296823346e-05, + "loss": 0.78455019, + "num_input_tokens_seen": 391461312, + "router_z_loss_mlp": 0.31494141, + "step": 4733, + "time_per_iteration": 2.650042772293091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062465, + "balance_loss_mlp": 1.03080285, + "epoch": 0.9107348980377068, + "flos": 657206811648.0, + "grad_norm": 0.04976495879335239, + "language_loss": 0.83918369, + "learning_rate": 2.0751183408112162e-05, + "loss": 0.84980834, + "num_input_tokens_seen": 391542192, + "router_z_loss_mlp": 0.31640625, + "step": 4734, + "time_per_iteration": 2.8551266193389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065441, + "balance_loss_mlp": 1.03442264, + "epoch": 0.9109272797229704, + "flos": 553406916096.0, + "grad_norm": 0.06703437522614884, + "language_loss": 0.84643781, + "learning_rate": 2.066245558029256e-05, + "loss": 0.85709226, + "num_input_tokens_seen": 391609968, + "router_z_loss_mlp": 0.30981445, + "step": 4735, + "time_per_iteration": 2.6464221477508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060664, + "balance_loss_mlp": 1.03045595, + "epoch": 0.911119661408234, + "flos": 518757958656.0, + "grad_norm": 0.05362209566963938, + "language_loss": 0.84261322, + "learning_rate": 2.057391384781182e-05, + "loss": 0.85321987, + "num_input_tokens_seen": 391681264, + "router_z_loss_mlp": 0.30175781, + "step": 4736, + "time_per_iteration": 2.6503520011901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060922, + "balance_loss_mlp": 1.02937949, + "epoch": 0.9113120430934974, + "flos": 554111124480.0, + "grad_norm": 0.056779441339490845, + "language_loss": 0.83084607, + "learning_rate": 2.0485558245044834e-05, + "loss": 0.84145528, + "num_input_tokens_seen": 391751392, + "router_z_loss_mlp": 0.31518555, + "step": 4737, + "time_per_iteration": 2.6577727794647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062197, + "balance_loss_mlp": 1.03105998, + "epoch": 0.911504424778761, + "flos": 501624050688.0, + "grad_norm": 0.056537694741311456, + "language_loss": 0.81219387, + "learning_rate": 2.0397388806294216e-05, + "loss": 0.82281584, + "num_input_tokens_seen": 391823952, + "router_z_loss_mlp": 0.31103516, + "step": 4738, + "time_per_iteration": 2.62200927734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063366, + "balance_loss_mlp": 1.03237128, + "epoch": 0.9116968064640246, + "flos": 610823448576.0, + "grad_norm": 0.05036513509417674, + "language_loss": 0.82349581, + "learning_rate": 2.0309405565790527e-05, + "loss": 0.83412945, + "num_input_tokens_seen": 391895264, + "router_z_loss_mlp": 0.30957031, + "step": 4739, + "time_per_iteration": 2.7241289615631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061012, + "balance_loss_mlp": 1.02968431, + "epoch": 0.9118891881492882, + "flos": 572625745920.0, + "grad_norm": 0.05909426557587561, + "language_loss": 0.82400405, + "learning_rate": 2.0221608557691895e-05, + "loss": 0.83461416, + "num_input_tokens_seen": 391973040, + "router_z_loss_mlp": 0.31298828, + "step": 4740, + "time_per_iteration": 2.800021171569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059842, + "balance_loss_mlp": 1.02877665, + "epoch": 0.9120815698345518, + "flos": 635659978752.0, + "grad_norm": 0.06447832705175557, + "language_loss": 0.77531219, + "learning_rate": 2.0133997816083992e-05, + "loss": 0.78591061, + "num_input_tokens_seen": 392048160, + "router_z_loss_mlp": 0.31030273, + "step": 4741, + "time_per_iteration": 2.816603183746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062793, + "balance_loss_mlp": 1.03186965, + "epoch": 0.9122739515198153, + "flos": 701988291072.0, + "grad_norm": 0.05201992294054252, + "language_loss": 0.85963714, + "learning_rate": 2.0046573374980447e-05, + "loss": 0.87026513, + "num_input_tokens_seen": 392128960, + "router_z_loss_mlp": 0.30883789, + "step": 4742, + "time_per_iteration": 2.8994803428649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065258, + "balance_loss_mlp": 1.03369117, + "epoch": 0.9124663332050789, + "flos": 524435295744.0, + "grad_norm": 0.05856400150605942, + "language_loss": 0.87501878, + "learning_rate": 1.995933526832239e-05, + "loss": 0.88567138, + "num_input_tokens_seen": 392195008, + "router_z_loss_mlp": 0.31542969, + "step": 4743, + "time_per_iteration": 2.594181776046753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061888, + "balance_loss_mlp": 1.03077435, + "epoch": 0.9126587148903424, + "flos": 563033078784.0, + "grad_norm": 0.05280528716664947, + "language_loss": 0.8234272, + "learning_rate": 1.9872283529978662e-05, + "loss": 0.83404607, + "num_input_tokens_seen": 392265168, + "router_z_loss_mlp": 0.31079102, + "step": 4744, + "time_per_iteration": 2.6380250453948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060202, + "balance_loss_mlp": 1.02942252, + "epoch": 0.912851096575606, + "flos": 505695121920.0, + "grad_norm": 0.05435021172866501, + "language_loss": 0.79992861, + "learning_rate": 1.978541819374574e-05, + "loss": 0.8105306, + "num_input_tokens_seen": 392329456, + "router_z_loss_mlp": 0.30737305, + "step": 4745, + "time_per_iteration": 2.591541290283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060289, + "balance_loss_mlp": 1.02939034, + "epoch": 0.9130434782608695, + "flos": 550472219136.0, + "grad_norm": 0.0649422024104131, + "language_loss": 0.82114339, + "learning_rate": 1.9698739293347755e-05, + "loss": 0.83174634, + "num_input_tokens_seen": 392397792, + "router_z_loss_mlp": 0.30859375, + "step": 4746, + "time_per_iteration": 2.655029773712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066365, + "balance_loss_mlp": 1.03584766, + "epoch": 0.9132358599461331, + "flos": 468737100288.0, + "grad_norm": 0.12332969566626222, + "language_loss": 0.83492082, + "learning_rate": 1.9612246862436456e-05, + "loss": 0.84558451, + "num_input_tokens_seen": 392462928, + "router_z_loss_mlp": 0.3046875, + "step": 4747, + "time_per_iteration": 2.555858850479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062525, + "balance_loss_mlp": 1.03150725, + "epoch": 0.9134282416313967, + "flos": 505847890944.0, + "grad_norm": 0.05396566993602361, + "language_loss": 0.79646921, + "learning_rate": 1.9525940934591148e-05, + "loss": 0.80709445, + "num_input_tokens_seen": 392531840, + "router_z_loss_mlp": 0.30981445, + "step": 4748, + "time_per_iteration": 2.614349365234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063896, + "balance_loss_mlp": 1.03297329, + "epoch": 0.9136206233166603, + "flos": 604540827648.0, + "grad_norm": 0.0546299136084745, + "language_loss": 0.8396163, + "learning_rate": 1.9439821543318748e-05, + "loss": 0.85025525, + "num_input_tokens_seen": 392602464, + "router_z_loss_mlp": 0.30883789, + "step": 4749, + "time_per_iteration": 2.7605695724487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060881, + "balance_loss_mlp": 1.0290997, + "epoch": 0.9138130050019239, + "flos": 561467510784.0, + "grad_norm": 0.05257527121038526, + "language_loss": 0.82906801, + "learning_rate": 1.9353888722053793e-05, + "loss": 0.8396768, + "num_input_tokens_seen": 392669872, + "router_z_loss_mlp": 0.31762695, + "step": 4750, + "time_per_iteration": 2.6780149936676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066432, + "balance_loss_mlp": 1.03536606, + "epoch": 0.9140053866871873, + "flos": 689811545088.0, + "grad_norm": 0.12355226926767966, + "language_loss": 0.89985728, + "learning_rate": 1.9268142504158426e-05, + "loss": 0.91052163, + "num_input_tokens_seen": 392744256, + "router_z_loss_mlp": 0.31030273, + "step": 4751, + "time_per_iteration": 2.8180720806121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059231, + "balance_loss_mlp": 1.02840388, + "epoch": 0.9141977683724509, + "flos": 550734087168.0, + "grad_norm": 0.05041700860442144, + "language_loss": 0.84207261, + "learning_rate": 1.9182582922922186e-05, + "loss": 0.85266495, + "num_input_tokens_seen": 392816832, + "router_z_loss_mlp": 0.30786133, + "step": 4752, + "time_per_iteration": 2.6917872428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067812, + "balance_loss_mlp": 1.03634083, + "epoch": 0.9143901500577145, + "flos": 539831927808.0, + "grad_norm": 0.04860414083954547, + "language_loss": 0.75207782, + "learning_rate": 1.9097210011562228e-05, + "loss": 0.76275599, + "num_input_tokens_seen": 392886304, + "router_z_loss_mlp": 0.31445312, + "step": 4753, + "time_per_iteration": 2.653679370880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106309, + "balance_loss_mlp": 1.03102279, + "epoch": 0.9145825317429781, + "flos": 528512159232.0, + "grad_norm": 0.05770192006998304, + "language_loss": 0.80789167, + "learning_rate": 1.9012023803223366e-05, + "loss": 0.81852257, + "num_input_tokens_seen": 392955872, + "router_z_loss_mlp": 0.32055664, + "step": 4754, + "time_per_iteration": 2.645815849304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066571, + "balance_loss_mlp": 1.03557611, + "epoch": 0.9147749134282416, + "flos": 514538500608.0, + "grad_norm": 0.05242685059037384, + "language_loss": 0.79065865, + "learning_rate": 1.892702433097776e-05, + "loss": 0.80132431, + "num_input_tokens_seen": 393025776, + "router_z_loss_mlp": 0.30957031, + "step": 4755, + "time_per_iteration": 2.668991804122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062006, + "balance_loss_mlp": 1.03136897, + "epoch": 0.9149672951135052, + "flos": 514174735872.0, + "grad_norm": 0.0565157200230722, + "language_loss": 0.85695755, + "learning_rate": 1.8842211627825233e-05, + "loss": 0.86757755, + "num_input_tokens_seen": 393095936, + "router_z_loss_mlp": 0.3059082, + "step": 4756, + "time_per_iteration": 2.7136027812957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061248, + "balance_loss_mlp": 1.02951407, + "epoch": 0.9151596767987688, + "flos": 576781185024.0, + "grad_norm": 0.07352542931591961, + "language_loss": 0.80928689, + "learning_rate": 1.8757585726692727e-05, + "loss": 0.81989938, + "num_input_tokens_seen": 393166816, + "router_z_loss_mlp": 0.31713867, + "step": 4757, + "time_per_iteration": 2.7354605197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060549, + "balance_loss_mlp": 1.02974486, + "epoch": 0.9153520584840323, + "flos": 619051368960.0, + "grad_norm": 0.044801284055131146, + "language_loss": 0.82543564, + "learning_rate": 1.8673146660435182e-05, + "loss": 0.83604121, + "num_input_tokens_seen": 393242176, + "router_z_loss_mlp": 0.30761719, + "step": 4758, + "time_per_iteration": 2.726820707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065566, + "balance_loss_mlp": 1.03359389, + "epoch": 0.9155444401692959, + "flos": 468687638016.0, + "grad_norm": 0.05141972147747453, + "language_loss": 0.82493746, + "learning_rate": 1.8588894461834704e-05, + "loss": 0.8355931, + "num_input_tokens_seen": 393311792, + "router_z_loss_mlp": 0.31958008, + "step": 4759, + "time_per_iteration": 2.5751149654388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101691, + "balance_loss_mlp": 1.00904226, + "epoch": 0.9157368218545594, + "flos": 1409931601920.0, + "grad_norm": 0.008900792110931678, + "language_loss": 0.7481907, + "learning_rate": 1.8504829163600855e-05, + "loss": 0.75835979, + "num_input_tokens_seen": 393535648, + "router_z_loss_mlp": 0.07861328, + "step": 4760, + "time_per_iteration": 4.846553325653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016898, + "balance_loss_mlp": 1.00903058, + "epoch": 0.915929203539823, + "flos": 1521195572736.0, + "grad_norm": 0.008902417917998095, + "language_loss": 0.79576051, + "learning_rate": 1.8420950798370584e-05, + "loss": 0.80592954, + "num_input_tokens_seen": 393767040, + "router_z_loss_mlp": 0.07861328, + "step": 4761, + "time_per_iteration": 4.906817674636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066282, + "balance_loss_mlp": 1.03528786, + "epoch": 0.9161215852250866, + "flos": 535480049664.0, + "grad_norm": 0.061604051041974375, + "language_loss": 0.80440938, + "learning_rate": 1.8337259398708616e-05, + "loss": 0.81507224, + "num_input_tokens_seen": 393841232, + "router_z_loss_mlp": 0.30957031, + "step": 4762, + "time_per_iteration": 2.75858473777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063073, + "balance_loss_mlp": 1.03234076, + "epoch": 0.9163139669103502, + "flos": 590350381056.0, + "grad_norm": 0.050240655434021286, + "language_loss": 0.80299342, + "learning_rate": 1.8253754997106632e-05, + "loss": 0.81362408, + "num_input_tokens_seen": 393910512, + "router_z_loss_mlp": 0.30688477, + "step": 4763, + "time_per_iteration": 2.6782495975494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061226, + "balance_loss_mlp": 1.03011227, + "epoch": 0.9165063485956138, + "flos": 821627159040.0, + "grad_norm": 0.07609877775920502, + "language_loss": 0.84720802, + "learning_rate": 1.817043762598397e-05, + "loss": 0.85782027, + "num_input_tokens_seen": 393988624, + "router_z_loss_mlp": 0.31079102, + "step": 4764, + "time_per_iteration": 3.0433642864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061144, + "balance_loss_mlp": 1.03069818, + "epoch": 0.9166987302808772, + "flos": 524932890624.0, + "grad_norm": 0.05222854338861463, + "language_loss": 0.82242793, + "learning_rate": 1.8087307317687264e-05, + "loss": 0.83303934, + "num_input_tokens_seen": 394059184, + "router_z_loss_mlp": 0.30395508, + "step": 4765, + "time_per_iteration": 2.6640570163726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060211, + "balance_loss_mlp": 1.02890635, + "epoch": 0.9168911119661408, + "flos": 654784266240.0, + "grad_norm": 0.07195922114466717, + "language_loss": 0.84169734, + "learning_rate": 1.800436410449058e-05, + "loss": 0.85229945, + "num_input_tokens_seen": 394142160, + "router_z_loss_mlp": 0.31274414, + "step": 4766, + "time_per_iteration": 2.899909257888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064104, + "balance_loss_mlp": 1.03327632, + "epoch": 0.9170834936514044, + "flos": 491504675328.0, + "grad_norm": 0.07194392234567955, + "language_loss": 0.84633625, + "learning_rate": 1.7921608018595436e-05, + "loss": 0.85697722, + "num_input_tokens_seen": 394207056, + "router_z_loss_mlp": 0.30786133, + "step": 4767, + "time_per_iteration": 2.571272611618042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061618, + "balance_loss_mlp": 1.03043294, + "epoch": 0.917275875336668, + "flos": 627756535296.0, + "grad_norm": 0.057558765907327766, + "language_loss": 0.80572951, + "learning_rate": 1.7839039092130415e-05, + "loss": 0.81634569, + "num_input_tokens_seen": 394275456, + "router_z_loss_mlp": 0.31152344, + "step": 4768, + "time_per_iteration": 2.8055806159973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017406, + "balance_loss_mlp": 1.00949097, + "epoch": 0.9174682570219315, + "flos": 1517176935936.0, + "grad_norm": 0.0087369718613956, + "language_loss": 0.78180236, + "learning_rate": 1.7756657357151762e-05, + "loss": 0.79197639, + "num_input_tokens_seen": 394503808, + "router_z_loss_mlp": 0.07910156, + "step": 4769, + "time_per_iteration": 4.936990976333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060292, + "balance_loss_mlp": 1.02846277, + "epoch": 0.917660638707195, + "flos": 559749173760.0, + "grad_norm": 0.05006448592361951, + "language_loss": 0.84848541, + "learning_rate": 1.7674462845642835e-05, + "loss": 0.8590883, + "num_input_tokens_seen": 394573776, + "router_z_loss_mlp": 0.31811523, + "step": 4770, + "time_per_iteration": 2.677330255508423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061055, + "balance_loss_mlp": 1.03022778, + "epoch": 0.9178530203924586, + "flos": 447022941696.0, + "grad_norm": 0.0519833907610009, + "language_loss": 0.84258509, + "learning_rate": 1.7592455589514387e-05, + "loss": 0.85319561, + "num_input_tokens_seen": 394637600, + "router_z_loss_mlp": 0.30786133, + "step": 4771, + "time_per_iteration": 2.495462656021118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063599, + "balance_loss_mlp": 1.03255713, + "epoch": 0.9180454020777222, + "flos": 465734002176.0, + "grad_norm": 0.04964919418416434, + "language_loss": 0.80612022, + "learning_rate": 1.7510635620604453e-05, + "loss": 0.81675619, + "num_input_tokens_seen": 394707344, + "router_z_loss_mlp": 0.31005859, + "step": 4772, + "time_per_iteration": 2.5905964374542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057828, + "balance_loss_mlp": 1.02702451, + "epoch": 0.9182377837629858, + "flos": 596023335936.0, + "grad_norm": 0.05330480791985852, + "language_loss": 0.87082404, + "learning_rate": 1.74290029706784e-05, + "loss": 0.88140237, + "num_input_tokens_seen": 394786368, + "router_z_loss_mlp": 0.30761719, + "step": 4773, + "time_per_iteration": 2.7483558654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061724, + "balance_loss_mlp": 1.03030062, + "epoch": 0.9184301654482493, + "flos": 996251249664.0, + "grad_norm": 0.04999010093437848, + "language_loss": 0.82507402, + "learning_rate": 1.734755767142876e-05, + "loss": 0.83569121, + "num_input_tokens_seen": 394876976, + "router_z_loss_mlp": 0.31396484, + "step": 4774, + "time_per_iteration": 3.3252460956573486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061886, + "balance_loss_mlp": 1.03043866, + "epoch": 0.9186225471335129, + "flos": 508600705536.0, + "grad_norm": 0.043103948269501015, + "language_loss": 0.84609812, + "learning_rate": 1.7266299754475467e-05, + "loss": 0.85671699, + "num_input_tokens_seen": 394949024, + "router_z_loss_mlp": 0.31420898, + "step": 4775, + "time_per_iteration": 2.6413958072662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063705, + "balance_loss_mlp": 1.03211498, + "epoch": 0.9188149288187765, + "flos": 940011789312.0, + "grad_norm": 0.05598618240977498, + "language_loss": 0.78646922, + "learning_rate": 1.718522925136551e-05, + "loss": 0.79710621, + "num_input_tokens_seen": 395044352, + "router_z_loss_mlp": 0.31567383, + "step": 4776, + "time_per_iteration": 3.2665579319000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064215, + "balance_loss_mlp": 1.03322053, + "epoch": 0.91900731050404, + "flos": 583402839552.0, + "grad_norm": 0.0464124427178186, + "language_loss": 0.84131777, + "learning_rate": 1.7104346193573484e-05, + "loss": 0.85195988, + "num_input_tokens_seen": 395113824, + "router_z_loss_mlp": 0.30981445, + "step": 4777, + "time_per_iteration": 2.707064151763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062419, + "balance_loss_mlp": 1.03173482, + "epoch": 0.9191996921893035, + "flos": 580941006336.0, + "grad_norm": 0.06415459977537942, + "language_loss": 0.79562324, + "learning_rate": 1.7023650612500828e-05, + "loss": 0.80624747, + "num_input_tokens_seen": 395184496, + "router_z_loss_mlp": 0.30639648, + "step": 4778, + "time_per_iteration": 2.6951544284820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059351, + "balance_loss_mlp": 1.02845156, + "epoch": 0.9193920738745671, + "flos": 908566751232.0, + "grad_norm": 0.05239795011711653, + "language_loss": 0.79845613, + "learning_rate": 1.6943142539476374e-05, + "loss": 0.80904967, + "num_input_tokens_seen": 395263760, + "router_z_loss_mlp": 0.30859375, + "step": 4779, + "time_per_iteration": 3.128244638442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014928, + "balance_loss_mlp": 1.0069648, + "epoch": 0.9195844555598307, + "flos": 1557557074944.0, + "grad_norm": 0.006881946591681044, + "language_loss": 0.79795396, + "learning_rate": 1.686282200575606e-05, + "loss": 0.8081032, + "num_input_tokens_seen": 395482384, + "router_z_loss_mlp": 0.07958984, + "step": 4780, + "time_per_iteration": 4.738587379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060578, + "balance_loss_mlp": 1.02970314, + "epoch": 0.9197768372450943, + "flos": 473813535744.0, + "grad_norm": 0.05955041173862442, + "language_loss": 0.7853713, + "learning_rate": 1.678268904252317e-05, + "loss": 0.79597706, + "num_input_tokens_seen": 395550384, + "router_z_loss_mlp": 0.30834961, + "step": 4781, + "time_per_iteration": 2.539076805114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063538, + "balance_loss_mlp": 1.03287697, + "epoch": 0.9199692189303579, + "flos": 856622352384.0, + "grad_norm": 0.0707934897138979, + "language_loss": 0.83959591, + "learning_rate": 1.6702743680888088e-05, + "loss": 0.85023129, + "num_input_tokens_seen": 395632320, + "router_z_loss_mlp": 0.30615234, + "step": 4782, + "time_per_iteration": 3.2116215229034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064036, + "balance_loss_mlp": 1.03299415, + "epoch": 0.9201616006156214, + "flos": 504144110592.0, + "grad_norm": 0.06738288492919854, + "language_loss": 0.77458489, + "learning_rate": 1.6622985951888327e-05, + "loss": 0.78522527, + "num_input_tokens_seen": 395703856, + "router_z_loss_mlp": 0.31005859, + "step": 4783, + "time_per_iteration": 2.632368803024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058894, + "balance_loss_mlp": 1.02828109, + "epoch": 0.9203539823008849, + "flos": 548503598592.0, + "grad_norm": 0.09947574411564165, + "language_loss": 0.85094196, + "learning_rate": 1.6543415886488554e-05, + "loss": 0.8615309, + "num_input_tokens_seen": 395779456, + "router_z_loss_mlp": 0.30566406, + "step": 4784, + "time_per_iteration": 2.7621445655822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059931, + "balance_loss_mlp": 1.02903175, + "epoch": 0.9205463639861485, + "flos": 539738795520.0, + "grad_norm": 0.05215962907165292, + "language_loss": 0.82254821, + "learning_rate": 1.6464033515580624e-05, + "loss": 0.83314753, + "num_input_tokens_seen": 395849584, + "router_z_loss_mlp": 0.30859375, + "step": 4785, + "time_per_iteration": 2.6422579288482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061681, + "balance_loss_mlp": 1.03109241, + "epoch": 0.9207387456714121, + "flos": 799367353344.0, + "grad_norm": 0.05717975074851554, + "language_loss": 0.78002059, + "learning_rate": 1.6384838869983488e-05, + "loss": 0.79063737, + "num_input_tokens_seen": 395943712, + "router_z_loss_mlp": 0.30541992, + "step": 4786, + "time_per_iteration": 3.083732843399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061487, + "balance_loss_mlp": 1.0302304, + "epoch": 0.9209311273566756, + "flos": 502607655936.0, + "grad_norm": 0.057025299382847054, + "language_loss": 0.78579599, + "learning_rate": 1.630583198044333e-05, + "loss": 0.79641086, + "num_input_tokens_seen": 396013168, + "router_z_loss_mlp": 0.31225586, + "step": 4787, + "time_per_iteration": 2.648620367050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059126, + "balance_loss_mlp": 1.02777362, + "epoch": 0.9211235090419392, + "flos": 569059623936.0, + "grad_norm": 0.06012963691374042, + "language_loss": 0.82516944, + "learning_rate": 1.6227012877633173e-05, + "loss": 0.83576071, + "num_input_tokens_seen": 396082032, + "router_z_loss_mlp": 0.31323242, + "step": 4788, + "time_per_iteration": 2.6646766662597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063827, + "balance_loss_mlp": 1.03249896, + "epoch": 0.9213158907272028, + "flos": 806205795840.0, + "grad_norm": 0.06327373467774539, + "language_loss": 0.82420582, + "learning_rate": 1.6148381592153538e-05, + "loss": 0.83484411, + "num_input_tokens_seen": 396157984, + "router_z_loss_mlp": 0.31298828, + "step": 4789, + "time_per_iteration": 2.979316473007202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062579, + "balance_loss_mlp": 1.03115511, + "epoch": 0.9215082724124664, + "flos": 490441084416.0, + "grad_norm": 0.09581366649695534, + "language_loss": 0.76114941, + "learning_rate": 1.6069938154531618e-05, + "loss": 0.77177519, + "num_input_tokens_seen": 396223840, + "router_z_loss_mlp": 0.31396484, + "step": 4790, + "time_per_iteration": 2.5435032844543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010433, + "balance_loss_mlp": 1.00242269, + "epoch": 0.9217006540977299, + "flos": 1513648539648.0, + "grad_norm": 0.004451009217126261, + "language_loss": 0.77070266, + "learning_rate": 1.599168259522188e-05, + "loss": 0.78080696, + "num_input_tokens_seen": 396458288, + "router_z_loss_mlp": 0.08007812, + "step": 4791, + "time_per_iteration": 5.021566867828369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059671, + "balance_loss_mlp": 1.02851009, + "epoch": 0.9218930357829934, + "flos": 743471308800.0, + "grad_norm": 0.04726382939179337, + "language_loss": 0.76547706, + "learning_rate": 1.5913614944605804e-05, + "loss": 0.77607369, + "num_input_tokens_seen": 396536208, + "router_z_loss_mlp": 0.3112793, + "step": 4792, + "time_per_iteration": 2.9518425464630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059575, + "balance_loss_mlp": 1.02936769, + "epoch": 0.922085417468257, + "flos": 452803585536.0, + "grad_norm": 0.06742500071670546, + "language_loss": 0.8039397, + "learning_rate": 1.5835735232992032e-05, + "loss": 0.81453544, + "num_input_tokens_seen": 396599984, + "router_z_loss_mlp": 0.30151367, + "step": 4793, + "time_per_iteration": 2.4930520057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106032, + "balance_loss_mlp": 1.02949262, + "epoch": 0.9222777991535206, + "flos": 500003228160.0, + "grad_norm": 0.06238609397119238, + "language_loss": 0.84686369, + "learning_rate": 1.575804349061616e-05, + "loss": 0.85746688, + "num_input_tokens_seen": 396664592, + "router_z_loss_mlp": 0.30810547, + "step": 4794, + "time_per_iteration": 2.6074256896972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061377, + "balance_loss_mlp": 1.02976298, + "epoch": 0.9224701808387842, + "flos": 527704644096.0, + "grad_norm": 0.053967741899602094, + "language_loss": 0.78791153, + "learning_rate": 1.5680539747640722e-05, + "loss": 0.79852533, + "num_input_tokens_seen": 396729472, + "router_z_loss_mlp": 0.31591797, + "step": 4795, + "time_per_iteration": 2.697005033493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063499, + "balance_loss_mlp": 1.03229022, + "epoch": 0.9226625625240477, + "flos": 874272794112.0, + "grad_norm": 0.04984598897704024, + "language_loss": 0.75265729, + "learning_rate": 1.5603224034155315e-05, + "loss": 0.76329225, + "num_input_tokens_seen": 396810384, + "router_z_loss_mlp": 0.31176758, + "step": 4796, + "time_per_iteration": 3.1383020877838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064444, + "balance_loss_mlp": 1.03321099, + "epoch": 0.9228549442093112, + "flos": 502529080320.0, + "grad_norm": 0.06658857929604714, + "language_loss": 0.87877327, + "learning_rate": 1.5526096380176657e-05, + "loss": 0.88941771, + "num_input_tokens_seen": 396875472, + "router_z_loss_mlp": 0.31201172, + "step": 4797, + "time_per_iteration": 2.576430559158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106157, + "balance_loss_mlp": 1.0299077, + "epoch": 0.9230473258945748, + "flos": 599705911296.0, + "grad_norm": 0.04563191808794579, + "language_loss": 0.84813899, + "learning_rate": 1.544915681564829e-05, + "loss": 0.85875475, + "num_input_tokens_seen": 396949888, + "router_z_loss_mlp": 0.31640625, + "step": 4798, + "time_per_iteration": 2.83512544631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061344, + "balance_loss_mlp": 1.03018308, + "epoch": 0.9232397075798384, + "flos": 822168423936.0, + "grad_norm": 0.05160964536593656, + "language_loss": 0.7911216, + "learning_rate": 1.5372405370440822e-05, + "loss": 0.80173504, + "num_input_tokens_seen": 397027504, + "router_z_loss_mlp": 0.3112793, + "step": 4799, + "time_per_iteration": 3.1468732357025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059473, + "balance_loss_mlp": 1.02850246, + "epoch": 0.923432089265102, + "flos": 706719900672.0, + "grad_norm": 0.05428627979787911, + "language_loss": 0.8464976, + "learning_rate": 1.5295842074351805e-05, + "loss": 0.85709232, + "num_input_tokens_seen": 397101600, + "router_z_loss_mlp": 0.30932617, + "step": 4800, + "time_per_iteration": 2.8784780502319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060913, + "balance_loss_mlp": 1.02951312, + "epoch": 0.9236244709503655, + "flos": 701554715136.0, + "grad_norm": 0.06190533279611968, + "language_loss": 0.76700497, + "learning_rate": 1.5219466957105798e-05, + "loss": 0.77761406, + "num_input_tokens_seen": 397170880, + "router_z_loss_mlp": 0.3137207, + "step": 4801, + "time_per_iteration": 2.8618581295013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065155, + "balance_loss_mlp": 1.03466153, + "epoch": 0.9238168526356291, + "flos": 514780019712.0, + "grad_norm": 0.05113589786176994, + "language_loss": 0.83695769, + "learning_rate": 1.5143280048354136e-05, + "loss": 0.84760928, + "num_input_tokens_seen": 397242272, + "router_z_loss_mlp": 0.30444336, + "step": 4802, + "time_per_iteration": 2.5920772552490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059251, + "balance_loss_mlp": 1.02842343, + "epoch": 0.9240092343208927, + "flos": 491789864448.0, + "grad_norm": 0.061659751026456815, + "language_loss": 0.8127811, + "learning_rate": 1.5067281377675213e-05, + "loss": 0.82337356, + "num_input_tokens_seen": 397308032, + "router_z_loss_mlp": 0.30786133, + "step": 4803, + "time_per_iteration": 2.563819646835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058284, + "balance_loss_mlp": 1.02755177, + "epoch": 0.9242016160061562, + "flos": 646915728384.0, + "grad_norm": 0.05582004577056025, + "language_loss": 0.73311841, + "learning_rate": 1.4991470974574484e-05, + "loss": 0.74370122, + "num_input_tokens_seen": 397390944, + "router_z_loss_mlp": 0.30688477, + "step": 4804, + "time_per_iteration": 2.8679351806640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061479, + "balance_loss_mlp": 1.0312233, + "epoch": 0.9243939976914197, + "flos": 729094597632.0, + "grad_norm": 0.056639828384697895, + "language_loss": 0.78709513, + "learning_rate": 1.4915848868484016e-05, + "loss": 0.79770994, + "num_input_tokens_seen": 397468128, + "router_z_loss_mlp": 0.30200195, + "step": 4805, + "time_per_iteration": 3.022678852081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060975, + "balance_loss_mlp": 1.03014719, + "epoch": 0.9245863793766833, + "flos": 452006244864.0, + "grad_norm": 0.0445420112549805, + "language_loss": 0.90410256, + "learning_rate": 1.4840415088763048e-05, + "loss": 0.91471231, + "num_input_tokens_seen": 397538976, + "router_z_loss_mlp": 0.30786133, + "step": 4806, + "time_per_iteration": 2.6259498596191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063869, + "balance_loss_mlp": 1.03213537, + "epoch": 0.9247787610619469, + "flos": 754697945088.0, + "grad_norm": 0.052517724780417725, + "language_loss": 0.76738948, + "learning_rate": 1.476516966469732e-05, + "loss": 0.77802819, + "num_input_tokens_seen": 397612944, + "router_z_loss_mlp": 0.31713867, + "step": 4807, + "time_per_iteration": 2.9332311153411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062399, + "balance_loss_mlp": 1.03049862, + "epoch": 0.9249711427472105, + "flos": 561640628736.0, + "grad_norm": 0.044414575006585695, + "language_loss": 0.84793067, + "learning_rate": 1.4690112625499908e-05, + "loss": 0.85855472, + "num_input_tokens_seen": 397690848, + "router_z_loss_mlp": 0.31884766, + "step": 4808, + "time_per_iteration": 2.7425179481506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062085, + "balance_loss_mlp": 1.02982748, + "epoch": 0.9251635244324741, + "flos": 526430057472.0, + "grad_norm": 0.052440534962070226, + "language_loss": 0.85194021, + "learning_rate": 1.4615244000310501e-05, + "loss": 0.86256105, + "num_input_tokens_seen": 397761008, + "router_z_loss_mlp": 0.32250977, + "step": 4809, + "time_per_iteration": 2.6689164638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062775, + "balance_loss_mlp": 1.03118443, + "epoch": 0.9253559061177375, + "flos": 610982009856.0, + "grad_norm": 0.057022740257233, + "language_loss": 0.79165608, + "learning_rate": 1.4540563818195685e-05, + "loss": 0.80228388, + "num_input_tokens_seen": 397840640, + "router_z_loss_mlp": 0.31567383, + "step": 4810, + "time_per_iteration": 2.81392240524292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005244, + "balance_loss_mlp": 0.99713796, + "epoch": 0.9255482878030011, + "flos": 1550461146624.0, + "grad_norm": 0.004507621566339502, + "language_loss": 0.76925391, + "learning_rate": 1.446607210814882e-05, + "loss": 0.77930635, + "num_input_tokens_seen": 398060096, + "router_z_loss_mlp": 0.08105469, + "step": 4811, + "time_per_iteration": 4.72790789604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062769, + "balance_loss_mlp": 1.03141689, + "epoch": 0.9257406694882647, + "flos": 766008949248.0, + "grad_norm": 0.06092581196020588, + "language_loss": 0.8103286, + "learning_rate": 1.4391768899090219e-05, + "loss": 0.82095635, + "num_input_tokens_seen": 398143680, + "router_z_loss_mlp": 0.31323242, + "step": 4812, + "time_per_iteration": 3.064039707183838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064787, + "balance_loss_mlp": 1.03302956, + "epoch": 0.9259330511735283, + "flos": 497748008448.0, + "grad_norm": 0.053196549248037406, + "language_loss": 0.83248472, + "learning_rate": 1.431765421986686e-05, + "loss": 0.84313262, + "num_input_tokens_seen": 398207056, + "router_z_loss_mlp": 0.31738281, + "step": 4813, + "time_per_iteration": 2.5401344299316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060071, + "balance_loss_mlp": 1.02910006, + "epoch": 0.9261254328587919, + "flos": 626589637632.0, + "grad_norm": 0.0906762335238156, + "language_loss": 0.78651297, + "learning_rate": 1.424372809925273e-05, + "loss": 0.79711372, + "num_input_tokens_seen": 398277472, + "router_z_loss_mlp": 0.30932617, + "step": 4814, + "time_per_iteration": 2.7242367267608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105935, + "balance_loss_mlp": 1.02818894, + "epoch": 0.9263178145440554, + "flos": 597105865728.0, + "grad_norm": 0.05489571993390207, + "language_loss": 0.85417783, + "learning_rate": 1.416999056594831e-05, + "loss": 0.86477137, + "num_input_tokens_seen": 398346544, + "router_z_loss_mlp": 0.3112793, + "step": 4815, + "time_per_iteration": 2.7542569637298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059465, + "balance_loss_mlp": 1.02861321, + "epoch": 0.926510196229319, + "flos": 388350761472.0, + "grad_norm": 0.05346963676557139, + "language_loss": 0.83451992, + "learning_rate": 1.4096441648581259e-05, + "loss": 0.84511459, + "num_input_tokens_seen": 398409344, + "router_z_loss_mlp": 0.30810547, + "step": 4816, + "time_per_iteration": 2.5149407386779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060992, + "balance_loss_mlp": 1.02940106, + "epoch": 0.9267025779145825, + "flos": 545533996032.0, + "grad_norm": 0.056029117232846586, + "language_loss": 0.84429115, + "learning_rate": 1.4023081375705737e-05, + "loss": 0.85490108, + "num_input_tokens_seen": 398478816, + "router_z_loss_mlp": 0.31567383, + "step": 4817, + "time_per_iteration": 2.630242109298706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061904, + "balance_loss_mlp": 1.03036106, + "epoch": 0.9268949595998461, + "flos": 499540538880.0, + "grad_norm": 0.05209040991874763, + "language_loss": 0.81881189, + "learning_rate": 1.3949909775802682e-05, + "loss": 0.82943094, + "num_input_tokens_seen": 398550384, + "router_z_loss_mlp": 0.31518555, + "step": 4818, + "time_per_iteration": 2.7070581912994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063037, + "balance_loss_mlp": 1.03037405, + "epoch": 0.9270873412851096, + "flos": 432601150464.0, + "grad_norm": 0.07453857754542507, + "language_loss": 0.82538891, + "learning_rate": 1.3876926877279817e-05, + "loss": 0.83601934, + "num_input_tokens_seen": 398620832, + "router_z_loss_mlp": 0.32666016, + "step": 4819, + "time_per_iteration": 2.63124942779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057056, + "balance_loss_mlp": 1.02644277, + "epoch": 0.9272797229703732, + "flos": 466512403968.0, + "grad_norm": 0.05984200943619328, + "language_loss": 0.86118358, + "learning_rate": 1.380413270847164e-05, + "loss": 0.87175417, + "num_input_tokens_seen": 398689776, + "router_z_loss_mlp": 0.30566406, + "step": 4820, + "time_per_iteration": 2.6563737392425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106259, + "balance_loss_mlp": 1.0314765, + "epoch": 0.9274721046556368, + "flos": 704486439936.0, + "grad_norm": 0.05327632622082771, + "language_loss": 0.78716862, + "learning_rate": 1.373152729763938e-05, + "loss": 0.79779452, + "num_input_tokens_seen": 398775072, + "router_z_loss_mlp": 0.31079102, + "step": 4821, + "time_per_iteration": 3.0101308822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005202, + "balance_loss_mlp": 0.99709588, + "epoch": 0.9276644863409004, + "flos": 1401486893568.0, + "grad_norm": 0.004162007556462311, + "language_loss": 0.82380462, + "learning_rate": 1.3659110672970931e-05, + "loss": 0.83385664, + "num_input_tokens_seen": 399002016, + "router_z_loss_mlp": 0.08105469, + "step": 4822, + "time_per_iteration": 4.932299375534058 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061034, + "balance_loss_mlp": 1.02965784, + "epoch": 0.927856868026164, + "flos": 741370268160.0, + "grad_norm": 0.044770471632053395, + "language_loss": 0.79991037, + "learning_rate": 1.3586882862580917e-05, + "loss": 0.81052071, + "num_input_tokens_seen": 399085808, + "router_z_loss_mlp": 0.31347656, + "step": 4823, + "time_per_iteration": 3.075979709625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062708, + "balance_loss_mlp": 1.03152299, + "epoch": 0.9280492497114274, + "flos": 412000045056.0, + "grad_norm": 0.05794564929676867, + "language_loss": 0.73926902, + "learning_rate": 1.3514843894510686e-05, + "loss": 0.74989611, + "num_input_tokens_seen": 399146768, + "router_z_loss_mlp": 0.31152344, + "step": 4824, + "time_per_iteration": 2.475565195083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061263, + "balance_loss_mlp": 1.03003049, + "epoch": 0.928241631396691, + "flos": 646215902208.0, + "grad_norm": 0.05880246406469922, + "language_loss": 0.84044743, + "learning_rate": 1.3442993796728254e-05, + "loss": 0.85106003, + "num_input_tokens_seen": 399220192, + "router_z_loss_mlp": 0.31201172, + "step": 4825, + "time_per_iteration": 2.7478461265563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060328, + "balance_loss_mlp": 1.02926219, + "epoch": 0.9284340130819546, + "flos": 696537916416.0, + "grad_norm": 0.052202361516365085, + "language_loss": 0.80711192, + "learning_rate": 1.3371332597128249e-05, + "loss": 0.81771523, + "num_input_tokens_seen": 399300064, + "router_z_loss_mlp": 0.31030273, + "step": 4826, + "time_per_iteration": 2.9145355224609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062726, + "balance_loss_mlp": 1.03082526, + "epoch": 0.9286263947672182, + "flos": 758780600832.0, + "grad_norm": 0.04756980301049446, + "language_loss": 0.83686376, + "learning_rate": 1.3299860323532032e-05, + "loss": 0.84749097, + "num_input_tokens_seen": 399383200, + "router_z_loss_mlp": 0.31884766, + "step": 4827, + "time_per_iteration": 3.0382120609283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061304, + "balance_loss_mlp": 1.03050017, + "epoch": 0.9288187764524817, + "flos": 672495754752.0, + "grad_norm": 0.05853151917870524, + "language_loss": 0.80073225, + "learning_rate": 1.3228577003687681e-05, + "loss": 0.81134522, + "num_input_tokens_seen": 399466400, + "router_z_loss_mlp": 0.30761719, + "step": 4828, + "time_per_iteration": 2.977632761001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059808, + "balance_loss_mlp": 1.02857471, + "epoch": 0.9290111581377453, + "flos": 500220016128.0, + "grad_norm": 0.05187531918585966, + "language_loss": 0.83971095, + "learning_rate": 1.3157482665269727e-05, + "loss": 0.85030901, + "num_input_tokens_seen": 399533504, + "router_z_loss_mlp": 0.31201172, + "step": 4829, + "time_per_iteration": 2.5926513671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005303, + "balance_loss_mlp": 0.99719697, + "epoch": 0.9292035398230089, + "flos": 1562773132800.0, + "grad_norm": 0.004143837665711613, + "language_loss": 0.72122061, + "learning_rate": 1.3086577335879424e-05, + "loss": 0.73127365, + "num_input_tokens_seen": 399769872, + "router_z_loss_mlp": 0.08105469, + "step": 4830, + "time_per_iteration": 4.936404228210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005303, + "balance_loss_mlp": 0.99719697, + "epoch": 0.9293959215082724, + "flos": 1517828709888.0, + "grad_norm": 0.004144572143444307, + "language_loss": 0.79511833, + "learning_rate": 1.3015861043044753e-05, + "loss": 0.80517137, + "num_input_tokens_seen": 399997760, + "router_z_loss_mlp": 0.08105469, + "step": 4831, + "time_per_iteration": 4.858310222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061504, + "balance_loss_mlp": 1.03012788, + "epoch": 0.929588303193536, + "flos": 557572529664.0, + "grad_norm": 0.06711772734465098, + "language_loss": 0.84252775, + "learning_rate": 1.2945333814220195e-05, + "loss": 0.8531428, + "num_input_tokens_seen": 400063872, + "router_z_loss_mlp": 0.31347656, + "step": 4832, + "time_per_iteration": 2.642258644104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062693, + "balance_loss_mlp": 1.03134084, + "epoch": 0.9297806848787995, + "flos": 478338531840.0, + "grad_norm": 0.06848432277172195, + "language_loss": 0.80321014, + "learning_rate": 1.2874995676786905e-05, + "loss": 0.81383705, + "num_input_tokens_seen": 400126064, + "router_z_loss_mlp": 0.31323242, + "step": 4833, + "time_per_iteration": 2.535723924636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064305, + "balance_loss_mlp": 1.03290522, + "epoch": 0.9299730665640631, + "flos": 564259613184.0, + "grad_norm": 0.04598376752179828, + "language_loss": 0.80186009, + "learning_rate": 1.2804846658052372e-05, + "loss": 0.8125031, + "num_input_tokens_seen": 400201776, + "router_z_loss_mlp": 0.3137207, + "step": 4834, + "time_per_iteration": 2.8007967472076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061052, + "balance_loss_mlp": 1.03041565, + "epoch": 0.9301654482493267, + "flos": 559883003904.0, + "grad_norm": 0.050887682082653, + "language_loss": 0.82550877, + "learning_rate": 1.2734886785251032e-05, + "loss": 0.83611929, + "num_input_tokens_seen": 400279504, + "router_z_loss_mlp": 0.3059082, + "step": 4835, + "time_per_iteration": 2.780515193939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004579, + "balance_loss_mlp": 0.9964726, + "epoch": 0.9303578299345903, + "flos": 1519251683328.0, + "grad_norm": 0.004150284998796472, + "language_loss": 0.76852441, + "learning_rate": 1.2665116085543715e-05, + "loss": 0.77857018, + "num_input_tokens_seen": 400514800, + "router_z_loss_mlp": 0.08105469, + "step": 4836, + "time_per_iteration": 4.956341981887817 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106311, + "balance_loss_mlp": 1.03147149, + "epoch": 0.9305502116198537, + "flos": 530589878784.0, + "grad_norm": 0.05359665120507271, + "language_loss": 0.82833552, + "learning_rate": 1.2595534586017698e-05, + "loss": 0.83896661, + "num_input_tokens_seen": 400582640, + "router_z_loss_mlp": 0.31640625, + "step": 4837, + "time_per_iteration": 2.6186673641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106216, + "balance_loss_mlp": 1.03049755, + "epoch": 0.9307425933051173, + "flos": 474660338688.0, + "grad_norm": 0.06180694800901607, + "language_loss": 0.81545842, + "learning_rate": 1.2526142313686983e-05, + "loss": 0.82607996, + "num_input_tokens_seen": 400646912, + "router_z_loss_mlp": 0.31640625, + "step": 4838, + "time_per_iteration": 2.63519549369812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064317, + "balance_loss_mlp": 1.03258371, + "epoch": 0.9309349749903809, + "flos": 584600260608.0, + "grad_norm": 0.1559896918594763, + "language_loss": 0.86706674, + "learning_rate": 1.245693929549213e-05, + "loss": 0.87770993, + "num_input_tokens_seen": 400722128, + "router_z_loss_mlp": 0.31713867, + "step": 4839, + "time_per_iteration": 2.7265617847442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105852, + "balance_loss_mlp": 1.02771592, + "epoch": 0.9311273566756445, + "flos": 861298707456.0, + "grad_norm": 0.0464248741671958, + "language_loss": 0.76823103, + "learning_rate": 1.2387925558299984e-05, + "loss": 0.77881616, + "num_input_tokens_seen": 400801440, + "router_z_loss_mlp": 0.30761719, + "step": 4840, + "time_per_iteration": 3.1054060459136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062325, + "balance_loss_mlp": 1.0304966, + "epoch": 0.9313197383609081, + "flos": 547828503552.0, + "grad_norm": 0.05682803021706539, + "language_loss": 0.82184482, + "learning_rate": 1.231910112890411e-05, + "loss": 0.83246803, + "num_input_tokens_seen": 400873008, + "router_z_loss_mlp": 0.31835938, + "step": 4841, + "time_per_iteration": 2.6747379302978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060001, + "balance_loss_mlp": 1.02833903, + "epoch": 0.9315121200461716, + "flos": 468520312320.0, + "grad_norm": 0.0713585448689604, + "language_loss": 0.81151795, + "learning_rate": 1.2250466034024522e-05, + "loss": 0.82211792, + "num_input_tokens_seen": 400935328, + "router_z_loss_mlp": 0.31640625, + "step": 4842, + "time_per_iteration": 2.541785955429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061026, + "balance_loss_mlp": 1.03003192, + "epoch": 0.9317045017314352, + "flos": 417435863040.0, + "grad_norm": 0.05813435250991612, + "language_loss": 0.77865148, + "learning_rate": 1.2182020300307684e-05, + "loss": 0.7892617, + "num_input_tokens_seen": 401000720, + "router_z_loss_mlp": 0.30957031, + "step": 4843, + "time_per_iteration": 2.506622552871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061434, + "balance_loss_mlp": 1.03063035, + "epoch": 0.9318968834166987, + "flos": 540207277056.0, + "grad_norm": 0.0508930555691298, + "language_loss": 0.76882333, + "learning_rate": 1.2113763954326729e-05, + "loss": 0.77943766, + "num_input_tokens_seen": 401079664, + "router_z_loss_mlp": 0.30761719, + "step": 4844, + "time_per_iteration": 2.7364871501922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060895, + "balance_loss_mlp": 1.02944803, + "epoch": 0.9320892651019623, + "flos": 521077197312.0, + "grad_norm": 0.06090788976916376, + "language_loss": 0.80515504, + "learning_rate": 1.2045697022581015e-05, + "loss": 0.81576395, + "num_input_tokens_seen": 401146160, + "router_z_loss_mlp": 0.31420898, + "step": 4845, + "time_per_iteration": 2.6302125453948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067349, + "balance_loss_mlp": 1.03566349, + "epoch": 0.9322816467872258, + "flos": 581779044864.0, + "grad_norm": 0.05303023243065918, + "language_loss": 0.80538929, + "learning_rate": 1.1977819531496348e-05, + "loss": 0.81606281, + "num_input_tokens_seen": 401223264, + "router_z_loss_mlp": 0.31665039, + "step": 4846, + "time_per_iteration": 2.740966796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057851, + "balance_loss_mlp": 1.02647471, + "epoch": 0.9324740284724894, + "flos": 484484350464.0, + "grad_norm": 0.06270216233520148, + "language_loss": 0.82024521, + "learning_rate": 1.191013150742537e-05, + "loss": 0.83082366, + "num_input_tokens_seen": 401296368, + "router_z_loss_mlp": 0.31347656, + "step": 4847, + "time_per_iteration": 2.6899847984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059256, + "balance_loss_mlp": 1.02733231, + "epoch": 0.932666410157753, + "flos": 732227143680.0, + "grad_norm": 0.056276578673258616, + "language_loss": 0.82572961, + "learning_rate": 1.1842632976646672e-05, + "loss": 0.83632219, + "num_input_tokens_seen": 401383936, + "router_z_loss_mlp": 0.3190918, + "step": 4848, + "time_per_iteration": 3.029046058654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063015, + "balance_loss_mlp": 1.03149569, + "epoch": 0.9328587918430166, + "flos": 965127716352.0, + "grad_norm": 0.055194771265743715, + "language_loss": 0.78700304, + "learning_rate": 1.1775323965365681e-05, + "loss": 0.79763317, + "num_input_tokens_seen": 401468784, + "router_z_loss_mlp": 0.31494141, + "step": 4849, + "time_per_iteration": 3.231687545776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060415, + "balance_loss_mlp": 1.02865744, + "epoch": 0.9330511735282802, + "flos": 614270297088.0, + "grad_norm": 0.052004387996905495, + "language_loss": 0.80041909, + "learning_rate": 1.1708204499713936e-05, + "loss": 0.81102324, + "num_input_tokens_seen": 401539712, + "router_z_loss_mlp": 0.31738281, + "step": 4850, + "time_per_iteration": 2.6882708072662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066853, + "balance_loss_mlp": 1.03516674, + "epoch": 0.9332435552135436, + "flos": 558823795200.0, + "grad_norm": 0.048091399288315254, + "language_loss": 0.8570627, + "learning_rate": 1.1641274605749653e-05, + "loss": 0.86773121, + "num_input_tokens_seen": 401610432, + "router_z_loss_mlp": 0.31665039, + "step": 4851, + "time_per_iteration": 2.7675979137420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106148, + "balance_loss_mlp": 1.02984154, + "epoch": 0.9334359368988072, + "flos": 515281996800.0, + "grad_norm": 0.053519206023987825, + "language_loss": 0.82029992, + "learning_rate": 1.1574534309457208e-05, + "loss": 0.83091474, + "num_input_tokens_seen": 401677344, + "router_z_loss_mlp": 0.31616211, + "step": 4852, + "time_per_iteration": 2.5811235904693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061165, + "balance_loss_mlp": 1.03019428, + "epoch": 0.9336283185840708, + "flos": 539527799808.0, + "grad_norm": 0.043954400636649044, + "language_loss": 0.82742274, + "learning_rate": 1.1507983636747488e-05, + "loss": 0.83803439, + "num_input_tokens_seen": 401756864, + "router_z_loss_mlp": 0.30932617, + "step": 4853, + "time_per_iteration": 2.8195674419403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005382, + "balance_loss_mlp": 0.99732333, + "epoch": 0.9338207002693344, + "flos": 1562003495424.0, + "grad_norm": 0.004487342279678937, + "language_loss": 0.78455019, + "learning_rate": 1.1441622613457824e-05, + "loss": 0.79460394, + "num_input_tokens_seen": 401983664, + "router_z_loss_mlp": 0.08056641, + "step": 4854, + "time_per_iteration": 4.930206298828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060266, + "balance_loss_mlp": 1.02898586, + "epoch": 0.9340130819545979, + "flos": 644951490048.0, + "grad_norm": 0.05547450276969274, + "language_loss": 0.81409979, + "learning_rate": 1.1375451265351833e-05, + "loss": 0.8247025, + "num_input_tokens_seen": 402065744, + "router_z_loss_mlp": 0.3125, + "step": 4855, + "time_per_iteration": 2.8930420875549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063067, + "balance_loss_mlp": 1.03185797, + "epoch": 0.9342054636398615, + "flos": 503175062016.0, + "grad_norm": 0.05200041653829253, + "language_loss": 0.76766109, + "learning_rate": 1.1309469618119516e-05, + "loss": 0.77829176, + "num_input_tokens_seen": 402137728, + "router_z_loss_mlp": 0.31176758, + "step": 4856, + "time_per_iteration": 2.6574809551239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061148, + "balance_loss_mlp": 1.02962923, + "epoch": 0.934397845325125, + "flos": 592724874240.0, + "grad_norm": 0.07486950539873422, + "language_loss": 0.84321606, + "learning_rate": 1.1243677697377109e-05, + "loss": 0.8538276, + "num_input_tokens_seen": 402220160, + "router_z_loss_mlp": 0.31494141, + "step": 4857, + "time_per_iteration": 2.8692033290863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062363, + "balance_loss_mlp": 1.03198814, + "epoch": 0.9345902270103886, + "flos": 499643845632.0, + "grad_norm": 0.052677646415496285, + "language_loss": 0.80566096, + "learning_rate": 1.1178075528667453e-05, + "loss": 0.8162846, + "num_input_tokens_seen": 402285168, + "router_z_loss_mlp": 0.3034668, + "step": 4858, + "time_per_iteration": 2.6162071228027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005374, + "balance_loss_mlp": 0.99731594, + "epoch": 0.9347826086956522, + "flos": 1519563165696.0, + "grad_norm": 0.004492710484548213, + "language_loss": 0.7598772, + "learning_rate": 1.1112663137459566e-05, + "loss": 0.76993096, + "num_input_tokens_seen": 402504912, + "router_z_loss_mlp": 0.08056641, + "step": 4859, + "time_per_iteration": 4.6757166385650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061693, + "balance_loss_mlp": 1.0306747, + "epoch": 0.9349749903809157, + "flos": 504273558528.0, + "grad_norm": 0.05481870385966651, + "language_loss": 0.81309426, + "learning_rate": 1.1047440549148636e-05, + "loss": 0.82371128, + "num_input_tokens_seen": 402582032, + "router_z_loss_mlp": 0.30981445, + "step": 4860, + "time_per_iteration": 2.792273998260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059343, + "balance_loss_mlp": 1.02822995, + "epoch": 0.9351673720661793, + "flos": 568636222464.0, + "grad_norm": 0.06410233006416319, + "language_loss": 0.77857924, + "learning_rate": 1.0982407789056514e-05, + "loss": 0.78917265, + "num_input_tokens_seen": 402650144, + "router_z_loss_mlp": 0.31079102, + "step": 4861, + "time_per_iteration": 2.6576950550079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058849, + "balance_loss_mlp": 1.02806914, + "epoch": 0.9353597537514429, + "flos": 544342367232.0, + "grad_norm": 0.05887078488817451, + "language_loss": 0.86108792, + "learning_rate": 1.0917564882430952e-05, + "loss": 0.87167645, + "num_input_tokens_seen": 402720368, + "router_z_loss_mlp": 0.30737305, + "step": 4862, + "time_per_iteration": 2.635856866836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062763, + "balance_loss_mlp": 1.0311482, + "epoch": 0.9355521354367065, + "flos": 518743401984.0, + "grad_norm": 0.05060577749050637, + "language_loss": 0.84681392, + "learning_rate": 1.0852911854446368e-05, + "loss": 0.85744154, + "num_input_tokens_seen": 402795568, + "router_z_loss_mlp": 0.31591797, + "step": 4863, + "time_per_iteration": 2.744649887084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060564, + "balance_loss_mlp": 1.02952147, + "epoch": 0.93574451712197, + "flos": 446087388672.0, + "grad_norm": 0.05386717349507103, + "language_loss": 0.78386593, + "learning_rate": 1.0788448730203237e-05, + "loss": 0.79447162, + "num_input_tokens_seen": 402858784, + "router_z_loss_mlp": 0.31005859, + "step": 4864, + "time_per_iteration": 2.4612977504730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058845, + "balance_loss_mlp": 1.02804136, + "epoch": 0.9359368988072335, + "flos": 480273656832.0, + "grad_norm": 0.07391698555826227, + "language_loss": 0.77168214, + "learning_rate": 1.072417553472832e-05, + "loss": 0.78227055, + "num_input_tokens_seen": 402924144, + "router_z_loss_mlp": 0.30761719, + "step": 4865, + "time_per_iteration": 2.5211689472198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062281, + "balance_loss_mlp": 1.03181124, + "epoch": 0.9361292804924971, + "flos": 496876474368.0, + "grad_norm": 0.058428574526624755, + "language_loss": 0.85151851, + "learning_rate": 1.0660092292974766e-05, + "loss": 0.86214131, + "num_input_tokens_seen": 402987488, + "router_z_loss_mlp": 0.30419922, + "step": 4866, + "time_per_iteration": 2.622624635696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060087, + "balance_loss_mlp": 1.0294745, + "epoch": 0.9363216621777607, + "flos": 617830626816.0, + "grad_norm": 0.055760701356742395, + "language_loss": 0.84262055, + "learning_rate": 1.059619902982184e-05, + "loss": 0.85322142, + "num_input_tokens_seen": 403058224, + "router_z_loss_mlp": 0.30566406, + "step": 4867, + "time_per_iteration": 2.7364232540130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005383, + "balance_loss_mlp": 0.99727678, + "epoch": 0.9365140438630243, + "flos": 1415169570816.0, + "grad_norm": 0.004497337042276508, + "language_loss": 0.79203337, + "learning_rate": 1.053249577007509e-05, + "loss": 0.80208719, + "num_input_tokens_seen": 403289072, + "router_z_loss_mlp": 0.08105469, + "step": 4868, + "time_per_iteration": 4.865636110305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062556, + "balance_loss_mlp": 1.03108454, + "epoch": 0.9367064255482878, + "flos": 590217960960.0, + "grad_norm": 0.04987850830161197, + "language_loss": 0.81500798, + "learning_rate": 1.0468982538466287e-05, + "loss": 0.82563359, + "num_input_tokens_seen": 403361728, + "router_z_loss_mlp": 0.31445312, + "step": 4869, + "time_per_iteration": 2.699848175048828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058901, + "balance_loss_mlp": 1.02764463, + "epoch": 0.9368988072335513, + "flos": 526384977408.0, + "grad_norm": 0.05441615624063978, + "language_loss": 0.81575727, + "learning_rate": 1.0405659359653597e-05, + "loss": 0.82634622, + "num_input_tokens_seen": 403431536, + "router_z_loss_mlp": 0.31225586, + "step": 4870, + "time_per_iteration": 2.6536993980407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063994, + "balance_loss_mlp": 1.03202248, + "epoch": 0.9370911889188149, + "flos": 742880581632.0, + "grad_norm": 0.05850362106366467, + "language_loss": 0.78898335, + "learning_rate": 1.034252625822113e-05, + "loss": 0.79962337, + "num_input_tokens_seen": 403504768, + "router_z_loss_mlp": 0.31958008, + "step": 4871, + "time_per_iteration": 2.871039867401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062533, + "balance_loss_mlp": 1.03160965, + "epoch": 0.9372835706040785, + "flos": 545779897344.0, + "grad_norm": 0.051618847830223594, + "language_loss": 0.78720641, + "learning_rate": 1.0279583258679448e-05, + "loss": 0.79783177, + "num_input_tokens_seen": 403575584, + "router_z_loss_mlp": 0.30883789, + "step": 4872, + "time_per_iteration": 2.647601842880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106415, + "balance_loss_mlp": 1.03224909, + "epoch": 0.9374759522893421, + "flos": 491367873024.0, + "grad_norm": 0.055623553349456685, + "language_loss": 0.81515515, + "learning_rate": 1.0216830385465003e-05, + "loss": 0.82579672, + "num_input_tokens_seen": 403648720, + "router_z_loss_mlp": 0.31884766, + "step": 4873, + "time_per_iteration": 2.6937618255615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060458, + "balance_loss_mlp": 1.02903473, + "epoch": 0.9376683339746056, + "flos": 578144521728.0, + "grad_norm": 0.055922001744963604, + "language_loss": 0.82857215, + "learning_rate": 1.0154267662940809e-05, + "loss": 0.83917665, + "num_input_tokens_seen": 403721392, + "router_z_loss_mlp": 0.31396484, + "step": 4874, + "time_per_iteration": 2.6639533042907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060789, + "balance_loss_mlp": 1.02912736, + "epoch": 0.9378607156598692, + "flos": 506039947776.0, + "grad_norm": 0.05510447696512615, + "language_loss": 0.80243266, + "learning_rate": 1.0091895115395766e-05, + "loss": 0.81304049, + "num_input_tokens_seen": 403792112, + "router_z_loss_mlp": 0.31640625, + "step": 4875, + "time_per_iteration": 2.614619016647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059994, + "balance_loss_mlp": 1.02852273, + "epoch": 0.9380530973451328, + "flos": 519753148416.0, + "grad_norm": 0.07676786722670897, + "language_loss": 0.77718991, + "learning_rate": 1.0029712767045062e-05, + "loss": 0.78778982, + "num_input_tokens_seen": 403860928, + "router_z_loss_mlp": 0.31445312, + "step": 4876, + "time_per_iteration": 2.632483720779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060472, + "balance_loss_mlp": 1.02964473, + "epoch": 0.9382454790303963, + "flos": 557533241856.0, + "grad_norm": 0.09739330390757249, + "language_loss": 0.84747428, + "learning_rate": 9.967720642029999e-06, + "loss": 0.85807896, + "num_input_tokens_seen": 403928240, + "router_z_loss_mlp": 0.30786133, + "step": 4877, + "time_per_iteration": 2.667158365249634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105889, + "balance_loss_mlp": 1.02815771, + "epoch": 0.9384378607156598, + "flos": 695149848576.0, + "grad_norm": 0.38308809515223985, + "language_loss": 0.81761467, + "learning_rate": 9.905918764418153e-06, + "loss": 0.82820356, + "num_input_tokens_seen": 404004320, + "router_z_loss_mlp": 0.30688477, + "step": 4878, + "time_per_iteration": 2.888810873031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059406, + "balance_loss_mlp": 1.0284121, + "epoch": 0.9386302424009234, + "flos": 554480681472.0, + "grad_norm": 0.0575140000640527, + "language_loss": 0.80870175, + "learning_rate": 9.844307158203058e-06, + "loss": 0.81929588, + "num_input_tokens_seen": 404077040, + "router_z_loss_mlp": 0.30957031, + "step": 4879, + "time_per_iteration": 2.649317979812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063144, + "balance_loss_mlp": 1.03210211, + "epoch": 0.938822624086187, + "flos": 566711271936.0, + "grad_norm": 0.05421321108233327, + "language_loss": 0.79483026, + "learning_rate": 9.782885847304469e-06, + "loss": 0.80546176, + "num_input_tokens_seen": 404145248, + "router_z_loss_mlp": 0.31005859, + "step": 4880, + "time_per_iteration": 2.6381187438964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060611, + "balance_loss_mlp": 1.02975965, + "epoch": 0.9390150057714506, + "flos": 417367461888.0, + "grad_norm": 0.05143950105590942, + "language_loss": 0.8027178, + "learning_rate": 9.721654855568196e-06, + "loss": 0.81332386, + "num_input_tokens_seen": 404212000, + "router_z_loss_mlp": 0.30810547, + "step": 4881, + "time_per_iteration": 2.5627872943878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059492, + "balance_loss_mlp": 1.02883101, + "epoch": 0.9392073874567142, + "flos": 1553281256448.0, + "grad_norm": 0.05634975894400428, + "language_loss": 0.76259017, + "learning_rate": 9.660614206766394e-06, + "loss": 0.77318507, + "num_input_tokens_seen": 404305408, + "router_z_loss_mlp": 0.30615234, + "step": 4882, + "time_per_iteration": 3.692448854446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061611, + "balance_loss_mlp": 1.03099859, + "epoch": 0.9393997691419776, + "flos": 652238065152.0, + "grad_norm": 0.06867303852107298, + "language_loss": 0.77672702, + "learning_rate": 9.59976392459705e-06, + "loss": 0.78734314, + "num_input_tokens_seen": 404383248, + "router_z_loss_mlp": 0.30566406, + "step": 4883, + "time_per_iteration": 2.7691049575805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009065, + "balance_loss_mlp": 1.00091124, + "epoch": 0.9395921508272412, + "flos": 1552480639488.0, + "grad_norm": 0.003319864834589177, + "language_loss": 0.78170681, + "learning_rate": 9.539104032684209e-06, + "loss": 0.7917974, + "num_input_tokens_seen": 404615264, + "router_z_loss_mlp": 0.08154297, + "step": 4884, + "time_per_iteration": 4.804852247238159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062685, + "balance_loss_mlp": 1.03123808, + "epoch": 0.9397845325125048, + "flos": 497881838592.0, + "grad_norm": 0.054651008950097155, + "language_loss": 0.782938, + "learning_rate": 9.478634554578314e-06, + "loss": 0.7935648, + "num_input_tokens_seen": 404684656, + "router_z_loss_mlp": 0.31420898, + "step": 4885, + "time_per_iteration": 2.6088409423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060175, + "balance_loss_mlp": 1.02944279, + "epoch": 0.9399769141977684, + "flos": 498348910080.0, + "grad_norm": 0.05219236638184246, + "language_loss": 0.83581132, + "learning_rate": 9.418355513755638e-06, + "loss": 0.84641308, + "num_input_tokens_seen": 404752096, + "router_z_loss_mlp": 0.30688477, + "step": 4886, + "time_per_iteration": 2.6242825984954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009087, + "balance_loss_mlp": 1.00093293, + "epoch": 0.9401692958830319, + "flos": 1401709473792.0, + "grad_norm": 0.0033293422286518343, + "language_loss": 0.79332191, + "learning_rate": 9.358266933618575e-06, + "loss": 0.8034128, + "num_input_tokens_seen": 404980944, + "router_z_loss_mlp": 0.08154297, + "step": 4887, + "time_per_iteration": 4.809056997299194 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061863, + "balance_loss_mlp": 1.03063035, + "epoch": 0.9403616775682955, + "flos": 539852276736.0, + "grad_norm": 0.04229008056200792, + "language_loss": 0.85158825, + "learning_rate": 9.298368837495575e-06, + "loss": 0.86220682, + "num_input_tokens_seen": 405056688, + "router_z_loss_mlp": 0.31201172, + "step": 4888, + "time_per_iteration": 2.714353084564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010096, + "balance_loss_mlp": 1.00144565, + "epoch": 0.9405540592535591, + "flos": 1321340663808.0, + "grad_norm": 0.004028749544672952, + "language_loss": 0.75169432, + "learning_rate": 9.238661248641089e-06, + "loss": 0.76179039, + "num_input_tokens_seen": 405284656, + "router_z_loss_mlp": 0.08154297, + "step": 4889, + "time_per_iteration": 4.866438388824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060754, + "balance_loss_mlp": 1.02985442, + "epoch": 0.9407464409388226, + "flos": 572097627648.0, + "grad_norm": 0.05566218913854017, + "language_loss": 0.82558531, + "learning_rate": 9.179144190235799e-06, + "loss": 0.83619285, + "num_input_tokens_seen": 405351584, + "router_z_loss_mlp": 0.30859375, + "step": 4890, + "time_per_iteration": 2.618964195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064127, + "balance_loss_mlp": 1.03220248, + "epoch": 0.9409388226240862, + "flos": 510994137600.0, + "grad_norm": 0.04766222233300551, + "language_loss": 0.7677294, + "learning_rate": 9.119817685386112e-06, + "loss": 0.77837062, + "num_input_tokens_seen": 405425712, + "router_z_loss_mlp": 0.3190918, + "step": 4891, + "time_per_iteration": 2.7037875652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009613, + "balance_loss_mlp": 1.00145948, + "epoch": 0.9411312043093497, + "flos": 1569060135936.0, + "grad_norm": 0.0040360903132469, + "language_loss": 0.80241883, + "learning_rate": 9.06068175712471e-06, + "loss": 0.81251502, + "num_input_tokens_seen": 405655760, + "router_z_loss_mlp": 0.08154297, + "step": 4892, + "time_per_iteration": 4.862957715988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061584, + "balance_loss_mlp": 1.03099477, + "epoch": 0.9413235859946133, + "flos": 569197836288.0, + "grad_norm": 0.06563594953788358, + "language_loss": 0.78013027, + "learning_rate": 9.001736428410234e-06, + "loss": 0.79074609, + "num_input_tokens_seen": 405731664, + "router_z_loss_mlp": 0.30541992, + "step": 4893, + "time_per_iteration": 2.72495436668396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060652, + "balance_loss_mlp": 1.02989554, + "epoch": 0.9415159676798769, + "flos": 781567114752.0, + "grad_norm": 0.07880011282868978, + "language_loss": 0.80272818, + "learning_rate": 8.942981722127263e-06, + "loss": 0.8133347, + "num_input_tokens_seen": 405808128, + "router_z_loss_mlp": 0.30712891, + "step": 4894, + "time_per_iteration": 3.00886869430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064323, + "balance_loss_mlp": 1.03375769, + "epoch": 0.9417083493651405, + "flos": 848960428032.0, + "grad_norm": 0.06735892348971287, + "language_loss": 0.80011809, + "learning_rate": 8.884417661086331e-06, + "loss": 0.81076133, + "num_input_tokens_seen": 405892448, + "router_z_loss_mlp": 0.30517578, + "step": 4895, + "time_per_iteration": 3.1446125507354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058314, + "balance_loss_mlp": 1.02741504, + "epoch": 0.941900731050404, + "flos": 529054834176.0, + "grad_norm": 0.05884920935493865, + "language_loss": 0.85655093, + "learning_rate": 8.826044268024025e-06, + "loss": 0.86713409, + "num_input_tokens_seen": 405966736, + "router_z_loss_mlp": 0.30859375, + "step": 4896, + "time_per_iteration": 2.6839241981506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061174, + "balance_loss_mlp": 1.02986979, + "epoch": 0.9420931127356675, + "flos": 556799920128.0, + "grad_norm": 0.05000327104616657, + "language_loss": 0.80053771, + "learning_rate": 8.767861565602997e-06, + "loss": 0.81114948, + "num_input_tokens_seen": 406043264, + "router_z_loss_mlp": 0.31274414, + "step": 4897, + "time_per_iteration": 2.7563629150390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061483, + "balance_loss_mlp": 1.03072667, + "epoch": 0.9422854944209311, + "flos": 652233682944.0, + "grad_norm": 0.05764227997043217, + "language_loss": 0.86452037, + "learning_rate": 8.709869576411733e-06, + "loss": 0.87513518, + "num_input_tokens_seen": 406119552, + "router_z_loss_mlp": 0.30712891, + "step": 4898, + "time_per_iteration": 2.875542640686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058844, + "balance_loss_mlp": 1.02777863, + "epoch": 0.9424778761061947, + "flos": 553417090560.0, + "grad_norm": 0.054125332002499735, + "language_loss": 0.83954608, + "learning_rate": 8.65206832296478e-06, + "loss": 0.85013449, + "num_input_tokens_seen": 406192464, + "router_z_loss_mlp": 0.31030273, + "step": 4899, + "time_per_iteration": 2.676485300064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060281, + "balance_loss_mlp": 1.02921546, + "epoch": 0.9426702577914583, + "flos": 588287218176.0, + "grad_norm": 0.05730536240438454, + "language_loss": 0.79517835, + "learning_rate": 8.594457827702406e-06, + "loss": 0.80578119, + "num_input_tokens_seen": 406262640, + "router_z_loss_mlp": 0.31030273, + "step": 4900, + "time_per_iteration": 2.698810338973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061683, + "balance_loss_mlp": 1.03064072, + "epoch": 0.9428626394767218, + "flos": 616329077760.0, + "grad_norm": 0.07093887274859992, + "language_loss": 0.78249103, + "learning_rate": 8.537038112991114e-06, + "loss": 0.79310787, + "num_input_tokens_seen": 406341328, + "router_z_loss_mlp": 0.31005859, + "step": 4901, + "time_per_iteration": 2.7636358737945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061896, + "balance_loss_mlp": 1.03025842, + "epoch": 0.9430550211619854, + "flos": 610129414656.0, + "grad_norm": 0.057626683505121824, + "language_loss": 0.8184545, + "learning_rate": 8.479809201123178e-06, + "loss": 0.82907343, + "num_input_tokens_seen": 406418864, + "router_z_loss_mlp": 0.31616211, + "step": 4902, + "time_per_iteration": 2.6953253746032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061661, + "balance_loss_mlp": 1.03064311, + "epoch": 0.943247402847249, + "flos": 565726256640.0, + "grad_norm": 0.056063093918735984, + "language_loss": 0.78086585, + "learning_rate": 8.422771114316885e-06, + "loss": 0.79148251, + "num_input_tokens_seen": 406492320, + "router_z_loss_mlp": 0.30981445, + "step": 4903, + "time_per_iteration": 2.6811859607696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060874, + "balance_loss_mlp": 1.03061843, + "epoch": 0.9434397845325125, + "flos": 526779265536.0, + "grad_norm": 0.05832485336087126, + "language_loss": 0.81253076, + "learning_rate": 8.365923874716297e-06, + "loss": 0.82313943, + "num_input_tokens_seen": 406560448, + "router_z_loss_mlp": 0.30200195, + "step": 4904, + "time_per_iteration": 2.5902273654937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064715, + "balance_loss_mlp": 1.03333879, + "epoch": 0.943632166217776, + "flos": 593167214592.0, + "grad_norm": 0.0538585081715504, + "language_loss": 0.82290983, + "learning_rate": 8.309267504391593e-06, + "loss": 0.83355695, + "num_input_tokens_seen": 406631376, + "router_z_loss_mlp": 0.31347656, + "step": 4905, + "time_per_iteration": 2.7094435691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061362, + "balance_loss_mlp": 1.03051043, + "epoch": 0.9438245479030396, + "flos": 572468594688.0, + "grad_norm": 0.04298887979827658, + "language_loss": 0.85656631, + "learning_rate": 8.252802025338623e-06, + "loss": 0.86717987, + "num_input_tokens_seen": 406713728, + "router_z_loss_mlp": 0.30810547, + "step": 4906, + "time_per_iteration": 2.802527904510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060239, + "balance_loss_mlp": 1.02919722, + "epoch": 0.9440169295883032, + "flos": 488018539008.0, + "grad_norm": 0.05907547898577151, + "language_loss": 0.81667566, + "learning_rate": 8.196527459479242e-06, + "loss": 0.82727802, + "num_input_tokens_seen": 406779168, + "router_z_loss_mlp": 0.31005859, + "step": 4907, + "time_per_iteration": 2.5595598220825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061232, + "balance_loss_mlp": 1.02968919, + "epoch": 0.9442093112735668, + "flos": 731399279616.0, + "grad_norm": 0.05714065590928067, + "language_loss": 0.73537469, + "learning_rate": 8.140443828661137e-06, + "loss": 0.74598706, + "num_input_tokens_seen": 406860816, + "router_z_loss_mlp": 0.31518555, + "step": 4908, + "time_per_iteration": 3.017683267593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065511, + "balance_loss_mlp": 1.03446937, + "epoch": 0.9444016929588304, + "flos": 570763404288.0, + "grad_norm": 0.06175757947134421, + "language_loss": 0.82048225, + "learning_rate": 8.084551154658004e-06, + "loss": 0.83113736, + "num_input_tokens_seen": 406929888, + "router_z_loss_mlp": 0.31005859, + "step": 4909, + "time_per_iteration": 2.673168659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062254, + "balance_loss_mlp": 1.03032982, + "epoch": 0.9445940746440938, + "flos": 509038663680.0, + "grad_norm": 0.0760284691333908, + "language_loss": 0.85880816, + "learning_rate": 8.028849459169318e-06, + "loss": 0.86943078, + "num_input_tokens_seen": 406998224, + "router_z_loss_mlp": 0.3190918, + "step": 4910, + "time_per_iteration": 2.5887327194213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060208, + "balance_loss_mlp": 1.02926159, + "epoch": 0.9447864563293574, + "flos": 624247077888.0, + "grad_norm": 0.0535010023544938, + "language_loss": 0.80667341, + "learning_rate": 7.97333876382028e-06, + "loss": 0.81727552, + "num_input_tokens_seen": 407075088, + "router_z_loss_mlp": 0.30908203, + "step": 4911, + "time_per_iteration": 2.823601245880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059967, + "balance_loss_mlp": 1.02799463, + "epoch": 0.944978838014621, + "flos": 505011262464.0, + "grad_norm": 0.06633809570148991, + "language_loss": 0.8041541, + "learning_rate": 7.918019090162098e-06, + "loss": 0.81475377, + "num_input_tokens_seen": 407147792, + "router_z_loss_mlp": 0.31958008, + "step": 4912, + "time_per_iteration": 2.7652816772460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009718, + "balance_loss_mlp": 1.00156379, + "epoch": 0.9451712196998846, + "flos": 1483371809280.0, + "grad_norm": 0.00406090983810477, + "language_loss": 0.78287339, + "learning_rate": 7.862890459671812e-06, + "loss": 0.7929706, + "num_input_tokens_seen": 407387216, + "router_z_loss_mlp": 0.08154297, + "step": 4913, + "time_per_iteration": 4.969675779342651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058714, + "balance_loss_mlp": 1.02781546, + "epoch": 0.9453636013851482, + "flos": 520885140480.0, + "grad_norm": 0.0550476227772574, + "language_loss": 0.9011662, + "learning_rate": 7.80795289375219e-06, + "loss": 0.9117533, + "num_input_tokens_seen": 407457664, + "router_z_loss_mlp": 0.30859375, + "step": 4914, + "time_per_iteration": 2.6254100799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009716, + "balance_loss_mlp": 1.00156236, + "epoch": 0.9455559830704117, + "flos": 1496060706816.0, + "grad_norm": 0.004058659107795025, + "language_loss": 0.8356235, + "learning_rate": 7.75320641373195e-06, + "loss": 0.84572065, + "num_input_tokens_seen": 407700256, + "router_z_loss_mlp": 0.08154297, + "step": 4915, + "time_per_iteration": 4.945310831069946 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062356, + "balance_loss_mlp": 1.03174341, + "epoch": 0.9457483647556753, + "flos": 497871664128.0, + "grad_norm": 0.05064963619563798, + "language_loss": 0.81528383, + "learning_rate": 7.698651040865534e-06, + "loss": 0.82590735, + "num_input_tokens_seen": 407770080, + "router_z_loss_mlp": 0.30566406, + "step": 4916, + "time_per_iteration": 2.631326913833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065049, + "balance_loss_mlp": 1.03434074, + "epoch": 0.9459407464409388, + "flos": 1018979536896.0, + "grad_norm": 0.047302370588053054, + "language_loss": 0.82041919, + "learning_rate": 7.644286796333222e-06, + "loss": 0.83106971, + "num_input_tokens_seen": 407854640, + "router_z_loss_mlp": 0.30664062, + "step": 4917, + "time_per_iteration": 3.3984169960021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065516, + "balance_loss_mlp": 1.0340929, + "epoch": 0.9461331281262024, + "flos": 513332315136.0, + "grad_norm": 0.06405124336828617, + "language_loss": 0.81178218, + "learning_rate": 7.590113701241075e-06, + "loss": 0.82243741, + "num_input_tokens_seen": 407922704, + "router_z_loss_mlp": 0.31396484, + "step": 4918, + "time_per_iteration": 2.6147992610931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061487, + "balance_loss_mlp": 1.03058767, + "epoch": 0.9463255098114659, + "flos": 527768663040.0, + "grad_norm": 0.061511478598610246, + "language_loss": 0.77884614, + "learning_rate": 7.536131776620936e-06, + "loss": 0.78946102, + "num_input_tokens_seen": 407991136, + "router_z_loss_mlp": 0.30859375, + "step": 4919, + "time_per_iteration": 2.567692756652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060745, + "balance_loss_mlp": 1.02972698, + "epoch": 0.9465178914967295, + "flos": 505798428672.0, + "grad_norm": 0.06093265985390115, + "language_loss": 0.83455086, + "learning_rate": 7.482341043430485e-06, + "loss": 0.84515834, + "num_input_tokens_seen": 408056576, + "router_z_loss_mlp": 0.30981445, + "step": 4920, + "time_per_iteration": 2.5662806034088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058812, + "balance_loss_mlp": 1.02672136, + "epoch": 0.9467102731819931, + "flos": 659934895104.0, + "grad_norm": 0.05112335734363304, + "language_loss": 0.85568339, + "learning_rate": 7.428741522553184e-06, + "loss": 0.8662715, + "num_input_tokens_seen": 408136960, + "router_z_loss_mlp": 0.32080078, + "step": 4921, + "time_per_iteration": 2.886445999145508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059962, + "balance_loss_mlp": 1.02877688, + "epoch": 0.9469026548672567, + "flos": 674854281216.0, + "grad_norm": 0.04908759273743658, + "language_loss": 0.89305884, + "learning_rate": 7.375333234798054e-06, + "loss": 0.90365845, + "num_input_tokens_seen": 408218304, + "router_z_loss_mlp": 0.31152344, + "step": 4922, + "time_per_iteration": 2.920933961868286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061885, + "balance_loss_mlp": 1.03074789, + "epoch": 0.9470950365525203, + "flos": 513701872128.0, + "grad_norm": 0.06478709373660563, + "language_loss": 0.79722393, + "learning_rate": 7.32211620090012e-06, + "loss": 0.80784273, + "num_input_tokens_seen": 408287936, + "router_z_loss_mlp": 0.31103516, + "step": 4923, + "time_per_iteration": 2.6531217098236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106274, + "balance_loss_mlp": 1.03122091, + "epoch": 0.9472874182377837, + "flos": 549823265280.0, + "grad_norm": 0.05014649935871143, + "language_loss": 0.81039178, + "learning_rate": 7.269090441520132e-06, + "loss": 0.82101917, + "num_input_tokens_seen": 408365568, + "router_z_loss_mlp": 0.31494141, + "step": 4924, + "time_per_iteration": 2.7968811988830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060368, + "balance_loss_mlp": 1.0297308, + "epoch": 0.9474797999230473, + "flos": 542510548992.0, + "grad_norm": 0.05010652307615933, + "language_loss": 0.80013597, + "learning_rate": 7.216255977244457e-06, + "loss": 0.81073964, + "num_input_tokens_seen": 408431248, + "router_z_loss_mlp": 0.3059082, + "step": 4925, + "time_per_iteration": 2.610203266143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106326, + "balance_loss_mlp": 1.03155029, + "epoch": 0.9476721816083109, + "flos": 844291427328.0, + "grad_norm": 0.056324444552239616, + "language_loss": 0.85505098, + "learning_rate": 7.163612828585242e-06, + "loss": 0.86568356, + "num_input_tokens_seen": 408514112, + "router_z_loss_mlp": 0.31689453, + "step": 4926, + "time_per_iteration": 3.0810704231262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061593, + "balance_loss_mlp": 1.03052688, + "epoch": 0.9478645632935745, + "flos": 637717349376.0, + "grad_norm": 0.05782852110229401, + "language_loss": 0.79276693, + "learning_rate": 7.1111610159803605e-06, + "loss": 0.80338287, + "num_input_tokens_seen": 408585968, + "router_z_loss_mlp": 0.31030273, + "step": 4927, + "time_per_iteration": 2.7429778575897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060754, + "balance_loss_mlp": 1.02968776, + "epoch": 0.948056944978838, + "flos": 656531716608.0, + "grad_norm": 0.05213979762076608, + "language_loss": 0.75814188, + "learning_rate": 7.058900559793469e-06, + "loss": 0.76874942, + "num_input_tokens_seen": 408665456, + "router_z_loss_mlp": 0.31030273, + "step": 4928, + "time_per_iteration": 2.811953067779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010646, + "balance_loss_mlp": 1.03343904, + "epoch": 0.9482493266641016, + "flos": 440676301824.0, + "grad_norm": 0.06159460987031525, + "language_loss": 0.83276188, + "learning_rate": 7.00683148031378e-06, + "loss": 0.84340787, + "num_input_tokens_seen": 408730192, + "router_z_loss_mlp": 0.3112793, + "step": 4929, + "time_per_iteration": 2.560638666152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059451, + "balance_loss_mlp": 1.02864742, + "epoch": 0.9484417083493651, + "flos": 545707113984.0, + "grad_norm": 0.05635941131383605, + "language_loss": 0.77704895, + "learning_rate": 6.9549537977564024e-06, + "loss": 0.78764343, + "num_input_tokens_seen": 408807616, + "router_z_loss_mlp": 0.30761719, + "step": 4930, + "time_per_iteration": 2.7607994079589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061911, + "balance_loss_mlp": 1.03070188, + "epoch": 0.9486340900346287, + "flos": 538325996544.0, + "grad_norm": 0.08183540435838546, + "language_loss": 0.7971375, + "learning_rate": 6.903267532262003e-06, + "loss": 0.80775654, + "num_input_tokens_seen": 408883552, + "router_z_loss_mlp": 0.31176758, + "step": 4931, + "time_per_iteration": 2.6748297214508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060277, + "balance_loss_mlp": 1.02899647, + "epoch": 0.9488264717198923, + "flos": 681362454528.0, + "grad_norm": 0.05258246646499724, + "language_loss": 0.85742575, + "learning_rate": 6.851772703896975e-06, + "loss": 0.86802852, + "num_input_tokens_seen": 408956400, + "router_z_loss_mlp": 0.3125, + "step": 4932, + "time_per_iteration": 2.8221163749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061977, + "balance_loss_mlp": 1.03086352, + "epoch": 0.9490188534051558, + "flos": 462365729280.0, + "grad_norm": 0.060862795367112775, + "language_loss": 0.88211328, + "learning_rate": 6.8004693326533805e-06, + "loss": 0.8927331, + "num_input_tokens_seen": 409019904, + "router_z_loss_mlp": 0.31079102, + "step": 4933, + "time_per_iteration": 2.5040442943573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059018, + "balance_loss_mlp": 1.02904928, + "epoch": 0.9492112350904194, + "flos": 542865549312.0, + "grad_norm": 0.05128444619283133, + "language_loss": 0.82685566, + "learning_rate": 6.7493574384489e-06, + "loss": 0.83744586, + "num_input_tokens_seen": 409094288, + "router_z_loss_mlp": 0.29907227, + "step": 4934, + "time_per_iteration": 2.680206537246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060329, + "balance_loss_mlp": 1.02938271, + "epoch": 0.949403616775683, + "flos": 550040053248.0, + "grad_norm": 0.05239087368411042, + "language_loss": 0.84056008, + "learning_rate": 6.698437041126992e-06, + "loss": 0.85116339, + "num_input_tokens_seen": 409169120, + "router_z_loss_mlp": 0.30908203, + "step": 4935, + "time_per_iteration": 2.693420648574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061379, + "balance_loss_mlp": 1.03074229, + "epoch": 0.9495959984609466, + "flos": 598105437696.0, + "grad_norm": 0.04662425876845386, + "language_loss": 0.82682049, + "learning_rate": 6.647708160456678e-06, + "loss": 0.83743429, + "num_input_tokens_seen": 409243200, + "router_z_loss_mlp": 0.3059082, + "step": 4936, + "time_per_iteration": 2.7124640941619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063372, + "balance_loss_mlp": 1.03273535, + "epoch": 0.94978838014621, + "flos": 608130270720.0, + "grad_norm": 0.07024683277055332, + "language_loss": 0.82249677, + "learning_rate": 6.597170816132702e-06, + "loss": 0.8331306, + "num_input_tokens_seen": 409319264, + "router_z_loss_mlp": 0.3059082, + "step": 4937, + "time_per_iteration": 2.810114622116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063007, + "balance_loss_mlp": 1.03132105, + "epoch": 0.9499807618314736, + "flos": 540575424000.0, + "grad_norm": 0.04925833827066514, + "language_loss": 0.8649928, + "learning_rate": 6.546825027775427e-06, + "loss": 0.87562287, + "num_input_tokens_seen": 409389840, + "router_z_loss_mlp": 0.31665039, + "step": 4938, + "time_per_iteration": 2.6485135555267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065564, + "balance_loss_mlp": 1.03521299, + "epoch": 0.9501731435167372, + "flos": 594323937792.0, + "grad_norm": 0.046317056563734006, + "language_loss": 0.82939029, + "learning_rate": 6.496670814930717e-06, + "loss": 0.84004581, + "num_input_tokens_seen": 409458752, + "router_z_loss_mlp": 0.30297852, + "step": 4939, + "time_per_iteration": 2.687056303024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063883, + "balance_loss_mlp": 1.03310299, + "epoch": 0.9503655252020008, + "flos": 453906464256.0, + "grad_norm": 0.06011478879513061, + "language_loss": 0.80201852, + "learning_rate": 6.446708197070161e-06, + "loss": 0.81265736, + "num_input_tokens_seen": 409525008, + "router_z_loss_mlp": 0.30737305, + "step": 4940, + "time_per_iteration": 2.528292179107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061453, + "balance_loss_mlp": 1.03041101, + "epoch": 0.9505579068872644, + "flos": 667649253888.0, + "grad_norm": 0.055113851279690214, + "language_loss": 0.846946, + "learning_rate": 6.396937193591079e-06, + "loss": 0.85756052, + "num_input_tokens_seen": 409603376, + "router_z_loss_mlp": 0.31005859, + "step": 4941, + "time_per_iteration": 2.823488235473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061943, + "balance_loss_mlp": 1.03085279, + "epoch": 0.9507502885725279, + "flos": 401989768704.0, + "grad_norm": 0.05825643192840984, + "language_loss": 0.81586736, + "learning_rate": 6.347357823816235e-06, + "loss": 0.82648677, + "num_input_tokens_seen": 409667168, + "router_z_loss_mlp": 0.31054688, + "step": 4942, + "time_per_iteration": 2.495529890060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058501, + "balance_loss_mlp": 1.02719688, + "epoch": 0.9509426702577914, + "flos": 700015288320.0, + "grad_norm": 0.051863922569766165, + "language_loss": 0.79421437, + "learning_rate": 6.297970106994011e-06, + "loss": 0.80479932, + "num_input_tokens_seen": 409746832, + "router_z_loss_mlp": 0.31274414, + "step": 4943, + "time_per_iteration": 2.978654384613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058704, + "balance_loss_mlp": 1.02778077, + "epoch": 0.951135051943055, + "flos": 501170125824.0, + "grad_norm": 0.057314237178262395, + "language_loss": 0.82580554, + "learning_rate": 6.2487740622985126e-06, + "loss": 0.83639264, + "num_input_tokens_seen": 409813792, + "router_z_loss_mlp": 0.30883789, + "step": 4944, + "time_per_iteration": 2.5696232318878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059453, + "balance_loss_mlp": 1.02831542, + "epoch": 0.9513274336283186, + "flos": 614310994944.0, + "grad_norm": 0.06995172115852209, + "language_loss": 0.81611979, + "learning_rate": 6.1997697088292395e-06, + "loss": 0.82671428, + "num_input_tokens_seen": 409898848, + "router_z_loss_mlp": 0.31103516, + "step": 4945, + "time_per_iteration": 2.8979203701019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062862, + "balance_loss_mlp": 1.03184378, + "epoch": 0.9515198153135821, + "flos": 519334129152.0, + "grad_norm": 0.06029032553674963, + "language_loss": 0.81646979, + "learning_rate": 6.150957065611363e-06, + "loss": 0.82709849, + "num_input_tokens_seen": 409966368, + "router_z_loss_mlp": 0.30981445, + "step": 4946, + "time_per_iteration": 2.574259042739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010644, + "balance_loss_mlp": 1.03381109, + "epoch": 0.9517121969988457, + "flos": 664622834688.0, + "grad_norm": 0.054183669348516254, + "language_loss": 0.76488018, + "learning_rate": 6.102336151595667e-06, + "loss": 0.7755242, + "num_input_tokens_seen": 410048496, + "router_z_loss_mlp": 0.30566406, + "step": 4947, + "time_per_iteration": 2.927349805831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059457, + "balance_loss_mlp": 1.02791381, + "epoch": 0.9519045786841093, + "flos": 676108518912.0, + "grad_norm": 0.05907314462519681, + "language_loss": 0.75945526, + "learning_rate": 6.053906985658553e-06, + "loss": 0.77004981, + "num_input_tokens_seen": 410121840, + "router_z_loss_mlp": 0.31518555, + "step": 4948, + "time_per_iteration": 2.787550210952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059452, + "balance_loss_mlp": 1.02836227, + "epoch": 0.9520969603693729, + "flos": 652593065472.0, + "grad_norm": 0.05009872847852139, + "language_loss": 0.80296874, + "learning_rate": 6.005669586601814e-06, + "loss": 0.81356323, + "num_input_tokens_seen": 410199152, + "router_z_loss_mlp": 0.31054688, + "step": 4949, + "time_per_iteration": 2.8136212825775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062928, + "balance_loss_mlp": 1.03183794, + "epoch": 0.9522893420546364, + "flos": 742935836160.0, + "grad_norm": 0.046449409854488935, + "language_loss": 0.8303045, + "learning_rate": 5.957623973152748e-06, + "loss": 0.8409338, + "num_input_tokens_seen": 410285392, + "router_z_loss_mlp": 0.31054688, + "step": 4950, + "time_per_iteration": 3.0413155555725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061178, + "balance_loss_mlp": 1.03013575, + "epoch": 0.9524817237398999, + "flos": 761364679680.0, + "grad_norm": 0.06237526863901178, + "language_loss": 0.80663252, + "learning_rate": 5.909770163964545e-06, + "loss": 0.81724423, + "num_input_tokens_seen": 410359872, + "router_z_loss_mlp": 0.31005859, + "step": 4951, + "time_per_iteration": 2.926941156387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060308, + "balance_loss_mlp": 1.02936172, + "epoch": 0.9526741054251635, + "flos": 528871541760.0, + "grad_norm": 0.05333292784304159, + "language_loss": 0.82116485, + "learning_rate": 5.8621081776155105e-06, + "loss": 0.83176786, + "num_input_tokens_seen": 410425728, + "router_z_loss_mlp": 0.30908203, + "step": 4952, + "time_per_iteration": 2.572510004043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060149, + "balance_loss_mlp": 1.02896416, + "epoch": 0.9528664871104271, + "flos": 488196039168.0, + "grad_norm": 0.06449124257612378, + "language_loss": 0.80692679, + "learning_rate": 5.814638032609787e-06, + "loss": 0.81752825, + "num_input_tokens_seen": 410496080, + "router_z_loss_mlp": 0.31152344, + "step": 4953, + "time_per_iteration": 2.5699658393859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062492, + "balance_loss_mlp": 1.03142655, + "epoch": 0.9530588687956907, + "flos": 517464433152.0, + "grad_norm": 0.04487824642282098, + "language_loss": 0.8520484, + "learning_rate": 5.76735974737691e-06, + "loss": 0.86267328, + "num_input_tokens_seen": 410576448, + "router_z_loss_mlp": 0.31030273, + "step": 4954, + "time_per_iteration": 2.7860260009765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059671, + "balance_loss_mlp": 1.02831888, + "epoch": 0.9532512504809542, + "flos": 674833932288.0, + "grad_norm": 0.06088985073975803, + "language_loss": 0.80344075, + "learning_rate": 5.720273340271864e-06, + "loss": 0.81403744, + "num_input_tokens_seen": 410655792, + "router_z_loss_mlp": 0.31323242, + "step": 4955, + "time_per_iteration": 2.83512544631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106091, + "balance_loss_mlp": 1.02960563, + "epoch": 0.9534436321662177, + "flos": 489269804544.0, + "grad_norm": 0.05399623732044483, + "language_loss": 0.83765268, + "learning_rate": 5.673378829575249e-06, + "loss": 0.84826177, + "num_input_tokens_seen": 410725440, + "router_z_loss_mlp": 0.31274414, + "step": 4956, + "time_per_iteration": 2.5622565746307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064731, + "balance_loss_mlp": 1.03325951, + "epoch": 0.9536360138514813, + "flos": 496335209472.0, + "grad_norm": 0.05639798333533893, + "language_loss": 0.82038826, + "learning_rate": 5.626676233493167e-06, + "loss": 0.83103555, + "num_input_tokens_seen": 410797552, + "router_z_loss_mlp": 0.31445312, + "step": 4957, + "time_per_iteration": 2.6522533893585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106195, + "balance_loss_mlp": 1.03119373, + "epoch": 0.9538283955367449, + "flos": 801114803712.0, + "grad_norm": 0.052998960329489544, + "language_loss": 0.8405599, + "learning_rate": 5.580165570157114e-06, + "loss": 0.85117936, + "num_input_tokens_seen": 410876736, + "router_z_loss_mlp": 0.30712891, + "step": 4958, + "time_per_iteration": 3.0696020126342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061291, + "balance_loss_mlp": 1.03003478, + "epoch": 0.9540207772220085, + "flos": 556386693120.0, + "grad_norm": 0.050926837280592614, + "language_loss": 0.79829109, + "learning_rate": 5.533846857624203e-06, + "loss": 0.80890399, + "num_input_tokens_seen": 410955632, + "router_z_loss_mlp": 0.31225586, + "step": 4959, + "time_per_iteration": 2.758847951889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061461, + "balance_loss_mlp": 1.03044283, + "epoch": 0.954213158907272, + "flos": 684193844736.0, + "grad_norm": 0.056254773205571866, + "language_loss": 0.81409031, + "learning_rate": 5.487720113876882e-06, + "loss": 0.82470489, + "num_input_tokens_seen": 411038480, + "router_z_loss_mlp": 0.30981445, + "step": 4960, + "time_per_iteration": 2.8848278522491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059866, + "balance_loss_mlp": 1.02913427, + "epoch": 0.9544055405925356, + "flos": 535480049664.0, + "grad_norm": 0.06620853899260781, + "language_loss": 0.8256973, + "learning_rate": 5.441785356823214e-06, + "loss": 0.83629596, + "num_input_tokens_seen": 411109744, + "router_z_loss_mlp": 0.30688477, + "step": 4961, + "time_per_iteration": 2.7178971767425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063811, + "balance_loss_mlp": 1.03272176, + "epoch": 0.9545979222777992, + "flos": 825025955328.0, + "grad_norm": 0.06182369055058564, + "language_loss": 0.80590069, + "learning_rate": 5.3960426042965476e-06, + "loss": 0.81653881, + "num_input_tokens_seen": 411202192, + "router_z_loss_mlp": 0.31054688, + "step": 4962, + "time_per_iteration": 3.109530448913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063297, + "balance_loss_mlp": 1.03232658, + "epoch": 0.9547903039630627, + "flos": 761326801920.0, + "grad_norm": 0.052315265148481546, + "language_loss": 0.77177644, + "learning_rate": 5.3504918740558405e-06, + "loss": 0.78240943, + "num_input_tokens_seen": 411289248, + "router_z_loss_mlp": 0.30932617, + "step": 4963, + "time_per_iteration": 3.0747199058532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064785, + "balance_loss_mlp": 1.03367114, + "epoch": 0.9549826856483262, + "flos": 515050652160.0, + "grad_norm": 0.05862888222606825, + "language_loss": 0.82517004, + "learning_rate": 5.3051331837855045e-06, + "loss": 0.83581787, + "num_input_tokens_seen": 411355232, + "router_z_loss_mlp": 0.31079102, + "step": 4964, + "time_per_iteration": 2.5942893028259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058928, + "balance_loss_mlp": 1.02857697, + "epoch": 0.9551750673335898, + "flos": 642818515968.0, + "grad_norm": 0.08814451705350242, + "language_loss": 0.82504213, + "learning_rate": 5.259966551095341e-06, + "loss": 0.83563137, + "num_input_tokens_seen": 411432288, + "router_z_loss_mlp": 0.30297852, + "step": 4965, + "time_per_iteration": 2.8214685916900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106332, + "balance_loss_mlp": 1.03199232, + "epoch": 0.9553674490188534, + "flos": 471967160832.0, + "grad_norm": 0.06036166806665559, + "language_loss": 0.82877362, + "learning_rate": 5.214991993520546e-06, + "loss": 0.83940685, + "num_input_tokens_seen": 411499376, + "router_z_loss_mlp": 0.31298828, + "step": 4966, + "time_per_iteration": 2.5931785106658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063165, + "balance_loss_mlp": 1.03209853, + "epoch": 0.955559830704117, + "flos": 528064026624.0, + "grad_norm": 0.057495617143447204, + "language_loss": 0.81656486, + "learning_rate": 5.170209528521763e-06, + "loss": 0.82719648, + "num_input_tokens_seen": 411564976, + "router_z_loss_mlp": 0.31030273, + "step": 4967, + "time_per_iteration": 2.5922937393188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062233, + "balance_loss_mlp": 1.03064275, + "epoch": 0.9557522123893806, + "flos": 547907079168.0, + "grad_norm": 0.05879724431171985, + "language_loss": 0.83928549, + "learning_rate": 5.125619173485196e-06, + "loss": 0.84990788, + "num_input_tokens_seen": 411636464, + "router_z_loss_mlp": 0.31567383, + "step": 4968, + "time_per_iteration": 2.5986125469207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059963, + "balance_loss_mlp": 1.02887332, + "epoch": 0.955944594074644, + "flos": 509201607168.0, + "grad_norm": 0.04811907274650132, + "language_loss": 0.81663108, + "learning_rate": 5.08122094572222e-06, + "loss": 0.82723069, + "num_input_tokens_seen": 411710672, + "router_z_loss_mlp": 0.31054688, + "step": 4969, + "time_per_iteration": 2.6879472732543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062537, + "balance_loss_mlp": 1.03163767, + "epoch": 0.9561369757599076, + "flos": 527297209344.0, + "grad_norm": 0.0632836097408621, + "language_loss": 0.7964825, + "learning_rate": 5.037014862469824e-06, + "loss": 0.80710787, + "num_input_tokens_seen": 411785616, + "router_z_loss_mlp": 0.30859375, + "step": 4970, + "time_per_iteration": 2.7789974212646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064602, + "balance_loss_mlp": 1.03363097, + "epoch": 0.9563293574451712, + "flos": 497950239744.0, + "grad_norm": 0.05808374909339728, + "language_loss": 0.79714704, + "learning_rate": 4.993000940890391e-06, + "loss": 0.80779302, + "num_input_tokens_seen": 411854832, + "router_z_loss_mlp": 0.30932617, + "step": 4971, + "time_per_iteration": 2.5803020000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010642, + "balance_loss_mlp": 1.00253558, + "epoch": 0.9565217391304348, + "flos": 1408160982528.0, + "grad_norm": 0.0036852795244430637, + "language_loss": 0.81773561, + "learning_rate": 4.949179198071585e-06, + "loss": 0.82784206, + "num_input_tokens_seen": 412081856, + "router_z_loss_mlp": 0.08105469, + "step": 4972, + "time_per_iteration": 4.861463785171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060113, + "balance_loss_mlp": 1.02949977, + "epoch": 0.9567141208156984, + "flos": 503588289024.0, + "grad_norm": 0.0458139454633811, + "language_loss": 0.78067911, + "learning_rate": 4.905549651026464e-06, + "loss": 0.79128021, + "num_input_tokens_seen": 412155600, + "router_z_loss_mlp": 0.30566406, + "step": 4973, + "time_per_iteration": 2.7627005577087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062012, + "balance_loss_mlp": 1.03063631, + "epoch": 0.9569065025009619, + "flos": 432985264128.0, + "grad_norm": 0.0685337363619886, + "language_loss": 0.79855549, + "learning_rate": 4.86211231669359e-06, + "loss": 0.80917561, + "num_input_tokens_seen": 412219584, + "router_z_loss_mlp": 0.31347656, + "step": 4974, + "time_per_iteration": 2.4670193195343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061097, + "balance_loss_mlp": 1.03055596, + "epoch": 0.9570988841862255, + "flos": 589662139392.0, + "grad_norm": 0.08497367515573702, + "language_loss": 0.78133345, + "learning_rate": 4.818867211936806e-06, + "loss": 0.79194438, + "num_input_tokens_seen": 412295088, + "router_z_loss_mlp": 0.30493164, + "step": 4975, + "time_per_iteration": 2.7725143432617188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106144, + "balance_loss_mlp": 1.03037453, + "epoch": 0.957291265871489, + "flos": 766938710016.0, + "grad_norm": 0.09684450321681563, + "language_loss": 0.78521204, + "learning_rate": 4.7758143535454045e-06, + "loss": 0.79582649, + "num_input_tokens_seen": 412376992, + "router_z_loss_mlp": 0.31030273, + "step": 4976, + "time_per_iteration": 3.0132195949554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062638, + "balance_loss_mlp": 1.031739, + "epoch": 0.9574836475567526, + "flos": 638820228096.0, + "grad_norm": 0.058082259437561116, + "language_loss": 0.84378779, + "learning_rate": 4.732953758233849e-06, + "loss": 0.85441422, + "num_input_tokens_seen": 412450064, + "router_z_loss_mlp": 0.30883789, + "step": 4977, + "time_per_iteration": 2.775636672973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010609, + "balance_loss_mlp": 1.00250316, + "epoch": 0.9576760292420161, + "flos": 1575077916672.0, + "grad_norm": 0.0036801546071174532, + "language_loss": 0.78607261, + "learning_rate": 4.690285442642272e-06, + "loss": 0.7961787, + "num_input_tokens_seen": 412676896, + "router_z_loss_mlp": 0.08105469, + "step": 4978, + "time_per_iteration": 4.957335710525513 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064079, + "balance_loss_mlp": 1.03224993, + "epoch": 0.9578684109272797, + "flos": 496089308160.0, + "grad_norm": 0.05585876202637502, + "language_loss": 0.86942095, + "learning_rate": 4.6478094233358695e-06, + "loss": 0.88006175, + "num_input_tokens_seen": 412746848, + "router_z_loss_mlp": 0.31811523, + "step": 4979, + "time_per_iteration": 2.6361520290374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063178, + "balance_loss_mlp": 1.03173101, + "epoch": 0.9580607926125433, + "flos": 429730472448.0, + "grad_norm": 0.06248863095156401, + "language_loss": 0.84962738, + "learning_rate": 4.605525716805337e-06, + "loss": 0.86025918, + "num_input_tokens_seen": 412810144, + "router_z_loss_mlp": 0.31420898, + "step": 4980, + "time_per_iteration": 2.473877191543579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062381, + "balance_loss_mlp": 1.03129125, + "epoch": 0.9582531742978069, + "flos": 1126796659200.0, + "grad_norm": 0.05496423205034748, + "language_loss": 0.79914278, + "learning_rate": 4.563434339466599e-06, + "loss": 0.80976653, + "num_input_tokens_seen": 412904768, + "router_z_loss_mlp": 0.31054688, + "step": 4981, + "time_per_iteration": 3.5336828231811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062471, + "balance_loss_mlp": 1.03061819, + "epoch": 0.9584455559830705, + "flos": 524185012224.0, + "grad_norm": 0.049558760655255135, + "language_loss": 0.78986633, + "learning_rate": 4.521535307661085e-06, + "loss": 0.80049098, + "num_input_tokens_seen": 412974592, + "router_z_loss_mlp": 0.31835938, + "step": 4982, + "time_per_iteration": 2.6623659133911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063285, + "balance_loss_mlp": 1.03198099, + "epoch": 0.9586379376683339, + "flos": 633873240576.0, + "grad_norm": 0.05452483002568014, + "language_loss": 0.80709702, + "learning_rate": 4.479828637655392e-06, + "loss": 0.81772989, + "num_input_tokens_seen": 413052848, + "router_z_loss_mlp": 0.31274414, + "step": 4983, + "time_per_iteration": 2.8733677864074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106139, + "balance_loss_mlp": 1.03025281, + "epoch": 0.9588303193535975, + "flos": 415831007232.0, + "grad_norm": 0.05618867456801452, + "language_loss": 0.83768463, + "learning_rate": 4.438314345641459e-06, + "loss": 0.84829855, + "num_input_tokens_seen": 413118000, + "router_z_loss_mlp": 0.31103516, + "step": 4984, + "time_per_iteration": 2.485130548477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061229, + "balance_loss_mlp": 1.03006768, + "epoch": 0.9590227010388611, + "flos": 481440554496.0, + "grad_norm": 0.05863368736289354, + "language_loss": 0.78077298, + "learning_rate": 4.3969924477365585e-06, + "loss": 0.79138523, + "num_input_tokens_seen": 413185616, + "router_z_loss_mlp": 0.3112793, + "step": 4985, + "time_per_iteration": 2.5876057147979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060016, + "balance_loss_mlp": 1.02904499, + "epoch": 0.9592150827241247, + "flos": 684214193664.0, + "grad_norm": 0.05594536844017473, + "language_loss": 0.80234873, + "learning_rate": 4.355862959983359e-06, + "loss": 0.81294882, + "num_input_tokens_seen": 413265616, + "router_z_loss_mlp": 0.30932617, + "step": 4986, + "time_per_iteration": 2.9769937992095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058834, + "balance_loss_mlp": 1.02800655, + "epoch": 0.9594074644093882, + "flos": 574205870592.0, + "grad_norm": 0.05549029707808997, + "language_loss": 0.70972621, + "learning_rate": 4.314925898349642e-06, + "loss": 0.72031456, + "num_input_tokens_seen": 413341248, + "router_z_loss_mlp": 0.30786133, + "step": 4987, + "time_per_iteration": 2.7206904888153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059627, + "balance_loss_mlp": 1.02856088, + "epoch": 0.9595998460946518, + "flos": 546593204736.0, + "grad_norm": 0.06100041490887349, + "language_loss": 0.7789138, + "learning_rate": 4.2741812787286395e-06, + "loss": 0.78951007, + "num_input_tokens_seen": 413416080, + "router_z_loss_mlp": 0.31030273, + "step": 4988, + "time_per_iteration": 2.7511510848999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061435, + "balance_loss_mlp": 1.02998781, + "epoch": 0.9597922277799154, + "flos": 473798979072.0, + "grad_norm": 0.06528513984211147, + "language_loss": 0.78195035, + "learning_rate": 4.233629116938809e-06, + "loss": 0.79256475, + "num_input_tokens_seen": 413482336, + "router_z_loss_mlp": 0.31420898, + "step": 4989, + "time_per_iteration": 2.5284314155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106279, + "balance_loss_mlp": 1.03122306, + "epoch": 0.9599846094651789, + "flos": 514435193856.0, + "grad_norm": 0.0526648321296654, + "language_loss": 0.85647953, + "learning_rate": 4.193269428723889e-06, + "loss": 0.86710739, + "num_input_tokens_seen": 413553248, + "router_z_loss_mlp": 0.31542969, + "step": 4990, + "time_per_iteration": 2.629483938217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063356, + "balance_loss_mlp": 1.03233767, + "epoch": 0.9601769911504425, + "flos": 594689112576.0, + "grad_norm": 0.06076750451887931, + "language_loss": 0.7845335, + "learning_rate": 4.1531022297529035e-06, + "loss": 0.79516703, + "num_input_tokens_seen": 413625776, + "router_z_loss_mlp": 0.30981445, + "step": 4991, + "time_per_iteration": 2.767305850982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064329, + "balance_loss_mlp": 1.03381109, + "epoch": 0.960369372835706, + "flos": 492755940864.0, + "grad_norm": 0.043940554516399624, + "language_loss": 0.7891221, + "learning_rate": 4.1131275356201536e-06, + "loss": 0.79976535, + "num_input_tokens_seen": 413693056, + "router_z_loss_mlp": 0.3046875, + "step": 4992, + "time_per_iteration": 2.632108211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059952, + "balance_loss_mlp": 1.02862382, + "epoch": 0.9605617545209696, + "flos": 579016055808.0, + "grad_norm": 0.05380542544477012, + "language_loss": 0.82685798, + "learning_rate": 4.073345361845171e-06, + "loss": 0.83745754, + "num_input_tokens_seen": 413765616, + "router_z_loss_mlp": 0.31298828, + "step": 4993, + "time_per_iteration": 2.7149124145507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059008, + "balance_loss_mlp": 1.0279901, + "epoch": 0.9607541362062332, + "flos": 927312717312.0, + "grad_norm": 0.052236302750863224, + "language_loss": 0.86238164, + "learning_rate": 4.033755723872767e-06, + "loss": 0.87297165, + "num_input_tokens_seen": 413850976, + "router_z_loss_mlp": 0.30981445, + "step": 4994, + "time_per_iteration": 3.2594830989837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064119, + "balance_loss_mlp": 1.03255212, + "epoch": 0.9609465178914968, + "flos": 572832359424.0, + "grad_norm": 0.052246235549541574, + "language_loss": 0.75361091, + "learning_rate": 3.994358637073036e-06, + "loss": 0.76425207, + "num_input_tokens_seen": 413931648, + "router_z_loss_mlp": 0.31542969, + "step": 4995, + "time_per_iteration": 2.821571111679077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058679, + "balance_loss_mlp": 1.02792275, + "epoch": 0.9611388995767602, + "flos": 530585496576.0, + "grad_norm": 0.05476477352747421, + "language_loss": 0.85442054, + "learning_rate": 3.955154116741244e-06, + "loss": 0.86500728, + "num_input_tokens_seen": 414003216, + "router_z_loss_mlp": 0.30712891, + "step": 4996, + "time_per_iteration": 2.6323540210723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057031, + "balance_loss_mlp": 1.0260129, + "epoch": 0.9613312812620238, + "flos": 645959826432.0, + "grad_norm": 0.05351133483249604, + "language_loss": 0.81781733, + "learning_rate": 3.916142178097881e-06, + "loss": 0.82838762, + "num_input_tokens_seen": 414077072, + "router_z_loss_mlp": 0.30981445, + "step": 4997, + "time_per_iteration": 2.7790870666503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106076, + "balance_loss_mlp": 1.03000379, + "epoch": 0.9615236629472874, + "flos": 495897251328.0, + "grad_norm": 0.05497898796109047, + "language_loss": 0.77663916, + "learning_rate": 3.877322836288888e-06, + "loss": 0.78724676, + "num_input_tokens_seen": 414157600, + "router_z_loss_mlp": 0.30712891, + "step": 4998, + "time_per_iteration": 2.888197183609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063994, + "balance_loss_mlp": 1.03237975, + "epoch": 0.961716044632551, + "flos": 512716856832.0, + "grad_norm": 0.050530888251375694, + "language_loss": 0.75301838, + "learning_rate": 3.838696106385153e-06, + "loss": 0.76365829, + "num_input_tokens_seen": 414224880, + "router_z_loss_mlp": 0.31591797, + "step": 4999, + "time_per_iteration": 2.595407009124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065785, + "balance_loss_mlp": 1.03431368, + "epoch": 0.9619084263178146, + "flos": 500835474432.0, + "grad_norm": 0.07679880414039399, + "language_loss": 0.80363154, + "learning_rate": 3.800262003382904e-06, + "loss": 0.81428945, + "num_input_tokens_seen": 414291728, + "router_z_loss_mlp": 0.31445312, + "step": 5000, + "time_per_iteration": 2.579832077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106128, + "balance_loss_mlp": 1.02935529, + "epoch": 0.9621008080030781, + "flos": 595343858688.0, + "grad_norm": 0.060590971935696514, + "language_loss": 0.74907798, + "learning_rate": 3.7620205422035923e-06, + "loss": 0.75969076, + "num_input_tokens_seen": 414369568, + "router_z_loss_mlp": 0.3190918, + "step": 5001, + "time_per_iteration": 2.754000425338745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061748, + "balance_loss_mlp": 1.0305872, + "epoch": 0.9622931896883417, + "flos": 502002372096.0, + "grad_norm": 0.05936854573228492, + "language_loss": 0.8188566, + "learning_rate": 3.723971737693899e-06, + "loss": 0.82947409, + "num_input_tokens_seen": 414441424, + "router_z_loss_mlp": 0.3112793, + "step": 5002, + "time_per_iteration": 2.609647274017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059702, + "balance_loss_mlp": 1.02827847, + "epoch": 0.9624855713736052, + "flos": 606998278656.0, + "grad_norm": 0.05580409062839421, + "language_loss": 0.80881554, + "learning_rate": 3.6861156046256728e-06, + "loss": 0.81941253, + "num_input_tokens_seen": 414512960, + "router_z_loss_mlp": 0.31396484, + "step": 5003, + "time_per_iteration": 2.761650800704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061746, + "balance_loss_mlp": 1.03075206, + "epoch": 0.9626779530588688, + "flos": 510461637120.0, + "grad_norm": 0.0549718452181321, + "language_loss": 0.8447454, + "learning_rate": 3.648452157695936e-06, + "loss": 0.85536283, + "num_input_tokens_seen": 414577392, + "router_z_loss_mlp": 0.30957031, + "step": 5004, + "time_per_iteration": 2.553931951522827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060462, + "balance_loss_mlp": 1.02946782, + "epoch": 0.9628703347441323, + "flos": 626994100224.0, + "grad_norm": 0.05616371519000272, + "language_loss": 0.8239938, + "learning_rate": 3.610981411526937e-06, + "loss": 0.83459842, + "num_input_tokens_seen": 414655152, + "router_z_loss_mlp": 0.30957031, + "step": 5005, + "time_per_iteration": 2.8171026706695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060597, + "balance_loss_mlp": 1.02938795, + "epoch": 0.9630627164293959, + "flos": 630474444288.0, + "grad_norm": 0.057826844797486406, + "language_loss": 0.77271199, + "learning_rate": 3.573703380666149e-06, + "loss": 0.78331804, + "num_input_tokens_seen": 414730432, + "router_z_loss_mlp": 0.31176758, + "step": 5006, + "time_per_iteration": 2.7363760471343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064818, + "balance_loss_mlp": 1.03368068, + "epoch": 0.9632550981146595, + "flos": 570267219456.0, + "grad_norm": 0.049148732947391416, + "language_loss": 0.78805315, + "learning_rate": 3.5366180795861622e-06, + "loss": 0.79870129, + "num_input_tokens_seen": 414810688, + "router_z_loss_mlp": 0.31103516, + "step": 5007, + "time_per_iteration": 2.798482894897461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062148, + "balance_loss_mlp": 1.03000939, + "epoch": 0.9634474797999231, + "flos": 465857657856.0, + "grad_norm": 0.061293995921825266, + "language_loss": 0.80628145, + "learning_rate": 3.4997255226847937e-06, + "loss": 0.81690294, + "num_input_tokens_seen": 414880544, + "router_z_loss_mlp": 0.32128906, + "step": 5008, + "time_per_iteration": 2.642397403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106517, + "balance_loss_mlp": 1.03365088, + "epoch": 0.9636398614851867, + "flos": 526345689600.0, + "grad_norm": 0.06183115681843797, + "language_loss": 0.85305095, + "learning_rate": 3.463025724284974e-06, + "loss": 0.86370265, + "num_input_tokens_seen": 414949920, + "router_z_loss_mlp": 0.31494141, + "step": 5009, + "time_per_iteration": 4.074778079986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060705, + "balance_loss_mlp": 1.03002095, + "epoch": 0.9638322431704501, + "flos": 564554976768.0, + "grad_norm": 0.05273989252047438, + "language_loss": 0.75239956, + "learning_rate": 3.4265186986348618e-06, + "loss": 0.76300663, + "num_input_tokens_seen": 415024288, + "router_z_loss_mlp": 0.30639648, + "step": 5010, + "time_per_iteration": 2.758338689804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062801, + "balance_loss_mlp": 1.03113854, + "epoch": 0.9640246248557137, + "flos": 477531016704.0, + "grad_norm": 0.05564071805678616, + "language_loss": 0.84424335, + "learning_rate": 3.3902044599076754e-06, + "loss": 0.85487133, + "num_input_tokens_seen": 415092032, + "router_z_loss_mlp": 0.31640625, + "step": 5011, + "time_per_iteration": 2.572166919708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061468, + "balance_loss_mlp": 1.03083074, + "epoch": 0.9642170065409773, + "flos": 539063700480.0, + "grad_norm": 0.05146867217669957, + "language_loss": 0.88495445, + "learning_rate": 3.354083022201859e-06, + "loss": 0.89556915, + "num_input_tokens_seen": 415158544, + "router_z_loss_mlp": 0.3059082, + "step": 5012, + "time_per_iteration": 2.6278939247131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060669, + "balance_loss_mlp": 1.02934122, + "epoch": 0.9644093882262409, + "flos": 523499742720.0, + "grad_norm": 0.053579427527373706, + "language_loss": 0.8370012, + "learning_rate": 3.3181543995410843e-06, + "loss": 0.84760791, + "num_input_tokens_seen": 415225088, + "router_z_loss_mlp": 0.31298828, + "step": 5013, + "time_per_iteration": 2.619873523712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061188, + "balance_loss_mlp": 1.0306704, + "epoch": 0.9646017699115044, + "flos": 574018195968.0, + "grad_norm": 0.05027392216499071, + "language_loss": 0.78493142, + "learning_rate": 3.2824186058740268e-06, + "loss": 0.79554331, + "num_input_tokens_seen": 415300224, + "router_z_loss_mlp": 0.30493164, + "step": 5014, + "time_per_iteration": 2.701974630355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105829, + "balance_loss_mlp": 1.02779675, + "epoch": 0.964794151596768, + "flos": 636511163904.0, + "grad_norm": 0.05861201015581005, + "language_loss": 0.84411347, + "learning_rate": 3.246875655074588e-06, + "loss": 0.85469639, + "num_input_tokens_seen": 415368784, + "router_z_loss_mlp": 0.30444336, + "step": 5015, + "time_per_iteration": 2.7228355407714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062943, + "balance_loss_mlp": 1.03218722, + "epoch": 0.9649865332820315, + "flos": 617155531776.0, + "grad_norm": 0.06123490133092928, + "language_loss": 0.86001122, + "learning_rate": 3.211525560941675e-06, + "loss": 0.87064075, + "num_input_tokens_seen": 415440752, + "router_z_loss_mlp": 0.30712891, + "step": 5016, + "time_per_iteration": 2.718139171600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106481, + "balance_loss_mlp": 1.03317225, + "epoch": 0.9651789149672951, + "flos": 515898865152.0, + "grad_norm": 0.05141317284262244, + "language_loss": 0.80746591, + "learning_rate": 3.1763683371994754e-06, + "loss": 0.81811404, + "num_input_tokens_seen": 415516128, + "router_z_loss_mlp": 0.31616211, + "step": 5017, + "time_per_iteration": 2.7883141040802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059591, + "balance_loss_mlp": 1.0283339, + "epoch": 0.9653712966525587, + "flos": 492696304128.0, + "grad_norm": 0.05540130147565109, + "language_loss": 0.79782176, + "learning_rate": 3.1414039974972385e-06, + "loss": 0.80841768, + "num_input_tokens_seen": 415583744, + "router_z_loss_mlp": 0.31225586, + "step": 5018, + "time_per_iteration": 2.5674331188201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059766, + "balance_loss_mlp": 1.02824712, + "epoch": 0.9655636783378222, + "flos": 536287564800.0, + "grad_norm": 0.0402183906982743, + "language_loss": 0.82745731, + "learning_rate": 3.106632555409328e-06, + "loss": 0.83805501, + "num_input_tokens_seen": 415659856, + "router_z_loss_mlp": 0.31494141, + "step": 5019, + "time_per_iteration": 2.816701650619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057217, + "balance_loss_mlp": 1.02615166, + "epoch": 0.9657560600230858, + "flos": 458790842880.0, + "grad_norm": 0.05354585998337552, + "language_loss": 0.82026023, + "learning_rate": 3.072054024435167e-06, + "loss": 0.83083236, + "num_input_tokens_seen": 415731792, + "router_z_loss_mlp": 0.31030273, + "step": 5020, + "time_per_iteration": 2.6295535564422607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059918, + "balance_loss_mlp": 1.02970994, + "epoch": 0.9659484417083494, + "flos": 685877276160.0, + "grad_norm": 0.06971172530491482, + "language_loss": 0.83589661, + "learning_rate": 3.0376684179994064e-06, + "loss": 0.84649581, + "num_input_tokens_seen": 415809536, + "router_z_loss_mlp": 0.30151367, + "step": 5021, + "time_per_iteration": 2.790837049484253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101064, + "balance_loss_mlp": 1.00253367, + "epoch": 0.966140823393613, + "flos": 1501503879168.0, + "grad_norm": 0.0036733761011580493, + "language_loss": 0.80694246, + "learning_rate": 3.0034757494516453e-06, + "loss": 0.81704885, + "num_input_tokens_seen": 416027600, + "router_z_loss_mlp": 0.08105469, + "step": 5022, + "time_per_iteration": 4.662962198257446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061518, + "balance_loss_mlp": 1.03085709, + "epoch": 0.9663332050788765, + "flos": 464660236800.0, + "grad_norm": 0.07115272426240939, + "language_loss": 0.80814624, + "learning_rate": 2.9694760320667093e-06, + "loss": 0.81876141, + "num_input_tokens_seen": 416096128, + "router_z_loss_mlp": 0.30615234, + "step": 5023, + "time_per_iteration": 2.6125125885009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058747, + "balance_loss_mlp": 1.02787185, + "epoch": 0.96652558676414, + "flos": 500575016448.0, + "grad_norm": 0.05166050311684865, + "language_loss": 0.85474747, + "learning_rate": 2.9356692790444283e-06, + "loss": 0.86533493, + "num_input_tokens_seen": 416164256, + "router_z_loss_mlp": 0.30834961, + "step": 5024, + "time_per_iteration": 2.638561487197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062477, + "balance_loss_mlp": 1.03222132, + "epoch": 0.9667179684494036, + "flos": 424614749184.0, + "grad_norm": 0.10681545927260369, + "language_loss": 0.82711923, + "learning_rate": 2.9020555035097484e-06, + "loss": 0.837744, + "num_input_tokens_seen": 416227296, + "router_z_loss_mlp": 0.30200195, + "step": 5025, + "time_per_iteration": 2.499992609024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060102, + "balance_loss_mlp": 1.02946556, + "epoch": 0.9669103501346672, + "flos": 516744258048.0, + "grad_norm": 0.047288791646070916, + "language_loss": 0.8577379, + "learning_rate": 2.8686347185127305e-06, + "loss": 0.86833894, + "num_input_tokens_seen": 416297184, + "router_z_loss_mlp": 0.3059082, + "step": 5026, + "time_per_iteration": 2.684466600418091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064062, + "balance_loss_mlp": 1.03242362, + "epoch": 0.9671027318199308, + "flos": 456008914944.0, + "grad_norm": 0.060249957140447924, + "language_loss": 0.75349987, + "learning_rate": 2.8354069370284396e-06, + "loss": 0.76414049, + "num_input_tokens_seen": 416363056, + "router_z_loss_mlp": 0.31616211, + "step": 5027, + "time_per_iteration": 2.581566572189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062603, + "balance_loss_mlp": 1.03151333, + "epoch": 0.9672951135051943, + "flos": 524809234944.0, + "grad_norm": 0.07736308326368693, + "language_loss": 0.80242217, + "learning_rate": 2.802372171957057e-06, + "loss": 0.81304818, + "num_input_tokens_seen": 416430688, + "router_z_loss_mlp": 0.31054688, + "step": 5028, + "time_per_iteration": 2.620943546295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062615, + "balance_loss_mlp": 1.0313108, + "epoch": 0.9674874951904578, + "flos": 573708275712.0, + "grad_norm": 0.05924446990021998, + "language_loss": 0.79946339, + "learning_rate": 2.7695304361237682e-06, + "loss": 0.81008953, + "num_input_tokens_seen": 416505248, + "router_z_loss_mlp": 0.31274414, + "step": 5029, + "time_per_iteration": 2.7776339054107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106092, + "balance_loss_mlp": 1.02990174, + "epoch": 0.9676798768757214, + "flos": 628875380736.0, + "grad_norm": 0.0401751924772168, + "language_loss": 0.79843652, + "learning_rate": 2.7368817422789848e-06, + "loss": 0.80904567, + "num_input_tokens_seen": 416592640, + "router_z_loss_mlp": 0.30981445, + "step": 5030, + "time_per_iteration": 2.9464609622955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011242, + "balance_loss_mlp": 1.00313604, + "epoch": 0.967872258560985, + "flos": 1463074831872.0, + "grad_norm": 0.0037536728734851557, + "language_loss": 0.75563359, + "learning_rate": 2.7044261030979566e-06, + "loss": 0.765746, + "num_input_tokens_seen": 416808560, + "router_z_loss_mlp": 0.08105469, + "step": 5031, + "time_per_iteration": 4.694348573684692 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106361, + "balance_loss_mlp": 1.03314054, + "epoch": 0.9680646402462486, + "flos": 565238836224.0, + "grad_norm": 0.06560457611419962, + "language_loss": 0.79323578, + "learning_rate": 2.672163531181049e-06, + "loss": 0.80387187, + "num_input_tokens_seen": 416878208, + "router_z_loss_mlp": 0.30444336, + "step": 5032, + "time_per_iteration": 2.663724184036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101124, + "balance_loss_mlp": 1.00313365, + "epoch": 0.9682570219315121, + "flos": 1433669635584.0, + "grad_norm": 0.0037552357874484797, + "language_loss": 0.78074801, + "learning_rate": 2.6400940390537976e-06, + "loss": 0.79086041, + "num_input_tokens_seen": 417105968, + "router_z_loss_mlp": 0.08105469, + "step": 5033, + "time_per_iteration": 4.814163446426392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106369, + "balance_loss_mlp": 1.03186107, + "epoch": 0.9684494036167757, + "flos": 584338392576.0, + "grad_norm": 0.07564514705526598, + "language_loss": 0.81710398, + "learning_rate": 2.608217639166688e-06, + "loss": 0.82774091, + "num_input_tokens_seen": 417175168, + "router_z_loss_mlp": 0.31811523, + "step": 5034, + "time_per_iteration": 2.738064765930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059011, + "balance_loss_mlp": 1.02849364, + "epoch": 0.9686417853020393, + "flos": 558784507392.0, + "grad_norm": 0.051069776715125234, + "language_loss": 0.83945799, + "learning_rate": 2.5765343438950982e-06, + "loss": 0.85004807, + "num_input_tokens_seen": 417247760, + "router_z_loss_mlp": 0.3046875, + "step": 5035, + "time_per_iteration": 2.717803716659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062645, + "balance_loss_mlp": 1.03143644, + "epoch": 0.9688341669873028, + "flos": 784594944000.0, + "grad_norm": 0.06911477440096066, + "language_loss": 0.83235759, + "learning_rate": 2.545044165539745e-06, + "loss": 0.84298402, + "num_input_tokens_seen": 417324080, + "router_z_loss_mlp": 0.31176758, + "step": 5036, + "time_per_iteration": 2.9943130016326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106053, + "balance_loss_mlp": 1.02903473, + "epoch": 0.9690265486725663, + "flos": 395682416640.0, + "grad_norm": 0.0635388023408017, + "language_loss": 0.79152626, + "learning_rate": 2.513747116326126e-06, + "loss": 0.80213153, + "num_input_tokens_seen": 417386416, + "router_z_loss_mlp": 0.31469727, + "step": 5037, + "time_per_iteration": 2.5086729526519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061541, + "balance_loss_mlp": 1.03090429, + "epoch": 0.9692189303578299, + "flos": 476113835520.0, + "grad_norm": 0.05740046745509432, + "language_loss": 0.77356291, + "learning_rate": 2.4826432084048002e-06, + "loss": 0.78417832, + "num_input_tokens_seen": 417459648, + "router_z_loss_mlp": 0.3059082, + "step": 5038, + "time_per_iteration": 2.705591917037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059038, + "balance_loss_mlp": 1.02885425, + "epoch": 0.9694113120430935, + "flos": 597297922560.0, + "grad_norm": 0.05798206678897628, + "language_loss": 0.78522074, + "learning_rate": 2.451732453851385e-06, + "loss": 0.79581112, + "num_input_tokens_seen": 417530512, + "router_z_loss_mlp": 0.30126953, + "step": 5039, + "time_per_iteration": 2.6998066902160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059417, + "balance_loss_mlp": 1.02892351, + "epoch": 0.9696036937283571, + "flos": 500628860928.0, + "grad_norm": 0.04971001407251187, + "language_loss": 0.82436031, + "learning_rate": 2.4210148646665598e-06, + "loss": 0.8349545, + "num_input_tokens_seen": 417597600, + "router_z_loss_mlp": 0.30444336, + "step": 5040, + "time_per_iteration": 2.585838556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062586, + "balance_loss_mlp": 1.0316397, + "epoch": 0.9697960754136207, + "flos": 432049711104.0, + "grad_norm": 0.06475972731655387, + "language_loss": 0.8689853, + "learning_rate": 2.3904904527758952e-06, + "loss": 0.87961119, + "num_input_tokens_seen": 417659616, + "router_z_loss_mlp": 0.30908203, + "step": 5041, + "time_per_iteration": 2.440291166305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060775, + "balance_loss_mlp": 1.02982795, + "epoch": 0.9699884570988841, + "flos": 568257901056.0, + "grad_norm": 0.04793585880961143, + "language_loss": 0.85172904, + "learning_rate": 2.3601592300300235e-06, + "loss": 0.86233675, + "num_input_tokens_seen": 417730896, + "router_z_loss_mlp": 0.30908203, + "step": 5042, + "time_per_iteration": 2.706629991531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064477, + "balance_loss_mlp": 1.03293455, + "epoch": 0.9701808387841477, + "flos": 515961474048.0, + "grad_norm": 0.054405736603249856, + "language_loss": 0.81407428, + "learning_rate": 2.33002120820458e-06, + "loss": 0.82471901, + "num_input_tokens_seen": 417803296, + "router_z_loss_mlp": 0.31518555, + "step": 5043, + "time_per_iteration": 2.6756155490875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106029, + "balance_loss_mlp": 1.02970135, + "epoch": 0.9703732204694113, + "flos": 491273330688.0, + "grad_norm": 0.0672507732770155, + "language_loss": 0.76003706, + "learning_rate": 2.300076399000206e-06, + "loss": 0.77063996, + "num_input_tokens_seen": 417870208, + "router_z_loss_mlp": 0.30541992, + "step": 5044, + "time_per_iteration": 2.5917348861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061641, + "balance_loss_mlp": 1.0302887, + "epoch": 0.9705656021546749, + "flos": 625831584768.0, + "grad_norm": 0.05433746286859287, + "language_loss": 0.80137366, + "learning_rate": 2.2703248140424348e-06, + "loss": 0.81199008, + "num_input_tokens_seen": 417944464, + "router_z_loss_mlp": 0.31323242, + "step": 5045, + "time_per_iteration": 2.808703899383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106209, + "balance_loss_mlp": 1.0320015, + "epoch": 0.9707579838399384, + "flos": 471198933504.0, + "grad_norm": 0.054204076995614914, + "language_loss": 0.82907468, + "learning_rate": 2.2407664648819715e-06, + "loss": 0.83969557, + "num_input_tokens_seen": 418010480, + "router_z_loss_mlp": 0.30029297, + "step": 5046, + "time_per_iteration": 2.595574140548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063429, + "balance_loss_mlp": 1.03255379, + "epoch": 0.970950365525202, + "flos": 491845118976.0, + "grad_norm": 0.05790005154915511, + "language_loss": 0.80455661, + "learning_rate": 2.2114013629942475e-06, + "loss": 0.81519091, + "num_input_tokens_seen": 418083952, + "router_z_loss_mlp": 0.30834961, + "step": 5047, + "time_per_iteration": 2.6507327556610107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060135, + "balance_loss_mlp": 1.02894998, + "epoch": 0.9711427472104656, + "flos": 557060378112.0, + "grad_norm": 0.07608957719483044, + "language_loss": 0.80362004, + "learning_rate": 2.1822295197799213e-06, + "loss": 0.81422138, + "num_input_tokens_seen": 418156672, + "router_z_loss_mlp": 0.31152344, + "step": 5048, + "time_per_iteration": 2.72220778465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059825, + "balance_loss_mlp": 1.02973652, + "epoch": 0.9713351288957291, + "flos": 625527456768.0, + "grad_norm": 0.04573208991369369, + "language_loss": 0.83717644, + "learning_rate": 2.153250946564489e-06, + "loss": 0.84777468, + "num_input_tokens_seen": 418242160, + "router_z_loss_mlp": 0.30029297, + "step": 5049, + "time_per_iteration": 2.9444804191589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062024, + "balance_loss_mlp": 1.03064787, + "epoch": 0.9715275105809927, + "flos": 498821773824.0, + "grad_norm": 0.05121797008138963, + "language_loss": 0.80890942, + "learning_rate": 2.1244656545983397e-06, + "loss": 0.81952965, + "num_input_tokens_seen": 418316960, + "router_z_loss_mlp": 0.31347656, + "step": 5050, + "time_per_iteration": 2.765965461730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062944, + "balance_loss_mlp": 1.03149652, + "epoch": 0.9717198922662562, + "flos": 477274940928.0, + "grad_norm": 0.0759374256858869, + "language_loss": 0.77641714, + "learning_rate": 2.0958736550570345e-06, + "loss": 0.78704655, + "num_input_tokens_seen": 418383888, + "router_z_loss_mlp": 0.31420898, + "step": 5051, + "time_per_iteration": 2.534787178039551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058864, + "balance_loss_mlp": 1.02794087, + "epoch": 0.9719122739515198, + "flos": 553171189248.0, + "grad_norm": 0.04369700859439487, + "language_loss": 0.78430104, + "learning_rate": 2.067474959040916e-06, + "loss": 0.79488969, + "num_input_tokens_seen": 418453776, + "router_z_loss_mlp": 0.30883789, + "step": 5052, + "time_per_iteration": 2.6652493476867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063725, + "balance_loss_mlp": 1.03237319, + "epoch": 0.9721046556367834, + "flos": 565583662080.0, + "grad_norm": 0.05627450979959505, + "language_loss": 0.80065435, + "learning_rate": 2.0392695775753312e-06, + "loss": 0.81129158, + "num_input_tokens_seen": 418521984, + "router_z_loss_mlp": 0.31323242, + "step": 5053, + "time_per_iteration": 2.645796537399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061537, + "balance_loss_mlp": 1.03051889, + "epoch": 0.972297037322047, + "flos": 560044537344.0, + "grad_norm": 0.056018858365713145, + "language_loss": 0.78160405, + "learning_rate": 2.0112575216105766e-06, + "loss": 0.7922194, + "num_input_tokens_seen": 418598768, + "router_z_loss_mlp": 0.30981445, + "step": 5054, + "time_per_iteration": 2.7389419078826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063362, + "balance_loss_mlp": 1.0327971, + "epoch": 0.9724894190073105, + "flos": 512175591936.0, + "grad_norm": 0.056389892105133975, + "language_loss": 0.78999579, + "learning_rate": 1.9834388020218974e-06, + "loss": 0.80062938, + "num_input_tokens_seen": 418670064, + "router_z_loss_mlp": 0.30517578, + "step": 5055, + "time_per_iteration": 2.677356719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061934, + "balance_loss_mlp": 1.0307492, + "epoch": 0.972681800692574, + "flos": 613532593152.0, + "grad_norm": 0.05719839880369088, + "language_loss": 0.80146527, + "learning_rate": 1.9558134296094875e-06, + "loss": 0.81208467, + "num_input_tokens_seen": 418745216, + "router_z_loss_mlp": 0.31152344, + "step": 5056, + "time_per_iteration": 2.780590295791626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059791, + "balance_loss_mlp": 1.02882087, + "epoch": 0.9728741823778376, + "flos": 833562385920.0, + "grad_norm": 0.04737798788694853, + "language_loss": 0.83823931, + "learning_rate": 1.92838141509849e-06, + "loss": 0.84883726, + "num_input_tokens_seen": 418824224, + "router_z_loss_mlp": 0.30932617, + "step": 5057, + "time_per_iteration": 3.111090660095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062986, + "balance_loss_mlp": 1.03118145, + "epoch": 0.9730665640631012, + "flos": 571167866880.0, + "grad_norm": 0.06372320617901352, + "language_loss": 0.84056395, + "learning_rate": 1.9011427691389415e-06, + "loss": 0.85119379, + "num_input_tokens_seen": 418899712, + "router_z_loss_mlp": 0.31787109, + "step": 5058, + "time_per_iteration": 2.716362237930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062442, + "balance_loss_mlp": 1.03116202, + "epoch": 0.9732589457483648, + "flos": 506271292416.0, + "grad_norm": 0.049152595370916714, + "language_loss": 0.77131605, + "learning_rate": 1.8740975023057715e-06, + "loss": 0.78194046, + "num_input_tokens_seen": 418964912, + "router_z_loss_mlp": 0.31274414, + "step": 5059, + "time_per_iteration": 2.567103624343872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105856, + "balance_loss_mlp": 1.02799499, + "epoch": 0.9734513274336283, + "flos": 926602716672.0, + "grad_norm": 0.04959205985991603, + "language_loss": 0.80284786, + "learning_rate": 1.84724562509897e-06, + "loss": 0.81343341, + "num_input_tokens_seen": 419040032, + "router_z_loss_mlp": 0.30517578, + "step": 5060, + "time_per_iteration": 3.1037087440490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061035, + "balance_loss_mlp": 1.03011179, + "epoch": 0.9736437091188919, + "flos": 491682175488.0, + "grad_norm": 0.053469499820537766, + "language_loss": 0.77895665, + "learning_rate": 1.8205871479433089e-06, + "loss": 0.78956699, + "num_input_tokens_seen": 419112672, + "router_z_loss_mlp": 0.30883789, + "step": 5061, + "time_per_iteration": 2.7472124099731445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106441, + "balance_loss_mlp": 1.03355861, + "epoch": 0.9738360908041555, + "flos": 613039380480.0, + "grad_norm": 0.05773032133011401, + "language_loss": 0.83614433, + "learning_rate": 1.7941220811885096e-06, + "loss": 0.84678841, + "num_input_tokens_seen": 419183408, + "router_z_loss_mlp": 0.30810547, + "step": 5062, + "time_per_iteration": 2.727924108505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011241, + "balance_loss_mlp": 1.00313449, + "epoch": 0.974028472489419, + "flos": 1548771922944.0, + "grad_norm": 0.0037481101456261563, + "language_loss": 0.75992095, + "learning_rate": 1.7678504351092972e-06, + "loss": 0.77003336, + "num_input_tokens_seen": 419415472, + "router_z_loss_mlp": 0.08105469, + "step": 5063, + "time_per_iteration": 5.0528404712677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011241, + "balance_loss_mlp": 1.0031352, + "epoch": 0.9742208541746825, + "flos": 1410403055616.0, + "grad_norm": 0.0037495136804295438, + "language_loss": 0.79677713, + "learning_rate": 1.7417722199051245e-06, + "loss": 0.80688953, + "num_input_tokens_seen": 419651840, + "router_z_loss_mlp": 0.08105469, + "step": 5064, + "time_per_iteration": 4.989039182662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061659, + "balance_loss_mlp": 1.03114104, + "epoch": 0.9744132358599461, + "flos": 674582238720.0, + "grad_norm": 0.04691018591826747, + "language_loss": 0.76823747, + "learning_rate": 1.7158874457005592e-06, + "loss": 0.77885401, + "num_input_tokens_seen": 419729424, + "router_z_loss_mlp": 0.3046875, + "step": 5065, + "time_per_iteration": 2.84334659576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059101, + "balance_loss_mlp": 1.02741492, + "epoch": 0.9746056175452097, + "flos": 598111229952.0, + "grad_norm": 0.06794562847276414, + "language_loss": 0.77235073, + "learning_rate": 1.690196122544896e-06, + "loss": 0.78294176, + "num_input_tokens_seen": 419803616, + "router_z_loss_mlp": 0.31665039, + "step": 5066, + "time_per_iteration": 2.7670531272888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063312, + "balance_loss_mlp": 1.03234136, + "epoch": 0.9747979992304733, + "flos": 731837237760.0, + "grad_norm": 0.0506469462866736, + "language_loss": 0.82077444, + "learning_rate": 1.6646982604123784e-06, + "loss": 0.83140755, + "num_input_tokens_seen": 419883536, + "router_z_loss_mlp": 0.30932617, + "step": 5067, + "time_per_iteration": 2.989997148513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010609, + "balance_loss_mlp": 1.02926183, + "epoch": 0.9749903809157369, + "flos": 616219978752.0, + "grad_norm": 0.06891073101993225, + "language_loss": 0.76269442, + "learning_rate": 1.6393938692022548e-06, + "loss": 0.77330339, + "num_input_tokens_seen": 419956816, + "router_z_loss_mlp": 0.31616211, + "step": 5068, + "time_per_iteration": 2.6933865547180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010603, + "balance_loss_mlp": 1.02963936, + "epoch": 0.9751827626010003, + "flos": 468160929792.0, + "grad_norm": 0.05062684211706446, + "language_loss": 0.83640307, + "learning_rate": 1.6142829587384443e-06, + "loss": 0.84700608, + "num_input_tokens_seen": 420022096, + "router_z_loss_mlp": 0.30615234, + "step": 5069, + "time_per_iteration": 2.548443078994751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106123, + "balance_loss_mlp": 1.03006864, + "epoch": 0.9753751442862639, + "flos": 598918745088.0, + "grad_norm": 0.08322455471388275, + "language_loss": 0.8529315, + "learning_rate": 1.5893655387698713e-06, + "loss": 0.86354387, + "num_input_tokens_seen": 420097008, + "router_z_loss_mlp": 0.3112793, + "step": 5070, + "time_per_iteration": 2.7602720260620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060615, + "balance_loss_mlp": 1.02931058, + "epoch": 0.9755675259715275, + "flos": 650486232576.0, + "grad_norm": 0.05115135160859089, + "language_loss": 0.82068843, + "learning_rate": 1.5646416189704637e-06, + "loss": 0.83129454, + "num_input_tokens_seen": 420174960, + "router_z_loss_mlp": 0.31274414, + "step": 5071, + "time_per_iteration": 2.878183126449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061905, + "balance_loss_mlp": 1.03138733, + "epoch": 0.9757599076567911, + "flos": 563392461312.0, + "grad_norm": 0.056802057532983834, + "language_loss": 0.7912972, + "learning_rate": 1.5401112089387659e-06, + "loss": 0.80191624, + "num_input_tokens_seen": 420245248, + "router_z_loss_mlp": 0.3046875, + "step": 5072, + "time_per_iteration": 2.6892030239105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106052, + "balance_loss_mlp": 1.02931106, + "epoch": 0.9759522893420547, + "flos": 504385629696.0, + "grad_norm": 0.06614272886181409, + "language_loss": 0.80103958, + "learning_rate": 1.5157743181983819e-06, + "loss": 0.81164479, + "num_input_tokens_seen": 420310688, + "router_z_loss_mlp": 0.31176758, + "step": 5073, + "time_per_iteration": 2.6456804275512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058437, + "balance_loss_mlp": 1.02741873, + "epoch": 0.9761446710273182, + "flos": 583452301824.0, + "grad_norm": 0.05438660473134642, + "language_loss": 0.81815821, + "learning_rate": 1.4916309561976982e-06, + "loss": 0.82874256, + "num_input_tokens_seen": 420379008, + "router_z_loss_mlp": 0.30981445, + "step": 5074, + "time_per_iteration": 2.6796131134033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063789, + "balance_loss_mlp": 1.03289032, + "epoch": 0.9763370527125818, + "flos": 481967262720.0, + "grad_norm": 0.062046674624534275, + "language_loss": 0.82287705, + "learning_rate": 1.4676811323099947e-06, + "loss": 0.83351487, + "num_input_tokens_seen": 420445504, + "router_z_loss_mlp": 0.30859375, + "step": 5075, + "time_per_iteration": 2.5883021354675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060183, + "balance_loss_mlp": 1.02911687, + "epoch": 0.9765294343978453, + "flos": 618706543104.0, + "grad_norm": 0.06359470146777534, + "language_loss": 0.78232706, + "learning_rate": 1.4439248558335561e-06, + "loss": 0.79292893, + "num_input_tokens_seen": 420520528, + "router_z_loss_mlp": 0.31030273, + "step": 5076, + "time_per_iteration": 2.7379183769226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058637, + "balance_loss_mlp": 1.02776134, + "epoch": 0.9767218160831089, + "flos": 526320958464.0, + "grad_norm": 0.06717350649320492, + "language_loss": 0.85320723, + "learning_rate": 1.4203621359911712e-06, + "loss": 0.86379361, + "num_input_tokens_seen": 420586224, + "router_z_loss_mlp": 0.30834961, + "step": 5077, + "time_per_iteration": 2.630486249923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061717, + "balance_loss_mlp": 1.03119898, + "epoch": 0.9769141977683724, + "flos": 524932890624.0, + "grad_norm": 0.047528387695096014, + "language_loss": 0.84178358, + "learning_rate": 1.3969929819308557e-06, + "loss": 0.85240072, + "num_input_tokens_seen": 420655456, + "router_z_loss_mlp": 0.3046875, + "step": 5078, + "time_per_iteration": 2.630990505218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064302, + "balance_loss_mlp": 1.03373718, + "epoch": 0.977106579453636, + "flos": 457359105024.0, + "grad_norm": 0.05153436611336294, + "language_loss": 0.80598915, + "learning_rate": 1.3738174027252416e-06, + "loss": 0.81663209, + "num_input_tokens_seen": 420733216, + "router_z_loss_mlp": 0.30517578, + "step": 5079, + "time_per_iteration": 2.8142943382263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063034, + "balance_loss_mlp": 1.03156233, + "epoch": 0.9772989611388996, + "flos": 531830969856.0, + "grad_norm": 0.06258721314145185, + "language_loss": 0.81607276, + "learning_rate": 1.3508354073719642e-06, + "loss": 0.82670313, + "num_input_tokens_seen": 420803376, + "router_z_loss_mlp": 0.31445312, + "step": 5080, + "time_per_iteration": 2.601149797439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061495, + "balance_loss_mlp": 1.03030968, + "epoch": 0.9774913428241632, + "flos": 754999100928.0, + "grad_norm": 0.058179386830925724, + "language_loss": 0.86116189, + "learning_rate": 1.3280470047933313e-06, + "loss": 0.87177682, + "num_input_tokens_seen": 420886256, + "router_z_loss_mlp": 0.31152344, + "step": 5081, + "time_per_iteration": 2.989039182662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101122, + "balance_loss_mlp": 1.00311351, + "epoch": 0.9776837245094268, + "flos": 1553486003712.0, + "grad_norm": 0.003746354895581978, + "language_loss": 0.78895497, + "learning_rate": 1.3054522038366544e-06, + "loss": 0.79906714, + "num_input_tokens_seen": 421123728, + "router_z_loss_mlp": 0.08105469, + "step": 5082, + "time_per_iteration": 4.968156576156616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062166, + "balance_loss_mlp": 1.0312196, + "epoch": 0.9778761061946902, + "flos": 592260774912.0, + "grad_norm": 0.06333002037000754, + "language_loss": 0.84055161, + "learning_rate": 1.2830510132739725e-06, + "loss": 0.85117328, + "num_input_tokens_seen": 421192576, + "router_z_loss_mlp": 0.30908203, + "step": 5083, + "time_per_iteration": 2.6840903759002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063545, + "balance_loss_mlp": 1.032336, + "epoch": 0.9780684878799538, + "flos": 414732510720.0, + "grad_norm": 0.051194594259949974, + "language_loss": 0.81744003, + "learning_rate": 1.2608434418022175e-06, + "loss": 0.82807547, + "num_input_tokens_seen": 421256272, + "router_z_loss_mlp": 0.31176758, + "step": 5084, + "time_per_iteration": 2.4817535877227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063953, + "balance_loss_mlp": 1.03260052, + "epoch": 0.9782608695652174, + "flos": 568129863168.0, + "grad_norm": 0.05618141703355523, + "language_loss": 0.84910816, + "learning_rate": 1.2388294980431036e-06, + "loss": 0.85974771, + "num_input_tokens_seen": 421332880, + "router_z_loss_mlp": 0.31323242, + "step": 5085, + "time_per_iteration": 2.7052948474884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061201, + "balance_loss_mlp": 1.03020716, + "epoch": 0.978453251250481, + "flos": 690151988736.0, + "grad_norm": 0.06611452324560538, + "language_loss": 0.83097386, + "learning_rate": 1.217009190543239e-06, + "loss": 0.84158587, + "num_input_tokens_seen": 421406160, + "router_z_loss_mlp": 0.30957031, + "step": 5086, + "time_per_iteration": 2.8580446243286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057022, + "balance_loss_mlp": 1.02657628, + "epoch": 0.9786456329357445, + "flos": 502239508992.0, + "grad_norm": 0.04725270380406371, + "language_loss": 0.77273715, + "learning_rate": 1.1953825277740694e-06, + "loss": 0.78330743, + "num_input_tokens_seen": 421476208, + "router_z_loss_mlp": 0.30395508, + "step": 5087, + "time_per_iteration": 2.644757032394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063526, + "balance_loss_mlp": 1.03222179, + "epoch": 0.9788380146210081, + "flos": 862829369856.0, + "grad_norm": 0.06946428392980135, + "language_loss": 0.80393237, + "learning_rate": 1.1739495181317117e-06, + "loss": 0.81456769, + "num_input_tokens_seen": 421549232, + "router_z_loss_mlp": 0.31274414, + "step": 5088, + "time_per_iteration": 3.011176109313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062233, + "balance_loss_mlp": 1.03173923, + "epoch": 0.9790303963062716, + "flos": 512460781056.0, + "grad_norm": 0.05754819894183326, + "language_loss": 0.84162724, + "learning_rate": 1.1527101699371767e-06, + "loss": 0.85224962, + "num_input_tokens_seen": 421617056, + "router_z_loss_mlp": 0.30444336, + "step": 5089, + "time_per_iteration": 2.5700507164001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062724, + "balance_loss_mlp": 1.03218281, + "epoch": 0.9792227779915352, + "flos": 494183296512.0, + "grad_norm": 0.07818701968274684, + "language_loss": 0.8623296, + "learning_rate": 1.1316644914364237e-06, + "loss": 0.87295687, + "num_input_tokens_seen": 421683424, + "router_z_loss_mlp": 0.30493164, + "step": 5090, + "time_per_iteration": 2.578331470489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106283, + "balance_loss_mlp": 1.03171659, + "epoch": 0.9794151596767988, + "flos": 608037138432.0, + "grad_norm": 0.0640945477186255, + "language_loss": 0.81165767, + "learning_rate": 1.1108124908000838e-06, + "loss": 0.82228601, + "num_input_tokens_seen": 421761200, + "router_z_loss_mlp": 0.31079102, + "step": 5091, + "time_per_iteration": 2.8047142028808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060042, + "balance_loss_mlp": 1.02835619, + "epoch": 0.9796075413620623, + "flos": 477979149312.0, + "grad_norm": 0.05635912773275719, + "language_loss": 0.86476392, + "learning_rate": 1.09015417612357e-06, + "loss": 0.8753643, + "num_input_tokens_seen": 421829600, + "router_z_loss_mlp": 0.31665039, + "step": 5092, + "time_per_iteration": 2.5596201419830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010638, + "balance_loss_mlp": 1.03278232, + "epoch": 0.9797999230473259, + "flos": 591936297984.0, + "grad_norm": 0.05185035440216898, + "language_loss": 0.84320545, + "learning_rate": 1.0696895554271335e-06, + "loss": 0.85384345, + "num_input_tokens_seen": 421904928, + "router_z_loss_mlp": 0.30981445, + "step": 5093, + "time_per_iteration": 2.7417502403259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060458, + "balance_loss_mlp": 1.02924931, + "epoch": 0.9799923047325895, + "flos": 556086947328.0, + "grad_norm": 0.057258912795892604, + "language_loss": 0.81443524, + "learning_rate": 1.049418636655919e-06, + "loss": 0.82503974, + "num_input_tokens_seen": 421989616, + "router_z_loss_mlp": 0.31176758, + "step": 5094, + "time_per_iteration": 2.9223530292510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062101, + "balance_loss_mlp": 1.03101122, + "epoch": 0.9801846864178531, + "flos": 579164442624.0, + "grad_norm": 0.04579710583324431, + "language_loss": 0.84433246, + "learning_rate": 1.0293414276797974e-06, + "loss": 0.85495341, + "num_input_tokens_seen": 422067088, + "router_z_loss_mlp": 0.31054688, + "step": 5095, + "time_per_iteration": 2.7353591918945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060539, + "balance_loss_mlp": 1.03030777, + "epoch": 0.9803770681031165, + "flos": 514825099776.0, + "grad_norm": 0.061566152326411605, + "language_loss": 0.79944533, + "learning_rate": 1.0094579362933677e-06, + "loss": 0.81005073, + "num_input_tokens_seen": 422141136, + "router_z_loss_mlp": 0.30175781, + "step": 5096, + "time_per_iteration": 2.65447998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056617, + "balance_loss_mlp": 1.02643323, + "epoch": 0.9805694497883801, + "flos": 566706889728.0, + "grad_norm": 0.053549936488599306, + "language_loss": 0.77981234, + "learning_rate": 9.897681702160654e-07, + "loss": 0.79037857, + "num_input_tokens_seen": 422216400, + "router_z_loss_mlp": 0.30126953, + "step": 5097, + "time_per_iteration": 2.7695438861846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061184, + "balance_loss_mlp": 1.03038001, + "epoch": 0.9807618314736437, + "flos": 479106759168.0, + "grad_norm": 0.051835716035438115, + "language_loss": 0.73514438, + "learning_rate": 9.702721370922208e-07, + "loss": 0.74575627, + "num_input_tokens_seen": 422287664, + "router_z_loss_mlp": 0.30761719, + "step": 5098, + "time_per_iteration": 2.6515543460845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064654, + "balance_loss_mlp": 1.03339684, + "epoch": 0.9809542131589073, + "flos": 545021844480.0, + "grad_norm": 0.05890860301588922, + "language_loss": 0.80092633, + "learning_rate": 9.509698444908344e-07, + "loss": 0.81157291, + "num_input_tokens_seen": 422357552, + "router_z_loss_mlp": 0.31225586, + "step": 5099, + "time_per_iteration": 2.622485876083374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058534, + "balance_loss_mlp": 1.02751541, + "epoch": 0.9811465948441709, + "flos": 520589776896.0, + "grad_norm": 0.05424341522111697, + "language_loss": 0.79649353, + "learning_rate": 9.318612999057452e-07, + "loss": 0.80707896, + "num_input_tokens_seen": 422425872, + "router_z_loss_mlp": 0.30981445, + "step": 5100, + "time_per_iteration": 2.591421127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062161, + "balance_loss_mlp": 1.03142905, + "epoch": 0.9813389765294344, + "flos": 541023556608.0, + "grad_norm": 0.056964991867044526, + "language_loss": 0.79990375, + "learning_rate": 9.129465107554635e-07, + "loss": 0.81052536, + "num_input_tokens_seen": 422495760, + "router_z_loss_mlp": 0.30688477, + "step": 5101, + "time_per_iteration": 2.624356985092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061299, + "balance_loss_mlp": 1.0304718, + "epoch": 0.981531358214698, + "flos": 567080828928.0, + "grad_norm": 0.06134435834417123, + "language_loss": 0.84231782, + "learning_rate": 8.942254843834485e-07, + "loss": 0.85293078, + "num_input_tokens_seen": 422568112, + "router_z_loss_mlp": 0.30786133, + "step": 5102, + "time_per_iteration": 2.723998546600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060525, + "balance_loss_mlp": 1.02972126, + "epoch": 0.9817237398999615, + "flos": 576987798528.0, + "grad_norm": 0.049977089978911385, + "language_loss": 0.80795527, + "learning_rate": 8.756982280578307e-07, + "loss": 0.81856054, + "num_input_tokens_seen": 422641280, + "router_z_loss_mlp": 0.30786133, + "step": 5103, + "time_per_iteration": 2.7338523864746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061515, + "balance_loss_mlp": 1.03006709, + "epoch": 0.9819161215852251, + "flos": 701172011520.0, + "grad_norm": 0.0557213877990239, + "language_loss": 0.81740284, + "learning_rate": 8.573647489714676e-07, + "loss": 0.82801795, + "num_input_tokens_seen": 422720416, + "router_z_loss_mlp": 0.31420898, + "step": 5104, + "time_per_iteration": 2.931447744369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063553, + "balance_loss_mlp": 1.03308296, + "epoch": 0.9821085032704886, + "flos": 623873138688.0, + "grad_norm": 0.05444182607070333, + "language_loss": 0.84073544, + "learning_rate": 8.392250542421653e-07, + "loss": 0.85137099, + "num_input_tokens_seen": 422800384, + "router_z_loss_mlp": 0.30419922, + "step": 5105, + "time_per_iteration": 2.8544764518737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062382, + "balance_loss_mlp": 1.03167391, + "epoch": 0.9823008849557522, + "flos": 499259731968.0, + "grad_norm": 0.059283748900889124, + "language_loss": 0.81204158, + "learning_rate": 8.212791509122353e-07, + "loss": 0.82266545, + "num_input_tokens_seen": 422870768, + "router_z_loss_mlp": 0.30664062, + "step": 5106, + "time_per_iteration": 2.7119359970092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062348, + "balance_loss_mlp": 1.03123415, + "epoch": 0.9824932666410158, + "flos": 523544822784.0, + "grad_norm": 0.05648226297161044, + "language_loss": 0.7276091, + "learning_rate": 8.035270459489929e-07, + "loss": 0.73823255, + "num_input_tokens_seen": 422942864, + "router_z_loss_mlp": 0.31079102, + "step": 5107, + "time_per_iteration": 2.7225635051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063572, + "balance_loss_mlp": 1.0326972, + "epoch": 0.9826856483262794, + "flos": 502411216896.0, + "grad_norm": 0.05541444658999194, + "language_loss": 0.82263827, + "learning_rate": 7.859687462443698e-07, + "loss": 0.83327401, + "num_input_tokens_seen": 423013600, + "router_z_loss_mlp": 0.30834961, + "step": 5108, + "time_per_iteration": 2.6065311431884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063107, + "balance_loss_mlp": 1.03189802, + "epoch": 0.982878030011543, + "flos": 561768666624.0, + "grad_norm": 0.05636875798625559, + "language_loss": 0.84198737, + "learning_rate": 7.686042586151354e-07, + "loss": 0.85261846, + "num_input_tokens_seen": 423093680, + "router_z_loss_mlp": 0.31176758, + "step": 5109, + "time_per_iteration": 2.8074042797088623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061414, + "balance_loss_mlp": 1.03025222, + "epoch": 0.9830704116968064, + "flos": 536824447488.0, + "grad_norm": 0.05411078449531097, + "language_loss": 0.82744133, + "learning_rate": 7.514335898027857e-07, + "loss": 0.83805549, + "num_input_tokens_seen": 423168608, + "router_z_loss_mlp": 0.3112793, + "step": 5110, + "time_per_iteration": 2.73994517326355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060884, + "balance_loss_mlp": 1.02977061, + "epoch": 0.98326279338207, + "flos": 458712267264.0, + "grad_norm": 0.060128702369139815, + "language_loss": 0.84042352, + "learning_rate": 7.344567464735441e-07, + "loss": 0.85103238, + "num_input_tokens_seen": 423233552, + "router_z_loss_mlp": 0.31079102, + "step": 5111, + "time_per_iteration": 2.48018479347229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060072, + "balance_loss_mlp": 1.02881503, + "epoch": 0.9834551750673336, + "flos": 640672395264.0, + "grad_norm": 0.05142672263893664, + "language_loss": 0.79408097, + "learning_rate": 7.17673735218416e-07, + "loss": 0.80468172, + "num_input_tokens_seen": 423307440, + "router_z_loss_mlp": 0.31225586, + "step": 5112, + "time_per_iteration": 2.826101541519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057314, + "balance_loss_mlp": 1.02682042, + "epoch": 0.9836475567525972, + "flos": 1071373478400.0, + "grad_norm": 0.04969099377135597, + "language_loss": 0.79194742, + "learning_rate": 7.010845625530782e-07, + "loss": 0.80252051, + "num_input_tokens_seen": 423394880, + "router_z_loss_mlp": 0.3046875, + "step": 5113, + "time_per_iteration": 3.394486427307129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106507, + "balance_loss_mlp": 1.03347969, + "epoch": 0.9838399384378607, + "flos": 564943472640.0, + "grad_norm": 0.06818796043549862, + "language_loss": 0.75512731, + "learning_rate": 6.846892349181566e-07, + "loss": 0.76577806, + "num_input_tokens_seen": 423461792, + "router_z_loss_mlp": 0.31567383, + "step": 5114, + "time_per_iteration": 2.6842877864837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064716, + "balance_loss_mlp": 1.03333998, + "epoch": 0.9840323201231242, + "flos": 772463278080.0, + "grad_norm": 0.1018443246698012, + "language_loss": 0.79624081, + "learning_rate": 6.684877586787819e-07, + "loss": 0.80688798, + "num_input_tokens_seen": 423539952, + "router_z_loss_mlp": 0.31347656, + "step": 5115, + "time_per_iteration": 2.9627039432525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059546, + "balance_loss_mlp": 1.02867126, + "epoch": 0.9842247018083878, + "flos": 472016623104.0, + "grad_norm": 0.05701334916356296, + "language_loss": 0.85578704, + "learning_rate": 6.524801401249225e-07, + "loss": 0.86638254, + "num_input_tokens_seen": 423607184, + "router_z_loss_mlp": 0.30859375, + "step": 5116, + "time_per_iteration": 2.5376946926116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065055, + "balance_loss_mlp": 1.0332495, + "epoch": 0.9844170834936514, + "flos": 524996909568.0, + "grad_norm": 0.05489154338763104, + "language_loss": 0.84577304, + "learning_rate": 6.366663854713295e-07, + "loss": 0.85642362, + "num_input_tokens_seen": 423676528, + "router_z_loss_mlp": 0.31787109, + "step": 5117, + "time_per_iteration": 2.6327760219573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011198, + "balance_loss_mlp": 1.00309229, + "epoch": 0.984609465178915, + "flos": 1566406245888.0, + "grad_norm": 0.003743704164174256, + "language_loss": 0.77162516, + "learning_rate": 6.210465008574251e-07, + "loss": 0.78173721, + "num_input_tokens_seen": 423905856, + "router_z_loss_mlp": 0.08105469, + "step": 5118, + "time_per_iteration": 4.938253164291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065245, + "balance_loss_mlp": 1.03398824, + "epoch": 0.9848018468641785, + "flos": 519294841344.0, + "grad_norm": 0.061067822522050556, + "language_loss": 0.81750166, + "learning_rate": 6.056204923473584e-07, + "loss": 0.82815415, + "num_input_tokens_seen": 423972496, + "router_z_loss_mlp": 0.31225586, + "step": 5119, + "time_per_iteration": 2.6061348915100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062892, + "balance_loss_mlp": 1.03201687, + "epoch": 0.9849942285494421, + "flos": 492760323072.0, + "grad_norm": 0.07845667393472046, + "language_loss": 0.82774782, + "learning_rate": 5.903883659301167e-07, + "loss": 0.8383767, + "num_input_tokens_seen": 424039968, + "router_z_loss_mlp": 0.30834961, + "step": 5120, + "time_per_iteration": 2.6440351009368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064055, + "balance_loss_mlp": 1.03282189, + "epoch": 0.9851866102347057, + "flos": 545740609536.0, + "grad_norm": 0.056123337743530885, + "language_loss": 0.80794674, + "learning_rate": 5.753501275193029e-07, + "loss": 0.8185873, + "num_input_tokens_seen": 424108096, + "router_z_loss_mlp": 0.31225586, + "step": 5121, + "time_per_iteration": 2.649475574493408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059863, + "balance_loss_mlp": 1.02941656, + "epoch": 0.9853789919199692, + "flos": 476019293184.0, + "grad_norm": 0.0562432124087145, + "language_loss": 0.79757869, + "learning_rate": 5.605057829531912e-07, + "loss": 0.80817735, + "num_input_tokens_seen": 424172256, + "router_z_loss_mlp": 0.30395508, + "step": 5122, + "time_per_iteration": 2.521296262741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062584, + "balance_loss_mlp": 1.03139853, + "epoch": 0.9855713736052328, + "flos": 1032199524864.0, + "grad_norm": 0.055038538436744985, + "language_loss": 0.75819677, + "learning_rate": 5.458553379950049e-07, + "loss": 0.76882255, + "num_input_tokens_seen": 424261088, + "router_z_loss_mlp": 0.31152344, + "step": 5123, + "time_per_iteration": 3.337373971939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062929, + "balance_loss_mlp": 1.03174376, + "epoch": 0.9857637552904963, + "flos": 494794372608.0, + "grad_norm": 0.05369962620538531, + "language_loss": 0.82470608, + "learning_rate": 5.31398798332472e-07, + "loss": 0.83533537, + "num_input_tokens_seen": 424329168, + "router_z_loss_mlp": 0.31152344, + "step": 5124, + "time_per_iteration": 2.5722227096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062656, + "balance_loss_mlp": 1.03113651, + "epoch": 0.9859561369757599, + "flos": 591990142464.0, + "grad_norm": 0.06425183371235867, + "language_loss": 0.83396256, + "learning_rate": 5.17136169578103e-07, + "loss": 0.84458917, + "num_input_tokens_seen": 424399392, + "router_z_loss_mlp": 0.31494141, + "step": 5125, + "time_per_iteration": 2.689207077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060728, + "balance_loss_mlp": 1.03032947, + "epoch": 0.9861485186610235, + "flos": 486719221248.0, + "grad_norm": 0.06282802238411583, + "language_loss": 0.78441298, + "learning_rate": 5.030674572691907e-07, + "loss": 0.79502022, + "num_input_tokens_seen": 424470080, + "router_z_loss_mlp": 0.3034668, + "step": 5126, + "time_per_iteration": 2.6470096111297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061426, + "balance_loss_mlp": 1.03024125, + "epoch": 0.9863409003462871, + "flos": 518536788480.0, + "grad_norm": 0.045592923518431534, + "language_loss": 0.82589722, + "learning_rate": 4.891926668676994e-07, + "loss": 0.83651149, + "num_input_tokens_seen": 424541824, + "router_z_loss_mlp": 0.31152344, + "step": 5127, + "time_per_iteration": 2.6396868228912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011205, + "balance_loss_mlp": 1.00309873, + "epoch": 0.9865332820315506, + "flos": 1485212391936.0, + "grad_norm": 0.003744860643622383, + "language_loss": 0.79182732, + "learning_rate": 4.755118037602646e-07, + "loss": 0.80193937, + "num_input_tokens_seen": 424773408, + "router_z_loss_mlp": 0.08105469, + "step": 5128, + "time_per_iteration": 4.885936260223389 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063316, + "balance_loss_mlp": 1.03232157, + "epoch": 0.9867256637168141, + "flos": 581837271552.0, + "grad_norm": 0.05639219580829708, + "language_loss": 0.79096341, + "learning_rate": 4.620248732582488e-07, + "loss": 0.80159652, + "num_input_tokens_seen": 424840608, + "router_z_loss_mlp": 0.30981445, + "step": 5129, + "time_per_iteration": 2.697075605392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062742, + "balance_loss_mlp": 1.0312469, + "epoch": 0.9869180454020777, + "flos": 958898939904.0, + "grad_norm": 0.054185757090889033, + "language_loss": 0.86109281, + "learning_rate": 4.487318805977969e-07, + "loss": 0.87172019, + "num_input_tokens_seen": 424926128, + "router_z_loss_mlp": 0.31469727, + "step": 5130, + "time_per_iteration": 3.233060598373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064125, + "balance_loss_mlp": 1.03336906, + "epoch": 0.9871104270873413, + "flos": 770385558528.0, + "grad_norm": 0.05107531469569112, + "language_loss": 0.82377338, + "learning_rate": 4.3563283093966954e-07, + "loss": 0.8344146, + "num_input_tokens_seen": 425005744, + "router_z_loss_mlp": 0.30712891, + "step": 5131, + "time_per_iteration": 2.976196765899658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062884, + "balance_loss_mlp": 1.03134108, + "epoch": 0.9873028087726049, + "flos": 446215426560.0, + "grad_norm": 0.06674879005758158, + "language_loss": 0.77993727, + "learning_rate": 4.2272772936940986e-07, + "loss": 0.79056621, + "num_input_tokens_seen": 425068112, + "router_z_loss_mlp": 0.31518555, + "step": 5132, + "time_per_iteration": 2.477193593978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106004, + "balance_loss_mlp": 1.02866411, + "epoch": 0.9874951904578684, + "flos": 507359614464.0, + "grad_norm": 0.04705074666951951, + "language_loss": 0.86384636, + "learning_rate": 4.1001658089717676e-07, + "loss": 0.87444681, + "num_input_tokens_seen": 425137408, + "router_z_loss_mlp": 0.31347656, + "step": 5133, + "time_per_iteration": 2.5848257541656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062248, + "balance_loss_mlp": 1.03211176, + "epoch": 0.987687572143132, + "flos": 716420256768.0, + "grad_norm": 0.04984001642827075, + "language_loss": 0.82130331, + "learning_rate": 3.9749939045791164e-07, + "loss": 0.83192575, + "num_input_tokens_seen": 425213504, + "router_z_loss_mlp": 0.30078125, + "step": 5134, + "time_per_iteration": 2.890923023223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011205, + "balance_loss_mlp": 1.00309896, + "epoch": 0.9878799538283956, + "flos": 1537823121408.0, + "grad_norm": 0.0037470646793171538, + "language_loss": 0.79817951, + "learning_rate": 3.851761629111716e-07, + "loss": 0.80829155, + "num_input_tokens_seen": 425451296, + "router_z_loss_mlp": 0.08105469, + "step": 5135, + "time_per_iteration": 4.827297925949097 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062735, + "balance_loss_mlp": 1.03176403, + "epoch": 0.9880723355136591, + "flos": 721098021888.0, + "grad_norm": 0.09785276178071765, + "language_loss": 0.8140105, + "learning_rate": 3.730469030412964e-07, + "loss": 0.82463777, + "num_input_tokens_seen": 425527536, + "router_z_loss_mlp": 0.30932617, + "step": 5136, + "time_per_iteration": 2.8907034397125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061795, + "balance_loss_mlp": 1.0313009, + "epoch": 0.9882647171989226, + "flos": 557085109248.0, + "grad_norm": 0.08580715505102396, + "language_loss": 0.84046173, + "learning_rate": 3.611116155572969e-07, + "loss": 0.85107958, + "num_input_tokens_seen": 425596608, + "router_z_loss_mlp": 0.30444336, + "step": 5137, + "time_per_iteration": 2.7122488021850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066137, + "balance_loss_mlp": 1.03445137, + "epoch": 0.9884570988841862, + "flos": 562541276160.0, + "grad_norm": 0.0626014477527682, + "language_loss": 0.80571049, + "learning_rate": 3.493703050927999e-07, + "loss": 0.81637186, + "num_input_tokens_seen": 425667280, + "router_z_loss_mlp": 0.31665039, + "step": 5138, + "time_per_iteration": 2.686124801635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106034, + "balance_loss_mlp": 1.02872539, + "epoch": 0.9886494805694498, + "flos": 431537559552.0, + "grad_norm": 0.05773726314876442, + "language_loss": 0.85940892, + "learning_rate": 3.378229762062146e-07, + "loss": 0.87001228, + "num_input_tokens_seen": 425730736, + "router_z_loss_mlp": 0.31591797, + "step": 5139, + "time_per_iteration": 2.464719295501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106163, + "balance_loss_mlp": 1.03075433, + "epoch": 0.9888418622547134, + "flos": 591793703424.0, + "grad_norm": 0.04946155729695369, + "language_loss": 0.90478563, + "learning_rate": 3.264696333806771e-07, + "loss": 0.91540194, + "num_input_tokens_seen": 425807616, + "router_z_loss_mlp": 0.30834961, + "step": 5140, + "time_per_iteration": 2.7692387104034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105981, + "balance_loss_mlp": 1.02912521, + "epoch": 0.989034243939977, + "flos": 1134526984704.0, + "grad_norm": 0.0539141258158205, + "language_loss": 0.80669558, + "learning_rate": 3.1531028102388394e-07, + "loss": 0.81729364, + "num_input_tokens_seen": 425900880, + "router_z_loss_mlp": 0.30639648, + "step": 5141, + "time_per_iteration": 3.5516679286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062728, + "balance_loss_mlp": 1.03128052, + "epoch": 0.9892266256252404, + "flos": 566405733888.0, + "grad_norm": 0.0708287283243169, + "language_loss": 0.81880045, + "learning_rate": 3.0434492346825824e-07, + "loss": 0.82942772, + "num_input_tokens_seen": 425973632, + "router_z_loss_mlp": 0.31420898, + "step": 5142, + "time_per_iteration": 2.699986219406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064078, + "balance_loss_mlp": 1.03193879, + "epoch": 0.989419007310504, + "flos": 640254786048.0, + "grad_norm": 0.047861078038832494, + "language_loss": 0.836555, + "learning_rate": 2.9357356497095033e-07, + "loss": 0.8471958, + "num_input_tokens_seen": 426057088, + "router_z_loss_mlp": 0.32128906, + "step": 5143, + "time_per_iteration": 2.894883394241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062971, + "balance_loss_mlp": 1.03216708, + "epoch": 0.9896113889957676, + "flos": 455236305408.0, + "grad_norm": 0.05543458418629213, + "language_loss": 0.81691206, + "learning_rate": 2.829962097138372e-07, + "loss": 0.82754171, + "num_input_tokens_seen": 426124336, + "router_z_loss_mlp": 0.30761719, + "step": 5144, + "time_per_iteration": 2.607186794281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058638, + "balance_loss_mlp": 1.0274049, + "epoch": 0.9898037706810312, + "flos": 567070654464.0, + "grad_norm": 0.06893052249390286, + "language_loss": 0.80264378, + "learning_rate": 2.726128618033008e-07, + "loss": 0.81323016, + "num_input_tokens_seen": 426191888, + "router_z_loss_mlp": 0.31201172, + "step": 5145, + "time_per_iteration": 2.634690999984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011199, + "balance_loss_mlp": 1.00309277, + "epoch": 0.9899961523662947, + "flos": 1549476131328.0, + "grad_norm": 0.0037450033001579456, + "language_loss": 0.78146422, + "learning_rate": 2.624235252706164e-07, + "loss": 0.79157621, + "num_input_tokens_seen": 426425840, + "router_z_loss_mlp": 0.08105469, + "step": 5146, + "time_per_iteration": 4.934341192245483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061804, + "balance_loss_mlp": 1.02999902, + "epoch": 0.9901885340515583, + "flos": 610401457152.0, + "grad_norm": 0.05110479894795738, + "language_loss": 0.85054564, + "learning_rate": 2.524282040715642e-07, + "loss": 0.86116374, + "num_input_tokens_seen": 426506080, + "router_z_loss_mlp": 0.31787109, + "step": 5147, + "time_per_iteration": 2.8900723457336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059898, + "balance_loss_mlp": 1.0296669, + "epoch": 0.9903809157368219, + "flos": 517231678464.0, + "grad_norm": 0.0532812715747954, + "language_loss": 0.82843816, + "learning_rate": 2.426269020866512e-07, + "loss": 0.83903718, + "num_input_tokens_seen": 426573936, + "router_z_loss_mlp": 0.30175781, + "step": 5148, + "time_per_iteration": 2.5548617839813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062875, + "balance_loss_mlp": 1.03233337, + "epoch": 0.9905732974220854, + "flos": 1099985716224.0, + "grad_norm": 0.05366226804981356, + "language_loss": 0.80599821, + "learning_rate": 2.3301962312122226e-07, + "loss": 0.81662691, + "num_input_tokens_seen": 426657472, + "router_z_loss_mlp": 0.30493164, + "step": 5149, + "time_per_iteration": 3.392335891723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063813, + "balance_loss_mlp": 1.03241384, + "epoch": 0.990765679107349, + "flos": 857630688768.0, + "grad_norm": 0.059297430250845336, + "language_loss": 0.84505075, + "learning_rate": 2.2360637090496073e-07, + "loss": 0.85568881, + "num_input_tokens_seen": 426740560, + "router_z_loss_mlp": 0.3137207, + "step": 5150, + "time_per_iteration": 3.1182923316955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060351, + "balance_loss_mlp": 1.02954686, + "epoch": 0.9909580607926125, + "flos": 491041986048.0, + "grad_norm": 0.07738057790240345, + "language_loss": 0.79969645, + "learning_rate": 2.143871490925542e-07, + "loss": 0.81029999, + "num_input_tokens_seen": 426809296, + "router_z_loss_mlp": 0.30761719, + "step": 5151, + "time_per_iteration": 2.6201224327087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061417, + "balance_loss_mlp": 1.02987456, + "epoch": 0.9911504424778761, + "flos": 584786525184.0, + "grad_norm": 0.05591386255417787, + "language_loss": 0.79255068, + "learning_rate": 2.0536196126319519e-07, + "loss": 0.80316496, + "num_input_tokens_seen": 426881056, + "router_z_loss_mlp": 0.31518555, + "step": 5152, + "time_per_iteration": 2.6853623390197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057775, + "balance_loss_mlp": 1.02711439, + "epoch": 0.9913428241631397, + "flos": 569763832320.0, + "grad_norm": 0.08412789044536739, + "language_loss": 0.81331563, + "learning_rate": 1.9653081092074753e-07, + "loss": 0.82389343, + "num_input_tokens_seen": 426949664, + "router_z_loss_mlp": 0.30615234, + "step": 5153, + "time_per_iteration": 2.678966760635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060701, + "balance_loss_mlp": 1.03011227, + "epoch": 0.9915352058484033, + "flos": 489505531392.0, + "grad_norm": 0.050282318122502674, + "language_loss": 0.86287737, + "learning_rate": 1.8789370149374652e-07, + "loss": 0.87348437, + "num_input_tokens_seen": 427018816, + "router_z_loss_mlp": 0.30541992, + "step": 5154, + "time_per_iteration": 2.6363751888275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059754, + "balance_loss_mlp": 1.02909327, + "epoch": 0.9917275875336667, + "flos": 743708445696.0, + "grad_norm": 0.046515503690355224, + "language_loss": 0.82832515, + "learning_rate": 1.7945063633545423e-07, + "loss": 0.83892262, + "num_input_tokens_seen": 427097984, + "router_z_loss_mlp": 0.30615234, + "step": 5155, + "time_per_iteration": 2.938014507293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060739, + "balance_loss_mlp": 1.02829003, + "epoch": 0.9919199692189303, + "flos": 508009978368.0, + "grad_norm": 0.05259363859514861, + "language_loss": 0.79867995, + "learning_rate": 1.7120161872380412e-07, + "loss": 0.80928731, + "num_input_tokens_seen": 427169280, + "router_z_loss_mlp": 0.32446289, + "step": 5156, + "time_per_iteration": 2.7240796089172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060457, + "balance_loss_mlp": 1.02910459, + "epoch": 0.9921123509041939, + "flos": 543702177792.0, + "grad_norm": 0.05290112083375949, + "language_loss": 0.83877754, + "learning_rate": 1.6314665186123457e-07, + "loss": 0.8493821, + "num_input_tokens_seen": 427237312, + "router_z_loss_mlp": 0.31323242, + "step": 5157, + "time_per_iteration": 2.6792891025543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062835, + "balance_loss_mlp": 1.03200781, + "epoch": 0.9923047325894575, + "flos": 671263428096.0, + "grad_norm": 0.059402742354966065, + "language_loss": 0.77112788, + "learning_rate": 1.5528573887507724e-07, + "loss": 0.78175628, + "num_input_tokens_seen": 427305008, + "router_z_loss_mlp": 0.30786133, + "step": 5158, + "time_per_iteration": 2.7650957107543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106199, + "balance_loss_mlp": 1.03047156, + "epoch": 0.9924971142747211, + "flos": 466291233792.0, + "grad_norm": 0.05256839185105993, + "language_loss": 0.80342734, + "learning_rate": 1.4761888281711322e-07, + "loss": 0.81404722, + "num_input_tokens_seen": 427377008, + "router_z_loss_mlp": 0.31494141, + "step": 5159, + "time_per_iteration": 2.69912052154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063209, + "balance_loss_mlp": 1.03202367, + "epoch": 0.9926894959599846, + "flos": 491337349632.0, + "grad_norm": 0.0517555441430021, + "language_loss": 0.82689339, + "learning_rate": 1.4014608666390594e-07, + "loss": 0.83752549, + "num_input_tokens_seen": 427444528, + "router_z_loss_mlp": 0.31152344, + "step": 5160, + "time_per_iteration": 2.585045099258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060997, + "balance_loss_mlp": 1.02993083, + "epoch": 0.9928818776452482, + "flos": 492144864768.0, + "grad_norm": 0.05816835563996184, + "language_loss": 0.81448108, + "learning_rate": 1.328673533166902e-07, + "loss": 0.825091, + "num_input_tokens_seen": 427509808, + "router_z_loss_mlp": 0.31054688, + "step": 5161, + "time_per_iteration": 2.5658082962036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059734, + "balance_loss_mlp": 1.02869153, + "epoch": 0.9930742593305117, + "flos": 546081053184.0, + "grad_norm": 0.047207524561871925, + "language_loss": 0.84269929, + "learning_rate": 1.2578268560131666e-07, + "loss": 0.85329664, + "num_input_tokens_seen": 427587936, + "router_z_loss_mlp": 0.31005859, + "step": 5162, + "time_per_iteration": 2.8079018592834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059662, + "balance_loss_mlp": 1.02890635, + "epoch": 0.9932666410157753, + "flos": 585234657792.0, + "grad_norm": 0.047366061223404234, + "language_loss": 0.85871446, + "learning_rate": 1.1889208626825188e-07, + "loss": 0.86931109, + "num_input_tokens_seen": 427662224, + "router_z_loss_mlp": 0.30737305, + "step": 5163, + "time_per_iteration": 2.760917901992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062808, + "balance_loss_mlp": 1.03226614, + "epoch": 0.9934590227010388, + "flos": 536833211904.0, + "grad_norm": 0.04936345876995348, + "language_loss": 0.83421499, + "learning_rate": 1.1219555799268921e-07, + "loss": 0.84484303, + "num_input_tokens_seen": 427730544, + "router_z_loss_mlp": 0.30517578, + "step": 5164, + "time_per_iteration": 2.614365577697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062983, + "balance_loss_mlp": 1.03117847, + "epoch": 0.9936514043863024, + "flos": 517754004480.0, + "grad_norm": 0.056445406823462094, + "language_loss": 0.86550891, + "learning_rate": 1.0569310337443794e-07, + "loss": 0.87613875, + "num_input_tokens_seen": 427799760, + "router_z_loss_mlp": 0.31787109, + "step": 5165, + "time_per_iteration": 2.622957944869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062574, + "balance_loss_mlp": 1.03203213, + "epoch": 0.993843786071566, + "flos": 744284616192.0, + "grad_norm": 0.10034207773061687, + "language_loss": 0.80428374, + "learning_rate": 9.938472493803419e-08, + "loss": 0.81490946, + "num_input_tokens_seen": 427881936, + "router_z_loss_mlp": 0.30493164, + "step": 5166, + "time_per_iteration": 3.043248414993286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060393, + "balance_loss_mlp": 1.02982759, + "epoch": 0.9940361677568296, + "flos": 525647273472.0, + "grad_norm": 0.05669440201180295, + "language_loss": 0.8168757, + "learning_rate": 9.327042513251893e-08, + "loss": 0.82747972, + "num_input_tokens_seen": 427951648, + "router_z_loss_mlp": 0.30517578, + "step": 5167, + "time_per_iteration": 2.6479313373565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058196, + "balance_loss_mlp": 1.02732062, + "epoch": 0.9942285494420932, + "flos": 555376946688.0, + "grad_norm": 0.05714439163256689, + "language_loss": 0.79955453, + "learning_rate": 8.735020633177104e-08, + "loss": 0.81013644, + "num_input_tokens_seen": 428031184, + "router_z_loss_mlp": 0.30834961, + "step": 5168, + "time_per_iteration": 2.742321729660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061675, + "balance_loss_mlp": 1.0310148, + "epoch": 0.9944209311273566, + "flos": 585722078208.0, + "grad_norm": 0.04880112875214473, + "language_loss": 0.81882125, + "learning_rate": 8.162407083411872e-08, + "loss": 0.82943803, + "num_input_tokens_seen": 428107296, + "router_z_loss_mlp": 0.30615234, + "step": 5169, + "time_per_iteration": 2.7316317558288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062047, + "balance_loss_mlp": 1.03136218, + "epoch": 0.9946133128126202, + "flos": 735185161728.0, + "grad_norm": 0.05301980101308898, + "language_loss": 0.82062948, + "learning_rate": 7.609202086272804e-08, + "loss": 0.83124995, + "num_input_tokens_seen": 428187904, + "router_z_loss_mlp": 0.30639648, + "step": 5170, + "time_per_iteration": 2.9755966663360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059155, + "balance_loss_mlp": 1.02768385, + "epoch": 0.9948056944978838, + "flos": 645728481792.0, + "grad_norm": 0.07007508791376361, + "language_loss": 0.82043839, + "learning_rate": 7.075405856526995e-08, + "loss": 0.83102989, + "num_input_tokens_seen": 428255856, + "router_z_loss_mlp": 0.31445312, + "step": 5171, + "time_per_iteration": 4.273667812347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060839, + "balance_loss_mlp": 1.02970123, + "epoch": 0.9949980761831474, + "flos": 445610142720.0, + "grad_norm": 0.05905922669837915, + "language_loss": 0.86205649, + "learning_rate": 6.561018601414226e-08, + "loss": 0.87266487, + "num_input_tokens_seen": 428321872, + "router_z_loss_mlp": 0.31103516, + "step": 5172, + "time_per_iteration": 2.528289318084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060626, + "balance_loss_mlp": 1.02908325, + "epoch": 0.995190457868411, + "flos": 435407809536.0, + "grad_norm": 0.04822617192166719, + "language_loss": 0.85489011, + "learning_rate": 6.066040520641414e-08, + "loss": 0.86549646, + "num_input_tokens_seen": 428389232, + "router_z_loss_mlp": 0.31518555, + "step": 5173, + "time_per_iteration": 2.575528621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065737, + "balance_loss_mlp": 1.03366995, + "epoch": 0.9953828395536745, + "flos": 513937598976.0, + "grad_norm": 0.04981117864172083, + "language_loss": 0.8126061, + "learning_rate": 5.590471806377062e-08, + "loss": 0.82326341, + "num_input_tokens_seen": 428456128, + "router_z_loss_mlp": 0.32055664, + "step": 5174, + "time_per_iteration": 2.5552728176116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061578, + "balance_loss_mlp": 1.03008258, + "epoch": 0.995575221238938, + "flos": 479608736256.0, + "grad_norm": 0.05922732070451884, + "language_loss": 0.81865811, + "learning_rate": 5.134312643245709e-08, + "loss": 0.82927388, + "num_input_tokens_seen": 428523504, + "router_z_loss_mlp": 0.31469727, + "step": 5175, + "time_per_iteration": 2.5534262657165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060563, + "balance_loss_mlp": 1.0288291, + "epoch": 0.9957676029242016, + "flos": 587500051968.0, + "grad_norm": 0.06112181675379925, + "language_loss": 0.76542944, + "learning_rate": 4.6975632083445793e-08, + "loss": 0.77603507, + "num_input_tokens_seen": 428596880, + "router_z_loss_mlp": 0.31713867, + "step": 5176, + "time_per_iteration": 2.716848850250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063795, + "balance_loss_mlp": 1.03210914, + "epoch": 0.9959599846094652, + "flos": 426244336128.0, + "grad_norm": 0.05881178966801391, + "language_loss": 0.80162942, + "learning_rate": 4.280223671243588e-08, + "loss": 0.8122673, + "num_input_tokens_seen": 428659472, + "router_z_loss_mlp": 0.31665039, + "step": 5177, + "time_per_iteration": 2.500436305999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060368, + "balance_loss_mlp": 1.02934957, + "epoch": 0.9961523662947287, + "flos": 611312279040.0, + "grad_norm": 0.048622636366890376, + "language_loss": 0.80515933, + "learning_rate": 3.8822941939575804e-08, + "loss": 0.815763, + "num_input_tokens_seen": 428736704, + "router_z_loss_mlp": 0.30981445, + "step": 5178, + "time_per_iteration": 2.828415870666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061082, + "balance_loss_mlp": 1.02937198, + "epoch": 0.9963447479799923, + "flos": 550521681408.0, + "grad_norm": 0.06697428881430652, + "language_loss": 0.73867881, + "learning_rate": 3.5037749309851927e-08, + "loss": 0.74928963, + "num_input_tokens_seen": 428808560, + "router_z_loss_mlp": 0.31689453, + "step": 5179, + "time_per_iteration": 2.6863455772399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063311, + "balance_loss_mlp": 1.03262651, + "epoch": 0.9965371296652559, + "flos": 625590065664.0, + "grad_norm": 0.05269572886478648, + "language_loss": 0.88772953, + "learning_rate": 3.1446660292755446e-08, + "loss": 0.89836264, + "num_input_tokens_seen": 428880688, + "router_z_loss_mlp": 0.30639648, + "step": 5180, + "time_per_iteration": 2.702698230743408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061004, + "balance_loss_mlp": 1.02950907, + "epoch": 0.9967295113505195, + "flos": 639205751808.0, + "grad_norm": 0.05458985414981575, + "language_loss": 0.81910491, + "learning_rate": 2.8049676282504433e-08, + "loss": 0.82971495, + "num_input_tokens_seen": 428960096, + "router_z_loss_mlp": 0.31469727, + "step": 5181, + "time_per_iteration": 2.9029834270477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062568, + "balance_loss_mlp": 1.03159761, + "epoch": 0.996921893035783, + "flos": 607101585408.0, + "grad_norm": 0.05702338558599981, + "language_loss": 0.76808369, + "learning_rate": 2.484679859793282e-08, + "loss": 0.77870935, + "num_input_tokens_seen": 429031296, + "router_z_loss_mlp": 0.30932617, + "step": 5182, + "time_per_iteration": 2.7194111347198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061712, + "balance_loss_mlp": 1.02990723, + "epoch": 0.9971142747210465, + "flos": 643867550208.0, + "grad_norm": 0.06530592489398579, + "language_loss": 0.81833375, + "learning_rate": 2.183802848243488e-08, + "loss": 0.82895088, + "num_input_tokens_seen": 429103312, + "router_z_loss_mlp": 0.31787109, + "step": 5183, + "time_per_iteration": 2.7606563568115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059793, + "balance_loss_mlp": 1.02877522, + "epoch": 0.9973066564063101, + "flos": 1040353251840.0, + "grad_norm": 0.04855677840468543, + "language_loss": 0.80962884, + "learning_rate": 1.9023367104187285e-08, + "loss": 0.82022679, + "num_input_tokens_seen": 429194896, + "router_z_loss_mlp": 0.30981445, + "step": 5184, + "time_per_iteration": 3.3332931995391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062751, + "balance_loss_mlp": 1.03201878, + "epoch": 0.9974990380915737, + "flos": 664784368128.0, + "grad_norm": 0.0558770781712101, + "language_loss": 0.83045828, + "learning_rate": 1.640281555587153e-08, + "loss": 0.84108579, + "num_input_tokens_seen": 429267664, + "router_z_loss_mlp": 0.30688477, + "step": 5185, + "time_per_iteration": 2.8848559856414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059374, + "balance_loss_mlp": 1.0282129, + "epoch": 0.9976914197768373, + "flos": 717808324608.0, + "grad_norm": 0.06124690550469667, + "language_loss": 0.77321869, + "learning_rate": 1.3976374855007024e-08, + "loss": 0.7838124, + "num_input_tokens_seen": 429343472, + "router_z_loss_mlp": 0.3112793, + "step": 5186, + "time_per_iteration": 2.8420236110687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060861, + "balance_loss_mlp": 1.02991474, + "epoch": 0.9978838014621008, + "flos": 518078481408.0, + "grad_norm": 0.05470349904981655, + "language_loss": 0.78897154, + "learning_rate": 1.1744045943451464e-08, + "loss": 0.79958016, + "num_input_tokens_seen": 429411472, + "router_z_loss_mlp": 0.30908203, + "step": 5187, + "time_per_iteration": 2.6191396713256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106246, + "balance_loss_mlp": 1.03110838, + "epoch": 0.9980761831473643, + "flos": 603138203136.0, + "grad_norm": 0.04602158415592968, + "language_loss": 0.83977401, + "learning_rate": 9.70582968801148e-09, + "loss": 0.8503986, + "num_input_tokens_seen": 429486704, + "router_z_loss_mlp": 0.31323242, + "step": 5188, + "time_per_iteration": 2.787299871444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058324, + "balance_loss_mlp": 1.0267812, + "epoch": 0.9982685648326279, + "flos": 453291005952.0, + "grad_norm": 0.04960660439705578, + "language_loss": 0.89004576, + "learning_rate": 7.861726879943021e-09, + "loss": 0.90062904, + "num_input_tokens_seen": 429554736, + "router_z_loss_mlp": 0.31518555, + "step": 5189, + "time_per_iteration": 2.5919430255889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057984, + "balance_loss_mlp": 1.02744257, + "epoch": 0.9984609465178915, + "flos": 481165539840.0, + "grad_norm": 0.06078734147645417, + "language_loss": 0.7877931, + "learning_rate": 6.211738235173403e-09, + "loss": 0.79837286, + "num_input_tokens_seen": 429623216, + "router_z_loss_mlp": 0.30493164, + "step": 5190, + "time_per_iteration": 2.6413121223449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062175, + "balance_loss_mlp": 1.03120399, + "epoch": 0.9986533282031551, + "flos": 476675449344.0, + "grad_norm": 0.045308866149240234, + "language_loss": 0.84180099, + "learning_rate": 4.755864394301312e-09, + "loss": 0.85242271, + "num_input_tokens_seen": 429695808, + "router_z_loss_mlp": 0.30957031, + "step": 5191, + "time_per_iteration": 2.6326682567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062646, + "balance_loss_mlp": 1.03191352, + "epoch": 0.9988457098884186, + "flos": 641647236096.0, + "grad_norm": 0.0680731275372209, + "language_loss": 0.86416614, + "learning_rate": 3.494105922541291e-09, + "loss": 0.87479258, + "num_input_tokens_seen": 429774464, + "router_z_loss_mlp": 0.30688477, + "step": 5192, + "time_per_iteration": 2.7818009853363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064252, + "balance_loss_mlp": 1.03189921, + "epoch": 0.9990380915736822, + "flos": 396105818112.0, + "grad_norm": 0.057818769382579786, + "language_loss": 0.87863171, + "learning_rate": 2.4264633097237365e-09, + "loss": 0.88927424, + "num_input_tokens_seen": 429835872, + "router_z_loss_mlp": 0.32348633, + "step": 5193, + "time_per_iteration": 2.421710252761841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061611, + "balance_loss_mlp": 1.03099763, + "epoch": 0.9992304732589458, + "flos": 575831075328.0, + "grad_norm": 0.0533802000790901, + "language_loss": 0.84760511, + "learning_rate": 1.552936970405927e-09, + "loss": 0.85822117, + "num_input_tokens_seen": 429911440, + "router_z_loss_mlp": 0.3059082, + "step": 5194, + "time_per_iteration": 2.716845989227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059499, + "balance_loss_mlp": 1.02883863, + "epoch": 0.9994228549442093, + "flos": 544017890304.0, + "grad_norm": 0.060624333126130775, + "language_loss": 0.75329232, + "learning_rate": 8.735272437054853e-10, + "loss": 0.76388735, + "num_input_tokens_seen": 429982512, + "router_z_loss_mlp": 0.30615234, + "step": 5195, + "time_per_iteration": 2.649554967880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106175, + "balance_loss_mlp": 1.03030252, + "epoch": 0.9996152366294728, + "flos": 1470777910272.0, + "grad_norm": 0.05155957143930501, + "language_loss": 0.80520171, + "learning_rate": 3.882343933003796e-10, + "loss": 0.8158192, + "num_input_tokens_seen": 430070944, + "router_z_loss_mlp": 0.31420898, + "step": 5196, + "time_per_iteration": 3.691424608230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048694, + "balance_loss_mlp": 1.02315903, + "epoch": 0.9998076183147364, + "flos": 618667255296.0, + "grad_norm": 0.1131411540909355, + "language_loss": 0.70138329, + "learning_rate": 9.70586077619906e-11, + "loss": 0.71187025, + "num_input_tokens_seen": 430164864, + "router_z_loss_mlp": 0.25506592, + "step": 5197, + "time_per_iteration": 4.002978086471558 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_mlp": 1.01343656, + "epoch": 1.0, + "flos": 1289959492608.0, + "grad_norm": 0.02359347763858496, + "language_loss": 0.84250998, + "learning_rate": 0.0, + "loss": 0.85281241, + "num_input_tokens_seen": 430340944, + "router_z_loss_mlp": 0.16827393, + "step": 5198, + "time_per_iteration": 5.848259449005127 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 430340944, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1713320035811328e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/training_args.bin b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..459663e238ea62a90da439e633388cc1e16cedb6 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63f07a99639c8908760dc7ac65f4d34d749c1861fc4b5a1f91cbdcc73581ce9e +size 7992 diff --git a/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/zero_to_fp32.py b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/checkpoint-5198/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_sharev3/config.json b/sft_pretrain/Full_smoe_sharev3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..5dea851aa1a9a167af5c492c9f5acf44b7a0cc35 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sharev3", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": true, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_sharev3/generation_config.json b/sft_pretrain/Full_smoe_sharev3/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_sharev3/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_sharev3/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_sharev3/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_sharev3/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a5850e0e33fb7be370bb086085a9d3bf29450a73 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7df115f383ed31a5a9b7c49a81df7f755bfb40161e10c12560c7bc1dd60f2330 +size 3759020544 diff --git a/sft_pretrain/Full_smoe_sharev3/model.safetensors.index.json b/sft_pretrain/Full_smoe_sharev3/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..90a56cf0153ed9062ff35ee2b372f7ab9e796c6a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731420128 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_sharev3/special_tokens_map.json b/sft_pretrain/Full_smoe_sharev3/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_sharev3/tokenizer.model b/sft_pretrain/Full_smoe_sharev3/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_sharev3/tokenizer_config.json b/sft_pretrain/Full_smoe_sharev3/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_sharev3/trainer_state.json b/sft_pretrain/Full_smoe_sharev3/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..21b3cd05aa67a05cfdcd1081f3ace03f4e6103e4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/trainer_state.json @@ -0,0 +1,78013 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03964023, + "balance_loss_mlp": 3.01339984, + "epoch": 0.00019238168526356292, + "flos": 470464353792.0, + "grad_norm": 27.10233905437441, + "language_loss": 3.72295761, + "learning_rate": 0.0, + "loss": 2.48840261, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 9.5, + "step": 1, + "time_per_iteration": 29.606513023376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01906453, + "balance_loss_mlp": 1.25642872, + "epoch": 0.00038476337052712584, + "flos": 504311436288.0, + "grad_norm": 2.874173750989579, + "language_loss": 1.79264998, + "learning_rate": 0.00013726078121135892, + "loss": 1.81171465, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.5, + "step": 2, + "time_per_iteration": 2.7078208923339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01915611, + "balance_loss_mlp": 1.2667315, + "epoch": 0.0005771450557906887, + "flos": 598869282816.0, + "grad_norm": 2.1141296462778643, + "language_loss": 1.61429811, + "learning_rate": 0.00021755319103969496, + "loss": 1.63345432, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.48828125, + "step": 3, + "time_per_iteration": 3.010409116744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01917319, + "balance_loss_mlp": 1.26309848, + "epoch": 0.0007695267410542517, + "flos": 580133491200.0, + "grad_norm": 1.255159247360545, + "language_loss": 1.49202251, + "learning_rate": 0.00027452156242271784, + "loss": 1.51119578, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.54296875, + "step": 4, + "time_per_iteration": 2.7161622047424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0185163, + "balance_loss_mlp": 1.22144234, + "epoch": 0.0009619084263178145, + "flos": 485857861632.0, + "grad_norm": 4.267520959606063, + "language_loss": 1.57359505, + "learning_rate": 0.0003187096642208417, + "loss": 1.59211147, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 6.296875, + "step": 5, + "time_per_iteration": 2.718417167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01828185, + "balance_loss_mlp": 1.21211123, + "epoch": 0.0011542901115813775, + "flos": 559744791552.0, + "grad_norm": 1.225349312557607, + "language_loss": 1.4752574, + "learning_rate": 0.0003548139722510539, + "loss": 1.49353933, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 6.15234375, + "step": 6, + "time_per_iteration": 2.6827874183654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01821666, + "balance_loss_mlp": 1.22428453, + "epoch": 0.0013466717968449403, + "flos": 533721014784.0, + "grad_norm": 0.5025899606895544, + "language_loss": 1.33846116, + "learning_rate": 0.00038533972973918044, + "loss": 1.35667801, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.96875, + "step": 7, + "time_per_iteration": 2.6889517307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01776667, + "balance_loss_mlp": 1.2090404, + "epoch": 0.0015390534821085034, + "flos": 492037175808.0, + "grad_norm": 0.1719820928967348, + "language_loss": 1.2814672, + "learning_rate": 0.0004117823436340768, + "loss": 1.29923391, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.6875, + "step": 8, + "time_per_iteration": 2.7207248210906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0177577, + "balance_loss_mlp": 1.23217535, + "epoch": 0.0017314351673720662, + "flos": 564402207744.0, + "grad_norm": 0.6128716609675008, + "language_loss": 1.39861906, + "learning_rate": 0.00043510638207938993, + "loss": 1.41637683, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.44140625, + "step": 9, + "time_per_iteration": 2.887538194656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01823371, + "balance_loss_mlp": 1.31334615, + "epoch": 0.001923816852635629, + "flos": 593132308992.0, + "grad_norm": 0.480897383035181, + "language_loss": 1.25963569, + "learning_rate": 0.00045597044543220066, + "loss": 1.27786922, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.09765625, + "step": 10, + "time_per_iteration": 2.7672832012176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01930298, + "balance_loss_mlp": 1.44621277, + "epoch": 0.002116198537899192, + "flos": 609308752896.0, + "grad_norm": 0.21803247425844502, + "language_loss": 1.22959518, + "learning_rate": 0.00047484428652143135, + "loss": 1.24889803, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 4.83203125, + "step": 11, + "time_per_iteration": 2.9771082401275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130152, + "balance_loss_mlp": 1.67772901, + "epoch": 0.002308580223162755, + "flos": 544869075456.0, + "grad_norm": 0.19847359144835577, + "language_loss": 1.28057694, + "learning_rate": 0.0004920747534624128, + "loss": 1.30187845, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 4.52734375, + "step": 12, + "time_per_iteration": 2.6094090938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02177014, + "balance_loss_mlp": 1.7512939, + "epoch": 0.002500961908426318, + "flos": 644458277376.0, + "grad_norm": 0.3126355826019607, + "language_loss": 1.29235363, + "learning_rate": 0.0005079252465375872, + "loss": 1.31412375, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 4.265625, + "step": 13, + "time_per_iteration": 2.841792345046997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02141221, + "balance_loss_mlp": 1.74411082, + "epoch": 0.0026933435936898806, + "flos": 487605312000.0, + "grad_norm": 0.282411779716686, + "language_loss": 1.17459798, + "learning_rate": 0.0005226005109505393, + "loss": 1.19601011, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 3.96875, + "step": 14, + "time_per_iteration": 2.597313165664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02024541, + "balance_loss_mlp": 1.65890288, + "epoch": 0.0028857252789534437, + "flos": 434368949760.0, + "grad_norm": 0.2583476739022616, + "language_loss": 1.22957516, + "learning_rate": 0.0005362628552605367, + "loss": 1.24982059, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 3.65234375, + "step": 15, + "time_per_iteration": 2.6388704776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01790575, + "balance_loss_mlp": 1.44687057, + "epoch": 0.0030781069642170067, + "flos": 596465676288.0, + "grad_norm": 0.18613747071639053, + "language_loss": 1.27631426, + "learning_rate": 0.0005490431248454357, + "loss": 1.29421997, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 3.44140625, + "step": 16, + "time_per_iteration": 2.708346128463745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01779165, + "balance_loss_mlp": 1.46941185, + "epoch": 0.0032704886494805694, + "flos": 1537360432128.0, + "grad_norm": 0.2733785965407311, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.77484274, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 3.09375, + "step": 17, + "time_per_iteration": 6.916250705718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01553778, + "balance_loss_mlp": 1.24955583, + "epoch": 0.0034628703347441324, + "flos": 473720403456.0, + "grad_norm": 0.11658431553946913, + "language_loss": 1.14468098, + "learning_rate": 0.0005723671632907488, + "loss": 1.16021872, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 3.03710938, + "step": 18, + "time_per_iteration": 2.7716212272644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01490625, + "balance_loss_mlp": 1.21005416, + "epoch": 0.0036552520200076955, + "flos": 448303320576.0, + "grad_norm": 0.11552730485963776, + "language_loss": 1.19723654, + "learning_rate": 0.0005830738490244919, + "loss": 1.21214283, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 2.80859375, + "step": 19, + "time_per_iteration": 2.6067557334899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0141948, + "balance_loss_mlp": 1.16103387, + "epoch": 0.003847633705271258, + "flos": 635881148928.0, + "grad_norm": 0.11977740619668105, + "language_loss": 1.21676993, + "learning_rate": 0.0005932312266435596, + "loss": 1.23096466, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 2.58398438, + "step": 20, + "time_per_iteration": 2.8545703887939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01364308, + "balance_loss_mlp": 1.13084817, + "epoch": 0.004040015390534821, + "flos": 589222771200.0, + "grad_norm": 0.09935322828728523, + "language_loss": 1.16681409, + "learning_rate": 0.0006028929207788754, + "loss": 1.18045723, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 2.33203125, + "step": 21, + "time_per_iteration": 2.7119524478912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319718, + "balance_loss_mlp": 1.11038613, + "epoch": 0.004232397075798384, + "flos": 756253338624.0, + "grad_norm": 0.09023283304690737, + "language_loss": 1.20250762, + "learning_rate": 0.0006121050677327902, + "loss": 1.21570492, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 2.09667969, + "step": 22, + "time_per_iteration": 2.884739398956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304467, + "balance_loss_mlp": 1.1184051, + "epoch": 0.004424778761061947, + "flos": 526434439680.0, + "grad_norm": 0.08559602389751407, + "language_loss": 1.10067201, + "learning_rate": 0.0006209076479463684, + "loss": 1.1137166, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 1.85839844, + "step": 23, + "time_per_iteration": 2.6616718769073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275434, + "balance_loss_mlp": 1.10787356, + "epoch": 0.00461716044632551, + "flos": 547907079168.0, + "grad_norm": 0.07141137445072718, + "language_loss": 1.2012924, + "learning_rate": 0.0006293355346737718, + "loss": 1.21404672, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 1.67675781, + "step": 24, + "time_per_iteration": 2.7025952339172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252583, + "balance_loss_mlp": 1.10476315, + "epoch": 0.004809542131589073, + "flos": 567293234688.0, + "grad_norm": 0.08524381015789384, + "language_loss": 1.16738653, + "learning_rate": 0.0006374193284416834, + "loss": 1.17991233, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 1.47753906, + "step": 25, + "time_per_iteration": 2.827439069747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223638, + "balance_loss_mlp": 1.0984205, + "epoch": 0.005001923816852636, + "flos": 470391418368.0, + "grad_norm": 0.08512374611478205, + "language_loss": 1.15399337, + "learning_rate": 0.0006451860277489461, + "loss": 1.16622972, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 1.25097656, + "step": 26, + "time_per_iteration": 2.6214864253997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206141, + "balance_loss_mlp": 1.10009253, + "epoch": 0.005194305502116198, + "flos": 415283950080.0, + "grad_norm": 0.07774032731783902, + "language_loss": 1.23061514, + "learning_rate": 0.0006526595731190848, + "loss": 1.2426765, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 1.0625, + "step": 27, + "time_per_iteration": 2.5637125968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117904, + "balance_loss_mlp": 1.09192181, + "epoch": 0.005386687187379761, + "flos": 628466535936.0, + "grad_norm": 0.05524077436438855, + "language_loss": 1.1626848, + "learning_rate": 0.0006598612921618983, + "loss": 1.17447519, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 0.87158203, + "step": 28, + "time_per_iteration": 2.8202784061431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159441, + "balance_loss_mlp": 1.08772469, + "epoch": 0.005579068872643324, + "flos": 886100332032.0, + "grad_norm": 0.07386109802626846, + "language_loss": 1.08505416, + "learning_rate": 0.0006668102665011454, + "loss": 1.09664845, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 0.71728516, + "step": 29, + "time_per_iteration": 3.2254040241241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154142, + "balance_loss_mlp": 1.09520459, + "epoch": 0.005771450557906887, + "flos": 547287238656.0, + "grad_norm": 0.0797557646441396, + "language_loss": 1.18077409, + "learning_rate": 0.0006735236364718957, + "loss": 1.19231534, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 0.58886719, + "step": 30, + "time_per_iteration": 2.6730945110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140737, + "balance_loss_mlp": 1.09384, + "epoch": 0.00596383224317045, + "flos": 531766950912.0, + "grad_norm": 0.060827451674393726, + "language_loss": 1.1687839, + "learning_rate": 0.0006800168558381346, + "loss": 1.18019128, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 0.46875, + "step": 31, + "time_per_iteration": 2.649216651916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148736, + "balance_loss_mlp": 1.11166239, + "epoch": 0.0061562139284340135, + "flos": 588813926400.0, + "grad_norm": 0.10592463777190406, + "language_loss": 1.19211543, + "learning_rate": 0.0006863039060567947, + "loss": 1.20360279, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 0.37084961, + "step": 32, + "time_per_iteration": 2.6697018146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132499, + "balance_loss_mlp": 1.10136151, + "epoch": 0.006348595613697576, + "flos": 617929551360.0, + "grad_norm": 0.09812744917576391, + "language_loss": 1.1217525, + "learning_rate": 0.0006923974775611263, + "loss": 1.13307738, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 0.3112793, + "step": 33, + "time_per_iteration": 2.770225763320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137532, + "balance_loss_mlp": 1.11146092, + "epoch": 0.006540977298961139, + "flos": 777564444672.0, + "grad_norm": 0.06513543096417564, + "language_loss": 1.08375585, + "learning_rate": 0.0006983091239737814, + "loss": 1.09513116, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 0.26086426, + "step": 34, + "time_per_iteration": 2.99418306350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128276, + "balance_loss_mlp": 1.10578084, + "epoch": 0.006733358984224702, + "flos": 666837356544.0, + "grad_norm": 0.06344935516817307, + "language_loss": 1.07062221, + "learning_rate": 0.0007040493939600222, + "loss": 1.08190489, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 0.22497559, + "step": 35, + "time_per_iteration": 2.9126057624816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119708, + "balance_loss_mlp": 1.09892988, + "epoch": 0.006925740669488265, + "flos": 564092287488.0, + "grad_norm": 0.06579143759664555, + "language_loss": 1.07960629, + "learning_rate": 0.0007096279445021078, + "loss": 1.09080338, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 0.20788574, + "step": 36, + "time_per_iteration": 2.7079102993011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114855, + "balance_loss_mlp": 1.09574544, + "epoch": 0.007118122354751828, + "flos": 549583156224.0, + "grad_norm": 0.14799474820221378, + "language_loss": 1.14634764, + "learning_rate": 0.0007150536386503726, + "loss": 1.15749621, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 0.19104004, + "step": 37, + "time_per_iteration": 2.8290467262268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104011, + "balance_loss_mlp": 1.08533084, + "epoch": 0.007310504040015391, + "flos": 702161409024.0, + "grad_norm": 0.2513092385422617, + "language_loss": 1.08396375, + "learning_rate": 0.0007203346302358509, + "loss": 1.09500384, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 0.18688965, + "step": 38, + "time_per_iteration": 2.961430311203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121274, + "balance_loss_mlp": 1.10231924, + "epoch": 0.007502885725278953, + "flos": 599022051840.0, + "grad_norm": 0.0999674626629785, + "language_loss": 1.11391926, + "learning_rate": 0.000725478437577282, + "loss": 1.12513208, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 0.18945312, + "step": 39, + "time_per_iteration": 2.742088556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146989, + "balance_loss_mlp": 1.12810588, + "epoch": 0.007695267410542516, + "flos": 560000867328.0, + "grad_norm": 0.3323772184023467, + "language_loss": 1.08355689, + "learning_rate": 0.0007304920078549186, + "loss": 1.09502685, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 0.18884277, + "step": 40, + "time_per_iteration": 2.66943621635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116486, + "balance_loss_mlp": 1.1452378, + "epoch": 0.007887649095806078, + "flos": 507906671616.0, + "grad_norm": 0.11539272036457353, + "language_loss": 1.09356606, + "learning_rate": 0.0007353817735343603, + "loss": 1.1052146, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 0.19604492, + "step": 41, + "time_per_iteration": 2.7052595615386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132998, + "balance_loss_mlp": 1.11293542, + "epoch": 0.008080030781069641, + "flos": 503642133504.0, + "grad_norm": 0.12251683576194117, + "language_loss": 1.04851842, + "learning_rate": 0.0007401537019902344, + "loss": 1.05984843, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 0.20056152, + "step": 42, + "time_per_iteration": 2.590432643890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124507, + "balance_loss_mlp": 1.10198867, + "epoch": 0.008272412466333205, + "flos": 517764178944.0, + "grad_norm": 0.09393858903586973, + "language_loss": 1.08539796, + "learning_rate": 0.0007448133392900729, + "loss": 1.09664297, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 0.22521973, + "step": 43, + "time_per_iteration": 2.6619081497192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112544, + "balance_loss_mlp": 1.10156202, + "epoch": 0.008464794151596768, + "flos": 607673373696.0, + "grad_norm": 0.06822323064374927, + "language_loss": 1.03845203, + "learning_rate": 0.0007493658489441491, + "loss": 1.04970646, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 0.23864746, + "step": 44, + "time_per_iteration": 2.861008644104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128905, + "balance_loss_mlp": 1.10477662, + "epoch": 0.00865717583686033, + "flos": 537661075968.0, + "grad_norm": 0.1413166066405165, + "language_loss": 1.08820629, + "learning_rate": 0.0007538160463002316, + "loss": 1.09949529, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 0.2409668, + "step": 45, + "time_per_iteration": 2.643458604812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115676, + "balance_loss_mlp": 1.13258433, + "epoch": 0.008849557522123894, + "flos": 507758284800.0, + "grad_norm": 0.08570115972640321, + "language_loss": 1.10720444, + "learning_rate": 0.0007581684291577274, + "loss": 1.11877203, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.24157715, + "step": 46, + "time_per_iteration": 2.5904788970947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145761, + "balance_loss_mlp": 1.12085772, + "epoch": 0.009041939207387457, + "flos": 625048800768.0, + "grad_norm": 0.06636849455276843, + "language_loss": 1.14156199, + "learning_rate": 0.0007624272050891776, + "loss": 1.15301955, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.24902344, + "step": 47, + "time_per_iteration": 2.782179594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154374, + "balance_loss_mlp": 1.12759995, + "epoch": 0.00923432089265102, + "flos": 549124849152.0, + "grad_norm": 0.09356522507451794, + "language_loss": 1.04615343, + "learning_rate": 0.0007665963158851307, + "loss": 1.05769718, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.26806641, + "step": 48, + "time_per_iteration": 2.824540138244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174661, + "balance_loss_mlp": 1.14738548, + "epoch": 0.009426702577914583, + "flos": 562202242560.0, + "grad_norm": 0.059100241584136314, + "language_loss": 1.12381458, + "learning_rate": 0.0007706794594783609, + "loss": 1.13556111, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.27270508, + "step": 49, + "time_per_iteration": 2.790757894515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192673, + "balance_loss_mlp": 1.16604137, + "epoch": 0.009619084263178146, + "flos": 616486228992.0, + "grad_norm": 0.08074806779925832, + "language_loss": 1.11280799, + "learning_rate": 0.0007746801096530423, + "loss": 1.12473488, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.2668457, + "step": 50, + "time_per_iteration": 2.7235305309295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116178, + "balance_loss_mlp": 1.135149, + "epoch": 0.009811465948441709, + "flos": 541176325632.0, + "grad_norm": 0.06558886342971224, + "language_loss": 1.16576111, + "learning_rate": 0.0007786015338021173, + "loss": 1.17737889, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.26672363, + "step": 51, + "time_per_iteration": 2.6817519664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134628, + "balance_loss_mlp": 1.1085453, + "epoch": 0.010003847633705272, + "flos": 535608087552.0, + "grad_norm": 0.06210449580458492, + "language_loss": 1.08870959, + "learning_rate": 0.0007824468089603051, + "loss": 1.10005593, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.26098633, + "step": 52, + "time_per_iteration": 2.644577980041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125522, + "balance_loss_mlp": 1.09910512, + "epoch": 0.010196229318968833, + "flos": 908867907072.0, + "grad_norm": 0.05864822926220488, + "language_loss": 1.07807887, + "learning_rate": 0.0007862188363098669, + "loss": 1.08933413, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.26428223, + "step": 53, + "time_per_iteration": 3.1450047492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126237, + "balance_loss_mlp": 1.10084558, + "epoch": 0.010388611004232396, + "flos": 585594040320.0, + "grad_norm": 0.07974065634267835, + "language_loss": 1.08295977, + "learning_rate": 0.0007899203543304438, + "loss": 1.09422219, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.25390625, + "step": 54, + "time_per_iteration": 2.6822280883789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155972, + "balance_loss_mlp": 1.13315582, + "epoch": 0.01058099268949596, + "flos": 502233716736.0, + "grad_norm": 0.07014139109577967, + "language_loss": 1.22212756, + "learning_rate": 0.0007935539507422731, + "loss": 1.23368728, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.22814941, + "step": 55, + "time_per_iteration": 2.5841405391693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117516, + "balance_loss_mlp": 1.153512, + "epoch": 0.010773374374759523, + "flos": 544170659328.0, + "grad_norm": 0.07006342440897594, + "language_loss": 1.13914931, + "learning_rate": 0.0007971220733732573, + "loss": 1.15090084, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.21643066, + "step": 56, + "time_per_iteration": 2.697427988052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193099, + "balance_loss_mlp": 1.17267895, + "epoch": 0.010965756060023086, + "flos": 525874235904.0, + "grad_norm": 0.08125896119424647, + "language_loss": 1.0764755, + "learning_rate": 0.0008006270400641869, + "loss": 1.08840656, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.2043457, + "step": 57, + "time_per_iteration": 2.723154306411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174019, + "balance_loss_mlp": 1.15412247, + "epoch": 0.011158137745286649, + "flos": 576653147136.0, + "grad_norm": 0.07485866075688756, + "language_loss": 1.09104013, + "learning_rate": 0.0008040710477125043, + "loss": 1.10278034, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.19897461, + "step": 58, + "time_per_iteration": 2.703186273574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153983, + "balance_loss_mlp": 1.13440859, + "epoch": 0.011350519430550212, + "flos": 529024310784.0, + "grad_norm": 0.06764829366941465, + "language_loss": 1.09780312, + "learning_rate": 0.0008074561805429771, + "loss": 1.10934305, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.19567871, + "step": 59, + "time_per_iteration": 2.6111674308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136624, + "balance_loss_mlp": 1.11676335, + "epoch": 0.011542901115813775, + "flos": 555608291328.0, + "grad_norm": 0.06986870516034673, + "language_loss": 1.08079648, + "learning_rate": 0.0008107844176832545, + "loss": 1.09216261, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.19848633, + "step": 60, + "time_per_iteration": 2.682687997817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125651, + "balance_loss_mlp": 1.1056236, + "epoch": 0.011735282801077338, + "flos": 571826995200.0, + "grad_norm": 0.061548073586970495, + "language_loss": 1.09071934, + "learning_rate": 0.0008140576401132568, + "loss": 1.10197592, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.20019531, + "step": 61, + "time_per_iteration": 2.639394760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111743, + "balance_loss_mlp": 1.09838021, + "epoch": 0.0119276644863409, + "flos": 615309156864.0, + "grad_norm": 0.06273761556608791, + "language_loss": 1.10558033, + "learning_rate": 0.0008172776370494935, + "loss": 1.11675453, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.19030762, + "step": 62, + "time_per_iteration": 2.7110230922698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134294, + "balance_loss_mlp": 1.11483955, + "epoch": 0.012120046171604464, + "flos": 500835474432.0, + "grad_norm": 0.07391589684249159, + "language_loss": 1.17346644, + "learning_rate": 0.0008204461118185703, + "loss": 1.18480933, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.19445801, + "step": 63, + "time_per_iteration": 2.5490689277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142708, + "balance_loss_mlp": 1.12420678, + "epoch": 0.012312427856868027, + "flos": 473109327360.0, + "grad_norm": 0.05825974543220343, + "language_loss": 1.06081367, + "learning_rate": 0.0008235646872681536, + "loss": 1.07224083, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.18505859, + "step": 64, + "time_per_iteration": 2.5874247550964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139504, + "balance_loss_mlp": 1.12069249, + "epoch": 0.012504809542131588, + "flos": 538094651904.0, + "grad_norm": 0.1040066778051144, + "language_loss": 1.06503749, + "learning_rate": 0.0008266349107584288, + "loss": 1.07643247, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.18823242, + "step": 65, + "time_per_iteration": 2.678736925125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123492, + "balance_loss_mlp": 1.10500288, + "epoch": 0.012697191227395151, + "flos": 608450365440.0, + "grad_norm": 0.09066354406474254, + "language_loss": 1.09410381, + "learning_rate": 0.0008296582587724851, + "loss": 1.10533869, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.18481445, + "step": 66, + "time_per_iteration": 2.6937255859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121105, + "balance_loss_mlp": 1.10255599, + "epoch": 0.012889572912658714, + "flos": 767750607360.0, + "grad_norm": 0.11790618145169461, + "language_loss": 1.07982886, + "learning_rate": 0.0008326361411800136, + "loss": 1.0910399, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.1854248, + "step": 67, + "time_per_iteration": 2.9377663135528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096346, + "balance_loss_mlp": 1.07871521, + "epoch": 0.013081954597922277, + "flos": 533604561408.0, + "grad_norm": 0.09153807632987658, + "language_loss": 1.08335972, + "learning_rate": 0.0008355699051851403, + "loss": 1.09432316, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.17651367, + "step": 68, + "time_per_iteration": 2.7278473377227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_mlp": 1.0865227, + "epoch": 0.01327433628318584, + "flos": 572826567168.0, + "grad_norm": 0.08317322449907456, + "language_loss": 1.14837921, + "learning_rate": 0.0008384608389860635, + "loss": 1.15941942, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.1751709, + "step": 69, + "time_per_iteration": 2.7238211631774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111418, + "balance_loss_mlp": 1.09424019, + "epoch": 0.013466717968449404, + "flos": 497029243392.0, + "grad_norm": 0.08213812906773327, + "language_loss": 1.04970825, + "learning_rate": 0.000841310175171381, + "loss": 1.06082237, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.17199707, + "step": 70, + "time_per_iteration": 2.578726291656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101987, + "balance_loss_mlp": 1.08526158, + "epoch": 0.013659099653712967, + "flos": 565234454016.0, + "grad_norm": 0.06358988870017376, + "language_loss": 1.03380442, + "learning_rate": 0.000844119093875517, + "loss": 1.04482436, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.16723633, + "step": 71, + "time_per_iteration": 2.692791223526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103533, + "balance_loss_mlp": 1.08689094, + "epoch": 0.01385148133897653, + "flos": 573540950016.0, + "grad_norm": 0.07461407963015444, + "language_loss": 1.08098376, + "learning_rate": 0.0008468887257134666, + "loss": 1.09201908, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.16650391, + "step": 72, + "time_per_iteration": 2.6599459648132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122889, + "balance_loss_mlp": 1.10587776, + "epoch": 0.014043863024240093, + "flos": 576539665920.0, + "grad_norm": 0.05931650266846123, + "language_loss": 1.10316896, + "learning_rate": 0.0008496201545131264, + "loss": 1.11439776, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.17028809, + "step": 73, + "time_per_iteration": 2.7093684673309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126213, + "balance_loss_mlp": 1.10950017, + "epoch": 0.014236244709503656, + "flos": 938287660032.0, + "grad_norm": 0.060718352480344094, + "language_loss": 1.08902812, + "learning_rate": 0.0008523144198617317, + "loss": 1.1002903, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.16711426, + "step": 74, + "time_per_iteration": 3.1743276119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125614, + "balance_loss_mlp": 1.10876918, + "epoch": 0.014428626394767219, + "flos": 528231352320.0, + "grad_norm": 0.07198154728214846, + "language_loss": 1.08249164, + "learning_rate": 0.0008549725194813783, + "loss": 1.09374774, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.1685791, + "step": 75, + "time_per_iteration": 2.630387783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106727, + "balance_loss_mlp": 1.09047866, + "epoch": 0.014621008080030782, + "flos": 803371433472.0, + "grad_norm": 0.07553700512989577, + "language_loss": 1.06998253, + "learning_rate": 0.0008575954114472099, + "loss": 1.0810498, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.16247559, + "step": 76, + "time_per_iteration": 3.134385347366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109532, + "balance_loss_mlp": 1.0933075, + "epoch": 0.014813389765294343, + "flos": 696588788736.0, + "grad_norm": 0.053440596513601155, + "language_loss": 1.05069363, + "learning_rate": 0.0008601840162606118, + "loss": 1.06178904, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.16223145, + "step": 77, + "time_per_iteration": 3.039991855621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123171, + "balance_loss_mlp": 1.10660076, + "epoch": 0.015005771450557906, + "flos": 596702813184.0, + "grad_norm": 0.07894951514499118, + "language_loss": 1.1143651, + "learning_rate": 0.000862739218788641, + "loss": 1.12559676, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.16577148, + "step": 78, + "time_per_iteration": 2.867741346359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113228, + "balance_loss_mlp": 1.11553121, + "epoch": 0.01519815313582147, + "flos": 549148170240.0, + "grad_norm": 0.0893413961860561, + "language_loss": 1.07743871, + "learning_rate": 0.0008652618700799138, + "loss": 1.08876157, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.16760254, + "step": 79, + "time_per_iteration": 2.675795555114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160152, + "balance_loss_mlp": 1.14348662, + "epoch": 0.015390534821085032, + "flos": 430306642944.0, + "grad_norm": 0.06679936706529424, + "language_loss": 1.07125092, + "learning_rate": 0.0008677527890662774, + "loss": 1.08285248, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.16662598, + "step": 80, + "time_per_iteration": 2.4765963554382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196819, + "balance_loss_mlp": 1.17889023, + "epoch": 0.015582916506348595, + "flos": 523854743040.0, + "grad_norm": 0.12362960542988827, + "language_loss": 1.09903598, + "learning_rate": 0.0008702127641587799, + "loss": 1.11100423, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.17932129, + "step": 81, + "time_per_iteration": 2.636688470840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180455, + "balance_loss_mlp": 1.16288388, + "epoch": 0.015775298191612157, + "flos": 575151598080.0, + "grad_norm": 0.08274533442322421, + "language_loss": 1.04032063, + "learning_rate": 0.0008726425547457192, + "loss": 1.05212522, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.17565918, + "step": 82, + "time_per_iteration": 2.765179395675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157804, + "balance_loss_mlp": 1.14051914, + "epoch": 0.01596767987687572, + "flos": 610040664576.0, + "grad_norm": 0.07618339381967684, + "language_loss": 1.03921247, + "learning_rate": 0.0008750428925998964, + "loss": 1.05079055, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.1730957, + "step": 83, + "time_per_iteration": 2.7615418434143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159673, + "balance_loss_mlp": 1.14280462, + "epoch": 0.016160061562139283, + "flos": 566864040960.0, + "grad_norm": 0.0706757922791228, + "language_loss": 1.09743476, + "learning_rate": 0.0008774144832015932, + "loss": 1.10903156, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.16882324, + "step": 84, + "time_per_iteration": 2.694364070892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01699218, + "balance_loss_mlp": 1.68252861, + "epoch": 0.016352443247402846, + "flos": 1410557234688.0, + "grad_norm": 0.23342967148410274, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76473522, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.16699219, + "step": 85, + "time_per_iteration": 4.599137306213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212855, + "balance_loss_mlp": 1.19580793, + "epoch": 0.01654482493266641, + "flos": 730177127424.0, + "grad_norm": 0.09253845479208671, + "language_loss": 1.04518116, + "learning_rate": 0.0008820741205014318, + "loss": 1.05730963, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.1706543, + "step": 86, + "time_per_iteration": 2.8595266342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246652, + "balance_loss_mlp": 1.22939014, + "epoch": 0.016737206617929972, + "flos": 536016932352.0, + "grad_norm": 0.10044068584300966, + "language_loss": 1.06437612, + "learning_rate": 0.0008843634575408404, + "loss": 1.07684278, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.17248535, + "step": 87, + "time_per_iteration": 2.690492630004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215448, + "balance_loss_mlp": 1.19887805, + "epoch": 0.016929588303193535, + "flos": 536706584064.0, + "grad_norm": 0.0661610487366718, + "language_loss": 1.07674646, + "learning_rate": 0.0008866266301555082, + "loss": 1.08890104, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.16577148, + "step": 88, + "time_per_iteration": 2.737339496612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203027, + "balance_loss_mlp": 1.18706512, + "epoch": 0.017121969988457098, + "flos": 526498458624.0, + "grad_norm": 0.07897226836222233, + "language_loss": 1.08543992, + "learning_rate": 0.0008888642296509615, + "loss": 1.09747016, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.1595459, + "step": 89, + "time_per_iteration": 2.576819658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187655, + "balance_loss_mlp": 1.17131162, + "epoch": 0.01731435167372066, + "flos": 625304876544.0, + "grad_norm": 0.0740353605135553, + "language_loss": 1.13367987, + "learning_rate": 0.0008910768275115906, + "loss": 1.14555645, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.16345215, + "step": 90, + "time_per_iteration": 2.778571128845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173459, + "balance_loss_mlp": 1.15692425, + "epoch": 0.017506733358984224, + "flos": 496157709312.0, + "grad_norm": 0.07518713147028631, + "language_loss": 1.08794332, + "learning_rate": 0.0008932649762767675, + "loss": 1.0996778, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.16540527, + "step": 91, + "time_per_iteration": 2.5931665897369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185544, + "balance_loss_mlp": 1.16881919, + "epoch": 0.017699115044247787, + "flos": 745613047296.0, + "grad_norm": 0.07711429280558382, + "language_loss": 1.11576343, + "learning_rate": 0.0008954292103690864, + "loss": 1.12761879, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.1673584, + "step": 92, + "time_per_iteration": 2.9129488468170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194769, + "balance_loss_mlp": 1.17854476, + "epoch": 0.01789149672951135, + "flos": 515257265664.0, + "grad_norm": 0.0669718610224715, + "language_loss": 1.1343056, + "learning_rate": 0.0008975700468778296, + "loss": 1.14625335, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.16223145, + "step": 93, + "time_per_iteration": 2.576620101928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216953, + "balance_loss_mlp": 1.20076382, + "epoch": 0.018083878414774913, + "flos": 585850116096.0, + "grad_norm": 0.11698648494194364, + "language_loss": 1.0652318, + "learning_rate": 0.0008996879863005366, + "loss": 1.07740128, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.16186523, + "step": 94, + "time_per_iteration": 2.6751108169555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217025, + "balance_loss_mlp": 1.2013253, + "epoch": 0.018276260100038477, + "flos": 497103436800.0, + "grad_norm": 0.08327491501556071, + "language_loss": 1.06208014, + "learning_rate": 0.0009017835132453337, + "loss": 1.07425046, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.15686035, + "step": 95, + "time_per_iteration": 2.5971803665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196463, + "balance_loss_mlp": 1.1804409, + "epoch": 0.01846864178530204, + "flos": 639765955584.0, + "grad_norm": 0.09756000368948786, + "language_loss": 1.06920743, + "learning_rate": 0.0009038570970964896, + "loss": 1.08117199, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.16027832, + "step": 96, + "time_per_iteration": 2.7428832054138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173361, + "balance_loss_mlp": 1.15723228, + "epoch": 0.018661023470565603, + "flos": 511411746816.0, + "grad_norm": 0.07053433913024812, + "language_loss": 1.04343212, + "learning_rate": 0.0009059091926454854, + "loss": 1.05516577, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.16125488, + "step": 97, + "time_per_iteration": 2.570509433746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.16246903, + "epoch": 0.018853405155829166, + "flos": 930710103552.0, + "grad_norm": 0.08767892767743933, + "language_loss": 1.03389072, + "learning_rate": 0.0009079402406897198, + "loss": 1.04567385, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.15844727, + "step": 98, + "time_per_iteration": 3.202298164367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179075, + "balance_loss_mlp": 1.16296983, + "epoch": 0.01904578684109273, + "flos": 576209396736.0, + "grad_norm": 0.2639136557883628, + "language_loss": 1.0596242, + "learning_rate": 0.0009099506686008212, + "loss": 1.07141495, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.16101074, + "step": 99, + "time_per_iteration": 2.812368869781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139923, + "balance_loss_mlp": 1.12423468, + "epoch": 0.019238168526356292, + "flos": 558173431296.0, + "grad_norm": 0.12311670746354397, + "language_loss": 1.08180976, + "learning_rate": 0.0009119408908644013, + "loss": 1.09320903, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.15673828, + "step": 100, + "time_per_iteration": 2.7063775062561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150815, + "balance_loss_mlp": 1.13574743, + "epoch": 0.019430550211619855, + "flos": 723539506176.0, + "grad_norm": 0.12127606313133317, + "language_loss": 1.14121008, + "learning_rate": 0.0009139113095929519, + "loss": 1.15271831, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.15039062, + "step": 101, + "time_per_iteration": 2.840913772583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218173, + "balance_loss_mlp": 1.20243776, + "epoch": 0.019622931896883418, + "flos": 499235000832.0, + "grad_norm": 0.1104247345061639, + "language_loss": 1.0836457, + "learning_rate": 0.0009158623150134762, + "loss": 1.09582746, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.15722656, + "step": 102, + "time_per_iteration": 2.560464859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01357908, + "balance_loss_mlp": 1.34204173, + "epoch": 0.01981531358214698, + "flos": 508916418048.0, + "grad_norm": 0.15164768975642337, + "language_loss": 1.07661259, + "learning_rate": 0.000917794285931332, + "loss": 1.0901916, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.15856934, + "step": 103, + "time_per_iteration": 2.6684353351593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381196, + "balance_loss_mlp": 1.36572242, + "epoch": 0.020007695267410544, + "flos": 521087371776.0, + "grad_norm": 0.10342928287682196, + "language_loss": 0.9971087, + "learning_rate": 0.0009197075901716639, + "loss": 1.01092052, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.15454102, + "step": 104, + "time_per_iteration": 2.7250871658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01356986, + "balance_loss_mlp": 1.34017777, + "epoch": 0.020200076952674107, + "flos": 533013834240.0, + "grad_norm": 0.1824265866479698, + "language_loss": 1.09647703, + "learning_rate": 0.0009216025849997171, + "loss": 1.11004686, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.16809082, + "step": 105, + "time_per_iteration": 2.776764154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261961, + "balance_loss_mlp": 1.24583197, + "epoch": 0.020392458637937667, + "flos": 684430981632.0, + "grad_norm": 0.06376163654280764, + "language_loss": 1.0425086, + "learning_rate": 0.0009234796175212258, + "loss": 1.05512834, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.16125488, + "step": 106, + "time_per_iteration": 2.9174978733062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269614, + "balance_loss_mlp": 1.25201869, + "epoch": 0.02058484032320123, + "flos": 701791852032.0, + "grad_norm": 0.060044663360548714, + "language_loss": 1.08808422, + "learning_rate": 0.000925339025064007, + "loss": 1.10078037, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.17590332, + "step": 107, + "time_per_iteration": 2.975735902786255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324579, + "balance_loss_mlp": 1.30547023, + "epoch": 0.020777222008464793, + "flos": 638772175872.0, + "grad_norm": 0.12680512225677842, + "language_loss": 1.01262307, + "learning_rate": 0.0009271811355418027, + "loss": 1.02586877, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.19128418, + "step": 108, + "time_per_iteration": 2.8408150672912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01306621, + "balance_loss_mlp": 1.28755951, + "epoch": 0.020969603693728356, + "flos": 681785856000.0, + "grad_norm": 0.06997483982989385, + "language_loss": 1.08551693, + "learning_rate": 0.0009290062678013548, + "loss": 1.09858322, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.19055176, + "step": 109, + "time_per_iteration": 2.869980812072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309468, + "balance_loss_mlp": 1.29159832, + "epoch": 0.02116198537899192, + "flos": 533140462080.0, + "grad_norm": 0.13190855435004306, + "language_loss": 1.06647623, + "learning_rate": 0.0009308147319536321, + "loss": 1.07957077, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.17895508, + "step": 110, + "time_per_iteration": 2.6270735263824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130688, + "balance_loss_mlp": 1.29067969, + "epoch": 0.021354367064255482, + "flos": 717168135168.0, + "grad_norm": 0.10963649287068344, + "language_loss": 1.1282903, + "learning_rate": 0.0009326068296900676, + "loss": 1.14135909, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.1619873, + "step": 111, + "time_per_iteration": 2.8845341205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388527, + "balance_loss_mlp": 1.37200487, + "epoch": 0.021546748749519045, + "flos": 519290459136.0, + "grad_norm": 0.12406482447985402, + "language_loss": 1.03902006, + "learning_rate": 0.0009343828545846161, + "loss": 1.05290532, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.16516113, + "step": 112, + "time_per_iteration": 2.8167102336883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01548404, + "balance_loss_mlp": 1.53109479, + "epoch": 0.021739130434782608, + "flos": 504912337920.0, + "grad_norm": 0.2528517188051562, + "language_loss": 1.0722419, + "learning_rate": 0.0009361430923823841, + "loss": 1.08772588, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.1730957, + "step": 113, + "time_per_iteration": 2.664581060409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441472, + "balance_loss_mlp": 1.42576015, + "epoch": 0.02193151212004617, + "flos": 463251820032.0, + "grad_norm": 0.1910881492312462, + "language_loss": 1.11420846, + "learning_rate": 0.0009378878212755459, + "loss": 1.12862325, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.15710449, + "step": 114, + "time_per_iteration": 2.4851133823394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262203, + "balance_loss_mlp": 1.24767148, + "epoch": 0.022123893805309734, + "flos": 552008673792.0, + "grad_norm": 0.09004287588953173, + "language_loss": 1.0099957, + "learning_rate": 0.0009396173121672103, + "loss": 1.0226177, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.14538574, + "step": 115, + "time_per_iteration": 2.6535162925720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215709, + "balance_loss_mlp": 1.20165396, + "epoch": 0.022316275490573297, + "flos": 635920436736.0, + "grad_norm": 0.07849561533847389, + "language_loss": 1.07122314, + "learning_rate": 0.0009413318289238633, + "loss": 1.08338022, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.14050293, + "step": 116, + "time_per_iteration": 2.7836899757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203544, + "balance_loss_mlp": 1.18965602, + "epoch": 0.02250865717583686, + "flos": 798535107072.0, + "grad_norm": 0.07099947506123377, + "language_loss": 0.98912275, + "learning_rate": 0.0009430316286169771, + "loss": 1.00115824, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.13891602, + "step": 117, + "time_per_iteration": 3.049468517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263206, + "balance_loss_mlp": 1.24786401, + "epoch": 0.022701038861100423, + "flos": 455851763712.0, + "grad_norm": 0.18808502465815918, + "language_loss": 1.04843259, + "learning_rate": 0.0009447169617543361, + "loss": 1.06106472, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.15319824, + "step": 118, + "time_per_iteration": 2.5886504650115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121023, + "balance_loss_mlp": 1.19557953, + "epoch": 0.022893420546363986, + "flos": 582812112384.0, + "grad_norm": 0.09179634719817005, + "language_loss": 1.11139297, + "learning_rate": 0.0009463880725016029, + "loss": 1.12349522, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.14648438, + "step": 119, + "time_per_iteration": 2.6861259937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169264, + "balance_loss_mlp": 1.15572226, + "epoch": 0.02308580223162755, + "flos": 561010613760.0, + "grad_norm": 0.09164108943144146, + "language_loss": 1.05675769, + "learning_rate": 0.0009480451988946134, + "loss": 1.06845045, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.13549805, + "step": 120, + "time_per_iteration": 2.8075129985809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217336, + "balance_loss_mlp": 1.2034359, + "epoch": 0.023278183916891113, + "flos": 770966111232.0, + "grad_norm": 0.1019945076921087, + "language_loss": 1.07486713, + "learning_rate": 0.0009496885730428627, + "loss": 1.08704054, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.13903809, + "step": 121, + "time_per_iteration": 3.0081264972686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291544, + "balance_loss_mlp": 1.27698815, + "epoch": 0.023470565602154676, + "flos": 553111552512.0, + "grad_norm": 0.08478902086488087, + "language_loss": 1.05369067, + "learning_rate": 0.0009513184213246156, + "loss": 1.06660616, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.14550781, + "step": 122, + "time_per_iteration": 2.654902696609497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406128, + "balance_loss_mlp": 1.39054775, + "epoch": 0.02366294728741824, + "flos": 559744791552.0, + "grad_norm": 0.09837859270685317, + "language_loss": 1.09463692, + "learning_rate": 0.0009529349645740552, + "loss": 1.10869825, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.15563965, + "step": 123, + "time_per_iteration": 2.6837081909179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01484693, + "balance_loss_mlp": 1.46961284, + "epoch": 0.0238553289726818, + "flos": 468313698816.0, + "grad_norm": 0.11388616458843728, + "language_loss": 1.07573724, + "learning_rate": 0.0009545384182608524, + "loss": 1.09058416, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.1505127, + "step": 124, + "time_per_iteration": 2.5069937705993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01411359, + "balance_loss_mlp": 1.39688659, + "epoch": 0.024047710657945365, + "flos": 559763730432.0, + "grad_norm": 0.3429043048666504, + "language_loss": 1.05057025, + "learning_rate": 0.0009561289926625252, + "loss": 1.06468379, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.14465332, + "step": 125, + "time_per_iteration": 2.6802117824554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011688, + "balance_loss_mlp": 1.15507352, + "epoch": 0.024240092343208928, + "flos": 504528224256.0, + "grad_norm": 0.18048320350440872, + "language_loss": 1.09737623, + "learning_rate": 0.0009577068930299292, + "loss": 1.10906434, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.13739014, + "step": 126, + "time_per_iteration": 2.6096670627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163735, + "balance_loss_mlp": 1.15040147, + "epoch": 0.02443247402847249, + "flos": 435516908544.0, + "grad_norm": 0.07278748671530755, + "language_loss": 1.05931616, + "learning_rate": 0.0009592723197462087, + "loss": 1.07095349, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.13360596, + "step": 127, + "time_per_iteration": 2.6409482955932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248107, + "balance_loss_mlp": 1.23239577, + "epoch": 0.024624855713736054, + "flos": 683445966336.0, + "grad_norm": 0.0813490266373729, + "language_loss": 1.02871299, + "learning_rate": 0.0009608254684795125, + "loss": 1.04119396, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.15710449, + "step": 128, + "time_per_iteration": 2.940600872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265693, + "balance_loss_mlp": 1.24772859, + "epoch": 0.024817237398999614, + "flos": 524721894912.0, + "grad_norm": 0.0804185451989367, + "language_loss": 1.06161952, + "learning_rate": 0.0009623665303297678, + "loss": 1.07427645, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.1796875, + "step": 129, + "time_per_iteration": 2.7088472843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256284, + "balance_loss_mlp": 1.23668599, + "epoch": 0.025009619084263177, + "flos": 655350262272.0, + "grad_norm": 0.12369480901617341, + "language_loss": 1.10218048, + "learning_rate": 0.0009638956919697878, + "loss": 1.11474347, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.19592285, + "step": 130, + "time_per_iteration": 2.857571840286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266475, + "balance_loss_mlp": 1.24420691, + "epoch": 0.02520200076952674, + "flos": 454187271168.0, + "grad_norm": 0.08293639348197612, + "language_loss": 1.02638018, + "learning_rate": 0.0009654131357809714, + "loss": 1.03904486, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.22253418, + "step": 131, + "time_per_iteration": 2.641470432281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128644, + "balance_loss_mlp": 1.26142943, + "epoch": 0.025394382454790303, + "flos": 839427397632.0, + "grad_norm": 0.05741461740254168, + "language_loss": 1.11002767, + "learning_rate": 0.0009669190399838441, + "loss": 1.12289214, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.25036621, + "step": 132, + "time_per_iteration": 3.133596420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302533, + "balance_loss_mlp": 1.27633083, + "epoch": 0.025586764140053866, + "flos": 580725628416.0, + "grad_norm": 0.06987664196058198, + "language_loss": 1.0413487, + "learning_rate": 0.0009684135787636724, + "loss": 1.05437398, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.26208496, + "step": 133, + "time_per_iteration": 2.7968075275421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01325396, + "balance_loss_mlp": 1.29710746, + "epoch": 0.02577914582531743, + "flos": 789893959680.0, + "grad_norm": 0.07551411578012862, + "language_loss": 1.07757604, + "learning_rate": 0.0009698969223913726, + "loss": 1.09083009, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.28283691, + "step": 134, + "time_per_iteration": 3.0058987140655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131126, + "balance_loss_mlp": 1.28212547, + "epoch": 0.025971527510580992, + "flos": 594683320320.0, + "grad_norm": 0.0731546450398535, + "language_loss": 1.10457921, + "learning_rate": 0.0009713692373399265, + "loss": 1.11769176, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.29125977, + "step": 135, + "time_per_iteration": 2.6654229164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02152319, + "balance_loss_mlp": 1.95700705, + "epoch": 0.026163909195844555, + "flos": 1576771522560.0, + "grad_norm": 0.26755932757436196, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81608546, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 1.953125, + "step": 136, + "time_per_iteration": 6.531313896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01724331, + "balance_loss_mlp": 1.55266988, + "epoch": 0.026356290881108118, + "flos": 1501306030080.0, + "grad_norm": 0.1444935793983717, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79535371, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 1.71875, + "step": 137, + "time_per_iteration": 4.966995716094971 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01371776, + "balance_loss_mlp": 1.34284425, + "epoch": 0.02654867256637168, + "flos": 596841025536.0, + "grad_norm": 0.06823267419395149, + "language_loss": 1.03539467, + "learning_rate": 0.0009757216201974225, + "loss": 1.04911256, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.28918457, + "step": 138, + "time_per_iteration": 2.7901663780212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396345, + "balance_loss_mlp": 1.36752045, + "epoch": 0.026741054251635244, + "flos": 544761386496.0, + "grad_norm": 0.08904352821745645, + "language_loss": 1.08793342, + "learning_rate": 0.0009771514130396581, + "loss": 1.10189688, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.28833008, + "step": 139, + "time_per_iteration": 2.664384603500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410566, + "balance_loss_mlp": 1.38171697, + "epoch": 0.026933435936898807, + "flos": 506591387136.0, + "grad_norm": 0.09467843708761726, + "language_loss": 1.08393478, + "learning_rate": 0.00097857095638274, + "loss": 1.09804034, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.28833008, + "step": 140, + "time_per_iteration": 2.5600626468658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399161, + "balance_loss_mlp": 1.37263703, + "epoch": 0.02712581762216237, + "flos": 740513290752.0, + "grad_norm": 0.06303030428856128, + "language_loss": 0.99670362, + "learning_rate": 0.0009799803961288726, + "loss": 1.01069522, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.26538086, + "step": 141, + "time_per_iteration": 2.984253168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01354082, + "balance_loss_mlp": 1.33143175, + "epoch": 0.027318199307425933, + "flos": 848023464960.0, + "grad_norm": 0.06264638149228761, + "language_loss": 1.05898559, + "learning_rate": 0.000981379875086876, + "loss": 1.07252645, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.22644043, + "step": 142, + "time_per_iteration": 3.032597064971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323808, + "balance_loss_mlp": 1.30553341, + "epoch": 0.027510580992689496, + "flos": 575288400384.0, + "grad_norm": 0.07028220907739285, + "language_loss": 1.01752293, + "learning_rate": 0.0009827695330590185, + "loss": 1.030761, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.18273926, + "step": 143, + "time_per_iteration": 2.626483678817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303402, + "balance_loss_mlp": 1.28557992, + "epoch": 0.02770296267795306, + "flos": 772079164416.0, + "grad_norm": 0.05744811954937285, + "language_loss": 1.00619161, + "learning_rate": 0.0009841495069248256, + "loss": 1.0192256, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.17822266, + "step": 144, + "time_per_iteration": 2.9495198726654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316023, + "balance_loss_mlp": 1.29916632, + "epoch": 0.027895344363216622, + "flos": 569123642880.0, + "grad_norm": 0.04968902291069247, + "language_loss": 0.9920603, + "learning_rate": 0.0009855199307219871, + "loss": 1.00522041, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.1685791, + "step": 145, + "time_per_iteration": 2.6407721042633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130391, + "balance_loss_mlp": 1.28731608, + "epoch": 0.028087726048480186, + "flos": 547099564032.0, + "grad_norm": 0.10723696528856613, + "language_loss": 1.01566505, + "learning_rate": 0.0009868809357244854, + "loss": 1.02870417, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.16589355, + "step": 146, + "time_per_iteration": 2.6262452602386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287507, + "balance_loss_mlp": 1.27153277, + "epoch": 0.02828010773374375, + "flos": 524519663616.0, + "grad_norm": 0.06991830692152445, + "language_loss": 1.05632663, + "learning_rate": 0.0009882326505180556, + "loss": 1.06920183, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.15966797, + "step": 147, + "time_per_iteration": 2.6469435691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270213, + "balance_loss_mlp": 1.2534517, + "epoch": 0.02847248941900731, + "flos": 772108277760.0, + "grad_norm": 0.07309095407736986, + "language_loss": 1.04486537, + "learning_rate": 0.0009895752010730906, + "loss": 1.0575676, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.16748047, + "step": 148, + "time_per_iteration": 2.9457786083221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012724, + "balance_loss_mlp": 1.25667655, + "epoch": 0.028664871104270875, + "flos": 534150208512.0, + "grad_norm": 0.048334696317449924, + "language_loss": 1.10088921, + "learning_rate": 0.0009909087108150867, + "loss": 1.11361325, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.15710449, + "step": 149, + "time_per_iteration": 2.712559700012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309133, + "balance_loss_mlp": 1.29286051, + "epoch": 0.028857252789534438, + "flos": 367557599232.0, + "grad_norm": 0.13115053493636905, + "language_loss": 1.11238122, + "learning_rate": 0.0009922333006927371, + "loss": 1.12547255, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.16247559, + "step": 150, + "time_per_iteration": 2.4607067108154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01329212, + "balance_loss_mlp": 1.31257081, + "epoch": 0.029049634474798, + "flos": 515232534528.0, + "grad_norm": 0.06948512606819708, + "language_loss": 1.04613614, + "learning_rate": 0.0009935490892437632, + "loss": 1.05942833, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.16650391, + "step": 151, + "time_per_iteration": 2.5460238456726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309109, + "balance_loss_mlp": 1.29317045, + "epoch": 0.029242016160061564, + "flos": 587840495616.0, + "grad_norm": 0.11257287432569656, + "language_loss": 1.03097093, + "learning_rate": 0.0009948561926585687, + "loss": 1.04406202, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.15930176, + "step": 152, + "time_per_iteration": 2.753009557723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300362, + "balance_loss_mlp": 1.28555596, + "epoch": 0.029434397845325123, + "flos": 551816616960.0, + "grad_norm": 0.062223246716750634, + "language_loss": 1.06524086, + "learning_rate": 0.0009961547248418122, + "loss": 1.07824445, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.14807129, + "step": 153, + "time_per_iteration": 2.630092144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308008, + "balance_loss_mlp": 1.29357219, + "epoch": 0.029626779530588686, + "flos": 603221160960.0, + "grad_norm": 0.09420536563091944, + "language_loss": 1.03062868, + "learning_rate": 0.0009974447974719707, + "loss": 1.04370856, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.14440918, + "step": 154, + "time_per_iteration": 2.6962759494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312448, + "balance_loss_mlp": 1.29745138, + "epoch": 0.02981916121585225, + "flos": 620808993792.0, + "grad_norm": 0.08558703297148447, + "language_loss": 1.04985213, + "learning_rate": 0.0009987265200589763, + "loss": 1.0629766, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.15002441, + "step": 155, + "time_per_iteration": 2.7059414386749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295882, + "balance_loss_mlp": 1.28057528, + "epoch": 0.030011542901115813, + "flos": 661322962944.0, + "grad_norm": 0.09731995783752632, + "language_loss": 1.04436159, + "learning_rate": 0.001, + "loss": 1.05732036, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.1529541, + "step": 156, + "time_per_iteration": 2.856968641281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262708, + "balance_loss_mlp": 1.24682927, + "epoch": 0.030203924586379376, + "flos": 651258842112.0, + "grad_norm": 0.05966927829613408, + "language_loss": 1.02520585, + "learning_rate": 0.0009999999029413921, + "loss": 1.03783274, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.15856934, + "step": 157, + "time_per_iteration": 2.851480722427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268181, + "balance_loss_mlp": 1.25150406, + "epoch": 0.03039630627164294, + "flos": 531083091456.0, + "grad_norm": 0.1034311415514979, + "language_loss": 1.04085183, + "learning_rate": 0.0009999996117656068, + "loss": 1.05353379, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.16674805, + "step": 158, + "time_per_iteration": 2.707646369934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262524, + "balance_loss_mlp": 1.24747968, + "epoch": 0.030588687956906502, + "flos": 585914135040.0, + "grad_norm": 0.12050944658187299, + "language_loss": 0.97824669, + "learning_rate": 0.0009999991264727564, + "loss": 0.99087203, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.15039062, + "step": 159, + "time_per_iteration": 2.7575390338897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272116, + "balance_loss_mlp": 1.25716722, + "epoch": 0.030781069642170065, + "flos": 513026777088.0, + "grad_norm": 0.07020206521781955, + "language_loss": 1.08316755, + "learning_rate": 0.0009999984470630296, + "loss": 1.09588861, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.14929199, + "step": 160, + "time_per_iteration": 2.62310528755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128559, + "balance_loss_mlp": 1.27058172, + "epoch": 0.030973451327433628, + "flos": 717766064640.0, + "grad_norm": 0.06068839125924313, + "language_loss": 0.96528012, + "learning_rate": 0.0009999975735366902, + "loss": 0.978136, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.15002441, + "step": 161, + "time_per_iteration": 3.0823376178741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305752, + "balance_loss_mlp": 1.29055238, + "epoch": 0.03116583301269719, + "flos": 1109312133120.0, + "grad_norm": 0.09428930343360856, + "language_loss": 0.98546314, + "learning_rate": 0.0009999965058940775, + "loss": 0.99852067, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.1517334, + "step": 162, + "time_per_iteration": 3.486618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315996, + "balance_loss_mlp": 1.3010118, + "epoch": 0.031358214697960754, + "flos": 450676403712.0, + "grad_norm": 0.09976775191278689, + "language_loss": 1.04580116, + "learning_rate": 0.0009999952441356057, + "loss": 1.05896115, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.1496582, + "step": 163, + "time_per_iteration": 2.537173271179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300744, + "balance_loss_mlp": 1.28654623, + "epoch": 0.031550596383224314, + "flos": 1254701325312.0, + "grad_norm": 0.0838197011845512, + "language_loss": 1.05903006, + "learning_rate": 0.000999993788261765, + "loss": 1.07203746, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.14196777, + "step": 164, + "time_per_iteration": 3.5638957023620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270584, + "balance_loss_mlp": 1.25625503, + "epoch": 0.03174297806848788, + "flos": 667841310720.0, + "grad_norm": 0.068717417443618, + "language_loss": 1.0642612, + "learning_rate": 0.00099999213827312, + "loss": 1.076967, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.14343262, + "step": 165, + "time_per_iteration": 2.8084213733673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255587, + "balance_loss_mlp": 1.24152076, + "epoch": 0.03193535975375144, + "flos": 551033832960.0, + "grad_norm": 0.06892139424853191, + "language_loss": 1.0208962, + "learning_rate": 0.000999990294170312, + "loss": 1.03345203, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.14074707, + "step": 166, + "time_per_iteration": 2.6247787475585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259954, + "balance_loss_mlp": 1.24549401, + "epoch": 0.032127741439015006, + "flos": 543377700864.0, + "grad_norm": 0.08292396830811857, + "language_loss": 1.05774951, + "learning_rate": 0.0009999882559540566, + "loss": 1.07034898, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.14465332, + "step": 167, + "time_per_iteration": 2.654036283493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291491, + "balance_loss_mlp": 1.27790117, + "epoch": 0.032320123124278566, + "flos": 548104928256.0, + "grad_norm": 0.07217909902530589, + "language_loss": 1.02104354, + "learning_rate": 0.000999986023625145, + "loss": 1.03395844, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.13598633, + "step": 168, + "time_per_iteration": 2.696866750717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03738194, + "balance_loss_mlp": 3.61993837, + "epoch": 0.03251250480954213, + "flos": 1305156865536.0, + "grad_norm": 0.563981464368737, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.82662606, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 1.1796875, + "step": 169, + "time_per_iteration": 4.971506834030151 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134723, + "balance_loss_mlp": 1.33386648, + "epoch": 0.03270488649480569, + "flos": 560866609152.0, + "grad_norm": 0.12141219581883538, + "language_loss": 1.02540469, + "learning_rate": 0.0009999809766328958, + "loss": 1.03887701, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.13391113, + "step": 170, + "time_per_iteration": 2.646425724029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01355192, + "balance_loss_mlp": 1.34039843, + "epoch": 0.03289726818006926, + "flos": 482120031744.0, + "grad_norm": 0.08046017426621577, + "language_loss": 1.05186188, + "learning_rate": 0.0009999781619715177, + "loss": 1.06541371, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.14770508, + "step": 171, + "time_per_iteration": 2.535360336303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381569, + "balance_loss_mlp": 1.36640596, + "epoch": 0.03308964986533282, + "flos": 674355276288.0, + "grad_norm": 0.08789680193074563, + "language_loss": 1.04250002, + "learning_rate": 0.000999975153201402, + "loss": 1.05631578, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.15161133, + "step": 172, + "time_per_iteration": 2.8205513954162598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433883, + "balance_loss_mlp": 1.41711044, + "epoch": 0.033282031550596385, + "flos": 608937785856.0, + "grad_norm": 0.07610360898370483, + "language_loss": 1.02505267, + "learning_rate": 0.0009999719503237174, + "loss": 1.03939152, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.16760254, + "step": 173, + "time_per_iteration": 2.738676071166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451195, + "balance_loss_mlp": 1.43315864, + "epoch": 0.033474413235859944, + "flos": 467801547264.0, + "grad_norm": 0.07270846083900323, + "language_loss": 1.111094, + "learning_rate": 0.0009999685533397073, + "loss": 1.12560594, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.18029785, + "step": 174, + "time_per_iteration": 2.5556905269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01429898, + "balance_loss_mlp": 1.41368508, + "epoch": 0.03366679492112351, + "flos": 579365263872.0, + "grad_norm": 0.09196642879711979, + "language_loss": 1.03199494, + "learning_rate": 0.00099996496225069, + "loss": 1.04629397, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.16210938, + "step": 175, + "time_per_iteration": 2.6806485652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432234, + "balance_loss_mlp": 1.41513896, + "epoch": 0.03385917660638707, + "flos": 637378315776.0, + "grad_norm": 0.08705990667808558, + "language_loss": 1.05897307, + "learning_rate": 0.0009999611770580604, + "loss": 1.07329535, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.17102051, + "step": 176, + "time_per_iteration": 2.830826759338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415158, + "balance_loss_mlp": 1.39910054, + "epoch": 0.03405155829165064, + "flos": 441587123712.0, + "grad_norm": 0.08054669051038237, + "language_loss": 1.03868258, + "learning_rate": 0.0009999571977632876, + "loss": 1.05283427, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.16052246, + "step": 177, + "time_per_iteration": 2.623309850692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463141, + "balance_loss_mlp": 1.44573641, + "epoch": 0.034243939976914196, + "flos": 466097766912.0, + "grad_norm": 0.08089290506220445, + "language_loss": 1.06928194, + "learning_rate": 0.0009999530243679166, + "loss": 1.08391333, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.17407227, + "step": 178, + "time_per_iteration": 2.545133113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451423, + "balance_loss_mlp": 1.43560433, + "epoch": 0.03443632166217776, + "flos": 778919016960.0, + "grad_norm": 0.08468734735068614, + "language_loss": 1.01505899, + "learning_rate": 0.0009999486568735675, + "loss": 1.0295732, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.15808105, + "step": 179, + "time_per_iteration": 3.0384457111358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433641, + "balance_loss_mlp": 1.41778612, + "epoch": 0.03462870334744132, + "flos": 1263284246016.0, + "grad_norm": 0.06997324880309466, + "language_loss": 1.01388979, + "learning_rate": 0.0009999440952819362, + "loss": 1.02822614, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.15856934, + "step": 180, + "time_per_iteration": 3.6892786026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401308, + "balance_loss_mlp": 1.38610911, + "epoch": 0.03482108503270489, + "flos": 606899354112.0, + "grad_norm": 0.057831512038439566, + "language_loss": 1.02027512, + "learning_rate": 0.0009999393395947935, + "loss": 1.03428817, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.15185547, + "step": 181, + "time_per_iteration": 2.826353073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381551, + "balance_loss_mlp": 1.36612535, + "epoch": 0.03501346671796845, + "flos": 538010284032.0, + "grad_norm": 0.05913415109875365, + "language_loss": 1.05361927, + "learning_rate": 0.0009999343898139858, + "loss": 1.06743479, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.1541748, + "step": 182, + "time_per_iteration": 2.593250036239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01359754, + "balance_loss_mlp": 1.33988214, + "epoch": 0.035205848403232015, + "flos": 518231250432.0, + "grad_norm": 0.05898920665253376, + "language_loss": 1.04308426, + "learning_rate": 0.0009999292459414348, + "loss": 1.05668187, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.19909668, + "step": 183, + "time_per_iteration": 2.565936326980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01311064, + "balance_loss_mlp": 1.296103, + "epoch": 0.035398230088495575, + "flos": 472134486528.0, + "grad_norm": 0.06373248491183749, + "language_loss": 1.08499169, + "learning_rate": 0.0009999239079791374, + "loss": 1.09810233, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.14953613, + "step": 184, + "time_per_iteration": 2.552949905395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130912, + "balance_loss_mlp": 1.29237127, + "epoch": 0.03559061177375914, + "flos": 511820591616.0, + "grad_norm": 0.056329932736213485, + "language_loss": 1.01337337, + "learning_rate": 0.0009999183759291659, + "loss": 1.02646446, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.16748047, + "step": 185, + "time_per_iteration": 2.741727113723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291511, + "balance_loss_mlp": 1.27575147, + "epoch": 0.0357829934590227, + "flos": 477146903040.0, + "grad_norm": 0.11224085577532149, + "language_loss": 1.03738213, + "learning_rate": 0.0009999126497936682, + "loss": 1.05029726, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.1574707, + "step": 186, + "time_per_iteration": 2.4901957511901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291515, + "balance_loss_mlp": 1.27446783, + "epoch": 0.03597537514428627, + "flos": 644350588416.0, + "grad_norm": 0.06537030709871235, + "language_loss": 1.06735992, + "learning_rate": 0.0009999067295748676, + "loss": 1.08027506, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.1706543, + "step": 187, + "time_per_iteration": 2.7923052310943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327575, + "balance_loss_mlp": 1.30966997, + "epoch": 0.03616775682954983, + "flos": 580916275200.0, + "grad_norm": 0.06523062893181024, + "language_loss": 1.04418302, + "learning_rate": 0.000999900615275062, + "loss": 1.05745876, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.17919922, + "step": 188, + "time_per_iteration": 2.7248637676239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295421, + "balance_loss_mlp": 1.27722955, + "epoch": 0.03636013851481339, + "flos": 382210735104.0, + "grad_norm": 0.08035209765807474, + "language_loss": 1.10347509, + "learning_rate": 0.0009998943068966256, + "loss": 1.11642933, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.18188477, + "step": 189, + "time_per_iteration": 2.429497480392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279097, + "balance_loss_mlp": 1.26120377, + "epoch": 0.03655252020007695, + "flos": 582954706944.0, + "grad_norm": 0.07380481555246936, + "language_loss": 1.0506779, + "learning_rate": 0.0009998878044420072, + "loss": 1.06346881, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.17907715, + "step": 190, + "time_per_iteration": 2.6878626346588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012863, + "balance_loss_mlp": 1.26773953, + "epoch": 0.03674490188534051, + "flos": 471376433664.0, + "grad_norm": 0.10484442400689244, + "language_loss": 1.01223493, + "learning_rate": 0.0009998811079137318, + "loss": 1.02509785, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.18566895, + "step": 191, + "time_per_iteration": 2.561494827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281775, + "balance_loss_mlp": 1.26645625, + "epoch": 0.03693728357060408, + "flos": 528113488896.0, + "grad_norm": 0.0609431296621874, + "language_loss": 1.01984763, + "learning_rate": 0.0009998742173143987, + "loss": 1.03266537, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.1529541, + "step": 192, + "time_per_iteration": 2.59798264503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336751, + "balance_loss_mlp": 1.32157528, + "epoch": 0.03712966525586764, + "flos": 798657352704.0, + "grad_norm": 0.10186248006293357, + "language_loss": 1.02005363, + "learning_rate": 0.0009998671326466833, + "loss": 1.03342128, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.15185547, + "step": 193, + "time_per_iteration": 2.9510865211486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331057, + "balance_loss_mlp": 1.3157624, + "epoch": 0.037322046941131205, + "flos": 829628116992.0, + "grad_norm": 0.06375125184008373, + "language_loss": 1.02914846, + "learning_rate": 0.0009998598539133362, + "loss": 1.04245901, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.1529541, + "step": 194, + "time_per_iteration": 2.9981300830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01337882, + "balance_loss_mlp": 1.3235296, + "epoch": 0.037514428626394765, + "flos": 437460797952.0, + "grad_norm": 0.10181133305516413, + "language_loss": 1.03936744, + "learning_rate": 0.0009998523811171828, + "loss": 1.0527463, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.14379883, + "step": 195, + "time_per_iteration": 2.501542568206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296215, + "balance_loss_mlp": 1.28125429, + "epoch": 0.03770681031165833, + "flos": 511372459008.0, + "grad_norm": 0.09414845611868274, + "language_loss": 1.04584992, + "learning_rate": 0.0009998447142611248, + "loss": 1.05881214, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.14941406, + "step": 196, + "time_per_iteration": 2.6247317790985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128144, + "balance_loss_mlp": 1.26702762, + "epoch": 0.03789919199692189, + "flos": 807102061056.0, + "grad_norm": 0.05831249070889761, + "language_loss": 0.97701526, + "learning_rate": 0.0009998368533481387, + "loss": 0.9898296, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.14422607, + "step": 197, + "time_per_iteration": 3.01912784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294999, + "balance_loss_mlp": 1.27945375, + "epoch": 0.03809157368218546, + "flos": 690274234368.0, + "grad_norm": 0.06656848410147823, + "language_loss": 1.00630498, + "learning_rate": 0.0009998287983812762, + "loss": 1.01925504, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.15551758, + "step": 198, + "time_per_iteration": 2.8252804279327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0135387, + "balance_loss_mlp": 1.33592904, + "epoch": 0.03828395536744902, + "flos": 517675428864.0, + "grad_norm": 0.06988401379713739, + "language_loss": 1.06386423, + "learning_rate": 0.0009998205493636646, + "loss": 1.07740283, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.17944336, + "step": 199, + "time_per_iteration": 2.649765729904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339461, + "balance_loss_mlp": 1.32242572, + "epoch": 0.038476337052712584, + "flos": 581389138944.0, + "grad_norm": 0.07184113921580974, + "language_loss": 0.9925406, + "learning_rate": 0.0009998121062985063, + "loss": 1.00593519, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.17053223, + "step": 200, + "time_per_iteration": 2.6788320541381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328142, + "balance_loss_mlp": 1.31167912, + "epoch": 0.03866871873797614, + "flos": 576791359488.0, + "grad_norm": 0.059667024197710104, + "language_loss": 1.01260698, + "learning_rate": 0.0009998034691890794, + "loss": 1.02588844, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.16455078, + "step": 201, + "time_per_iteration": 2.753265380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297644, + "balance_loss_mlp": 1.28249288, + "epoch": 0.03886110042323971, + "flos": 540472117248.0, + "grad_norm": 0.07302515973387386, + "language_loss": 1.05948424, + "learning_rate": 0.0009997946380387369, + "loss": 1.07246065, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.15136719, + "step": 202, + "time_per_iteration": 2.618546485900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262023, + "balance_loss_mlp": 1.24746776, + "epoch": 0.03905348210850327, + "flos": 717694843392.0, + "grad_norm": 0.0775452329378228, + "language_loss": 1.08266401, + "learning_rate": 0.0009997856128509076, + "loss": 1.09528422, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.14550781, + "step": 203, + "time_per_iteration": 2.8284859657287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267878, + "balance_loss_mlp": 1.25321579, + "epoch": 0.039245863793766836, + "flos": 427268639232.0, + "grad_norm": 0.06664318613050589, + "language_loss": 1.02886617, + "learning_rate": 0.0009997763936290952, + "loss": 1.04154491, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.14660645, + "step": 204, + "time_per_iteration": 2.516263246536255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129892, + "balance_loss_mlp": 1.28264785, + "epoch": 0.039438245479030395, + "flos": 662804163072.0, + "grad_norm": 0.07463685050771204, + "language_loss": 1.0815413, + "learning_rate": 0.0009997669803768789, + "loss": 1.09453046, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.16271973, + "step": 205, + "time_per_iteration": 2.7576606273651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291456, + "balance_loss_mlp": 1.27812803, + "epoch": 0.03963062716429396, + "flos": 635063459328.0, + "grad_norm": 0.055878982250893716, + "language_loss": 1.03253651, + "learning_rate": 0.0009997573730979134, + "loss": 1.04545116, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.13342285, + "step": 206, + "time_per_iteration": 2.716325521469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.07279634, + "balance_loss_mlp": 4.65512276, + "epoch": 0.03982300884955752, + "flos": 1417813286400.0, + "grad_norm": 0.533603848118922, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.86472833, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 26.25, + "step": 207, + "time_per_iteration": 4.635821342468262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0137482, + "balance_loss_mlp": 1.35964513, + "epoch": 0.04001539053482109, + "flos": 688769713152.0, + "grad_norm": 0.1040721574676452, + "language_loss": 1.02094078, + "learning_rate": 0.0009997375764747294, + "loss": 1.03468895, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.1517334, + "step": 208, + "time_per_iteration": 2.974442481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415285, + "balance_loss_mlp": 1.40052748, + "epoch": 0.04020777222008465, + "flos": 533363042304.0, + "grad_norm": 0.08111266742266361, + "language_loss": 0.99458027, + "learning_rate": 0.0009997273871381967, + "loss": 1.00873303, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.14758301, + "step": 209, + "time_per_iteration": 2.6802144050598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466201, + "balance_loss_mlp": 1.44989347, + "epoch": 0.040400153905348214, + "flos": 567661381632.0, + "grad_norm": 0.06875741115436663, + "language_loss": 1.05031717, + "learning_rate": 0.0009997170037902862, + "loss": 1.0649792, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.16308594, + "step": 210, + "time_per_iteration": 2.6975836753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531768, + "balance_loss_mlp": 1.51399446, + "epoch": 0.040592535590611774, + "flos": 713130559488.0, + "grad_norm": 0.07197690318934227, + "language_loss": 1.07202697, + "learning_rate": 0.0009997064264350292, + "loss": 1.08734465, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.17785645, + "step": 211, + "time_per_iteration": 2.836771011352539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531925, + "balance_loss_mlp": 1.5154984, + "epoch": 0.04078491727587533, + "flos": 577824427008.0, + "grad_norm": 0.09120436996840299, + "language_loss": 1.0146966, + "learning_rate": 0.0009996956550765317, + "loss": 1.03001595, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.16430664, + "step": 212, + "time_per_iteration": 2.671292781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01499293, + "balance_loss_mlp": 1.4817214, + "epoch": 0.0409772989611389, + "flos": 552033404928.0, + "grad_norm": 0.11449485477945152, + "language_loss": 0.96278083, + "learning_rate": 0.0009996846897189762, + "loss": 0.97777379, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.17565918, + "step": 213, + "time_per_iteration": 2.6231424808502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014653, + "balance_loss_mlp": 1.44753814, + "epoch": 0.04116968064640246, + "flos": 555347833344.0, + "grad_norm": 0.09512793115916172, + "language_loss": 1.02356708, + "learning_rate": 0.0009996735303666193, + "loss": 1.03822017, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.1776123, + "step": 214, + "time_per_iteration": 2.6930177211761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01477298, + "balance_loss_mlp": 1.46134758, + "epoch": 0.041362062331666026, + "flos": 578204158464.0, + "grad_norm": 0.09141123477091552, + "language_loss": 1.04750729, + "learning_rate": 0.0009996621770237937, + "loss": 1.06228042, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.15942383, + "step": 215, + "time_per_iteration": 2.7448923587799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01578462, + "balance_loss_mlp": 1.56013966, + "epoch": 0.041554444016929586, + "flos": 611130396672.0, + "grad_norm": 0.10233552268903827, + "language_loss": 0.99822551, + "learning_rate": 0.0009996506296949073, + "loss": 1.01401007, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.18334961, + "step": 216, + "time_per_iteration": 2.8548526763916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01609008, + "balance_loss_mlp": 1.59156775, + "epoch": 0.04174682570219315, + "flos": 527857413120.0, + "grad_norm": 0.10522858499680945, + "language_loss": 0.99888742, + "learning_rate": 0.0009996388883844428, + "loss": 1.01497757, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.17456055, + "step": 217, + "time_per_iteration": 2.618546724319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01557164, + "balance_loss_mlp": 1.54124999, + "epoch": 0.04193920738745671, + "flos": 511258977792.0, + "grad_norm": 0.09341741551851517, + "language_loss": 1.03841758, + "learning_rate": 0.0009996269530969588, + "loss": 1.05398929, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.15905762, + "step": 218, + "time_per_iteration": 2.6204636096954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01525903, + "balance_loss_mlp": 1.50927377, + "epoch": 0.04213158907272028, + "flos": 571226093568.0, + "grad_norm": 0.09609660813155754, + "language_loss": 1.02944803, + "learning_rate": 0.0009996148238370888, + "loss": 1.04470706, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.16625977, + "step": 219, + "time_per_iteration": 2.7071943283081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0150071, + "balance_loss_mlp": 1.48340106, + "epoch": 0.04232397075798384, + "flos": 963803667456.0, + "grad_norm": 0.05454565212769997, + "language_loss": 0.9941752, + "learning_rate": 0.0009996025006095421, + "loss": 1.00918233, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.1730957, + "step": 220, + "time_per_iteration": 3.3006374835968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.10944285, + "balance_loss_mlp": 6.84272289, + "epoch": 0.042516352443247404, + "flos": 1468814777856.0, + "grad_norm": 0.48497418398004566, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.88727427, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 41.0, + "step": 221, + "time_per_iteration": 5.7136383056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601015, + "balance_loss_mlp": 1.58291924, + "epoch": 0.042708734128510964, + "flos": 654419091456.0, + "grad_norm": 0.10763646442297387, + "language_loss": 0.99765503, + "learning_rate": 0.0009995772722706307, + "loss": 1.0136652, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.1809082, + "step": 222, + "time_per_iteration": 2.8322792053222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01658843, + "balance_loss_mlp": 1.63811278, + "epoch": 0.04290111581377453, + "flos": 431601578496.0, + "grad_norm": 0.16393394652444138, + "language_loss": 1.13557565, + "learning_rate": 0.0009995643671690604, + "loss": 1.1521641, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.20739746, + "step": 223, + "time_per_iteration": 2.470729351043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163871, + "balance_loss_mlp": 1.61504686, + "epoch": 0.04309349749903809, + "flos": 644379701760.0, + "grad_norm": 0.08733094203359489, + "language_loss": 1.00837708, + "learning_rate": 0.0009995512681194023, + "loss": 1.02476418, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.23632812, + "step": 224, + "time_per_iteration": 2.8274452686309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615568, + "balance_loss_mlp": 1.58755326, + "epoch": 0.04328587918430166, + "flos": 830861853696.0, + "grad_norm": 0.12001676841435771, + "language_loss": 0.98664522, + "learning_rate": 0.0009995379751267417, + "loss": 1.00280082, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.28027344, + "step": 225, + "time_per_iteration": 3.275660991668701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01617416, + "balance_loss_mlp": 1.58639741, + "epoch": 0.043478260869565216, + "flos": 524804852736.0, + "grad_norm": 0.1467276253632499, + "language_loss": 1.0007726, + "learning_rate": 0.0009995244881962398, + "loss": 1.01694679, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.30981445, + "step": 226, + "time_per_iteration": 2.6300203800201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601732, + "balance_loss_mlp": 1.56787658, + "epoch": 0.04367064255482878, + "flos": 439253328384.0, + "grad_norm": 0.095918638324787, + "language_loss": 1.01389623, + "learning_rate": 0.0009995108073331323, + "loss": 1.02991343, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.33862305, + "step": 227, + "time_per_iteration": 2.667628765106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0158134, + "balance_loss_mlp": 1.5462923, + "epoch": 0.04386302424009234, + "flos": 507109330944.0, + "grad_norm": 0.08564981186298011, + "language_loss": 1.04024279, + "learning_rate": 0.0009994969325427309, + "loss": 1.05605614, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.35058594, + "step": 228, + "time_per_iteration": 2.6454501152038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558795, + "balance_loss_mlp": 1.52224541, + "epoch": 0.04405540592535591, + "flos": 540432829440.0, + "grad_norm": 0.07744391701114339, + "language_loss": 1.00052619, + "learning_rate": 0.0009994828638304218, + "loss": 1.016114, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.36547852, + "step": 229, + "time_per_iteration": 2.6468071937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01516137, + "balance_loss_mlp": 1.47794271, + "epoch": 0.04424778761061947, + "flos": 446136850944.0, + "grad_norm": 0.08052263902742013, + "language_loss": 1.06763554, + "learning_rate": 0.0009994686012016675, + "loss": 1.08279693, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.3815918, + "step": 230, + "time_per_iteration": 2.5467634201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01483037, + "balance_loss_mlp": 1.44515228, + "epoch": 0.044440169295883035, + "flos": 700383435264.0, + "grad_norm": 0.05918307184238542, + "language_loss": 1.0518043, + "learning_rate": 0.000999454144662005, + "loss": 1.06663465, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.37866211, + "step": 231, + "time_per_iteration": 2.8704099655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473358, + "balance_loss_mlp": 1.43549716, + "epoch": 0.044632550981146595, + "flos": 588055873536.0, + "grad_norm": 0.08626514264815018, + "language_loss": 0.99676436, + "learning_rate": 0.0009994394942170468, + "loss": 1.01149797, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.37866211, + "step": 232, + "time_per_iteration": 2.6578898429870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461415, + "balance_loss_mlp": 1.4258194, + "epoch": 0.04482493266641016, + "flos": 554534525952.0, + "grad_norm": 0.07124765242066121, + "language_loss": 0.96965969, + "learning_rate": 0.0009994246498724808, + "loss": 0.98427379, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.35620117, + "step": 233, + "time_per_iteration": 2.7764015197753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463645, + "balance_loss_mlp": 1.42790616, + "epoch": 0.04501731435167372, + "flos": 722500646400.0, + "grad_norm": 0.07759597622956232, + "language_loss": 0.99069166, + "learning_rate": 0.00099940961163407, + "loss": 1.00532806, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.35766602, + "step": 234, + "time_per_iteration": 2.8431143760681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454599, + "balance_loss_mlp": 1.42098188, + "epoch": 0.04520969603693728, + "flos": 511539784704.0, + "grad_norm": 0.05931413709293958, + "language_loss": 1.02564597, + "learning_rate": 0.0009993943795076528, + "loss": 1.04019189, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.33642578, + "step": 235, + "time_per_iteration": 2.645988702774048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444244, + "balance_loss_mlp": 1.40936303, + "epoch": 0.04540207772220085, + "flos": 364854246912.0, + "grad_norm": 0.07280953320994132, + "language_loss": 1.04776168, + "learning_rate": 0.0009993789534991427, + "loss": 1.062204, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.34912109, + "step": 236, + "time_per_iteration": 2.4084837436676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01418385, + "balance_loss_mlp": 1.38390946, + "epoch": 0.045594459407464406, + "flos": 522407038464.0, + "grad_norm": 0.060943880380569936, + "language_loss": 0.99500269, + "learning_rate": 0.0009993633336145287, + "loss": 1.00918651, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.34472656, + "step": 237, + "time_per_iteration": 2.6044533252716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406135, + "balance_loss_mlp": 1.3730185, + "epoch": 0.04578684109272797, + "flos": 671442338304.0, + "grad_norm": 0.06747057459653658, + "language_loss": 1.03573179, + "learning_rate": 0.0009993475198598752, + "loss": 1.04979324, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.33129883, + "step": 238, + "time_per_iteration": 2.9948084354400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01383668, + "balance_loss_mlp": 1.35164809, + "epoch": 0.04597922277799153, + "flos": 541387321344.0, + "grad_norm": 0.07135856148897902, + "language_loss": 0.99909985, + "learning_rate": 0.0009993315122413212, + "loss": 1.01293659, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.32006836, + "step": 239, + "time_per_iteration": 2.5848827362060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369111, + "balance_loss_mlp": 1.33773541, + "epoch": 0.0461716044632551, + "flos": 458732616192.0, + "grad_norm": 0.056000088810755834, + "language_loss": 1.0008105, + "learning_rate": 0.0009993153107650818, + "loss": 1.01450157, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.31347656, + "step": 240, + "time_per_iteration": 2.5492687225341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338815, + "balance_loss_mlp": 1.31015706, + "epoch": 0.04636398614851866, + "flos": 455009342976.0, + "grad_norm": 0.06491754001609312, + "language_loss": 0.99534512, + "learning_rate": 0.0009992989154374468, + "loss": 1.00873327, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.28662109, + "step": 241, + "time_per_iteration": 2.511237621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294622, + "balance_loss_mlp": 1.26833653, + "epoch": 0.046556367833782225, + "flos": 556558401024.0, + "grad_norm": 0.07592069792168304, + "language_loss": 1.06626534, + "learning_rate": 0.0009992823262647817, + "loss": 1.07921147, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26293945, + "step": 242, + "time_per_iteration": 2.7618701457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249282, + "balance_loss_mlp": 1.22240043, + "epoch": 0.046748749519045785, + "flos": 592625949696.0, + "grad_norm": 0.0687662987323222, + "language_loss": 1.00893593, + "learning_rate": 0.0009992655432535264, + "loss": 1.0214287, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.26879883, + "step": 243, + "time_per_iteration": 2.7471935749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255015, + "balance_loss_mlp": 1.23083937, + "epoch": 0.04694113120430935, + "flos": 569596506624.0, + "grad_norm": 0.07373455055845594, + "language_loss": 1.0054853, + "learning_rate": 0.0009992485664101973, + "loss": 1.01803541, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.24169922, + "step": 244, + "time_per_iteration": 2.635344982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295554, + "balance_loss_mlp": 1.27291572, + "epoch": 0.04713351288957291, + "flos": 863401158144.0, + "grad_norm": 0.10584905626659928, + "language_loss": 1.03312445, + "learning_rate": 0.000999231395741385, + "loss": 1.04607987, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.22631836, + "step": 245, + "time_per_iteration": 3.093386173248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128706, + "balance_loss_mlp": 1.26464868, + "epoch": 0.04732589457483648, + "flos": 536961249792.0, + "grad_norm": 0.08844420521863233, + "language_loss": 1.01371169, + "learning_rate": 0.0009992140312537557, + "loss": 1.02658224, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.22412109, + "step": 246, + "time_per_iteration": 2.667579412460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256359, + "balance_loss_mlp": 1.23515141, + "epoch": 0.04751827626010004, + "flos": 761566910976.0, + "grad_norm": 0.052835972446563725, + "language_loss": 0.9609164, + "learning_rate": 0.000999196472954051, + "loss": 0.97347999, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.2121582, + "step": 247, + "time_per_iteration": 2.9537084102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02369813, + "balance_loss_mlp": 2.16687083, + "epoch": 0.0477106579453636, + "flos": 1578961313280.0, + "grad_norm": 0.2151482568863758, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.81794667, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 2.03125, + "step": 248, + "time_per_iteration": 5.758621454238892 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289622, + "balance_loss_mlp": 1.27137113, + "epoch": 0.04790303963062716, + "flos": 457535195136.0, + "grad_norm": 0.10849969336884063, + "language_loss": 1.03316629, + "learning_rate": 0.0009991607749457578, + "loss": 1.04606247, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.18261719, + "step": 249, + "time_per_iteration": 2.5432913303375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334566, + "balance_loss_mlp": 1.31724536, + "epoch": 0.04809542131589073, + "flos": 782079266304.0, + "grad_norm": 0.08264534697846654, + "language_loss": 1.01180637, + "learning_rate": 0.0009991426352510286, + "loss": 1.02515209, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.17321777, + "step": 250, + "time_per_iteration": 3.1542766094207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351096, + "balance_loss_mlp": 1.33215368, + "epoch": 0.04828780300115429, + "flos": 558995503104.0, + "grad_norm": 0.06435857362074206, + "language_loss": 1.03307557, + "learning_rate": 0.0009991243017719422, + "loss": 1.04658651, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.18933105, + "step": 251, + "time_per_iteration": 2.693882942199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333217, + "balance_loss_mlp": 1.31485844, + "epoch": 0.048480184686417856, + "flos": 501682277376.0, + "grad_norm": 0.09276508096019526, + "language_loss": 0.97794825, + "learning_rate": 0.0009991057745156165, + "loss": 0.99128038, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.18347168, + "step": 252, + "time_per_iteration": 2.628873109817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01867297, + "balance_loss_mlp": 1.75514495, + "epoch": 0.048672566371681415, + "flos": 1535585430528.0, + "grad_norm": 0.16359674361847032, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.8377828, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.125, + "step": 253, + "time_per_iteration": 5.060615062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285031, + "balance_loss_mlp": 1.26567185, + "epoch": 0.04886494805694498, + "flos": 537665458176.0, + "grad_norm": 0.07164286827098729, + "language_loss": 1.06546187, + "learning_rate": 0.0009990681387000943, + "loss": 1.07831216, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.19384766, + "step": 254, + "time_per_iteration": 2.783367395401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275754, + "balance_loss_mlp": 1.25606036, + "epoch": 0.04905732974220854, + "flos": 679841966592.0, + "grad_norm": 0.06618046133348403, + "language_loss": 1.01404011, + "learning_rate": 0.0009990490301555093, + "loss": 1.02679765, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.19689941, + "step": 255, + "time_per_iteration": 2.9520761966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01936632, + "balance_loss_mlp": 1.86796737, + "epoch": 0.04924971142747211, + "flos": 1420408949760.0, + "grad_norm": 0.31562964738653715, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.81151783, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.6875, + "step": 256, + "time_per_iteration": 4.825209856033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615246, + "balance_loss_mlp": 1.55344784, + "epoch": 0.04944209311273567, + "flos": 1557202074624.0, + "grad_norm": 0.16937574338078817, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80857986, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.6171875, + "step": 257, + "time_per_iteration": 4.995501518249512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163115, + "balance_loss_mlp": 1.58422887, + "epoch": 0.04963447479799923, + "flos": 1569985514496.0, + "grad_norm": 0.13524925240989144, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71607035, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.46875, + "step": 258, + "time_per_iteration": 4.841471910476685 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272668, + "balance_loss_mlp": 1.24463022, + "epoch": 0.049826856483262794, + "flos": 625063357440.0, + "grad_norm": 0.06365504505971183, + "language_loss": 0.95603192, + "learning_rate": 0.0009989706585723202, + "loss": 0.96875864, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.28076172, + "step": 259, + "time_per_iteration": 2.7635786533355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130022, + "balance_loss_mlp": 1.27020288, + "epoch": 0.05001923816852635, + "flos": 503912765952.0, + "grad_norm": 0.062257698278494894, + "language_loss": 1.01846027, + "learning_rate": 0.0009989505813633442, + "loss": 1.03146255, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.29980469, + "step": 260, + "time_per_iteration": 2.6451833248138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131611, + "balance_loss_mlp": 1.28101516, + "epoch": 0.05021161985378992, + "flos": 587066476032.0, + "grad_norm": 0.06290514068599455, + "language_loss": 1.01911807, + "learning_rate": 0.000998930310444573, + "loss": 1.03227913, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.35083008, + "step": 261, + "time_per_iteration": 2.6989662647247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324978, + "balance_loss_mlp": 1.2880708, + "epoch": 0.05040400153905348, + "flos": 633029409792.0, + "grad_norm": 0.0625839964239575, + "language_loss": 1.00387836, + "learning_rate": 0.0009989098458238765, + "loss": 1.01712811, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.36914062, + "step": 262, + "time_per_iteration": 2.7581043243408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319841, + "balance_loss_mlp": 1.28395867, + "epoch": 0.050596383224317046, + "flos": 553344307200.0, + "grad_norm": 0.06067150197267865, + "language_loss": 0.99905968, + "learning_rate": 0.0009988891875091998, + "loss": 1.01225805, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.35913086, + "step": 263, + "time_per_iteration": 2.7601842880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131413, + "balance_loss_mlp": 1.27793837, + "epoch": 0.050788764909580605, + "flos": 549389689344.0, + "grad_norm": 0.07440292928735547, + "language_loss": 0.94277728, + "learning_rate": 0.0009988683355085636, + "loss": 0.95591855, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.36206055, + "step": 264, + "time_per_iteration": 2.7262909412384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277315, + "balance_loss_mlp": 1.24248254, + "epoch": 0.05098114659484417, + "flos": 604812870144.0, + "grad_norm": 0.06984595792035174, + "language_loss": 1.02861905, + "learning_rate": 0.000998847289830063, + "loss": 1.04139221, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.34838867, + "step": 265, + "time_per_iteration": 2.8318397998809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256298, + "balance_loss_mlp": 1.22272849, + "epoch": 0.05117352828010773, + "flos": 438317775360.0, + "grad_norm": 0.08677906198544101, + "language_loss": 0.95779377, + "learning_rate": 0.0009988260504818682, + "loss": 0.9703567, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.3359375, + "step": 266, + "time_per_iteration": 2.5212388038635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220367, + "balance_loss_mlp": 1.19046903, + "epoch": 0.0513659099653713, + "flos": 504784300032.0, + "grad_norm": 0.09456939977029206, + "language_loss": 1.01958096, + "learning_rate": 0.000998804617472226, + "loss": 1.03178465, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.29858398, + "step": 267, + "time_per_iteration": 2.649739980697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169139, + "balance_loss_mlp": 1.14131606, + "epoch": 0.05155829165063486, + "flos": 695183344128.0, + "grad_norm": 0.07125411147685125, + "language_loss": 0.97574937, + "learning_rate": 0.0009987829908094568, + "loss": 0.98744082, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.27856445, + "step": 268, + "time_per_iteration": 2.816098690032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119703, + "balance_loss_mlp": 1.09379935, + "epoch": 0.051750673335898424, + "flos": 1347751830528.0, + "grad_norm": 0.06583247177587333, + "language_loss": 1.04151332, + "learning_rate": 0.0009987611705019569, + "loss": 1.05271029, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.25927734, + "step": 269, + "time_per_iteration": 4.478148460388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103433, + "balance_loss_mlp": 1.08008027, + "epoch": 0.051943055021161984, + "flos": 489362936832.0, + "grad_norm": 0.06787757239342199, + "language_loss": 1.02481639, + "learning_rate": 0.0009987391565581978, + "loss": 1.03585076, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.23364258, + "step": 270, + "time_per_iteration": 2.5662009716033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111004, + "balance_loss_mlp": 1.08859241, + "epoch": 0.05213543670642555, + "flos": 545504882688.0, + "grad_norm": 0.08198896814085149, + "language_loss": 0.9504528, + "learning_rate": 0.000998716948986726, + "loss": 0.96156287, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.22424316, + "step": 271, + "time_per_iteration": 2.7815349102020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158552, + "balance_loss_mlp": 1.13697529, + "epoch": 0.05232781839168911, + "flos": 603285179904.0, + "grad_norm": 0.07646156534985457, + "language_loss": 0.97641921, + "learning_rate": 0.0009986945477961633, + "loss": 0.9880048, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.21569824, + "step": 272, + "time_per_iteration": 2.694547414779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188724, + "balance_loss_mlp": 1.16735017, + "epoch": 0.052520200076952676, + "flos": 538218307584.0, + "grad_norm": 0.07381807258867126, + "language_loss": 1.02498066, + "learning_rate": 0.0009986719529952066, + "loss": 1.03686786, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.21386719, + "step": 273, + "time_per_iteration": 2.8339192867279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121752, + "balance_loss_mlp": 1.19785035, + "epoch": 0.052712581762216236, + "flos": 463148513280.0, + "grad_norm": 0.0738352941440963, + "language_loss": 1.01808548, + "learning_rate": 0.000998649164592628, + "loss": 1.03026068, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.19677734, + "step": 274, + "time_per_iteration": 2.60577130317688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236713, + "balance_loss_mlp": 1.21763909, + "epoch": 0.0529049634474798, + "flos": 547749927936.0, + "grad_norm": 0.08134169766286939, + "language_loss": 0.99272913, + "learning_rate": 0.0009986261825972748, + "loss": 1.00509632, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.19055176, + "step": 275, + "time_per_iteration": 2.652561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196689, + "balance_loss_mlp": 1.17834246, + "epoch": 0.05309734513274336, + "flos": 617727320064.0, + "grad_norm": 0.09111845604121613, + "language_loss": 1.01860571, + "learning_rate": 0.000998603007018069, + "loss": 1.03057253, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.18334961, + "step": 276, + "time_per_iteration": 2.8293774127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011443, + "balance_loss_mlp": 1.1273365, + "epoch": 0.05328972681800693, + "flos": 605220304896.0, + "grad_norm": 0.07377841396756965, + "language_loss": 0.99345076, + "learning_rate": 0.0009985796378640089, + "loss": 1.00489378, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.16955566, + "step": 277, + "time_per_iteration": 2.694716215133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098185, + "balance_loss_mlp": 1.08067346, + "epoch": 0.05348210850327049, + "flos": 604197411840.0, + "grad_norm": 0.07074934963985437, + "language_loss": 0.99532163, + "learning_rate": 0.0009985560751441665, + "loss": 1.00630355, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.1751709, + "step": 278, + "time_per_iteration": 2.798563241958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095446, + "balance_loss_mlp": 1.07736206, + "epoch": 0.053674490188534055, + "flos": 630480236544.0, + "grad_norm": 0.054749659326078955, + "language_loss": 1.01733184, + "learning_rate": 0.00099853231886769, + "loss": 1.02828622, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.1809082, + "step": 279, + "time_per_iteration": 2.780940532684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134885, + "balance_loss_mlp": 1.11744475, + "epoch": 0.053866871873797614, + "flos": 478939433472.0, + "grad_norm": 0.06375435082524677, + "language_loss": 1.01461124, + "learning_rate": 0.0009985083690438024, + "loss": 1.02595997, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.17443848, + "step": 280, + "time_per_iteration": 2.68762469291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145965, + "balance_loss_mlp": 1.12913251, + "epoch": 0.054059253559061174, + "flos": 787673645568.0, + "grad_norm": 0.07384801764192533, + "language_loss": 0.92380941, + "learning_rate": 0.0009984842256818016, + "loss": 0.93526906, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.16845703, + "step": 281, + "time_per_iteration": 3.054032325744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114791, + "balance_loss_mlp": 1.13080359, + "epoch": 0.05425163524432474, + "flos": 628076630016.0, + "grad_norm": 0.082175996598207, + "language_loss": 1.0314945, + "learning_rate": 0.0009984598887910613, + "loss": 1.04297376, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.17114258, + "step": 282, + "time_per_iteration": 2.7095611095428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144379, + "balance_loss_mlp": 1.12627149, + "epoch": 0.0544440169295883, + "flos": 615453161472.0, + "grad_norm": 0.06813866095032944, + "language_loss": 0.9902432, + "learning_rate": 0.0009984353583810297, + "loss": 1.00168693, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.18103027, + "step": 283, + "time_per_iteration": 2.804438829421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124262, + "balance_loss_mlp": 1.10624945, + "epoch": 0.05463639861485187, + "flos": 647471549952.0, + "grad_norm": 0.10003204141391345, + "language_loss": 1.01340103, + "learning_rate": 0.0009984106344612302, + "loss": 1.02464366, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.18017578, + "step": 284, + "time_per_iteration": 2.7521376609802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109552, + "balance_loss_mlp": 1.07819879, + "epoch": 0.054828780300115426, + "flos": 796845883392.0, + "grad_norm": 0.07143310654982075, + "language_loss": 0.96421391, + "learning_rate": 0.0009983857170412615, + "loss": 0.97516906, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.17321777, + "step": 285, + "time_per_iteration": 2.9796621799468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089942, + "balance_loss_mlp": 1.07363439, + "epoch": 0.05502116198537899, + "flos": 549414420480.0, + "grad_norm": 0.05224422052371224, + "language_loss": 0.95713383, + "learning_rate": 0.000998360606130798, + "loss": 0.96803325, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.16308594, + "step": 286, + "time_per_iteration": 2.7950801849365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02908189, + "balance_loss_mlp": 2.83799911, + "epoch": 0.05521354367064255, + "flos": 1406967791616.0, + "grad_norm": 0.233188183772104, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71981305, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.703125, + "step": 287, + "time_per_iteration": 4.876653432846069 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179574, + "balance_loss_mlp": 1.1627655, + "epoch": 0.05540592535590612, + "flos": 645123197952.0, + "grad_norm": 0.17417830683261867, + "language_loss": 1.0204829, + "learning_rate": 0.0009983098038774552, + "loss": 1.03227878, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.16821289, + "step": 288, + "time_per_iteration": 2.7781550884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02101154, + "balance_loss_mlp": 2.07540464, + "epoch": 0.05559830704116968, + "flos": 1510293413376.0, + "grad_norm": 0.1730100464590254, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.80271375, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.2578125, + "step": 289, + "time_per_iteration": 4.801970481872559 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338926, + "balance_loss_mlp": 1.32332134, + "epoch": 0.055790688726433245, + "flos": 508078379520.0, + "grad_norm": 0.11288123874753296, + "language_loss": 0.99586821, + "learning_rate": 0.0009982582277800948, + "loss": 1.00925756, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.15588379, + "step": 290, + "time_per_iteration": 2.6019012928009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376714, + "balance_loss_mlp": 1.36076403, + "epoch": 0.055983070411696804, + "flos": 657570576384.0, + "grad_norm": 0.11158393407579077, + "language_loss": 1.06464982, + "learning_rate": 0.0009982321495648908, + "loss": 1.07841706, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.15942383, + "step": 291, + "time_per_iteration": 2.7833075523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281101, + "balance_loss_mlp": 1.26441216, + "epoch": 0.05617545209696037, + "flos": 587051919360.0, + "grad_norm": 0.091490024999748, + "language_loss": 0.97375935, + "learning_rate": 0.0009982058779188115, + "loss": 0.98657036, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.16699219, + "step": 292, + "time_per_iteration": 2.700998067855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223751, + "balance_loss_mlp": 1.20634639, + "epoch": 0.05636783378222393, + "flos": 611331217920.0, + "grad_norm": 0.09093545163733599, + "language_loss": 1.05090272, + "learning_rate": 0.0009981794128520567, + "loss": 1.06314015, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.17431641, + "step": 293, + "time_per_iteration": 2.769562244415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172918, + "balance_loss_mlp": 1.15501237, + "epoch": 0.0565602154674875, + "flos": 667847102976.0, + "grad_norm": 0.08200667246549262, + "language_loss": 1.02219713, + "learning_rate": 0.000998152754374901, + "loss": 1.03392649, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.17919922, + "step": 294, + "time_per_iteration": 2.8421483039855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140496, + "balance_loss_mlp": 1.12121987, + "epoch": 0.05675259715275106, + "flos": 616963474944.0, + "grad_norm": 0.06298459153201627, + "language_loss": 0.97706711, + "learning_rate": 0.0009981259024976943, + "loss": 0.98847204, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.19250488, + "step": 295, + "time_per_iteration": 2.709536552429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131247, + "balance_loss_mlp": 1.11139894, + "epoch": 0.05694497883801462, + "flos": 751424214528.0, + "grad_norm": 0.13011693222478776, + "language_loss": 0.96307456, + "learning_rate": 0.0009980988572308612, + "loss": 0.97438705, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.19848633, + "step": 296, + "time_per_iteration": 2.9606993198394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125313, + "balance_loss_mlp": 1.10492802, + "epoch": 0.05713736052327818, + "flos": 711669708288.0, + "grad_norm": 0.06808560063607492, + "language_loss": 0.9959082, + "learning_rate": 0.0009980716185849015, + "loss": 1.00716126, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.20385742, + "step": 297, + "time_per_iteration": 2.952467203140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133548, + "balance_loss_mlp": 1.11424804, + "epoch": 0.05732974220854175, + "flos": 468737100288.0, + "grad_norm": 0.05570922928007862, + "language_loss": 0.95103967, + "learning_rate": 0.0009980441865703904, + "loss": 0.9623751, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.19299316, + "step": 298, + "time_per_iteration": 2.6629996299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125947, + "balance_loss_mlp": 1.10630131, + "epoch": 0.05752212389380531, + "flos": 601143441408.0, + "grad_norm": 0.06175770353433084, + "language_loss": 1.038656, + "learning_rate": 0.000998016561197978, + "loss": 1.04991555, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.19628906, + "step": 299, + "time_per_iteration": 2.7027034759521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122899, + "balance_loss_mlp": 1.10499382, + "epoch": 0.057714505579068875, + "flos": 678344799744.0, + "grad_norm": 0.07709513760197055, + "language_loss": 0.95715761, + "learning_rate": 0.0009979887424783895, + "loss": 0.96838653, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.17907715, + "step": 300, + "time_per_iteration": 2.8467562198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122592, + "balance_loss_mlp": 1.10369694, + "epoch": 0.057906887264332435, + "flos": 595604316672.0, + "grad_norm": 0.05754387138467597, + "language_loss": 0.94804943, + "learning_rate": 0.0009979607304224248, + "loss": 0.95927536, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.18908691, + "step": 301, + "time_per_iteration": 2.7457566261291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135958, + "balance_loss_mlp": 1.11577594, + "epoch": 0.058099268949596, + "flos": 551855904768.0, + "grad_norm": 0.06951393564289957, + "language_loss": 1.02452385, + "learning_rate": 0.000997932525040959, + "loss": 1.03588343, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.20166016, + "step": 302, + "time_per_iteration": 2.670464038848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123814, + "balance_loss_mlp": 1.10513425, + "epoch": 0.05829165063485956, + "flos": 507906671616.0, + "grad_norm": 0.06408930588753382, + "language_loss": 1.04041958, + "learning_rate": 0.000997904126344943, + "loss": 1.05165768, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.18676758, + "step": 303, + "time_per_iteration": 2.654275417327881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122557, + "balance_loss_mlp": 1.10432982, + "epoch": 0.05848403232012313, + "flos": 614949774336.0, + "grad_norm": 0.10902949066110783, + "language_loss": 1.00108004, + "learning_rate": 0.0009978755343454018, + "loss": 1.0123055, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.18212891, + "step": 304, + "time_per_iteration": 2.7061922550201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118016, + "balance_loss_mlp": 1.10034943, + "epoch": 0.05867641400538669, + "flos": 499835902464.0, + "grad_norm": 0.07196511907519268, + "language_loss": 1.01183403, + "learning_rate": 0.0009978467490534355, + "loss": 1.02301419, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.17663574, + "step": 305, + "time_per_iteration": 2.5658843517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118418, + "balance_loss_mlp": 1.09971452, + "epoch": 0.05886879569065025, + "flos": 531019072512.0, + "grad_norm": 0.05577021807863236, + "language_loss": 0.98775607, + "learning_rate": 0.00099781777048022, + "loss": 0.99894023, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.18713379, + "step": 306, + "time_per_iteration": 2.688661813735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112614, + "balance_loss_mlp": 1.10866416, + "epoch": 0.05906117737591381, + "flos": 488811497472.0, + "grad_norm": 0.06489613907432343, + "language_loss": 0.99682212, + "learning_rate": 0.0009977885986370057, + "loss": 1.00808358, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.17480469, + "step": 307, + "time_per_iteration": 2.527008056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129188, + "balance_loss_mlp": 1.11242771, + "epoch": 0.05925355906117737, + "flos": 591213150720.0, + "grad_norm": 0.060579194597163814, + "language_loss": 0.94911426, + "learning_rate": 0.000997759233535118, + "loss": 0.96040612, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.16772461, + "step": 308, + "time_per_iteration": 2.768683433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_mlp": 1.09052539, + "epoch": 0.05944594074644094, + "flos": 563373522432.0, + "grad_norm": 0.074144120767366, + "language_loss": 1.01706028, + "learning_rate": 0.0009977296751859576, + "loss": 1.02814317, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.17749023, + "step": 309, + "time_per_iteration": 2.710550308227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109964, + "balance_loss_mlp": 1.0817585, + "epoch": 0.0596383224317045, + "flos": 538483147776.0, + "grad_norm": 0.1012520362466171, + "language_loss": 1.03562367, + "learning_rate": 0.0009976999236009998, + "loss": 1.04662001, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.17895508, + "step": 310, + "time_per_iteration": 2.7346065044403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095396, + "balance_loss_mlp": 1.07697809, + "epoch": 0.059830704116968066, + "flos": 560684726784.0, + "grad_norm": 0.05903807060939984, + "language_loss": 1.05193245, + "learning_rate": 0.0009976699787917955, + "loss": 1.06288636, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.18408203, + "step": 311, + "time_per_iteration": 2.737165689468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04018029, + "balance_loss_mlp": 3.94440532, + "epoch": 0.060023085802231625, + "flos": 1569759962112.0, + "grad_norm": 0.34396821433057967, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.77461016, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.734375, + "step": 312, + "time_per_iteration": 4.990010976791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130575, + "balance_loss_mlp": 1.11010623, + "epoch": 0.06021546748749519, + "flos": 482415395328.0, + "grad_norm": 0.18656347991450223, + "language_loss": 0.97164261, + "learning_rate": 0.0009976095095472243, + "loss": 0.98294836, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.20458984, + "step": 313, + "time_per_iteration": 2.5596373081207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143867, + "balance_loss_mlp": 1.12198031, + "epoch": 0.06040784917275875, + "flos": 619889407488.0, + "grad_norm": 0.10017394493353984, + "language_loss": 0.98154747, + "learning_rate": 0.0009975789851353334, + "loss": 0.9929862, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.21911621, + "step": 314, + "time_per_iteration": 2.783092498779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113993, + "balance_loss_mlp": 1.11832976, + "epoch": 0.06060023085802232, + "flos": 483292721664.0, + "grad_norm": 0.12837029886330253, + "language_loss": 1.00706339, + "learning_rate": 0.0009975482675461487, + "loss": 1.01846266, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.21594238, + "step": 315, + "time_per_iteration": 2.6375765800476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128184, + "balance_loss_mlp": 1.10697675, + "epoch": 0.06079261254328588, + "flos": 581620483584.0, + "grad_norm": 0.07139597701291463, + "language_loss": 0.9800331, + "learning_rate": 0.0009975173567915952, + "loss": 0.99131489, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.21228027, + "step": 316, + "time_per_iteration": 2.680223226547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116284, + "balance_loss_mlp": 1.09438515, + "epoch": 0.060984994228549444, + "flos": 687492306432.0, + "grad_norm": 0.12898022133672052, + "language_loss": 0.92624593, + "learning_rate": 0.000997486252883674, + "loss": 0.93740869, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.21887207, + "step": 317, + "time_per_iteration": 2.835162878036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_mlp": 1.08325243, + "epoch": 0.061177375913813004, + "flos": 1314284327424.0, + "grad_norm": 0.06442728945451602, + "language_loss": 0.97186124, + "learning_rate": 0.0009974549558344602, + "loss": 0.98290741, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.21350098, + "step": 318, + "time_per_iteration": 3.6293551921844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105129, + "balance_loss_mlp": 1.08439815, + "epoch": 0.06136975759907657, + "flos": 574072040448.0, + "grad_norm": 0.08131052095693254, + "language_loss": 1.07145, + "learning_rate": 0.000997423465656105, + "loss": 1.08250129, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.20715332, + "step": 319, + "time_per_iteration": 2.7070071697235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_mlp": 1.08168781, + "epoch": 0.06156213928434013, + "flos": 527281242624.0, + "grad_norm": 0.059301156484267634, + "language_loss": 1.04424822, + "learning_rate": 0.0009973917823608335, + "loss": 1.0552659, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.20092773, + "step": 320, + "time_per_iteration": 2.6225128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110531, + "balance_loss_mlp": 1.0897882, + "epoch": 0.061754520969603696, + "flos": 495238123008.0, + "grad_norm": 0.05387649814829365, + "language_loss": 0.98383266, + "learning_rate": 0.0009973599059609462, + "loss": 0.9949379, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.20739746, + "step": 321, + "time_per_iteration": 2.692152261734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107693, + "balance_loss_mlp": 1.08798778, + "epoch": 0.061946902654867256, + "flos": 439839673344.0, + "grad_norm": 0.06112812680296507, + "language_loss": 0.9711749, + "learning_rate": 0.000997327836468819, + "loss": 0.98225188, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.19702148, + "step": 322, + "time_per_iteration": 2.5772383213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110285, + "balance_loss_mlp": 1.0900557, + "epoch": 0.06213928434013082, + "flos": 598490961408.0, + "grad_norm": 0.0645434874295678, + "language_loss": 0.9942351, + "learning_rate": 0.000997295573896902, + "loss": 1.00533807, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.20239258, + "step": 323, + "time_per_iteration": 2.839282274246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02259253, + "balance_loss_mlp": 2.20088792, + "epoch": 0.06233166602539438, + "flos": 1449393716736.0, + "grad_norm": 0.19547826226404627, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83455294, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.58203125, + "step": 324, + "time_per_iteration": 4.67440938949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01888161, + "balance_loss_mlp": 1.83246601, + "epoch": 0.06252404771065795, + "flos": 1462504453632.0, + "grad_norm": 0.11962022052509429, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80460101, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.55859375, + "step": 325, + "time_per_iteration": 4.860283136367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177486, + "balance_loss_mlp": 1.15595722, + "epoch": 0.06271642939592151, + "flos": 464059335168.0, + "grad_norm": 0.06272096910143152, + "language_loss": 0.93621421, + "learning_rate": 0.000997197627828043, + "loss": 0.94798911, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.2154541, + "step": 326, + "time_per_iteration": 2.5594961643218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205877, + "balance_loss_mlp": 1.18165386, + "epoch": 0.06290881108118507, + "flos": 532111776768.0, + "grad_norm": 0.08849931028565244, + "language_loss": 0.89414704, + "learning_rate": 0.0009971645930629716, + "loss": 0.90620589, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.2421875, + "step": 327, + "time_per_iteration": 2.7163310050964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223238, + "balance_loss_mlp": 1.19748878, + "epoch": 0.06310119276644863, + "flos": 673262572032.0, + "grad_norm": 0.09892100413683627, + "language_loss": 1.02883804, + "learning_rate": 0.0009971313652814872, + "loss": 1.04107046, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.25769043, + "step": 328, + "time_per_iteration": 2.7786266803741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228803, + "balance_loss_mlp": 1.20175433, + "epoch": 0.0632935744517122, + "flos": 770404497408.0, + "grad_norm": 0.06852265531332852, + "language_loss": 0.99799907, + "learning_rate": 0.0009970979444964903, + "loss": 1.01028717, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.27050781, + "step": 329, + "time_per_iteration": 2.952498197555542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235649, + "balance_loss_mlp": 1.2062993, + "epoch": 0.06348595613697576, + "flos": 561649393152.0, + "grad_norm": 0.09680127661829774, + "language_loss": 1.0121367, + "learning_rate": 0.0009970643307209556, + "loss": 1.02449322, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.29296875, + "step": 330, + "time_per_iteration": 2.78190541267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240935, + "balance_loss_mlp": 1.20970178, + "epoch": 0.06367833782223932, + "flos": 675891730944.0, + "grad_norm": 0.08786526055569537, + "language_loss": 0.9788332, + "learning_rate": 0.0009970305239679334, + "loss": 0.99124253, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.31201172, + "step": 331, + "time_per_iteration": 2.805845022201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228576, + "balance_loss_mlp": 1.19891691, + "epoch": 0.06387071950750288, + "flos": 495035891712.0, + "grad_norm": 0.10390832636325384, + "language_loss": 1.03124022, + "learning_rate": 0.0009969965242505483, + "loss": 1.04352593, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.29614258, + "step": 332, + "time_per_iteration": 2.676711082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207199, + "balance_loss_mlp": 1.1777302, + "epoch": 0.06406310119276645, + "flos": 533170985472.0, + "grad_norm": 0.07105898063788767, + "language_loss": 0.98331362, + "learning_rate": 0.0009969623315820007, + "loss": 0.99538565, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.29418945, + "step": 333, + "time_per_iteration": 2.6556739807128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118815, + "balance_loss_mlp": 1.16106582, + "epoch": 0.06425548287803001, + "flos": 455940513792.0, + "grad_norm": 0.08067516684621483, + "language_loss": 0.99160993, + "learning_rate": 0.000996927945975565, + "loss": 1.0034914, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.27124023, + "step": 334, + "time_per_iteration": 2.5398526191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147495, + "balance_loss_mlp": 1.1214596, + "epoch": 0.06444786456329357, + "flos": 559817574912.0, + "grad_norm": 0.08169715789363684, + "language_loss": 0.96174645, + "learning_rate": 0.0009968933674445906, + "loss": 0.97322142, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.26062012, + "step": 335, + "time_per_iteration": 2.6592860221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112932, + "balance_loss_mlp": 1.0879097, + "epoch": 0.06464024624855713, + "flos": 665769383424.0, + "grad_norm": 0.07104021966044574, + "language_loss": 0.97756392, + "learning_rate": 0.0009968585960025028, + "loss": 0.98869324, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.25036621, + "step": 336, + "time_per_iteration": 2.9279658794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01860024, + "balance_loss_mlp": 1.84323907, + "epoch": 0.0648326279338207, + "flos": 1520578704384.0, + "grad_norm": 0.14426901756633248, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.7951321, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.16796875, + "step": 337, + "time_per_iteration": 4.810914993286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101739, + "balance_loss_mlp": 1.07948256, + "epoch": 0.06502500961908426, + "flos": 1142872768512.0, + "grad_norm": 0.058812216055980165, + "language_loss": 0.95864177, + "learning_rate": 0.0009967884744390583, + "loss": 0.96965921, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.22265625, + "step": 338, + "time_per_iteration": 3.512282371520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146504, + "balance_loss_mlp": 1.12267399, + "epoch": 0.06521739130434782, + "flos": 582339248640.0, + "grad_norm": 0.10793578588091769, + "language_loss": 0.97449529, + "learning_rate": 0.0009967531243449256, + "loss": 0.98596036, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.23828125, + "step": 339, + "time_per_iteration": 2.712907075881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154087, + "balance_loss_mlp": 1.12950587, + "epoch": 0.06540977298961138, + "flos": 497398800384.0, + "grad_norm": 0.06396927661276222, + "language_loss": 1.04641414, + "learning_rate": 0.000996717581394126, + "loss": 1.05795503, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.24584961, + "step": 340, + "time_per_iteration": 2.5783133506774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168509, + "balance_loss_mlp": 1.14584756, + "epoch": 0.06560215467487496, + "flos": 542613855744.0, + "grad_norm": 0.07568553531769329, + "language_loss": 1.05092287, + "learning_rate": 0.000996681845600459, + "loss": 1.062608, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.2265625, + "step": 341, + "time_per_iteration": 2.6543757915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.13118291, + "epoch": 0.06579453636013852, + "flos": 413230961664.0, + "grad_norm": 0.06593832485574395, + "language_loss": 0.97027373, + "learning_rate": 0.0009966459169777982, + "loss": 0.9818095, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.22387695, + "step": 342, + "time_per_iteration": 2.5120761394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141132, + "balance_loss_mlp": 1.11848283, + "epoch": 0.06598691804540208, + "flos": 560354457600.0, + "grad_norm": 0.055115078659976495, + "language_loss": 1.05281377, + "learning_rate": 0.0009966097955400924, + "loss": 1.0642252, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.22644043, + "step": 343, + "time_per_iteration": 2.6954751014709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111133, + "balance_loss_mlp": 1.08904982, + "epoch": 0.06617929973066564, + "flos": 571789117440.0, + "grad_norm": 0.06176008438986438, + "language_loss": 0.99064481, + "learning_rate": 0.0009965734813013652, + "loss": 1.00175822, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.22277832, + "step": 344, + "time_per_iteration": 2.8235929012298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090293, + "balance_loss_mlp": 1.06726193, + "epoch": 0.06637168141592921, + "flos": 490234470912.0, + "grad_norm": 0.05365164831273283, + "language_loss": 1.01308548, + "learning_rate": 0.0009965369742757151, + "loss": 1.02398837, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.23022461, + "step": 345, + "time_per_iteration": 2.5708556175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078086, + "balance_loss_mlp": 1.05727243, + "epoch": 0.06656406310119277, + "flos": 1078735656960.0, + "grad_norm": 0.04968829319439664, + "language_loss": 0.97902787, + "learning_rate": 0.0009965002744773152, + "loss": 0.98980874, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.20812988, + "step": 346, + "time_per_iteration": 3.4984121322631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086945, + "balance_loss_mlp": 1.06450987, + "epoch": 0.06675644478645633, + "flos": 513421065216.0, + "grad_norm": 0.06258978415695335, + "language_loss": 0.95138037, + "learning_rate": 0.0009964633819204139, + "loss": 0.96224982, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.22436523, + "step": 347, + "time_per_iteration": 2.6866109371185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01752926, + "balance_loss_mlp": 1.73108697, + "epoch": 0.06694882647171989, + "flos": 1446359943168.0, + "grad_norm": 0.11694655230354783, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83554041, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.21875, + "step": 348, + "time_per_iteration": 4.935550928115845 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01583198, + "balance_loss_mlp": 1.56116796, + "epoch": 0.06714120815698346, + "flos": 1551230784000.0, + "grad_norm": 0.0989027294649474, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76737082, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.22070312, + "step": 349, + "time_per_iteration": 4.891008615493774 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149858, + "balance_loss_mlp": 1.12826955, + "epoch": 0.06733358984224702, + "flos": 879689673216.0, + "grad_norm": 0.07075764146586616, + "language_loss": 0.94920838, + "learning_rate": 0.000996351547842304, + "loss": 0.96070701, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.21582031, + "step": 350, + "time_per_iteration": 3.156322717666626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192552, + "balance_loss_mlp": 1.17055774, + "epoch": 0.06752597152751058, + "flos": 518654651904.0, + "grad_norm": 0.09040238598346795, + "language_loss": 0.93423587, + "learning_rate": 0.0009963138843953744, + "loss": 0.94616139, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.2199707, + "step": 351, + "time_per_iteration": 2.610987663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206077, + "balance_loss_mlp": 1.18405879, + "epoch": 0.06771835321277414, + "flos": 539366266368.0, + "grad_norm": 0.08658544591035036, + "language_loss": 0.97852194, + "learning_rate": 0.000996276028262306, + "loss": 0.9905827, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.22021484, + "step": 352, + "time_per_iteration": 2.8413686752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166048, + "balance_loss_mlp": 1.14382768, + "epoch": 0.0679107348980377, + "flos": 460430604288.0, + "grad_norm": 0.09117082479319542, + "language_loss": 1.04269946, + "learning_rate": 0.0009962379794577964, + "loss": 1.05435991, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.22216797, + "step": 353, + "time_per_iteration": 2.591372489929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114388, + "balance_loss_mlp": 1.12227976, + "epoch": 0.06810311658330127, + "flos": 635601752064.0, + "grad_norm": 0.05781909345233015, + "language_loss": 0.94169199, + "learning_rate": 0.000996199737996617, + "loss": 0.95313084, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.21630859, + "step": 354, + "time_per_iteration": 2.9088492393493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125411, + "balance_loss_mlp": 1.10420346, + "epoch": 0.06829549826856483, + "flos": 464443448832.0, + "grad_norm": 0.06770201052263504, + "language_loss": 1.03043509, + "learning_rate": 0.0009961613038936149, + "loss": 1.04168916, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.2121582, + "step": 355, + "time_per_iteration": 2.571904420852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110613, + "balance_loss_mlp": 1.08917904, + "epoch": 0.06848787995382839, + "flos": 634335929856.0, + "grad_norm": 0.06097004840688574, + "language_loss": 0.95565176, + "learning_rate": 0.000996122677163711, + "loss": 0.96675789, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.21435547, + "step": 356, + "time_per_iteration": 2.794982671737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107296, + "balance_loss_mlp": 1.08667266, + "epoch": 0.06868026163909195, + "flos": 806023913472.0, + "grad_norm": 0.08020973782133771, + "language_loss": 1.01095176, + "learning_rate": 0.000996083857821902, + "loss": 1.02202487, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.20629883, + "step": 357, + "time_per_iteration": 3.007086753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101637, + "balance_loss_mlp": 1.08076346, + "epoch": 0.06887264332435553, + "flos": 438997252608.0, + "grad_norm": 0.08125476198078858, + "language_loss": 0.99797714, + "learning_rate": 0.0009960448458832588, + "loss": 1.00899351, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.2088623, + "step": 358, + "time_per_iteration": 2.699530601501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098146, + "balance_loss_mlp": 1.07872701, + "epoch": 0.06906502500961909, + "flos": 484513463808.0, + "grad_norm": 0.06827746260367892, + "language_loss": 0.99188638, + "learning_rate": 0.000996005641362927, + "loss": 1.00286782, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.1940918, + "step": 359, + "time_per_iteration": 2.5541014671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103345, + "balance_loss_mlp": 1.0841639, + "epoch": 0.06925740669488265, + "flos": 733293706752.0, + "grad_norm": 0.08731085845928575, + "language_loss": 1.02303529, + "learning_rate": 0.0009959662442761274, + "loss": 1.0340687, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.19189453, + "step": 360, + "time_per_iteration": 2.906623363494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093844, + "balance_loss_mlp": 1.07268476, + "epoch": 0.0694497883801462, + "flos": 552127947264.0, + "grad_norm": 0.06697663210144707, + "language_loss": 0.9595629, + "learning_rate": 0.000995926654638155, + "loss": 0.97050136, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.21179199, + "step": 361, + "time_per_iteration": 2.793663501739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082773, + "balance_loss_mlp": 1.06236482, + "epoch": 0.06964217006540978, + "flos": 677708992512.0, + "grad_norm": 0.06860924301964295, + "language_loss": 0.98198265, + "learning_rate": 0.00099588687246438, + "loss": 0.99281037, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.20410156, + "step": 362, + "time_per_iteration": 2.828139305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085274, + "balance_loss_mlp": 1.06330371, + "epoch": 0.06983455175067334, + "flos": 523987163136.0, + "grad_norm": 0.08747541291209461, + "language_loss": 1.04803789, + "learning_rate": 0.0009958468977702471, + "loss": 1.0588907, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.21972656, + "step": 363, + "time_per_iteration": 2.5759966373443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02224374, + "balance_loss_mlp": 2.20682669, + "epoch": 0.0700269334359369, + "flos": 1575943658496.0, + "grad_norm": 0.2746548069890379, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81959081, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.17578125, + "step": 364, + "time_per_iteration": 4.782835245132446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134514, + "balance_loss_mlp": 1.11340213, + "epoch": 0.07021931512120046, + "flos": 1012848274944.0, + "grad_norm": 0.08586169827549085, + "language_loss": 0.93286598, + "learning_rate": 0.0009957663708830612, + "loss": 0.94421113, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.21105957, + "step": 365, + "time_per_iteration": 3.2484283447265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116189, + "balance_loss_mlp": 1.13884652, + "epoch": 0.07041169680646403, + "flos": 822622348800.0, + "grad_norm": 0.09941073368395695, + "language_loss": 0.97043455, + "learning_rate": 0.0009957258187212714, + "loss": 0.98205346, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.23034668, + "step": 366, + "time_per_iteration": 3.009479522705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01756688, + "balance_loss_mlp": 1.7255981, + "epoch": 0.07060407849172759, + "flos": 1413670993920.0, + "grad_norm": 0.12374795181042475, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80951542, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.31054688, + "step": 367, + "time_per_iteration": 4.82874608039856 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152073, + "balance_loss_mlp": 1.13087749, + "epoch": 0.07079646017699115, + "flos": 512652837888.0, + "grad_norm": 0.06786716904588838, + "language_loss": 0.93450886, + "learning_rate": 0.0009956441370400167, + "loss": 0.94602954, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.21191406, + "step": 368, + "time_per_iteration": 2.6226603984832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153965, + "balance_loss_mlp": 1.13158989, + "epoch": 0.07098884186225471, + "flos": 540240772608.0, + "grad_norm": 0.08343626294497461, + "language_loss": 0.99467343, + "learning_rate": 0.0009956030075522636, + "loss": 1.00621307, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.22375488, + "step": 369, + "time_per_iteration": 2.7128794193267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142856, + "balance_loss_mlp": 1.12137485, + "epoch": 0.07118122354751828, + "flos": 548419230720.0, + "grad_norm": 0.07464528715750075, + "language_loss": 0.98955953, + "learning_rate": 0.0009955616856543587, + "loss": 1.00098813, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.21472168, + "step": 370, + "time_per_iteration": 2.613138198852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118555, + "balance_loss_mlp": 1.0958215, + "epoch": 0.07137360523278184, + "flos": 620612554752.0, + "grad_norm": 0.056434914921328155, + "language_loss": 0.91880834, + "learning_rate": 0.0009955201713623448, + "loss": 0.92999387, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.22717285, + "step": 371, + "time_per_iteration": 2.747133255004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01746336, + "balance_loss_mlp": 1.72154021, + "epoch": 0.0715659869180454, + "flos": 1501850115072.0, + "grad_norm": 0.08669176596007007, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78419054, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.24707031, + "step": 372, + "time_per_iteration": 4.931428670883179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102855, + "balance_loss_mlp": 1.08040774, + "epoch": 0.07175836860330896, + "flos": 495246887424.0, + "grad_norm": 0.07044890130803105, + "language_loss": 1.05121827, + "learning_rate": 0.0009954365656605333, + "loss": 1.06224692, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.22436523, + "step": 373, + "time_per_iteration": 2.550243616104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118244, + "balance_loss_mlp": 1.09438992, + "epoch": 0.07195075028857253, + "flos": 785387902464.0, + "grad_norm": 0.05415547127036835, + "language_loss": 0.98150015, + "learning_rate": 0.0009953944742831947, + "loss": 0.99268264, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.23864746, + "step": 374, + "time_per_iteration": 2.9659459590911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125507, + "balance_loss_mlp": 1.10202336, + "epoch": 0.0721431319738361, + "flos": 592799067648.0, + "grad_norm": 0.07003669353380264, + "language_loss": 1.01441097, + "learning_rate": 0.0009953521905766642, + "loss": 1.02566612, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.23486328, + "step": 375, + "time_per_iteration": 2.942763566970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117119, + "balance_loss_mlp": 1.09393334, + "epoch": 0.07233551365909965, + "flos": 547981272576.0, + "grad_norm": 0.06343477824222313, + "language_loss": 0.99901861, + "learning_rate": 0.0009953097145573577, + "loss": 1.01018989, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.23193359, + "step": 376, + "time_per_iteration": 2.6275272369384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113711, + "balance_loss_mlp": 1.09023869, + "epoch": 0.07252789534436321, + "flos": 957170428416.0, + "grad_norm": 0.0678891965164594, + "language_loss": 0.97798675, + "learning_rate": 0.000995267046241766, + "loss": 0.98912394, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.23474121, + "step": 377, + "time_per_iteration": 3.2014975547790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096997, + "balance_loss_mlp": 1.07496762, + "epoch": 0.07272027702962677, + "flos": 507398902272.0, + "grad_norm": 0.0806519998399971, + "language_loss": 0.97275257, + "learning_rate": 0.0009952241856464547, + "loss": 0.98372257, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.22045898, + "step": 378, + "time_per_iteration": 2.6189732551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109641, + "balance_loss_mlp": 1.0746069, + "epoch": 0.07291265871489035, + "flos": 612128558592.0, + "grad_norm": 0.0691049335661606, + "language_loss": 1.04592681, + "learning_rate": 0.0009951811327880632, + "loss": 1.05689096, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.21826172, + "step": 379, + "time_per_iteration": 2.7411558628082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092071, + "balance_loss_mlp": 1.07025611, + "epoch": 0.0731050404001539, + "flos": 495502963200.0, + "grad_norm": 0.05765504670581196, + "language_loss": 0.97682816, + "learning_rate": 0.0009951378876833063, + "loss": 0.98774892, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.21813965, + "step": 380, + "time_per_iteration": 2.6211278438568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081575, + "balance_loss_mlp": 1.06068945, + "epoch": 0.07329742208541747, + "flos": 639677205504.0, + "grad_norm": 0.06809750593205881, + "language_loss": 1.04190159, + "learning_rate": 0.0009950944503489736, + "loss": 1.05271733, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.20898438, + "step": 381, + "time_per_iteration": 2.7533762454986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081401, + "balance_loss_mlp": 1.0607307, + "epoch": 0.07348980377068103, + "flos": 815999284224.0, + "grad_norm": 0.06607035824886899, + "language_loss": 0.98459697, + "learning_rate": 0.0009950508208019285, + "loss": 0.99541104, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.20678711, + "step": 382, + "time_per_iteration": 2.9885637760162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073667, + "balance_loss_mlp": 1.05369973, + "epoch": 0.0736821854559446, + "flos": 508383917568.0, + "grad_norm": 0.05970909775769663, + "language_loss": 1.02745128, + "learning_rate": 0.0009950069990591096, + "loss": 1.03818798, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.19958496, + "step": 383, + "time_per_iteration": 2.6111788749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01835936, + "balance_loss_mlp": 1.8101871, + "epoch": 0.07387456714120816, + "flos": 1553801716224.0, + "grad_norm": 0.167122487372618, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.78237301, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.2578125, + "step": 384, + "time_per_iteration": 4.859915494918823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116619, + "balance_loss_mlp": 1.09575748, + "epoch": 0.07406694882647172, + "flos": 525219489792.0, + "grad_norm": 0.0799084124695288, + "language_loss": 0.96017051, + "learning_rate": 0.0009949187790542777, + "loss": 0.97133672, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.20861816, + "step": 385, + "time_per_iteration": 2.6976191997528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124239, + "balance_loss_mlp": 1.10322285, + "epoch": 0.07425933051173528, + "flos": 497468611584.0, + "grad_norm": 0.08753491640442414, + "language_loss": 0.91745877, + "learning_rate": 0.0009948743808265148, + "loss": 0.92870116, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.21020508, + "step": 386, + "time_per_iteration": 2.6870572566986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113476, + "balance_loss_mlp": 1.09249496, + "epoch": 0.07445171219699885, + "flos": 504740630016.0, + "grad_norm": 0.05063210924529089, + "language_loss": 1.0156467, + "learning_rate": 0.0009948297904714782, + "loss": 1.02678132, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.20996094, + "step": 387, + "time_per_iteration": 2.668027639389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097529, + "balance_loss_mlp": 1.07642913, + "epoch": 0.07464409388226241, + "flos": 553693515264.0, + "grad_norm": 0.06830922509793466, + "language_loss": 0.93493366, + "learning_rate": 0.0009947850080064796, + "loss": 0.9459089, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.21105957, + "step": 388, + "time_per_iteration": 2.79836106300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098078, + "balance_loss_mlp": 1.07695365, + "epoch": 0.07483647556752597, + "flos": 776511028224.0, + "grad_norm": 0.06471398355705121, + "language_loss": 0.98276728, + "learning_rate": 0.0009947400334489047, + "loss": 0.99374807, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.21130371, + "step": 389, + "time_per_iteration": 3.0046355724334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095267, + "balance_loss_mlp": 1.07513261, + "epoch": 0.07502885725278953, + "flos": 612256596480.0, + "grad_norm": 0.0754939105077014, + "language_loss": 0.90272582, + "learning_rate": 0.0009946948668162145, + "loss": 0.91367853, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.20141602, + "step": 390, + "time_per_iteration": 2.724792003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091157, + "balance_loss_mlp": 1.06946135, + "epoch": 0.0752212389380531, + "flos": 688324552704.0, + "grad_norm": 0.05626120625508035, + "language_loss": 0.9463594, + "learning_rate": 0.0009946495081259441, + "loss": 0.95727098, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.21704102, + "step": 391, + "time_per_iteration": 2.8221397399902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101684, + "balance_loss_mlp": 1.08008361, + "epoch": 0.07541362062331666, + "flos": 765362967552.0, + "grad_norm": 0.09729902751759628, + "language_loss": 0.97468722, + "learning_rate": 0.0009946039573957035, + "loss": 0.98570406, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.21606445, + "step": 392, + "time_per_iteration": 2.958655595779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095785, + "balance_loss_mlp": 1.07572174, + "epoch": 0.07560600230858022, + "flos": 588460336128.0, + "grad_norm": 0.06468718689622391, + "language_loss": 0.94257009, + "learning_rate": 0.000994558214643177, + "loss": 0.95352793, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.20056152, + "step": 393, + "time_per_iteration": 2.752979040145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101382, + "balance_loss_mlp": 1.08086586, + "epoch": 0.07579838399384378, + "flos": 749508028416.0, + "grad_norm": 0.06635223139616171, + "language_loss": 0.961483, + "learning_rate": 0.000994512279886123, + "loss": 0.97249681, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.20532227, + "step": 394, + "time_per_iteration": 3.055225133895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104661, + "balance_loss_mlp": 1.08346581, + "epoch": 0.07599076567910736, + "flos": 523185440256.0, + "grad_norm": 0.06901630142642712, + "language_loss": 0.96749192, + "learning_rate": 0.0009944661531423758, + "loss": 0.97853857, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.2121582, + "step": 395, + "time_per_iteration": 2.6922085285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093271, + "balance_loss_mlp": 1.07248056, + "epoch": 0.07618314736437092, + "flos": 550812662784.0, + "grad_norm": 0.07064334209039194, + "language_loss": 0.95375401, + "learning_rate": 0.000994419834429843, + "loss": 0.96468663, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.20788574, + "step": 396, + "time_per_iteration": 2.6657333374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092352, + "balance_loss_mlp": 1.0716933, + "epoch": 0.07637552904963447, + "flos": 697901253120.0, + "grad_norm": 0.07324881108467876, + "language_loss": 0.99580455, + "learning_rate": 0.0009943733237665069, + "loss": 1.00672793, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.20654297, + "step": 397, + "time_per_iteration": 2.8662500381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085112, + "balance_loss_mlp": 1.06454849, + "epoch": 0.07656791073489803, + "flos": 579066928128.0, + "grad_norm": 0.04790317238997088, + "language_loss": 0.98118353, + "learning_rate": 0.0009943266211704248, + "loss": 0.99203461, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.20568848, + "step": 398, + "time_per_iteration": 2.930741786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094784, + "balance_loss_mlp": 1.07348132, + "epoch": 0.0767602924201616, + "flos": 416923711488.0, + "grad_norm": 0.09980331544781734, + "language_loss": 1.00422275, + "learning_rate": 0.000994279726659728, + "loss": 1.01517057, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.21325684, + "step": 399, + "time_per_iteration": 2.533738851547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109036, + "balance_loss_mlp": 1.06970143, + "epoch": 0.07695267410542517, + "flos": 482671471104.0, + "grad_norm": 0.06967700921129397, + "language_loss": 0.97985041, + "learning_rate": 0.0009942326402526231, + "loss": 0.99075395, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.20666504, + "step": 400, + "time_per_iteration": 2.51460337638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096542, + "balance_loss_mlp": 1.07526302, + "epoch": 0.07714505579068873, + "flos": 530742647808.0, + "grad_norm": 0.052652305799428985, + "language_loss": 0.96639109, + "learning_rate": 0.0009941853619673902, + "loss": 0.97735649, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.2130127, + "step": 401, + "time_per_iteration": 2.620939016342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101036, + "balance_loss_mlp": 1.08012676, + "epoch": 0.07733743747595229, + "flos": 804635845632.0, + "grad_norm": 0.07273299487754427, + "language_loss": 0.99959278, + "learning_rate": 0.0009941378918223844, + "loss": 1.01060319, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.20910645, + "step": 402, + "time_per_iteration": 3.036839008331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110477, + "balance_loss_mlp": 1.08423018, + "epoch": 0.07752981916121585, + "flos": 622192679424.0, + "grad_norm": 0.05767312217272775, + "language_loss": 0.93044209, + "learning_rate": 0.0009940902298360354, + "loss": 0.94148982, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.20544434, + "step": 403, + "time_per_iteration": 2.7703943252563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097477, + "balance_loss_mlp": 1.07694876, + "epoch": 0.07772220084647942, + "flos": 727961195520.0, + "grad_norm": 0.0686344305115436, + "language_loss": 1.02037048, + "learning_rate": 0.0009940423760268473, + "loss": 1.03134525, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.2052002, + "step": 404, + "time_per_iteration": 2.8823602199554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.06497431, + "epoch": 0.07791458253174298, + "flos": 555149984256.0, + "grad_norm": 0.10727031409308073, + "language_loss": 0.96142864, + "learning_rate": 0.0009939943304133982, + "loss": 0.97228479, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.20654297, + "step": 405, + "time_per_iteration": 2.63908314704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078944, + "balance_loss_mlp": 1.05944133, + "epoch": 0.07810696421700654, + "flos": 552919495680.0, + "grad_norm": 0.08981509362846728, + "language_loss": 1.0302707, + "learning_rate": 0.0009939460930143416, + "loss": 1.04106021, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.19482422, + "step": 406, + "time_per_iteration": 2.63259220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079269, + "balance_loss_mlp": 1.05927801, + "epoch": 0.0782993459022701, + "flos": 650323289088.0, + "grad_norm": 0.07212254231156982, + "language_loss": 0.96910775, + "learning_rate": 0.0009938976638484043, + "loss": 0.97990054, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.1998291, + "step": 407, + "time_per_iteration": 2.9489452838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_mlp": 1.05239439, + "epoch": 0.07849172758753367, + "flos": 495926364672.0, + "grad_norm": 0.07302041560946317, + "language_loss": 0.9619081, + "learning_rate": 0.0009938490429343887, + "loss": 0.97263873, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.20666504, + "step": 408, + "time_per_iteration": 2.541293144226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078297, + "balance_loss_mlp": 1.05823374, + "epoch": 0.07868410927279723, + "flos": 577696389120.0, + "grad_norm": 0.06961121210328268, + "language_loss": 0.96404505, + "learning_rate": 0.0009938002302911709, + "loss": 0.974828, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.20056152, + "step": 409, + "time_per_iteration": 2.7890634536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.05869615, + "epoch": 0.07887649095806079, + "flos": 522698019840.0, + "grad_norm": 0.10283598941623227, + "language_loss": 0.99080813, + "learning_rate": 0.0009937512259377015, + "loss": 1.00159442, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.19921875, + "step": 410, + "time_per_iteration": 2.6631360054016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076374, + "balance_loss_mlp": 1.05739617, + "epoch": 0.07906887264332435, + "flos": 556958481408.0, + "grad_norm": 0.07518465865945036, + "language_loss": 0.97744381, + "learning_rate": 0.000993702029893006, + "loss": 0.98820746, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.18981934, + "step": 411, + "time_per_iteration": 2.762937068939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070708, + "balance_loss_mlp": 1.0512886, + "epoch": 0.07926125432858792, + "flos": 821641715712.0, + "grad_norm": 0.06547583340109177, + "language_loss": 0.97466588, + "learning_rate": 0.0009936526421761838, + "loss": 0.98537302, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.1940918, + "step": 412, + "time_per_iteration": 3.019529342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070741, + "balance_loss_mlp": 1.05210841, + "epoch": 0.07945363601385148, + "flos": 562072794624.0, + "grad_norm": 0.06412617323579047, + "language_loss": 0.9993977, + "learning_rate": 0.000993603062806409, + "loss": 1.01010513, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.18615723, + "step": 413, + "time_per_iteration": 2.667893409729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078833, + "balance_loss_mlp": 1.05879402, + "epoch": 0.07964601769911504, + "flos": 517615792128.0, + "grad_norm": 0.0777298152120257, + "language_loss": 1.03187037, + "learning_rate": 0.0009935532918029298, + "loss": 1.04265857, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.20031738, + "step": 414, + "time_per_iteration": 2.628847122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079604, + "balance_loss_mlp": 1.06020916, + "epoch": 0.0798383993843786, + "flos": 538956011520.0, + "grad_norm": 0.0762846382616791, + "language_loss": 0.96381676, + "learning_rate": 0.0009935033291850694, + "loss": 0.97461283, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.19384766, + "step": 415, + "time_per_iteration": 2.6874804496765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078311, + "balance_loss_mlp": 1.05915451, + "epoch": 0.08003078106964218, + "flos": 484901959680.0, + "grad_norm": 0.07548152614126195, + "language_loss": 0.9874112, + "learning_rate": 0.0009934531749722247, + "loss": 0.9981944, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.19177246, + "step": 416, + "time_per_iteration": 2.5752930641174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077721, + "balance_loss_mlp": 1.0581702, + "epoch": 0.08022316275490574, + "flos": 517999905792.0, + "grad_norm": 0.07373378819853486, + "language_loss": 0.97326815, + "learning_rate": 0.0009934028291838672, + "loss": 0.98404539, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.1953125, + "step": 417, + "time_per_iteration": 2.715142011642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078885, + "balance_loss_mlp": 1.0593344, + "epoch": 0.0804155444401693, + "flos": 493755512832.0, + "grad_norm": 0.06878732968267398, + "language_loss": 0.9290086, + "learning_rate": 0.0009933522918395433, + "loss": 0.93979746, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.19555664, + "step": 418, + "time_per_iteration": 2.7008063793182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01673141, + "balance_loss_mlp": 1.6505394, + "epoch": 0.08060792612543285, + "flos": 1580567579136.0, + "grad_norm": 0.10865535097535944, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.7992425, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.22558594, + "step": 419, + "time_per_iteration": 4.854820728302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092516, + "balance_loss_mlp": 1.07238102, + "epoch": 0.08080030781069643, + "flos": 525090041856.0, + "grad_norm": 0.07888672823303539, + "language_loss": 1.11010027, + "learning_rate": 0.000993250642561551, + "loss": 1.12102532, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.20129395, + "step": 420, + "time_per_iteration": 2.6152822971343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102725, + "balance_loss_mlp": 1.08251905, + "epoch": 0.08099268949595999, + "flos": 546459374592.0, + "grad_norm": 0.06927423279576624, + "language_loss": 0.96781242, + "learning_rate": 0.0009931995306673466, + "loss": 0.97883964, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.20202637, + "step": 421, + "time_per_iteration": 2.8378820419311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107938, + "balance_loss_mlp": 1.08725524, + "epoch": 0.08118507118122355, + "flos": 510116811264.0, + "grad_norm": 0.07245841989657228, + "language_loss": 1.01691484, + "learning_rate": 0.000993148227296103, + "loss": 1.02799416, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.20678711, + "step": 422, + "time_per_iteration": 2.6234657764434814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109153, + "balance_loss_mlp": 1.08827925, + "epoch": 0.08137745286648711, + "flos": 720339969024.0, + "grad_norm": 0.06440268991377437, + "language_loss": 0.90059143, + "learning_rate": 0.000993096732467738, + "loss": 0.91168296, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.2088623, + "step": 423, + "time_per_iteration": 2.9789979457855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107042, + "balance_loss_mlp": 1.08620405, + "epoch": 0.08156983455175067, + "flos": 679313848320.0, + "grad_norm": 0.09430690436493987, + "language_loss": 0.97591221, + "learning_rate": 0.0009930450462022435, + "loss": 0.9869827, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.20837402, + "step": 424, + "time_per_iteration": 2.7870407104492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01731933, + "balance_loss_mlp": 1.70847309, + "epoch": 0.08176221623701424, + "flos": 1452577135104.0, + "grad_norm": 0.13164555017172178, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80921739, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.234375, + "step": 425, + "time_per_iteration": 4.870323181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095108, + "balance_loss_mlp": 1.07456827, + "epoch": 0.0819545979222778, + "flos": 1556034071040.0, + "grad_norm": 0.10298759083167684, + "language_loss": 0.95328236, + "learning_rate": 0.0009929410994402065, + "loss": 0.9642334, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.20544434, + "step": 426, + "time_per_iteration": 3.7942585945129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093366, + "balance_loss_mlp": 1.07214665, + "epoch": 0.08214697960754136, + "flos": 512456398848.0, + "grad_norm": 0.069672302328133, + "language_loss": 0.99507213, + "learning_rate": 0.0009928888389840196, + "loss": 1.00600576, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.21240234, + "step": 427, + "time_per_iteration": 2.684760093688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073876, + "balance_loss_mlp": 1.05376494, + "epoch": 0.08233936129280492, + "flos": 594850646016.0, + "grad_norm": 0.07796900075206671, + "language_loss": 1.01749206, + "learning_rate": 0.0009928363871714147, + "loss": 1.02823079, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.20092773, + "step": 428, + "time_per_iteration": 2.6608195304870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078126, + "balance_loss_mlp": 1.05796742, + "epoch": 0.08253174297806849, + "flos": 571758594048.0, + "grad_norm": 0.07341701057973313, + "language_loss": 0.95524251, + "learning_rate": 0.0009927837440227556, + "loss": 0.96602374, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.20153809, + "step": 429, + "time_per_iteration": 2.824958324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083273, + "balance_loss_mlp": 1.06413972, + "epoch": 0.08272412466333205, + "flos": 623065623552.0, + "grad_norm": 0.06194570532237157, + "language_loss": 0.90308964, + "learning_rate": 0.0009927309095584798, + "loss": 0.91392243, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.19128418, + "step": 430, + "time_per_iteration": 2.9565205574035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105878, + "balance_loss_mlp": 1.08643484, + "epoch": 0.08291650634859561, + "flos": 513745542144.0, + "grad_norm": 0.09375416706629437, + "language_loss": 1.0225904, + "learning_rate": 0.0009926778837991, + "loss": 1.03364921, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.19433594, + "step": 431, + "time_per_iteration": 2.5606777667999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104802, + "balance_loss_mlp": 1.08521628, + "epoch": 0.08310888803385917, + "flos": 667073083392.0, + "grad_norm": 0.09022222071598751, + "language_loss": 1.00445497, + "learning_rate": 0.000992624666765202, + "loss": 1.01550293, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.19580078, + "step": 432, + "time_per_iteration": 2.763514995574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112312, + "balance_loss_mlp": 1.09166527, + "epoch": 0.08330126971912274, + "flos": 582995404800.0, + "grad_norm": 0.07142121215748316, + "language_loss": 0.98131895, + "learning_rate": 0.000992571258477447, + "loss": 0.99244213, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.20654297, + "step": 433, + "time_per_iteration": 2.7823588848114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086622, + "balance_loss_mlp": 1.06731021, + "epoch": 0.0834936514043863, + "flos": 561064458240.0, + "grad_norm": 0.06618743000622296, + "language_loss": 0.92206728, + "learning_rate": 0.0009925176589565695, + "loss": 0.93293345, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.1932373, + "step": 434, + "time_per_iteration": 2.7774362564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109069, + "balance_loss_mlp": 1.07043648, + "epoch": 0.08368603308964986, + "flos": 494272046592.0, + "grad_norm": 0.07800081613857189, + "language_loss": 1.01949787, + "learning_rate": 0.0009924638682233791, + "loss": 1.03040481, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.20251465, + "step": 435, + "time_per_iteration": 2.574716091156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236801, + "balance_loss_mlp": 1.21505737, + "epoch": 0.08387841477491342, + "flos": 1388322312192.0, + "grad_norm": 0.08820287098199171, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80801398, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.21777344, + "step": 436, + "time_per_iteration": 4.521069049835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087939, + "balance_loss_mlp": 1.06750691, + "epoch": 0.084070796460177, + "flos": 798642796032.0, + "grad_norm": 0.09737991847895365, + "language_loss": 0.92070073, + "learning_rate": 0.0009923557132036668, + "loss": 0.93158013, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.2043457, + "step": 437, + "time_per_iteration": 3.0401971340179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106896, + "balance_loss_mlp": 1.08635592, + "epoch": 0.08426317814544056, + "flos": 558681200640.0, + "grad_norm": 0.07082709395687636, + "language_loss": 0.96077365, + "learning_rate": 0.0009923013489591345, + "loss": 0.97184265, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.20532227, + "step": 438, + "time_per_iteration": 2.7388038635253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138911, + "balance_loss_mlp": 1.11965871, + "epoch": 0.08445555983070412, + "flos": 810057106944.0, + "grad_norm": 0.09946092642967543, + "language_loss": 0.94659293, + "learning_rate": 0.0009922467935862681, + "loss": 0.95798206, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.19250488, + "step": 439, + "time_per_iteration": 3.0827929973602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153103, + "balance_loss_mlp": 1.13278937, + "epoch": 0.08464794151596768, + "flos": 509939311104.0, + "grad_norm": 0.08658230076015333, + "language_loss": 0.97196984, + "learning_rate": 0.0009921920471062478, + "loss": 0.9835009, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.203125, + "step": 440, + "time_per_iteration": 2.5667247772216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_mlp": 1.08952785, + "epoch": 0.08484032320123125, + "flos": 556149556224.0, + "grad_norm": 0.0779492699350581, + "language_loss": 0.95526892, + "learning_rate": 0.0009921371095403281, + "loss": 0.96636182, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.19763184, + "step": 441, + "time_per_iteration": 2.6504476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081558, + "balance_loss_mlp": 1.06137586, + "epoch": 0.08503270488649481, + "flos": 527103742464.0, + "grad_norm": 0.0823758421396894, + "language_loss": 0.98291612, + "learning_rate": 0.0009920819809098379, + "loss": 0.99373174, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.20166016, + "step": 442, + "time_per_iteration": 2.5884947776794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076633, + "balance_loss_mlp": 1.05612862, + "epoch": 0.08522508657175837, + "flos": 613989490176.0, + "grad_norm": 0.07828377396362728, + "language_loss": 0.94043314, + "learning_rate": 0.0009920266612361798, + "loss": 0.95119947, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.20507812, + "step": 443, + "time_per_iteration": 2.7464845180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077144, + "balance_loss_mlp": 1.05650926, + "epoch": 0.08541746825702193, + "flos": 619495119360.0, + "grad_norm": 0.07442656272719532, + "language_loss": 0.94335687, + "learning_rate": 0.0009919711505408308, + "loss": 0.95412827, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.2064209, + "step": 444, + "time_per_iteration": 2.7615623474121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092391, + "balance_loss_mlp": 1.07126665, + "epoch": 0.08560984994228549, + "flos": 482671471104.0, + "grad_norm": 0.08601843511227286, + "language_loss": 0.92049706, + "learning_rate": 0.000991915448845342, + "loss": 0.93142092, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.21130371, + "step": 445, + "time_per_iteration": 2.519644260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103035, + "balance_loss_mlp": 1.08145857, + "epoch": 0.08580223162754906, + "flos": 516897027072.0, + "grad_norm": 0.07781715705073443, + "language_loss": 1.01207459, + "learning_rate": 0.000991859556171339, + "loss": 1.02310491, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.21569824, + "step": 446, + "time_per_iteration": 2.5678694248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116922, + "balance_loss_mlp": 1.09462976, + "epoch": 0.08599461331281262, + "flos": 531215511552.0, + "grad_norm": 0.11213971543052093, + "language_loss": 1.02931881, + "learning_rate": 0.000991803472540521, + "loss": 1.040488, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.22302246, + "step": 447, + "time_per_iteration": 2.6309196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124555, + "balance_loss_mlp": 1.10302639, + "epoch": 0.08618699499807618, + "flos": 789966743040.0, + "grad_norm": 0.07287006723198586, + "language_loss": 0.97443926, + "learning_rate": 0.0009917471979746615, + "loss": 0.98568487, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.21533203, + "step": 448, + "time_per_iteration": 2.9742491245269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134564, + "balance_loss_mlp": 1.11266506, + "epoch": 0.08637937668333974, + "flos": 565707317760.0, + "grad_norm": 0.08202115093309782, + "language_loss": 0.97199845, + "learning_rate": 0.0009916907324956086, + "loss": 0.98334408, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.21923828, + "step": 449, + "time_per_iteration": 2.704089641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151497, + "balance_loss_mlp": 1.12693954, + "epoch": 0.08657175836860331, + "flos": 444930665472.0, + "grad_norm": 0.09325215593581063, + "language_loss": 0.93441564, + "learning_rate": 0.0009916340761252837, + "loss": 0.9459306, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.24536133, + "step": 450, + "time_per_iteration": 2.5866575241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158359, + "balance_loss_mlp": 1.13567328, + "epoch": 0.08676414005386687, + "flos": 843789450240.0, + "grad_norm": 0.23711660967347972, + "language_loss": 0.90976942, + "learning_rate": 0.0009915772288856832, + "loss": 0.92135304, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.22668457, + "step": 451, + "time_per_iteration": 3.109010696411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118071, + "balance_loss_mlp": 1.15827537, + "epoch": 0.08695652173913043, + "flos": 602995608576.0, + "grad_norm": 0.08699490701012727, + "language_loss": 0.92036849, + "learning_rate": 0.000991520190798877, + "loss": 0.93217564, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.22424316, + "step": 452, + "time_per_iteration": 2.8523812294006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181191, + "balance_loss_mlp": 1.15807629, + "epoch": 0.08714890342439399, + "flos": 730423028736.0, + "grad_norm": 0.09293440668835976, + "language_loss": 1.01637089, + "learning_rate": 0.0009914629618870089, + "loss": 1.02818286, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.23095703, + "step": 453, + "time_per_iteration": 2.882887125015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142362, + "balance_loss_mlp": 1.12891519, + "epoch": 0.08734128510965757, + "flos": 1481518232064.0, + "grad_norm": 0.0645312523276542, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79818237, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.13476562, + "step": 454, + "time_per_iteration": 4.717878103256226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103219, + "balance_loss_mlp": 1.09034455, + "epoch": 0.08753366679492113, + "flos": 1522214083584.0, + "grad_norm": 0.04274098512475534, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82531178, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.12890625, + "step": 455, + "time_per_iteration": 4.838243246078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171645, + "balance_loss_mlp": 1.14951944, + "epoch": 0.08772604848018468, + "flos": 720935078400.0, + "grad_norm": 0.10543082910841049, + "language_loss": 0.94423014, + "learning_rate": 0.0009912901304235883, + "loss": 0.95594656, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.22131348, + "step": 456, + "time_per_iteration": 2.9432015419006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150762, + "balance_loss_mlp": 1.12861252, + "epoch": 0.08791843016544824, + "flos": 707926086144.0, + "grad_norm": 0.10980567381029156, + "language_loss": 0.91300154, + "learning_rate": 0.000991232138434397, + "loss": 0.92450917, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.22143555, + "step": 457, + "time_per_iteration": 2.832761526107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113929, + "balance_loss_mlp": 1.09195828, + "epoch": 0.08811081185071182, + "flos": 472799407104.0, + "grad_norm": 0.1324680836731367, + "language_loss": 0.97845554, + "learning_rate": 0.000991173955731976, + "loss": 0.98959482, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.21960449, + "step": 458, + "time_per_iteration": 2.660696506500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100778, + "balance_loss_mlp": 1.07958269, + "epoch": 0.08830319353597538, + "flos": 684647769600.0, + "grad_norm": 0.07138233575581546, + "language_loss": 1.0178268, + "learning_rate": 0.0009911155823389137, + "loss": 1.02883458, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.21203613, + "step": 459, + "time_per_iteration": 2.9878122806549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105128, + "balance_loss_mlp": 1.08344412, + "epoch": 0.08849557522123894, + "flos": 573235411968.0, + "grad_norm": 0.0735053314112025, + "language_loss": 0.9764787, + "learning_rate": 0.000991057018277873, + "loss": 0.98752999, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.21679688, + "step": 460, + "time_per_iteration": 2.707247018814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116963, + "balance_loss_mlp": 1.09422946, + "epoch": 0.0886879569065025, + "flos": 564303283200.0, + "grad_norm": 0.10552034142073316, + "language_loss": 0.9759655, + "learning_rate": 0.0009909982635715898, + "loss": 0.98713505, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.22729492, + "step": 461, + "time_per_iteration": 2.609016180038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120097, + "balance_loss_mlp": 1.09760189, + "epoch": 0.08888033859176607, + "flos": 563609249280.0, + "grad_norm": 0.09185893532484944, + "language_loss": 0.96625364, + "learning_rate": 0.0009909393182428751, + "loss": 0.97745454, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.22497559, + "step": 462, + "time_per_iteration": 2.682616949081421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116703, + "balance_loss_mlp": 1.09437466, + "epoch": 0.08907272027702963, + "flos": 465517214208.0, + "grad_norm": 0.08888403374641002, + "language_loss": 0.91300213, + "learning_rate": 0.000990880182314614, + "loss": 0.92416912, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.22314453, + "step": 463, + "time_per_iteration": 2.732579469680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122985, + "balance_loss_mlp": 1.10014486, + "epoch": 0.08926510196229319, + "flos": 681200921088.0, + "grad_norm": 0.07408309604525525, + "language_loss": 0.92294347, + "learning_rate": 0.0009908208558097643, + "loss": 0.93417335, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.22839355, + "step": 464, + "time_per_iteration": 2.910313606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137827, + "balance_loss_mlp": 1.115273, + "epoch": 0.08945748364755675, + "flos": 596411831808.0, + "grad_norm": 0.08673846989427919, + "language_loss": 0.93827909, + "learning_rate": 0.000990761338751359, + "loss": 0.94965738, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.22546387, + "step": 465, + "time_per_iteration": 2.827570676803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133815, + "balance_loss_mlp": 1.12222791, + "epoch": 0.08964986533282032, + "flos": 1585082400768.0, + "grad_norm": 0.06082202694548154, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74793446, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.11572266, + "step": 466, + "time_per_iteration": 4.960917234420776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177765, + "balance_loss_mlp": 1.15419745, + "epoch": 0.08984224701808388, + "flos": 533268499968.0, + "grad_norm": 0.4900596090566038, + "language_loss": 0.96587038, + "learning_rate": 0.0009906417330663815, + "loss": 0.97764802, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.23571777, + "step": 467, + "time_per_iteration": 2.5937299728393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157084, + "balance_loss_mlp": 1.13383865, + "epoch": 0.09003462870334744, + "flos": 478702296576.0, + "grad_norm": 0.08613132202477504, + "language_loss": 0.92798859, + "learning_rate": 0.0009905816444862442, + "loss": 0.93955946, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.23217773, + "step": 468, + "time_per_iteration": 2.6012237071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164868, + "balance_loss_mlp": 1.14150274, + "epoch": 0.090227010388611, + "flos": 653307448320.0, + "grad_norm": 0.08218040805372613, + "language_loss": 0.90769458, + "learning_rate": 0.0009905213654454216, + "loss": 0.91934329, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.23364258, + "step": 469, + "time_per_iteration": 2.8727760314941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176439, + "balance_loss_mlp": 1.15152478, + "epoch": 0.09041939207387456, + "flos": 617894645760.0, + "grad_norm": 0.09256259391525869, + "language_loss": 0.97864139, + "learning_rate": 0.0009904608959673158, + "loss": 0.9904058, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.24938965, + "step": 470, + "time_per_iteration": 2.7991952896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151805, + "balance_loss_mlp": 1.12671185, + "epoch": 0.09061177375913813, + "flos": 454137808896.0, + "grad_norm": 0.09693984756275055, + "language_loss": 0.97988749, + "learning_rate": 0.000990400236075403, + "loss": 0.99140555, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.25109863, + "step": 471, + "time_per_iteration": 2.523508310317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125394, + "balance_loss_mlp": 1.10119498, + "epoch": 0.0908041554444017, + "flos": 543982984704.0, + "grad_norm": 0.09250187628709369, + "language_loss": 0.9490509, + "learning_rate": 0.0009903393857932338, + "loss": 0.96030486, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.24194336, + "step": 472, + "time_per_iteration": 2.7065584659576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124084, + "balance_loss_mlp": 1.09912193, + "epoch": 0.09099653712966525, + "flos": 564052999680.0, + "grad_norm": 0.10897832311722938, + "language_loss": 0.93660218, + "learning_rate": 0.0009902783451444317, + "loss": 0.94784307, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.24963379, + "step": 473, + "time_per_iteration": 2.7067277431488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108649, + "balance_loss_mlp": 1.08496177, + "epoch": 0.09118891881492881, + "flos": 474300956160.0, + "grad_norm": 0.09402902414949979, + "language_loss": 0.97273493, + "learning_rate": 0.0009902171141526956, + "loss": 0.98382139, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.23693848, + "step": 474, + "time_per_iteration": 2.5281569957733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087186, + "balance_loss_mlp": 1.06240201, + "epoch": 0.09138130050019239, + "flos": 545579076096.0, + "grad_norm": 0.06728788346792411, + "language_loss": 0.85273343, + "learning_rate": 0.000990155692841797, + "loss": 0.86360526, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.2479248, + "step": 475, + "time_per_iteration": 2.970107316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_mlp": 1.0587163, + "epoch": 0.09157368218545595, + "flos": 732397441536.0, + "grad_norm": 0.07226189405033341, + "language_loss": 0.97062063, + "learning_rate": 0.0009900940812355818, + "loss": 0.98145562, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.24768066, + "step": 476, + "time_per_iteration": 2.959184169769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096233, + "balance_loss_mlp": 1.07208097, + "epoch": 0.0917660638707195, + "flos": 610709967360.0, + "grad_norm": 0.09034653129128065, + "language_loss": 0.92824447, + "learning_rate": 0.00099003227935797, + "loss": 0.93920678, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.24157715, + "step": 477, + "time_per_iteration": 2.7553765773773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113263, + "balance_loss_mlp": 1.08839583, + "epoch": 0.09195844555598306, + "flos": 655561257984.0, + "grad_norm": 0.09830094540804109, + "language_loss": 0.95358098, + "learning_rate": 0.000989970287232955, + "loss": 0.96471357, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.2487793, + "step": 478, + "time_per_iteration": 2.7916457653045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112064, + "balance_loss_mlp": 1.09633327, + "epoch": 0.09215082724124664, + "flos": 476339387904.0, + "grad_norm": 0.08054303064285366, + "language_loss": 0.93560576, + "learning_rate": 0.0009899081048846043, + "loss": 0.94681215, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.24267578, + "step": 479, + "time_per_iteration": 2.554161787033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114732, + "balance_loss_mlp": 1.12177348, + "epoch": 0.0923432089265102, + "flos": 524051182080.0, + "grad_norm": 0.1186512856896222, + "language_loss": 0.97593725, + "learning_rate": 0.0009898457323370593, + "loss": 0.98741049, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.25549316, + "step": 480, + "time_per_iteration": 2.5794191360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131558, + "balance_loss_mlp": 1.10608315, + "epoch": 0.09253559061177376, + "flos": 545302651392.0, + "grad_norm": 0.10688941209840569, + "language_loss": 0.96892118, + "learning_rate": 0.000989783169614535, + "loss": 0.98023689, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.25512695, + "step": 481, + "time_per_iteration": 2.6676101684570312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336494, + "balance_loss_mlp": 1.32304764, + "epoch": 0.09272797229703732, + "flos": 1537222219776.0, + "grad_norm": 0.112558059824644, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80089253, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.13476562, + "step": 482, + "time_per_iteration": 4.910710096359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121205, + "balance_loss_mlp": 1.09537172, + "epoch": 0.09292035398230089, + "flos": 689501624832.0, + "grad_norm": 0.08905484371867754, + "language_loss": 0.93989253, + "learning_rate": 0.000989657473741779, + "loss": 0.95110452, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.25866699, + "step": 483, + "time_per_iteration": 2.8736467361450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120092, + "balance_loss_mlp": 1.09219658, + "epoch": 0.09311273566756445, + "flos": 509482414080.0, + "grad_norm": 0.10011855628381364, + "language_loss": 0.94861096, + "learning_rate": 0.0009895943406403465, + "loss": 0.95981193, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.27905273, + "step": 484, + "time_per_iteration": 2.7233312129974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114409, + "balance_loss_mlp": 1.08641887, + "epoch": 0.09330511735282801, + "flos": 659111413248.0, + "grad_norm": 0.10884122740481975, + "language_loss": 0.87602448, + "learning_rate": 0.0009895310174615338, + "loss": 0.88716859, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.2800293, + "step": 485, + "time_per_iteration": 2.7538061141967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098211, + "balance_loss_mlp": 1.08533621, + "epoch": 0.09349749903809157, + "flos": 1452054809088.0, + "grad_norm": 0.04867374252302138, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76816726, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.12890625, + "step": 486, + "time_per_iteration": 4.681119441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121456, + "balance_loss_mlp": 1.09291732, + "epoch": 0.09368988072335514, + "flos": 520614508032.0, + "grad_norm": 0.07858969791005947, + "language_loss": 0.92458618, + "learning_rate": 0.0009894038009701782, + "loss": 0.93580067, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.28515625, + "step": 487, + "time_per_iteration": 2.6114649772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153128, + "balance_loss_mlp": 1.12148952, + "epoch": 0.0938822624086187, + "flos": 497502107136.0, + "grad_norm": 0.11959755259003642, + "language_loss": 0.91595036, + "learning_rate": 0.0009893399077070253, + "loss": 0.92748165, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.31616211, + "step": 488, + "time_per_iteration": 2.5603692531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127952, + "balance_loss_mlp": 1.09845996, + "epoch": 0.09407464409388226, + "flos": 532948405248.0, + "grad_norm": 0.09098963794592498, + "language_loss": 0.89760649, + "learning_rate": 0.0009892758244652718, + "loss": 0.90888608, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.29516602, + "step": 489, + "time_per_iteration": 2.65938401222229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127724, + "balance_loss_mlp": 1.09568012, + "epoch": 0.09426702577914582, + "flos": 585736634880.0, + "grad_norm": 0.09102778373185845, + "language_loss": 0.94519842, + "learning_rate": 0.0009892115512697968, + "loss": 0.95647562, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.3203125, + "step": 490, + "time_per_iteration": 2.6538186073303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120065, + "balance_loss_mlp": 1.08926105, + "epoch": 0.0944594074644094, + "flos": 503081929728.0, + "grad_norm": 0.07724049493821064, + "language_loss": 0.96624851, + "learning_rate": 0.0009891470881455537, + "loss": 0.97744912, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.30810547, + "step": 491, + "time_per_iteration": 2.699535608291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122711, + "balance_loss_mlp": 1.09145451, + "epoch": 0.09465178914967295, + "flos": 570748847616.0, + "grad_norm": 0.0816499633869022, + "language_loss": 0.94510269, + "learning_rate": 0.0009890824351175692, + "loss": 0.95632982, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.31225586, + "step": 492, + "time_per_iteration": 2.678191661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125893, + "balance_loss_mlp": 1.09418344, + "epoch": 0.09484417083493651, + "flos": 549098707968.0, + "grad_norm": 0.07977284094064935, + "language_loss": 0.98609412, + "learning_rate": 0.0009890175922109435, + "loss": 0.99735302, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.31689453, + "step": 493, + "time_per_iteration": 2.6466987133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138627, + "balance_loss_mlp": 1.10534418, + "epoch": 0.09503655252020007, + "flos": 823552109568.0, + "grad_norm": 0.09331424233507904, + "language_loss": 0.96939492, + "learning_rate": 0.0009889525594508513, + "loss": 0.9807812, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.33300781, + "step": 494, + "time_per_iteration": 3.009894371032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153225, + "balance_loss_mlp": 1.12218332, + "epoch": 0.09522893420546363, + "flos": 404397757440.0, + "grad_norm": 0.08141129996203125, + "language_loss": 0.91043431, + "learning_rate": 0.0009888873368625404, + "loss": 0.92196655, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.31030273, + "step": 495, + "time_per_iteration": 2.4904890060424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171726, + "balance_loss_mlp": 1.14025438, + "epoch": 0.0954213158907272, + "flos": 690707810304.0, + "grad_norm": 0.08256479818708104, + "language_loss": 0.94339681, + "learning_rate": 0.0009888219244713326, + "loss": 0.95511413, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.31445312, + "step": 496, + "time_per_iteration": 2.8060483932495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181664, + "balance_loss_mlp": 1.15033531, + "epoch": 0.09561369757599077, + "flos": 518739019776.0, + "grad_norm": 0.10472312979641793, + "language_loss": 0.94370055, + "learning_rate": 0.0009887563223026229, + "loss": 0.95551717, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.31323242, + "step": 497, + "time_per_iteration": 2.6536803245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228939, + "balance_loss_mlp": 1.21549225, + "epoch": 0.09580607926125433, + "flos": 1384825849344.0, + "grad_norm": 0.04877985805939708, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80297101, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.13476562, + "step": 498, + "time_per_iteration": 4.874605178833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197245, + "balance_loss_mlp": 1.16455829, + "epoch": 0.09599846094651789, + "flos": 717090969600.0, + "grad_norm": 0.08863465655244346, + "language_loss": 0.93284124, + "learning_rate": 0.0009886245487346482, + "loss": 0.94481373, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.3269043, + "step": 499, + "time_per_iteration": 3.047938108444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011865, + "balance_loss_mlp": 1.15474319, + "epoch": 0.09619084263178146, + "flos": 385824909312.0, + "grad_norm": 0.09673466805801513, + "language_loss": 0.96238041, + "learning_rate": 0.0009885583773865422, + "loss": 0.97424543, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.31762695, + "step": 500, + "time_per_iteration": 2.402763843536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140705, + "balance_loss_mlp": 1.1099968, + "epoch": 0.09638322431704502, + "flos": 533869401600.0, + "grad_norm": 0.08556524095898377, + "language_loss": 0.93457472, + "learning_rate": 0.0009884920163632524, + "loss": 0.94598186, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.30688477, + "step": 501, + "time_per_iteration": 2.7420296669006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155853, + "balance_loss_mlp": 1.12373805, + "epoch": 0.09657560600230858, + "flos": 500426629632.0, + "grad_norm": 0.08462195742795481, + "language_loss": 0.95688182, + "learning_rate": 0.000988425465690543, + "loss": 0.96844035, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.32104492, + "step": 502, + "time_per_iteration": 2.5425736904144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163304, + "balance_loss_mlp": 1.13099861, + "epoch": 0.09676798768757214, + "flos": 528995197440.0, + "grad_norm": 0.07192036847451248, + "language_loss": 0.92721838, + "learning_rate": 0.0009883587253942505, + "loss": 0.93885148, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.32324219, + "step": 503, + "time_per_iteration": 2.8340742588043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188959, + "balance_loss_mlp": 1.15598607, + "epoch": 0.09696036937283571, + "flos": 463379857920.0, + "grad_norm": 0.0888689340699796, + "language_loss": 0.99166393, + "learning_rate": 0.0009882917955002862, + "loss": 1.00355351, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.32983398, + "step": 504, + "time_per_iteration": 2.560448169708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147535, + "balance_loss_mlp": 1.11606395, + "epoch": 0.09715275105809927, + "flos": 534716204544.0, + "grad_norm": 0.07251663236407552, + "language_loss": 0.9150176, + "learning_rate": 0.0009882246760346343, + "loss": 0.92649293, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.31420898, + "step": 505, + "time_per_iteration": 2.6460299491882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114081, + "balance_loss_mlp": 1.10714495, + "epoch": 0.09734513274336283, + "flos": 454713979392.0, + "grad_norm": 0.10061537251918176, + "language_loss": 0.96100289, + "learning_rate": 0.0009881573670233533, + "loss": 0.97241098, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.33666992, + "step": 506, + "time_per_iteration": 2.5137040615081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109977, + "balance_loss_mlp": 1.08029366, + "epoch": 0.09753751442862639, + "flos": 508551243264.0, + "grad_norm": 0.0762964042901656, + "language_loss": 0.91185808, + "learning_rate": 0.0009880898684925747, + "loss": 0.92295784, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.29663086, + "step": 507, + "time_per_iteration": 2.6571738719940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110119, + "balance_loss_mlp": 1.07133985, + "epoch": 0.09772989611388996, + "flos": 484030425600.0, + "grad_norm": 0.07531505250568626, + "language_loss": 0.89554358, + "learning_rate": 0.0009880221804685037, + "loss": 0.90655547, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.29882812, + "step": 508, + "time_per_iteration": 2.596289873123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01404721, + "balance_loss_mlp": 1.39136958, + "epoch": 0.09792227779915352, + "flos": 1565306339328.0, + "grad_norm": 0.10151454340945995, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80749142, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.13378906, + "step": 509, + "time_per_iteration": 4.724441051483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116621, + "balance_loss_mlp": 1.08655643, + "epoch": 0.09811465948441708, + "flos": 587529165312.0, + "grad_norm": 0.08257009801201759, + "language_loss": 0.94708043, + "learning_rate": 0.0009878862360456733, + "loss": 0.95824659, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.30029297, + "step": 510, + "time_per_iteration": 2.703011989593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122701, + "balance_loss_mlp": 1.09406662, + "epoch": 0.09830704116968064, + "flos": 612719285760.0, + "grad_norm": 0.06191460590209878, + "language_loss": 0.88457662, + "learning_rate": 0.0009878179796996922, + "loss": 0.89580369, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.28637695, + "step": 511, + "time_per_iteration": 2.7212226390838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128587, + "balance_loss_mlp": 1.09885597, + "epoch": 0.09849942285494422, + "flos": 538528227840.0, + "grad_norm": 0.06874751685339883, + "language_loss": 0.9199326, + "learning_rate": 0.0009877495339659754, + "loss": 0.9312185, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.29724121, + "step": 512, + "time_per_iteration": 2.7520575523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111609, + "balance_loss_mlp": 1.08826661, + "epoch": 0.09869180454020778, + "flos": 620193535488.0, + "grad_norm": 0.06953003964378547, + "language_loss": 0.87301105, + "learning_rate": 0.000987680898871096, + "loss": 0.88417196, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.27832031, + "step": 513, + "time_per_iteration": 2.7121992111206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134292, + "balance_loss_mlp": 1.10401261, + "epoch": 0.09888418622547133, + "flos": 811375363584.0, + "grad_norm": 0.1024184057853134, + "language_loss": 0.87763435, + "learning_rate": 0.0009876120744417, + "loss": 0.88897729, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.30273438, + "step": 514, + "time_per_iteration": 2.971573829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123143, + "balance_loss_mlp": 1.09267306, + "epoch": 0.0990765679107349, + "flos": 535548450816.0, + "grad_norm": 0.06764912074049458, + "language_loss": 0.95588082, + "learning_rate": 0.0009875430607045078, + "loss": 0.9671123, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.3046875, + "step": 515, + "time_per_iteration": 2.6630361080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108813, + "balance_loss_mlp": 1.08072746, + "epoch": 0.09926894959599845, + "flos": 587607740928.0, + "grad_norm": 0.06593749006245919, + "language_loss": 0.92788792, + "learning_rate": 0.000987473857686313, + "loss": 0.93897605, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.28076172, + "step": 516, + "time_per_iteration": 2.710068702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121556, + "balance_loss_mlp": 1.09039485, + "epoch": 0.09946133128126203, + "flos": 640947409920.0, + "grad_norm": 0.08862761474564218, + "language_loss": 0.9451825, + "learning_rate": 0.0009874044654139824, + "loss": 0.95639801, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.3112793, + "step": 517, + "time_per_iteration": 2.729975461959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117034, + "balance_loss_mlp": 1.08520555, + "epoch": 0.09965371296652559, + "flos": 465546327552.0, + "grad_norm": 0.09157938746936445, + "language_loss": 0.9250825, + "learning_rate": 0.0009873348839144563, + "loss": 0.93625283, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.31811523, + "step": 518, + "time_per_iteration": 2.5117127895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112516, + "balance_loss_mlp": 1.09540534, + "epoch": 0.09984609465178915, + "flos": 483365505024.0, + "grad_norm": 0.07736257304557469, + "language_loss": 0.9674046, + "learning_rate": 0.000987265113214749, + "loss": 0.97865617, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.29711914, + "step": 519, + "time_per_iteration": 2.5816774368286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147544, + "balance_loss_mlp": 1.11421299, + "epoch": 0.1000384763370527, + "flos": 568764260352.0, + "grad_norm": 0.08763817133734854, + "language_loss": 0.96583092, + "learning_rate": 0.0009871951533419476, + "loss": 0.97730637, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.33325195, + "step": 520, + "time_per_iteration": 2.638664484024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140597, + "balance_loss_mlp": 1.108482, + "epoch": 0.10023085802231628, + "flos": 545515057152.0, + "grad_norm": 0.10925869968591369, + "language_loss": 0.88377398, + "learning_rate": 0.0009871250043232132, + "loss": 0.89517999, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.32104492, + "step": 521, + "time_per_iteration": 2.70491886138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136934, + "balance_loss_mlp": 1.10555792, + "epoch": 0.10042323970757984, + "flos": 503208557568.0, + "grad_norm": 0.07694864026409119, + "language_loss": 0.87725985, + "learning_rate": 0.0009870546661857797, + "loss": 0.8886292, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.31347656, + "step": 522, + "time_per_iteration": 2.653456211090088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126678, + "balance_loss_mlp": 1.09380031, + "epoch": 0.1006156213928434, + "flos": 770084402688.0, + "grad_norm": 0.08414569380370593, + "language_loss": 0.95787346, + "learning_rate": 0.0009869841389569553, + "loss": 0.96914017, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.32885742, + "step": 523, + "time_per_iteration": 2.9442663192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116557, + "balance_loss_mlp": 1.08625388, + "epoch": 0.10080800307810696, + "flos": 489786338304.0, + "grad_norm": 0.06587351152736676, + "language_loss": 0.88897854, + "learning_rate": 0.0009869134226641206, + "loss": 0.90014416, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.30297852, + "step": 524, + "time_per_iteration": 2.5559868812561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110225, + "balance_loss_mlp": 1.07746601, + "epoch": 0.10100038476337053, + "flos": 454478252544.0, + "grad_norm": 0.09167866019985617, + "language_loss": 0.88383424, + "learning_rate": 0.0009868425173347303, + "loss": 0.89493656, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.32788086, + "step": 525, + "time_per_iteration": 2.645116090774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111349, + "balance_loss_mlp": 1.08216143, + "epoch": 0.10119276644863409, + "flos": 556155348480.0, + "grad_norm": 0.07288604326691553, + "language_loss": 0.96749896, + "learning_rate": 0.0009867714229963125, + "loss": 0.97863394, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.31323242, + "step": 526, + "time_per_iteration": 2.730703592300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106354, + "balance_loss_mlp": 1.07540703, + "epoch": 0.10138514813389765, + "flos": 515990587392.0, + "grad_norm": 0.07095113284061857, + "language_loss": 0.93916923, + "learning_rate": 0.000986700139676468, + "loss": 0.95023274, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.30932617, + "step": 527, + "time_per_iteration": 2.5836338996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110446, + "balance_loss_mlp": 1.07833052, + "epoch": 0.10157752981916121, + "flos": 500323322880.0, + "grad_norm": 0.06933811905919615, + "language_loss": 0.91673893, + "learning_rate": 0.0009866286674028717, + "loss": 0.92784333, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.32104492, + "step": 528, + "time_per_iteration": 2.7084739208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_mlp": 1.07100391, + "epoch": 0.10176991150442478, + "flos": 656444376576.0, + "grad_norm": 0.07189407365130172, + "language_loss": 0.88586026, + "learning_rate": 0.0009865570062032717, + "loss": 0.8968786, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.30810547, + "step": 529, + "time_per_iteration": 2.9141628742218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103952, + "balance_loss_mlp": 1.07443571, + "epoch": 0.10196229318968834, + "flos": 572974953984.0, + "grad_norm": 0.06841647032337263, + "language_loss": 0.93659967, + "learning_rate": 0.0009864851561054893, + "loss": 0.94763923, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.29516602, + "step": 530, + "time_per_iteration": 2.7539894580841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090977, + "balance_loss_mlp": 1.06110358, + "epoch": 0.1021546748749519, + "flos": 517946061312.0, + "grad_norm": 0.07340246055426732, + "language_loss": 0.91722125, + "learning_rate": 0.0009864131171374191, + "loss": 0.92813098, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.29882812, + "step": 531, + "time_per_iteration": 2.6921956539154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109944, + "balance_loss_mlp": 1.06749225, + "epoch": 0.10234705656021546, + "flos": 609470286336.0, + "grad_norm": 0.07867637119915549, + "language_loss": 0.91107762, + "learning_rate": 0.0009863408893270292, + "loss": 0.92207205, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.31933594, + "step": 532, + "time_per_iteration": 2.7911570072174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106913, + "balance_loss_mlp": 1.07396317, + "epoch": 0.10253943824547904, + "flos": 601473710592.0, + "grad_norm": 0.08191923529880715, + "language_loss": 0.86522454, + "learning_rate": 0.0009862684727023605, + "loss": 0.87629366, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.3293457, + "step": 533, + "time_per_iteration": 2.7452800273895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105875, + "balance_loss_mlp": 1.07466602, + "epoch": 0.1027318199307426, + "flos": 662647011840.0, + "grad_norm": 0.07282647554851075, + "language_loss": 0.90315968, + "learning_rate": 0.0009861958672915283, + "loss": 0.91421843, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.31201172, + "step": 534, + "time_per_iteration": 2.8041269779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096602, + "balance_loss_mlp": 1.0673244, + "epoch": 0.10292420161600616, + "flos": 682962928128.0, + "grad_norm": 0.058349855756870184, + "language_loss": 0.90126884, + "learning_rate": 0.0009861230731227201, + "loss": 0.9122349, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.29248047, + "step": 535, + "time_per_iteration": 2.8627805709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108033, + "balance_loss_mlp": 1.07615674, + "epoch": 0.10311658330126972, + "flos": 490042414080.0, + "grad_norm": 0.091555564896082, + "language_loss": 0.91954774, + "learning_rate": 0.0009860500902241973, + "loss": 0.93062806, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.31884766, + "step": 536, + "time_per_iteration": 2.6052157878875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120335, + "balance_loss_mlp": 1.08800602, + "epoch": 0.10330896498653329, + "flos": 431508446208.0, + "grad_norm": 0.0585767653270487, + "language_loss": 0.96574026, + "learning_rate": 0.0009859769186242942, + "loss": 0.97694361, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.32324219, + "step": 537, + "time_per_iteration": 2.51180362701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116517, + "balance_loss_mlp": 1.08571362, + "epoch": 0.10350134667179685, + "flos": 549330052608.0, + "grad_norm": 0.0744119924563098, + "language_loss": 0.8926785, + "learning_rate": 0.0009859035583514187, + "loss": 0.90384364, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.30834961, + "step": 538, + "time_per_iteration": 2.6369993686676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146613, + "balance_loss_mlp": 1.11380613, + "epoch": 0.10369372835706041, + "flos": 640327569408.0, + "grad_norm": 0.09976070350989504, + "language_loss": 0.90389431, + "learning_rate": 0.0009858300094340517, + "loss": 0.91536051, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.328125, + "step": 539, + "time_per_iteration": 2.7695086002349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150737, + "balance_loss_mlp": 1.11838388, + "epoch": 0.10388611004232397, + "flos": 521500598784.0, + "grad_norm": 0.08771902350159133, + "language_loss": 0.85304511, + "learning_rate": 0.0009857562719007473, + "loss": 0.8645525, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.32324219, + "step": 540, + "time_per_iteration": 2.59881329536438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144681, + "balance_loss_mlp": 1.11320961, + "epoch": 0.10407849172758753, + "flos": 702111946752.0, + "grad_norm": 0.07496368213999542, + "language_loss": 0.88249481, + "learning_rate": 0.0009856823457801331, + "loss": 0.89394164, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.31494141, + "step": 541, + "time_per_iteration": 2.873481035232544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119735, + "balance_loss_mlp": 1.08738184, + "epoch": 0.1042708734128511, + "flos": 502652736000.0, + "grad_norm": 0.06973546911765124, + "language_loss": 0.94998306, + "learning_rate": 0.00098560823110091, + "loss": 0.96118045, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.32373047, + "step": 542, + "time_per_iteration": 2.661374807357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_mlp": 1.08757377, + "epoch": 0.10446325509811466, + "flos": 485331153408.0, + "grad_norm": 0.0792045331206184, + "language_loss": 0.95517921, + "learning_rate": 0.000985533927891851, + "loss": 0.96635967, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.30419922, + "step": 543, + "time_per_iteration": 2.7264697551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096256, + "balance_loss_mlp": 1.06502366, + "epoch": 0.10465563678337822, + "flos": 568365590016.0, + "grad_norm": 0.0919664039836503, + "language_loss": 0.93718112, + "learning_rate": 0.0009854594361818044, + "loss": 0.94814372, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.31201172, + "step": 544, + "time_per_iteration": 2.6869821548461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099422, + "balance_loss_mlp": 1.0683322, + "epoch": 0.10484801846864178, + "flos": 625806853632.0, + "grad_norm": 0.1054615502202609, + "language_loss": 0.927598, + "learning_rate": 0.0009853847559996897, + "loss": 0.9385922, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.31103516, + "step": 545, + "time_per_iteration": 2.7953526973724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100313, + "balance_loss_mlp": 1.06772113, + "epoch": 0.10504040015390535, + "flos": 743063874048.0, + "grad_norm": 0.0768702593450629, + "language_loss": 0.92008656, + "learning_rate": 0.0009853098873745, + "loss": 0.93108964, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.32592773, + "step": 546, + "time_per_iteration": 3.0344293117523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106321, + "balance_loss_mlp": 1.07430172, + "epoch": 0.10523278183916891, + "flos": 586382616576.0, + "grad_norm": 0.072035501246702, + "language_loss": 0.90983582, + "learning_rate": 0.0009852348303353027, + "loss": 0.92089903, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.32006836, + "step": 547, + "time_per_iteration": 2.7647972106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110403, + "balance_loss_mlp": 1.07100892, + "epoch": 0.10542516352443247, + "flos": 869270552064.0, + "grad_norm": 0.07817580313906373, + "language_loss": 0.84611928, + "learning_rate": 0.000985159584911237, + "loss": 0.85715961, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.33007812, + "step": 548, + "time_per_iteration": 3.143122434616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104478, + "balance_loss_mlp": 1.07212472, + "epoch": 0.10561754520969603, + "flos": 505182970368.0, + "grad_norm": 0.08898596974063745, + "language_loss": 0.91126573, + "learning_rate": 0.0009850841511315162, + "loss": 0.92231047, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.32348633, + "step": 549, + "time_per_iteration": 2.6164846420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112982, + "balance_loss_mlp": 1.07946038, + "epoch": 0.1058099268949596, + "flos": 559690947072.0, + "grad_norm": 0.06224197989448247, + "language_loss": 0.92054999, + "learning_rate": 0.0009850085290254256, + "loss": 0.93167984, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.33520508, + "step": 550, + "time_per_iteration": 2.7473480701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110676, + "balance_loss_mlp": 1.07431078, + "epoch": 0.10600230858022316, + "flos": 561773048832.0, + "grad_norm": 0.05678957528127819, + "language_loss": 0.88957977, + "learning_rate": 0.0009849327186223246, + "loss": 0.90064728, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.32446289, + "step": 551, + "time_per_iteration": 2.805126905441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094878, + "balance_loss_mlp": 1.06297779, + "epoch": 0.10619469026548672, + "flos": 494079989760.0, + "grad_norm": 0.07906939671673464, + "language_loss": 0.95596325, + "learning_rate": 0.000984856719951646, + "loss": 0.96691203, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.31860352, + "step": 552, + "time_per_iteration": 2.5688273906707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105536, + "balance_loss_mlp": 1.07370734, + "epoch": 0.10638707195075028, + "flos": 675843678720.0, + "grad_norm": 0.06469368191660979, + "language_loss": 0.93170857, + "learning_rate": 0.0009847805330428943, + "loss": 0.94276392, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.31811523, + "step": 553, + "time_per_iteration": 2.8858227729797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105116, + "balance_loss_mlp": 1.07080746, + "epoch": 0.10657945363601386, + "flos": 487811925504.0, + "grad_norm": 0.07365688544553677, + "language_loss": 0.94454086, + "learning_rate": 0.0009847041579256481, + "loss": 0.95559192, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.34326172, + "step": 554, + "time_per_iteration": 2.5912039279937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114154, + "balance_loss_mlp": 1.08158636, + "epoch": 0.10677183532127742, + "flos": 482706376704.0, + "grad_norm": 0.06731486395760358, + "language_loss": 0.95310724, + "learning_rate": 0.0009846275946295592, + "loss": 0.96424878, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.32568359, + "step": 555, + "time_per_iteration": 2.6071619987487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120557, + "balance_loss_mlp": 1.08755958, + "epoch": 0.10696421700654098, + "flos": 655917668352.0, + "grad_norm": 0.06239681935918944, + "language_loss": 0.88169777, + "learning_rate": 0.0009845508431843518, + "loss": 0.89290333, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.32983398, + "step": 556, + "time_per_iteration": 2.9906973838806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_mlp": 1.08986306, + "epoch": 0.10715659869180454, + "flos": 567483881472.0, + "grad_norm": 0.06803394611182671, + "language_loss": 0.89010829, + "learning_rate": 0.0009844739036198233, + "loss": 0.90133309, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.32592773, + "step": 557, + "time_per_iteration": 2.6462793350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113246, + "balance_loss_mlp": 1.09927225, + "epoch": 0.10734898037706811, + "flos": 540432829440.0, + "grad_norm": 0.0683091886411484, + "language_loss": 0.96000761, + "learning_rate": 0.0009843967759658448, + "loss": 0.97133219, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.33203125, + "step": 558, + "time_per_iteration": 2.664320707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369087, + "balance_loss_mlp": 1.3546865, + "epoch": 0.10754136206233167, + "flos": 1475870008320.0, + "grad_norm": 0.12144998025248735, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74136841, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.14355469, + "step": 559, + "time_per_iteration": 4.836310148239136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124853, + "balance_loss_mlp": 1.0925231, + "epoch": 0.10773374374759523, + "flos": 512155243008.0, + "grad_norm": 0.06725764235558847, + "language_loss": 0.96045369, + "learning_rate": 0.000984241956509384, + "loss": 0.97170222, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.32324219, + "step": 560, + "time_per_iteration": 2.7409372329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134795, + "balance_loss_mlp": 1.10005689, + "epoch": 0.10792612543285879, + "flos": 496261016064.0, + "grad_norm": 0.08502468521942065, + "language_loss": 0.91520619, + "learning_rate": 0.0009841642647670078, + "loss": 0.92655414, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.34741211, + "step": 561, + "time_per_iteration": 2.5360167026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134435, + "balance_loss_mlp": 1.10050821, + "epoch": 0.10811850711812235, + "flos": 735131317248.0, + "grad_norm": 0.08550854990342285, + "language_loss": 0.86122006, + "learning_rate": 0.0009840863850553944, + "loss": 0.87256444, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.33911133, + "step": 562, + "time_per_iteration": 3.0013930797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118751, + "balance_loss_mlp": 1.08604038, + "epoch": 0.10831088880338592, + "flos": 611257024512.0, + "grad_norm": 0.07414056330929218, + "language_loss": 0.92513216, + "learning_rate": 0.0009840083174047782, + "loss": 0.93631971, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.3269043, + "step": 563, + "time_per_iteration": 2.761746883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125103, + "balance_loss_mlp": 1.09353685, + "epoch": 0.10850327048864948, + "flos": 556022928384.0, + "grad_norm": 0.06849160846851732, + "language_loss": 0.86520386, + "learning_rate": 0.0009839300618454685, + "loss": 0.87645483, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.31518555, + "step": 564, + "time_per_iteration": 2.833545684814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124691, + "balance_loss_mlp": 1.09291005, + "epoch": 0.10869565217391304, + "flos": 602902476288.0, + "grad_norm": 0.06688991061359367, + "language_loss": 0.92471159, + "learning_rate": 0.0009838516184078466, + "loss": 0.9359585, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.31762695, + "step": 565, + "time_per_iteration": 2.838482618331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112559, + "balance_loss_mlp": 1.09345102, + "epoch": 0.1088880338591766, + "flos": 525922288128.0, + "grad_norm": 0.08266802783800845, + "language_loss": 0.89073956, + "learning_rate": 0.0009837729871223669, + "loss": 0.90199542, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.3215332, + "step": 566, + "time_per_iteration": 2.6670589447021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134729, + "balance_loss_mlp": 1.10073042, + "epoch": 0.10908041554444017, + "flos": 619986921984.0, + "grad_norm": 0.06816497946354988, + "language_loss": 0.89503658, + "learning_rate": 0.0009836941680195568, + "loss": 0.90638387, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.34033203, + "step": 567, + "time_per_iteration": 2.7894582748413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131677, + "balance_loss_mlp": 1.09691525, + "epoch": 0.10927279722970373, + "flos": 897740195328.0, + "grad_norm": 0.07371226629870802, + "language_loss": 0.8534497, + "learning_rate": 0.0009836151611300166, + "loss": 0.86476642, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.34765625, + "step": 568, + "time_per_iteration": 3.204500913619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116051, + "balance_loss_mlp": 1.08467555, + "epoch": 0.10946517891496729, + "flos": 528408852480.0, + "grad_norm": 0.061952855977424344, + "language_loss": 0.96103537, + "learning_rate": 0.0009835359664844194, + "loss": 0.97219586, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.3137207, + "step": 569, + "time_per_iteration": 2.6154720783233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124163, + "balance_loss_mlp": 1.11014414, + "epoch": 0.10965756060023085, + "flos": 1559944714752.0, + "grad_norm": 0.03358522647050957, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82160974, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.140625, + "step": 570, + "time_per_iteration": 4.907090187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112487, + "balance_loss_mlp": 1.09406638, + "epoch": 0.10984994228549443, + "flos": 512820163584.0, + "grad_norm": 0.08674533322611513, + "language_loss": 0.9339065, + "learning_rate": 0.0009833770140481118, + "loss": 0.9451552, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.30786133, + "step": 571, + "time_per_iteration": 2.694821357727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121358, + "balance_loss_mlp": 1.09072113, + "epoch": 0.11004232397075799, + "flos": 954314307072.0, + "grad_norm": 0.07582699316256973, + "language_loss": 0.84126109, + "learning_rate": 0.000983297256319112, + "loss": 0.85247469, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.30664062, + "step": 572, + "time_per_iteration": 3.208728313446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144326, + "balance_loss_mlp": 1.11097169, + "epoch": 0.11023470565602154, + "flos": 487921024512.0, + "grad_norm": 0.07530566153242002, + "language_loss": 0.8789041, + "learning_rate": 0.000983217310957477, + "loss": 0.89034736, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.33349609, + "step": 573, + "time_per_iteration": 2.7521331310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113474, + "balance_loss_mlp": 1.1014812, + "epoch": 0.1104270873412851, + "flos": 655521970176.0, + "grad_norm": 0.08427122985019045, + "language_loss": 0.91161472, + "learning_rate": 0.000983137177994244, + "loss": 0.92296207, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.33300781, + "step": 574, + "time_per_iteration": 2.869795083999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105984, + "balance_loss_mlp": 1.0752039, + "epoch": 0.11061946902654868, + "flos": 723097165824.0, + "grad_norm": 0.0803000190442887, + "language_loss": 0.87202144, + "learning_rate": 0.0009830568574605235, + "loss": 0.88308132, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.30737305, + "step": 575, + "time_per_iteration": 2.952505111694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111674, + "balance_loss_mlp": 1.07963109, + "epoch": 0.11081185071181224, + "flos": 835113397248.0, + "grad_norm": 0.07764025760375837, + "language_loss": 0.89234924, + "learning_rate": 0.0009829763493874992, + "loss": 0.90346599, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.3203125, + "step": 576, + "time_per_iteration": 3.0367727279663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110641, + "balance_loss_mlp": 1.07508206, + "epoch": 0.1110042323970758, + "flos": 608776252416.0, + "grad_norm": 0.06795308301133055, + "language_loss": 0.94366598, + "learning_rate": 0.0009828956538064264, + "loss": 0.95473009, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.31347656, + "step": 577, + "time_per_iteration": 2.783268928527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091394, + "balance_loss_mlp": 1.0610671, + "epoch": 0.11119661408233936, + "flos": 595643604480.0, + "grad_norm": 0.0662915232098912, + "language_loss": 0.9183138, + "learning_rate": 0.0009828147707486344, + "loss": 0.92922771, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.30297852, + "step": 578, + "time_per_iteration": 2.6628670692443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092993, + "balance_loss_mlp": 1.06109214, + "epoch": 0.11138899576760293, + "flos": 555573385728.0, + "grad_norm": 0.07355059798421615, + "language_loss": 0.87444091, + "learning_rate": 0.0009827337002455245, + "loss": 0.88537085, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.31884766, + "step": 579, + "time_per_iteration": 2.616842031478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087398, + "balance_loss_mlp": 1.05857313, + "epoch": 0.11158137745286649, + "flos": 689418667008.0, + "grad_norm": 0.05531737995895799, + "language_loss": 0.89474124, + "learning_rate": 0.0009826524423285712, + "loss": 0.90561521, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.28808594, + "step": 580, + "time_per_iteration": 2.896409749984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093471, + "balance_loss_mlp": 1.06393051, + "epoch": 0.11177375913813005, + "flos": 762688728576.0, + "grad_norm": 0.06807232662928764, + "language_loss": 0.9046967, + "learning_rate": 0.0009825709970293218, + "loss": 0.91563141, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.2956543, + "step": 581, + "time_per_iteration": 2.8843319416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096208, + "balance_loss_mlp": 1.0669775, + "epoch": 0.11196614082339361, + "flos": 806211588096.0, + "grad_norm": 0.07053725402235117, + "language_loss": 0.96166003, + "learning_rate": 0.0009824893643793956, + "loss": 0.9726221, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.29248047, + "step": 582, + "time_per_iteration": 3.04577898979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104715, + "balance_loss_mlp": 1.07305288, + "epoch": 0.11215852250865718, + "flos": 558350931456.0, + "grad_norm": 0.10752491555358674, + "language_loss": 0.89033759, + "learning_rate": 0.0009824075444104857, + "loss": 0.90138471, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.31689453, + "step": 583, + "time_per_iteration": 2.682020902633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125233, + "balance_loss_mlp": 1.09497714, + "epoch": 0.11235090419392074, + "flos": 513322140672.0, + "grad_norm": 0.06606619546840543, + "language_loss": 0.94941097, + "learning_rate": 0.000982325537154357, + "loss": 0.9606632, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.30224609, + "step": 584, + "time_per_iteration": 2.577632427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122311, + "balance_loss_mlp": 1.09045827, + "epoch": 0.1125432858791843, + "flos": 491209311744.0, + "grad_norm": 0.07452844115700766, + "language_loss": 0.95190644, + "learning_rate": 0.0009822433426428484, + "loss": 0.96312958, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.31860352, + "step": 585, + "time_per_iteration": 2.560591220855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126565, + "balance_loss_mlp": 1.09280539, + "epoch": 0.11273566756444786, + "flos": 510476193792.0, + "grad_norm": 0.11434848401200806, + "language_loss": 0.87964213, + "learning_rate": 0.0009821609609078697, + "loss": 0.89090776, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.3371582, + "step": 586, + "time_per_iteration": 2.633925437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118464, + "balance_loss_mlp": 1.08785152, + "epoch": 0.11292804924971142, + "flos": 622149009408.0, + "grad_norm": 0.08000190427267627, + "language_loss": 0.905334, + "learning_rate": 0.0009820783919814045, + "loss": 0.91651857, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.3059082, + "step": 587, + "time_per_iteration": 2.806704044342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111289, + "balance_loss_mlp": 1.07857847, + "epoch": 0.113120430934975, + "flos": 477811823616.0, + "grad_norm": 0.09357252991594707, + "language_loss": 0.83955467, + "learning_rate": 0.0009819956358955095, + "loss": 0.8506676, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.32714844, + "step": 588, + "time_per_iteration": 2.5903711318969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109097, + "balance_loss_mlp": 1.07455039, + "epoch": 0.11331281262023855, + "flos": 466801975296.0, + "grad_norm": 0.06610764616840299, + "language_loss": 0.85348701, + "learning_rate": 0.0009819126926823127, + "loss": 0.86457801, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.34570312, + "step": 589, + "time_per_iteration": 2.5726494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108183, + "balance_loss_mlp": 1.07535291, + "epoch": 0.11350519430550211, + "flos": 650164727808.0, + "grad_norm": 0.06035980490561805, + "language_loss": 0.87806922, + "learning_rate": 0.000981829562374016, + "loss": 0.8891511, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.328125, + "step": 590, + "time_per_iteration": 2.7960643768310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112987, + "balance_loss_mlp": 1.08041859, + "epoch": 0.11369757599076567, + "flos": 557547798528.0, + "grad_norm": 0.08830164474684658, + "language_loss": 0.98550045, + "learning_rate": 0.0009817462450028933, + "loss": 0.99663031, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.32568359, + "step": 591, + "time_per_iteration": 2.654860734939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107596, + "balance_loss_mlp": 1.07526684, + "epoch": 0.11388995767602925, + "flos": 570774988800.0, + "grad_norm": 0.06245390963608315, + "language_loss": 0.86587834, + "learning_rate": 0.0009816627406012916, + "loss": 0.87695432, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.32348633, + "step": 592, + "time_per_iteration": 2.8017733097076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101808, + "balance_loss_mlp": 1.07074225, + "epoch": 0.1140823393612928, + "flos": 740069540352.0, + "grad_norm": 0.06581053360364857, + "language_loss": 0.8595314, + "learning_rate": 0.0009815790492016295, + "loss": 0.87054944, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.31030273, + "step": 593, + "time_per_iteration": 2.9602174758911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097875, + "balance_loss_mlp": 1.06666636, + "epoch": 0.11427472104655637, + "flos": 698694211584.0, + "grad_norm": 0.07124053574400792, + "language_loss": 0.87982339, + "learning_rate": 0.0009814951708363993, + "loss": 0.89080215, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.31201172, + "step": 594, + "time_per_iteration": 2.818460702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167391, + "balance_loss_mlp": 1.15413451, + "epoch": 0.11446710273181993, + "flos": 1476387952128.0, + "grad_norm": 0.04038129773095179, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79158378, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.1328125, + "step": 595, + "time_per_iteration": 4.776912450790405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110386, + "balance_loss_mlp": 1.07250798, + "epoch": 0.1146594844170835, + "flos": 494641603584.0, + "grad_norm": 0.1404346857169784, + "language_loss": 0.89489102, + "learning_rate": 0.0009813268533395648, + "loss": 0.90592968, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.3137207, + "step": 596, + "time_per_iteration": 2.562816858291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115324, + "balance_loss_mlp": 1.08344746, + "epoch": 0.11485186610234706, + "flos": 474596319744.0, + "grad_norm": 0.07456374098915484, + "language_loss": 0.89145029, + "learning_rate": 0.0009812424142733073, + "loss": 0.90260351, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.31884766, + "step": 597, + "time_per_iteration": 2.5198655128479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_mlp": 1.0946219, + "epoch": 0.11504424778761062, + "flos": 730858014720.0, + "grad_norm": 0.05033183127205697, + "language_loss": 0.86898923, + "learning_rate": 0.000981157788372175, + "loss": 0.88022888, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.29345703, + "step": 598, + "time_per_iteration": 3.004558563232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155063, + "balance_loss_mlp": 1.12290049, + "epoch": 0.11523662947287418, + "flos": 545539788288.0, + "grad_norm": 0.07554757352201513, + "language_loss": 0.90216064, + "learning_rate": 0.0009810729756690223, + "loss": 0.91371131, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.3215332, + "step": 599, + "time_per_iteration": 2.7165520191192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149643, + "balance_loss_mlp": 1.11790919, + "epoch": 0.11542901115813775, + "flos": 774737436672.0, + "grad_norm": 0.08801397326806587, + "language_loss": 0.92855275, + "learning_rate": 0.0009809879761967766, + "loss": 0.94004917, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.31738281, + "step": 600, + "time_per_iteration": 2.9548492431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115619, + "balance_loss_mlp": 1.12004542, + "epoch": 0.11562139284340131, + "flos": 730585972224.0, + "grad_norm": 0.08285308963026158, + "language_loss": 0.87716347, + "learning_rate": 0.0009809027899884378, + "loss": 0.8887254, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.36157227, + "step": 601, + "time_per_iteration": 2.9107346534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131924, + "balance_loss_mlp": 1.10085821, + "epoch": 0.11581377452866487, + "flos": 535589148672.0, + "grad_norm": 0.07059046613839054, + "language_loss": 0.89834028, + "learning_rate": 0.0009808174170770779, + "loss": 0.90965956, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.31079102, + "step": 602, + "time_per_iteration": 2.79127836227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217718, + "balance_loss_mlp": 1.20541608, + "epoch": 0.11600615621392843, + "flos": 1554968613888.0, + "grad_norm": 0.07528653751738872, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86115962, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.12304688, + "step": 603, + "time_per_iteration": 4.862261772155762 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103115, + "balance_loss_mlp": 1.07238269, + "epoch": 0.116198537899192, + "flos": 537178037760.0, + "grad_norm": 0.08106568577848162, + "language_loss": 0.94434869, + "learning_rate": 0.0009806461112779462, + "loss": 0.95537978, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.30737305, + "step": 604, + "time_per_iteration": 2.600008249282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097342, + "balance_loss_mlp": 1.06427336, + "epoch": 0.11639091958445556, + "flos": 453970483200.0, + "grad_norm": 0.09761910402267754, + "language_loss": 0.89590895, + "learning_rate": 0.0009805601784566814, + "loss": 0.90688241, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.33056641, + "step": 605, + "time_per_iteration": 2.4687013626098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097807, + "balance_loss_mlp": 1.06635928, + "epoch": 0.11658330126971912, + "flos": 554815332864.0, + "grad_norm": 0.0628453025897625, + "language_loss": 0.96235836, + "learning_rate": 0.0009804740590654089, + "loss": 0.97333646, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.31469727, + "step": 606, + "time_per_iteration": 2.654134750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109435, + "balance_loss_mlp": 1.0789417, + "epoch": 0.11677568295498268, + "flos": 716025968640.0, + "grad_norm": 0.07837472156111998, + "language_loss": 0.90884066, + "learning_rate": 0.0009803877531375635, + "loss": 0.91993499, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.30493164, + "step": 607, + "time_per_iteration": 2.825778007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_mlp": 1.08074808, + "epoch": 0.11696806464024626, + "flos": 609474668544.0, + "grad_norm": 0.07263848878870109, + "language_loss": 0.91923869, + "learning_rate": 0.0009803012607066523, + "loss": 0.93036401, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.31787109, + "step": 608, + "time_per_iteration": 2.721005916595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101062, + "balance_loss_mlp": 1.06980491, + "epoch": 0.11716044632550981, + "flos": 520127087616.0, + "grad_norm": 0.06980646294906427, + "language_loss": 0.9077962, + "learning_rate": 0.0009802145818062543, + "loss": 0.91880679, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.31225586, + "step": 609, + "time_per_iteration": 2.707643985748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102401, + "balance_loss_mlp": 1.07035792, + "epoch": 0.11735282801077337, + "flos": 507246133248.0, + "grad_norm": 0.07162886221417876, + "language_loss": 0.9293434, + "learning_rate": 0.0009801277164700212, + "loss": 0.9403674, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.3203125, + "step": 610, + "time_per_iteration": 2.6389639377593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094537, + "balance_loss_mlp": 1.06323278, + "epoch": 0.11754520969603693, + "flos": 686339965440.0, + "grad_norm": 0.07220465483683103, + "language_loss": 0.90727574, + "learning_rate": 0.0009800406647316776, + "loss": 0.91822106, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.31274414, + "step": 611, + "time_per_iteration": 2.8033382892608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066854, + "balance_loss_mlp": 1.05369329, + "epoch": 0.1177375913813005, + "flos": 1541673022464.0, + "grad_norm": 0.030783707978337852, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.77981311, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.13183594, + "step": 612, + "time_per_iteration": 4.777275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116404, + "balance_loss_mlp": 1.08307314, + "epoch": 0.11792997306656407, + "flos": 520269682176.0, + "grad_norm": 0.07589987368124408, + "language_loss": 0.8961159, + "learning_rate": 0.000979866002183916, + "loss": 0.90727997, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.33325195, + "step": 613, + "time_per_iteration": 2.6848883628845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109453, + "balance_loss_mlp": 1.07719529, + "epoch": 0.11812235475182763, + "flos": 665980379136.0, + "grad_norm": 0.08667718058784188, + "language_loss": 0.91197205, + "learning_rate": 0.0009797783914423082, + "loss": 0.92306662, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.32250977, + "step": 614, + "time_per_iteration": 2.832414388656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.07140493, + "epoch": 0.11831473643709119, + "flos": 621021399552.0, + "grad_norm": 0.06050640051516142, + "language_loss": 0.85425436, + "learning_rate": 0.0009796905944342094, + "loss": 0.86530626, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.33813477, + "step": 615, + "time_per_iteration": 2.8220455646514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112849, + "balance_loss_mlp": 1.07913685, + "epoch": 0.11850711812235475, + "flos": 456438108672.0, + "grad_norm": 0.0714748534502384, + "language_loss": 0.893188, + "learning_rate": 0.0009796026111937057, + "loss": 0.90431643, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.3371582, + "step": 616, + "time_per_iteration": 2.590566873550415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102524, + "balance_loss_mlp": 1.07005119, + "epoch": 0.11869949980761832, + "flos": 513598565376.0, + "grad_norm": 0.06492309219220607, + "language_loss": 0.89778733, + "learning_rate": 0.0009795144417549552, + "loss": 0.90881252, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.32470703, + "step": 617, + "time_per_iteration": 2.672914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109626, + "balance_loss_mlp": 1.0773685, + "epoch": 0.11889188149288188, + "flos": 534732171264.0, + "grad_norm": 0.057544425945024125, + "language_loss": 0.90660846, + "learning_rate": 0.0009794260861521883, + "loss": 0.9177047, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.32250977, + "step": 618, + "time_per_iteration": 2.817354202270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_mlp": 1.07009149, + "epoch": 0.11908426317814544, + "flos": 498344527872.0, + "grad_norm": 0.0773697745436404, + "language_loss": 0.87738883, + "learning_rate": 0.0009793375444197075, + "loss": 0.88841403, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.32397461, + "step": 619, + "time_per_iteration": 2.607475996017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011109, + "balance_loss_mlp": 1.07697332, + "epoch": 0.119276644863409, + "flos": 659598833664.0, + "grad_norm": 0.06767977381214116, + "language_loss": 0.86337721, + "learning_rate": 0.000979248816591888, + "loss": 0.87448615, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.33935547, + "step": 620, + "time_per_iteration": 2.758866548538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098794, + "balance_loss_mlp": 1.06667948, + "epoch": 0.11946902654867257, + "flos": 758396487168.0, + "grad_norm": 0.06819106164994826, + "language_loss": 0.87032986, + "learning_rate": 0.0009791599027031766, + "loss": 0.88131785, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.32128906, + "step": 621, + "time_per_iteration": 3.029431104660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088156, + "balance_loss_mlp": 1.05611241, + "epoch": 0.11966140823393613, + "flos": 680697533952.0, + "grad_norm": 0.0732554324646167, + "language_loss": 0.87112588, + "learning_rate": 0.0009790708027880932, + "loss": 0.88200748, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.32055664, + "step": 622, + "time_per_iteration": 2.855576992034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056461, + "balance_loss_mlp": 1.04444504, + "epoch": 0.11985378991919969, + "flos": 1450268070912.0, + "grad_norm": 0.03732324883573809, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78483754, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.12011719, + "step": 623, + "time_per_iteration": 4.840993165969849 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108671, + "balance_loss_mlp": 1.0551914, + "epoch": 0.12004617160446325, + "flos": 527586780672.0, + "grad_norm": 0.07309096746678648, + "language_loss": 0.94236648, + "learning_rate": 0.0009788920450172487, + "loss": 0.9532336, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.31518555, + "step": 624, + "time_per_iteration": 2.6301677227020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102663, + "balance_loss_mlp": 1.07023823, + "epoch": 0.12023855328972682, + "flos": 473980861440.0, + "grad_norm": 0.15739190650861204, + "language_loss": 0.91515559, + "learning_rate": 0.0009788023872308875, + "loss": 0.92618221, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.32421875, + "step": 625, + "time_per_iteration": 2.506446361541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014454, + "balance_loss_mlp": 1.0033915, + "epoch": 0.12043093497499038, + "flos": 1530954155520.0, + "grad_norm": 0.02216054665264375, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76443458, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.11083984, + "step": 626, + "time_per_iteration": 4.713289260864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114644, + "balance_loss_mlp": 1.08391225, + "epoch": 0.12062331666025394, + "flos": 539571469824.0, + "grad_norm": 0.0672242080300053, + "language_loss": 0.94766486, + "learning_rate": 0.0009786225140303285, + "loss": 0.95881128, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.30761719, + "step": 627, + "time_per_iteration": 2.61875057220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011267, + "balance_loss_mlp": 1.09503818, + "epoch": 0.1208156983455175, + "flos": 511634327040.0, + "grad_norm": 0.06510849521455, + "language_loss": 0.925771, + "learning_rate": 0.0009785322986859634, + "loss": 0.93703806, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.31640625, + "step": 628, + "time_per_iteration": 2.6567625999450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141777, + "balance_loss_mlp": 1.11059177, + "epoch": 0.12100808003078108, + "flos": 596195043840.0, + "grad_norm": 0.06735600063735754, + "language_loss": 0.93719506, + "learning_rate": 0.0009784418975588838, + "loss": 0.94861281, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.31152344, + "step": 629, + "time_per_iteration": 2.697376012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122983, + "balance_loss_mlp": 1.09222674, + "epoch": 0.12120046171604464, + "flos": 522698019840.0, + "grad_norm": 0.47103484407124013, + "language_loss": 0.93927598, + "learning_rate": 0.0009783513106841862, + "loss": 0.95050573, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.30761719, + "step": 630, + "time_per_iteration": 2.7226808071136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143332, + "balance_loss_mlp": 1.13179243, + "epoch": 0.1213928434013082, + "flos": 1553605277184.0, + "grad_norm": 0.056788624646596834, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77876031, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.11523438, + "step": 631, + "time_per_iteration": 4.948111295700073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228128, + "balance_loss_mlp": 1.19219875, + "epoch": 0.12158522508657175, + "flos": 495143580672.0, + "grad_norm": 0.06834333100250278, + "language_loss": 0.88515621, + "learning_rate": 0.0009781695798326854, + "loss": 0.89743745, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.35961914, + "step": 632, + "time_per_iteration": 2.5616555213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267845, + "balance_loss_mlp": 1.23050833, + "epoch": 0.12177760677183531, + "flos": 475335433728.0, + "grad_norm": 0.1009303482431908, + "language_loss": 0.88543177, + "learning_rate": 0.0009780784359264365, + "loss": 0.89811015, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.37329102, + "step": 633, + "time_per_iteration": 2.597935438156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265484, + "balance_loss_mlp": 1.25370574, + "epoch": 0.12196998845709889, + "flos": 1467630351360.0, + "grad_norm": 0.08843071113371018, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75454181, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.11767578, + "step": 634, + "time_per_iteration": 4.768415451049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235432, + "balance_loss_mlp": 1.19976473, + "epoch": 0.12216237014236245, + "flos": 586279309824.0, + "grad_norm": 0.0829698455775257, + "language_loss": 0.88074899, + "learning_rate": 0.000977895591329867, + "loss": 0.89310336, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.35668945, + "step": 635, + "time_per_iteration": 2.7918457984924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214994, + "balance_loss_mlp": 1.17720437, + "epoch": 0.12235475182762601, + "flos": 597721324032.0, + "grad_norm": 0.0916527997361875, + "language_loss": 0.87791145, + "learning_rate": 0.000977803890710533, + "loss": 0.89006138, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.37792969, + "step": 636, + "time_per_iteration": 2.7248313426971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186705, + "balance_loss_mlp": 1.1509428, + "epoch": 0.12254713351288957, + "flos": 497487550464.0, + "grad_norm": 0.0702522126388857, + "language_loss": 0.93856937, + "learning_rate": 0.0009777120045912774, + "loss": 0.95043641, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.35766602, + "step": 637, + "time_per_iteration": 2.6079726219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180704, + "balance_loss_mlp": 1.14236617, + "epoch": 0.12273951519815314, + "flos": 605565130752.0, + "grad_norm": 0.06645311005239844, + "language_loss": 0.90599251, + "learning_rate": 0.0009776199330077736, + "loss": 0.91779959, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.38330078, + "step": 638, + "time_per_iteration": 2.7671282291412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196025, + "balance_loss_mlp": 1.15940344, + "epoch": 0.1229318968834167, + "flos": 597578729472.0, + "grad_norm": 0.09015200479441979, + "language_loss": 0.93140519, + "learning_rate": 0.0009775276759957667, + "loss": 0.94336545, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.36621094, + "step": 639, + "time_per_iteration": 2.6990442276000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179898, + "balance_loss_mlp": 1.14265716, + "epoch": 0.12312427856868026, + "flos": 678082931712.0, + "grad_norm": 0.08188642922116089, + "language_loss": 0.90714514, + "learning_rate": 0.0009774352335910745, + "loss": 0.91894412, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.37280273, + "step": 640, + "time_per_iteration": 2.7950265407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115004, + "balance_loss_mlp": 1.11658967, + "epoch": 0.12331666025394382, + "flos": 608656978944.0, + "grad_norm": 0.07361380744806716, + "language_loss": 0.95549798, + "learning_rate": 0.000977342605829586, + "loss": 0.96699834, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.3347168, + "step": 641, + "time_per_iteration": 2.6966538429260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140018, + "balance_loss_mlp": 1.10497069, + "epoch": 0.12350904193920739, + "flos": 762172194816.0, + "grad_norm": 0.08211004604029591, + "language_loss": 0.86708105, + "learning_rate": 0.0009772497927472623, + "loss": 0.87848121, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.35083008, + "step": 642, + "time_per_iteration": 3.050595998764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121507, + "balance_loss_mlp": 1.0852437, + "epoch": 0.12370142362447095, + "flos": 540699079680.0, + "grad_norm": 0.0716743258864478, + "language_loss": 0.85363436, + "learning_rate": 0.0009771567943801368, + "loss": 0.86484945, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.36254883, + "step": 643, + "time_per_iteration": 2.627019166946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112067, + "balance_loss_mlp": 1.07744884, + "epoch": 0.12389380530973451, + "flos": 547848852480.0, + "grad_norm": 0.06992166814052157, + "language_loss": 0.89936745, + "learning_rate": 0.0009770636107643152, + "loss": 0.91048813, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.34643555, + "step": 644, + "time_per_iteration": 2.696233034133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102963, + "balance_loss_mlp": 1.06846356, + "epoch": 0.12408618699499807, + "flos": 540048715776.0, + "grad_norm": 0.06268128655507912, + "language_loss": 0.88181639, + "learning_rate": 0.0009769702419359738, + "loss": 0.89284605, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.3449707, + "step": 645, + "time_per_iteration": 2.61401104927063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116268, + "balance_loss_mlp": 1.0810535, + "epoch": 0.12427856868026164, + "flos": 745451513856.0, + "grad_norm": 0.07610574883038115, + "language_loss": 0.89730537, + "learning_rate": 0.000976876687931362, + "loss": 0.90846807, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.35229492, + "step": 646, + "time_per_iteration": 2.999408721923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131315, + "balance_loss_mlp": 1.09622002, + "epoch": 0.1244709503655252, + "flos": 533460556800.0, + "grad_norm": 0.19449531308307466, + "language_loss": 0.85410094, + "learning_rate": 0.0009767829487868005, + "loss": 0.86541414, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.35107422, + "step": 647, + "time_per_iteration": 2.617666721343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138117, + "balance_loss_mlp": 1.10159075, + "epoch": 0.12466333205078876, + "flos": 507847034880.0, + "grad_norm": 0.07509451505155453, + "language_loss": 0.89358151, + "learning_rate": 0.000976689024538682, + "loss": 0.90496266, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.36499023, + "step": 648, + "time_per_iteration": 2.5929009914398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138062, + "balance_loss_mlp": 1.10110736, + "epoch": 0.12485571373605232, + "flos": 681023420928.0, + "grad_norm": 0.07057439208121223, + "language_loss": 0.87662494, + "learning_rate": 0.0009765949152234716, + "loss": 0.8880055, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.36962891, + "step": 649, + "time_per_iteration": 2.874701976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147728, + "balance_loss_mlp": 1.13504386, + "epoch": 0.1250480954213159, + "flos": 1329402668544.0, + "grad_norm": 0.04527818124304351, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79833812, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.12695312, + "step": 650, + "time_per_iteration": 4.680933713912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138039, + "balance_loss_mlp": 1.10287213, + "epoch": 0.12524047710657946, + "flos": 938140683264.0, + "grad_norm": 0.08375968037938068, + "language_loss": 0.82443976, + "learning_rate": 0.0009764061415379919, + "loss": 0.83582014, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.35205078, + "step": 651, + "time_per_iteration": 3.2550604343414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135369, + "balance_loss_mlp": 1.09774697, + "epoch": 0.12543285879184302, + "flos": 513642235392.0, + "grad_norm": 0.07146085627000143, + "language_loss": 0.89363486, + "learning_rate": 0.0009763114772410109, + "loss": 0.90498853, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.3762207, + "step": 652, + "time_per_iteration": 2.5937142372131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139745, + "balance_loss_mlp": 1.10419679, + "epoch": 0.12562524047710658, + "flos": 717991617024.0, + "grad_norm": 0.07913079577836896, + "language_loss": 0.87230957, + "learning_rate": 0.0009762166280235146, + "loss": 0.88370705, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.35571289, + "step": 653, + "time_per_iteration": 2.96162748336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147653, + "balance_loss_mlp": 1.10974443, + "epoch": 0.12581762216237014, + "flos": 563441923584.0, + "grad_norm": 0.06492259826928527, + "language_loss": 0.87890899, + "learning_rate": 0.0009761215939223267, + "loss": 0.89038551, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.37890625, + "step": 654, + "time_per_iteration": 2.714641809463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145182, + "balance_loss_mlp": 1.1077261, + "epoch": 0.1260100038476337, + "flos": 481642785792.0, + "grad_norm": 0.07920721431290144, + "language_loss": 0.86875665, + "learning_rate": 0.0009760263749743428, + "loss": 0.88020849, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.37426758, + "step": 655, + "time_per_iteration": 2.547499179840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145343, + "balance_loss_mlp": 1.11074805, + "epoch": 0.12620238553289725, + "flos": 575269461504.0, + "grad_norm": 0.06357383816141966, + "language_loss": 0.90176344, + "learning_rate": 0.0009759309712165299, + "loss": 0.91321695, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.34570312, + "step": 656, + "time_per_iteration": 2.693922996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137509, + "balance_loss_mlp": 1.103248, + "epoch": 0.12639476721816084, + "flos": 530909973504.0, + "grad_norm": 0.07169490366111804, + "language_loss": 0.93258119, + "learning_rate": 0.0009758353826859272, + "loss": 0.94395626, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.34277344, + "step": 657, + "time_per_iteration": 2.5744612216949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139269, + "balance_loss_mlp": 1.10314822, + "epoch": 0.1265871489034244, + "flos": 689654393856.0, + "grad_norm": 0.06860158128637554, + "language_loss": 0.89679217, + "learning_rate": 0.0009757396094196456, + "loss": 0.90818477, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36132812, + "step": 658, + "time_per_iteration": 2.851700782775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143308, + "balance_loss_mlp": 1.10675859, + "epoch": 0.12677953058868796, + "flos": 536863735296.0, + "grad_norm": 0.0696485175834739, + "language_loss": 0.84555894, + "learning_rate": 0.0009756436514548673, + "loss": 0.85699201, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.36523438, + "step": 659, + "time_per_iteration": 2.7971351146698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122244, + "balance_loss_mlp": 1.08800757, + "epoch": 0.12697191227395152, + "flos": 518749194240.0, + "grad_norm": 0.05327633329409036, + "language_loss": 0.88343394, + "learning_rate": 0.0009755475088288466, + "loss": 0.89465636, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.34228516, + "step": 660, + "time_per_iteration": 2.670555353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103127, + "balance_loss_mlp": 1.06903291, + "epoch": 0.12716429395921508, + "flos": 566341714944.0, + "grad_norm": 0.06801254087798507, + "language_loss": 0.90210187, + "learning_rate": 0.0009754511815789095, + "loss": 0.91313314, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.34106445, + "step": 661, + "time_per_iteration": 2.748224973678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102552, + "balance_loss_mlp": 1.06798172, + "epoch": 0.12735667564447864, + "flos": 513844466688.0, + "grad_norm": 0.06975204014846512, + "language_loss": 0.86245489, + "learning_rate": 0.0009753546697424533, + "loss": 0.87348044, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.34594727, + "step": 662, + "time_per_iteration": 2.664799213409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092108, + "balance_loss_mlp": 1.05863369, + "epoch": 0.1275490573297422, + "flos": 541023556608.0, + "grad_norm": 0.05485824904298714, + "language_loss": 0.90572149, + "learning_rate": 0.0009752579733569475, + "loss": 0.91664255, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.3347168, + "step": 663, + "time_per_iteration": 2.679975748062134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267369, + "balance_loss_mlp": 1.2515384, + "epoch": 0.12774143901500576, + "flos": 1557872787456.0, + "grad_norm": 0.0685532780556388, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7614876, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.15820312, + "step": 664, + "time_per_iteration": 4.938101053237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096151, + "balance_loss_mlp": 1.06177139, + "epoch": 0.12793382070026935, + "flos": 613462781952.0, + "grad_norm": 0.06920677464457729, + "language_loss": 0.90523887, + "learning_rate": 0.0009750640270890217, + "loss": 0.9162004, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.34375, + "step": 665, + "time_per_iteration": 2.6939845085144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099083, + "balance_loss_mlp": 1.06563258, + "epoch": 0.1281262023855329, + "flos": 707386231296.0, + "grad_norm": 0.06773970450457005, + "language_loss": 0.96531481, + "learning_rate": 0.0009749667772818983, + "loss": 0.9763056, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.33447266, + "step": 666, + "time_per_iteration": 2.967853307723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164762, + "balance_loss_mlp": 1.15131497, + "epoch": 0.12831858407079647, + "flos": 1424250086400.0, + "grad_norm": 0.045177828452490555, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78100705, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.13476562, + "step": 667, + "time_per_iteration": 4.85069465637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093582, + "balance_loss_mlp": 1.05958366, + "epoch": 0.12851096575606002, + "flos": 448869316608.0, + "grad_norm": 0.07778909975942494, + "language_loss": 0.95426726, + "learning_rate": 0.0009747717245101093, + "loss": 0.96520311, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.34008789, + "step": 668, + "time_per_iteration": 2.5234692096710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098998, + "balance_loss_mlp": 1.06519032, + "epoch": 0.12870334744132358, + "flos": 479697486336.0, + "grad_norm": 0.05465485885236262, + "language_loss": 0.84969366, + "learning_rate": 0.00097467392162117, + "loss": 0.86068368, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.33789062, + "step": 669, + "time_per_iteration": 2.601684808731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096385, + "balance_loss_mlp": 1.06341171, + "epoch": 0.12889572912658714, + "flos": 638633963520.0, + "grad_norm": 0.05954757179165737, + "language_loss": 0.91292465, + "learning_rate": 0.0009745759344474708, + "loss": 0.92388856, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.32983398, + "step": 670, + "time_per_iteration": 2.8225347995758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098806, + "balance_loss_mlp": 1.06411648, + "epoch": 0.1290881108118507, + "flos": 509693409792.0, + "grad_norm": 0.06976130099981656, + "language_loss": 0.89229816, + "learning_rate": 0.0009744777630270536, + "loss": 0.90328622, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.34692383, + "step": 671, + "time_per_iteration": 2.633571147918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109875, + "balance_loss_mlp": 1.07435024, + "epoch": 0.12928049249711426, + "flos": 670746894336.0, + "grad_norm": 0.08011077975608555, + "language_loss": 0.93749923, + "learning_rate": 0.000974379407398032, + "loss": 0.94859791, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.35546875, + "step": 672, + "time_per_iteration": 2.875609874725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093721, + "balance_loss_mlp": 1.06065273, + "epoch": 0.12947287418237785, + "flos": 793158925824.0, + "grad_norm": 0.05850057523774312, + "language_loss": 0.82016242, + "learning_rate": 0.0009742808675985913, + "loss": 0.83109969, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.33056641, + "step": 673, + "time_per_iteration": 3.087738275527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101029, + "balance_loss_mlp": 1.0646224, + "epoch": 0.1296652558676414, + "flos": 485222054400.0, + "grad_norm": 0.08954381825883409, + "language_loss": 0.9153564, + "learning_rate": 0.0009741821436669876, + "loss": 0.92636657, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.36450195, + "step": 674, + "time_per_iteration": 2.539849281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101326, + "balance_loss_mlp": 1.06673169, + "epoch": 0.12985763755290497, + "flos": 453226987008.0, + "grad_norm": 0.0648114016490977, + "language_loss": 0.9288274, + "learning_rate": 0.0009740832356415492, + "loss": 0.93984067, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.34619141, + "step": 675, + "time_per_iteration": 2.467801094055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097673, + "balance_loss_mlp": 1.06315041, + "epoch": 0.13005001923816853, + "flos": 824719007232.0, + "grad_norm": 0.0735546441878898, + "language_loss": 0.8857609, + "learning_rate": 0.0009739841435606756, + "loss": 0.89673769, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.34545898, + "step": 676, + "time_per_iteration": 3.008781909942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109741, + "balance_loss_mlp": 1.06457949, + "epoch": 0.1302424009234321, + "flos": 531107822592.0, + "grad_norm": 0.07312926894822828, + "language_loss": 0.90675485, + "learning_rate": 0.0009738848674628377, + "loss": 0.9177289, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.328125, + "step": 677, + "time_per_iteration": 2.695338010787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104607, + "balance_loss_mlp": 1.06955981, + "epoch": 0.13043478260869565, + "flos": 525626924544.0, + "grad_norm": 0.06033597827839572, + "language_loss": 0.89643902, + "learning_rate": 0.000973785407386578, + "loss": 0.90748513, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.35058594, + "step": 678, + "time_per_iteration": 2.7727599143981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101976, + "balance_loss_mlp": 1.06714272, + "epoch": 0.1306271642939592, + "flos": 625862108160.0, + "grad_norm": 0.05570081952525763, + "language_loss": 0.87361526, + "learning_rate": 0.0009736857633705103, + "loss": 0.88463503, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.34814453, + "step": 679, + "time_per_iteration": 2.843129873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110176, + "balance_loss_mlp": 1.06630766, + "epoch": 0.13081954597922277, + "flos": 550438723584.0, + "grad_norm": 0.06405817655948583, + "language_loss": 0.93204647, + "learning_rate": 0.0009735859354533196, + "loss": 0.94306409, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.35473633, + "step": 680, + "time_per_iteration": 2.7122464179992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093118, + "balance_loss_mlp": 1.05914354, + "epoch": 0.13101192766448633, + "flos": 536651329536.0, + "grad_norm": 0.06779912020183775, + "language_loss": 0.91948998, + "learning_rate": 0.0009734859236737628, + "loss": 0.93042123, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.33984375, + "step": 681, + "time_per_iteration": 2.594881296157837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093326, + "balance_loss_mlp": 1.0593034, + "epoch": 0.13120430934974991, + "flos": 503258019840.0, + "grad_norm": 0.06413082246497326, + "language_loss": 0.93904501, + "learning_rate": 0.0009733857280706678, + "loss": 0.94997829, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.34033203, + "step": 682, + "time_per_iteration": 2.5831425189971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010929, + "balance_loss_mlp": 1.05992687, + "epoch": 0.13139669103501347, + "flos": 614014221312.0, + "grad_norm": 0.06246118190021366, + "language_loss": 0.85051745, + "learning_rate": 0.000973285348682934, + "loss": 0.86144638, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.33007812, + "step": 683, + "time_per_iteration": 2.7236225605010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226892, + "balance_loss_mlp": 1.21096563, + "epoch": 0.13158907272027703, + "flos": 1484163357696.0, + "grad_norm": 0.08359566880013784, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79125261, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.15917969, + "step": 684, + "time_per_iteration": 4.87854790687561 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087932, + "balance_loss_mlp": 1.05488706, + "epoch": 0.1317814544055406, + "flos": 985049344512.0, + "grad_norm": 0.07039095593234826, + "language_loss": 0.85449159, + "learning_rate": 0.0009730840387095046, + "loss": 0.86537099, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.33056641, + "step": 685, + "time_per_iteration": 3.30759596824646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096943, + "balance_loss_mlp": 1.06156158, + "epoch": 0.13197383609080415, + "flos": 611163892224.0, + "grad_norm": 0.05759402546544749, + "language_loss": 0.912597, + "learning_rate": 0.0009729831082019642, + "loss": 0.92356646, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.35351562, + "step": 686, + "time_per_iteration": 2.7965087890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093388, + "balance_loss_mlp": 1.0589608, + "epoch": 0.1321662177760677, + "flos": 494116305408.0, + "grad_norm": 0.058033147986452156, + "language_loss": 0.89668858, + "learning_rate": 0.0009728819940660958, + "loss": 0.90762246, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.34399414, + "step": 687, + "time_per_iteration": 2.7347469329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110102, + "balance_loss_mlp": 1.0653528, + "epoch": 0.13235859946133127, + "flos": 495591713280.0, + "grad_norm": 0.07548862234195632, + "language_loss": 0.86088693, + "learning_rate": 0.0009727806963411557, + "loss": 0.87189722, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.35668945, + "step": 688, + "time_per_iteration": 2.621638774871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098222, + "balance_loss_mlp": 1.06279302, + "epoch": 0.13255098114659483, + "flos": 511417539072.0, + "grad_norm": 0.08656773393569435, + "language_loss": 0.88000298, + "learning_rate": 0.000972679215066471, + "loss": 0.89098513, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.35449219, + "step": 689, + "time_per_iteration": 2.6806418895721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103388, + "balance_loss_mlp": 1.06900764, + "epoch": 0.13274336283185842, + "flos": 547114120704.0, + "grad_norm": 0.07064056682134613, + "language_loss": 0.99675226, + "learning_rate": 0.0009725775502814401, + "loss": 1.00778604, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.34350586, + "step": 690, + "time_per_iteration": 2.607179641723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121046, + "balance_loss_mlp": 1.08397222, + "epoch": 0.13293574451712198, + "flos": 640465781760.0, + "grad_norm": 0.08777481913975324, + "language_loss": 0.85673726, + "learning_rate": 0.0009724757020255327, + "loss": 0.86794776, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.37084961, + "step": 691, + "time_per_iteration": 2.81113338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111244, + "balance_loss_mlp": 1.07726967, + "epoch": 0.13312812620238554, + "flos": 491234042880.0, + "grad_norm": 0.09165524457583717, + "language_loss": 0.87811983, + "learning_rate": 0.0009723736703382902, + "loss": 0.88923222, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.33984375, + "step": 692, + "time_per_iteration": 2.548689603805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110186, + "balance_loss_mlp": 1.0692203, + "epoch": 0.1333205078876491, + "flos": 508693837824.0, + "grad_norm": 0.061462060991887495, + "language_loss": 0.83746743, + "learning_rate": 0.0009722714552593244, + "loss": 0.84848601, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.32641602, + "step": 693, + "time_per_iteration": 2.6584513187408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099112, + "balance_loss_mlp": 1.06358743, + "epoch": 0.13351288957291266, + "flos": 418474722816.0, + "grad_norm": 0.07144638741394425, + "language_loss": 0.94810003, + "learning_rate": 0.000972169056828319, + "loss": 0.95909119, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.35522461, + "step": 694, + "time_per_iteration": 2.461437702178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_mlp": 1.06751275, + "epoch": 0.13370527125817622, + "flos": 615614694912.0, + "grad_norm": 0.05672506947017021, + "language_loss": 0.87834966, + "learning_rate": 0.0009720664750850283, + "loss": 0.88935745, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.33251953, + "step": 695, + "time_per_iteration": 2.7716193199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103519, + "balance_loss_mlp": 1.07085609, + "epoch": 0.13389765294343978, + "flos": 625757391360.0, + "grad_norm": 0.07304651625724701, + "language_loss": 0.93482703, + "learning_rate": 0.0009719637100692784, + "loss": 0.94586229, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.32666016, + "step": 696, + "time_per_iteration": 2.7741310596466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111476, + "balance_loss_mlp": 1.08090401, + "epoch": 0.13409003462870334, + "flos": 609391710720.0, + "grad_norm": 0.06235589965882817, + "language_loss": 0.83759153, + "learning_rate": 0.0009718607618209661, + "loss": 0.84873915, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.33862305, + "step": 697, + "time_per_iteration": 2.869180202484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128671, + "balance_loss_mlp": 1.09488726, + "epoch": 0.13428241631396692, + "flos": 683499810816.0, + "grad_norm": 0.0709058406100417, + "language_loss": 0.88053036, + "learning_rate": 0.0009717576303800595, + "loss": 0.89181709, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.33789062, + "step": 698, + "time_per_iteration": 3.007253408432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122193, + "balance_loss_mlp": 1.08716917, + "epoch": 0.13447479799923048, + "flos": 508565799936.0, + "grad_norm": 0.07060238478807088, + "language_loss": 0.86057615, + "learning_rate": 0.0009716543157865975, + "loss": 0.87179804, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.35083008, + "step": 699, + "time_per_iteration": 2.6622114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112811, + "balance_loss_mlp": 1.07812154, + "epoch": 0.13466717968449404, + "flos": 897124737024.0, + "grad_norm": 0.06896685381510245, + "language_loss": 0.84149206, + "learning_rate": 0.0009715508180806907, + "loss": 0.85262012, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.34716797, + "step": 700, + "time_per_iteration": 3.175494909286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106112, + "balance_loss_mlp": 1.07054055, + "epoch": 0.1348595613697576, + "flos": 989501557248.0, + "grad_norm": 0.07388845252403331, + "language_loss": 0.90260321, + "learning_rate": 0.0009714471373025202, + "loss": 0.91366434, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.35546875, + "step": 701, + "time_per_iteration": 3.3912835121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090254, + "balance_loss_mlp": 1.05499172, + "epoch": 0.13505194305502116, + "flos": 487580580864.0, + "grad_norm": 0.07959074518459132, + "language_loss": 0.89355272, + "learning_rate": 0.0009713432734923386, + "loss": 0.9044553, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.35253906, + "step": 702, + "time_per_iteration": 2.6718733310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090728, + "balance_loss_mlp": 1.05572796, + "epoch": 0.13524432474028472, + "flos": 613103399424.0, + "grad_norm": 0.06387437846302528, + "language_loss": 0.875036, + "learning_rate": 0.0009712392266904696, + "loss": 0.88594317, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.34985352, + "step": 703, + "time_per_iteration": 2.6985831260681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010863, + "balance_loss_mlp": 1.0524683, + "epoch": 0.13543670642554828, + "flos": 904425868800.0, + "grad_norm": 0.06666466963859687, + "language_loss": 0.86250496, + "learning_rate": 0.0009711349969373076, + "loss": 0.87336791, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.33862305, + "step": 704, + "time_per_iteration": 3.1328465938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095762, + "balance_loss_mlp": 1.0610956, + "epoch": 0.13562908811081184, + "flos": 550335416832.0, + "grad_norm": 0.0628446006314887, + "language_loss": 0.80944061, + "learning_rate": 0.0009710305842733178, + "loss": 0.82039821, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.34667969, + "step": 705, + "time_per_iteration": 2.7668187618255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093976, + "balance_loss_mlp": 1.06147909, + "epoch": 0.1358214697960754, + "flos": 507797572608.0, + "grad_norm": 0.06635154625105166, + "language_loss": 0.90133065, + "learning_rate": 0.0009709259887390373, + "loss": 0.91227043, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.32519531, + "step": 706, + "time_per_iteration": 2.656233072280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096924, + "balance_loss_mlp": 1.06390333, + "epoch": 0.136013851481339, + "flos": 528640197120.0, + "grad_norm": 0.09290535615143355, + "language_loss": 0.91425377, + "learning_rate": 0.0009708212103750737, + "loss": 0.92522299, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.33007812, + "step": 707, + "time_per_iteration": 2.569655656814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_mlp": 1.06812644, + "epoch": 0.13620623316660255, + "flos": 658772379648.0, + "grad_norm": 0.06731423560591156, + "language_loss": 0.87756282, + "learning_rate": 0.0009707162492221051, + "loss": 0.88857424, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.33007812, + "step": 708, + "time_per_iteration": 2.880669593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103707, + "balance_loss_mlp": 1.07009029, + "epoch": 0.1363986148518661, + "flos": 671583522816.0, + "grad_norm": 0.07312175328849302, + "language_loss": 0.88322687, + "learning_rate": 0.0009706111053208815, + "loss": 0.89426386, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.33642578, + "step": 709, + "time_per_iteration": 2.7878787517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097257, + "balance_loss_mlp": 1.06342554, + "epoch": 0.13659099653712967, + "flos": 472828520448.0, + "grad_norm": 0.06741688104713542, + "language_loss": 0.86067665, + "learning_rate": 0.0009705057787122232, + "loss": 0.87164921, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.33862305, + "step": 710, + "time_per_iteration": 2.528298854827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105446, + "balance_loss_mlp": 1.07190061, + "epoch": 0.13678337822239323, + "flos": 452483490816.0, + "grad_norm": 0.05706590332145298, + "language_loss": 0.91653168, + "learning_rate": 0.0009704002694370216, + "loss": 0.92758614, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.33569336, + "step": 711, + "time_per_iteration": 2.5201761722564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114394, + "balance_loss_mlp": 1.0794661, + "epoch": 0.13697575990765679, + "flos": 519373416960.0, + "grad_norm": 0.06387130477766731, + "language_loss": 0.86892813, + "learning_rate": 0.0009702945775362388, + "loss": 0.88007212, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.34960938, + "step": 712, + "time_per_iteration": 2.661848783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130562, + "balance_loss_mlp": 1.0947994, + "epoch": 0.13716814159292035, + "flos": 480145618944.0, + "grad_norm": 0.06038249383316015, + "language_loss": 0.87339497, + "learning_rate": 0.0009701887030509086, + "loss": 0.8847006, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.35766602, + "step": 713, + "time_per_iteration": 2.6068434715270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125148, + "balance_loss_mlp": 1.0908401, + "epoch": 0.1373605232781839, + "flos": 545376844800.0, + "grad_norm": 0.06924339631343991, + "language_loss": 0.92127877, + "learning_rate": 0.0009700826460221346, + "loss": 0.93253028, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.34301758, + "step": 714, + "time_per_iteration": 2.653224468231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145818, + "balance_loss_mlp": 1.11050797, + "epoch": 0.1375529049634475, + "flos": 708473143296.0, + "grad_norm": 0.0682346884445605, + "language_loss": 0.93435562, + "learning_rate": 0.0009699764064910921, + "loss": 0.94581378, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.35302734, + "step": 715, + "time_per_iteration": 2.878445625305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130305, + "balance_loss_mlp": 1.09542441, + "epoch": 0.13774528664871105, + "flos": 486452971008.0, + "grad_norm": 0.07091873756636237, + "language_loss": 0.87931371, + "learning_rate": 0.0009698699844990268, + "loss": 0.89061677, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.34863281, + "step": 716, + "time_per_iteration": 2.6278092861175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124133, + "balance_loss_mlp": 1.09070659, + "epoch": 0.1379376683339746, + "flos": 679885636608.0, + "grad_norm": 0.0686032560828043, + "language_loss": 0.88731855, + "learning_rate": 0.0009697633800872555, + "loss": 0.89855987, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.33422852, + "step": 717, + "time_per_iteration": 2.888576030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112997, + "balance_loss_mlp": 1.07825947, + "epoch": 0.13813005001923817, + "flos": 610628419584.0, + "grad_norm": 0.07907714555147631, + "language_loss": 0.9128629, + "learning_rate": 0.0009696565932971655, + "loss": 0.92399287, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.34741211, + "step": 718, + "time_per_iteration": 2.8937225341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110249, + "balance_loss_mlp": 1.06837237, + "epoch": 0.13832243170450173, + "flos": 588431222784.0, + "grad_norm": 0.05947825646897862, + "language_loss": 0.9001984, + "learning_rate": 0.0009695496241702153, + "loss": 0.91122329, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.34155273, + "step": 719, + "time_per_iteration": 2.791111469268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094313, + "balance_loss_mlp": 1.06093454, + "epoch": 0.1385148133897653, + "flos": 699674844672.0, + "grad_norm": 0.07440757355955382, + "language_loss": 0.86308432, + "learning_rate": 0.0009694424727479339, + "loss": 0.87402749, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.33398438, + "step": 720, + "time_per_iteration": 2.8781325817108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088445, + "balance_loss_mlp": 1.05475688, + "epoch": 0.13870719507502885, + "flos": 597977399808.0, + "grad_norm": 0.059872525751604476, + "language_loss": 0.90073895, + "learning_rate": 0.0009693351390719213, + "loss": 0.91162348, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.3371582, + "step": 721, + "time_per_iteration": 2.691493272781372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095999, + "balance_loss_mlp": 1.06242967, + "epoch": 0.1388995767602924, + "flos": 586279309824.0, + "grad_norm": 0.07792099406652078, + "language_loss": 0.91640067, + "learning_rate": 0.000969227623183848, + "loss": 0.92736065, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.33569336, + "step": 722, + "time_per_iteration": 2.768209218978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086676, + "balance_loss_mlp": 1.05475235, + "epoch": 0.139091958445556, + "flos": 650810709504.0, + "grad_norm": 0.07717859695455091, + "language_loss": 0.91485119, + "learning_rate": 0.0009691199251254554, + "loss": 0.92571795, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.3190918, + "step": 723, + "time_per_iteration": 2.813594102859497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093708, + "balance_loss_mlp": 1.06159282, + "epoch": 0.13928434013081956, + "flos": 575446961664.0, + "grad_norm": 0.06414169604653322, + "language_loss": 0.8718468, + "learning_rate": 0.0009690120449385555, + "loss": 0.88278389, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.32104492, + "step": 724, + "time_per_iteration": 2.732372999191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110008, + "balance_loss_mlp": 1.06574821, + "epoch": 0.13947672181608312, + "flos": 562954503168.0, + "grad_norm": 0.07538454681544235, + "language_loss": 0.93399024, + "learning_rate": 0.0009689039826650312, + "loss": 0.94499099, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.34375, + "step": 725, + "time_per_iteration": 2.769481658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111743, + "balance_loss_mlp": 1.09967864, + "epoch": 0.13966910350134668, + "flos": 1520699387904.0, + "grad_norm": 0.042030956775344956, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77634799, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.12060547, + "step": 726, + "time_per_iteration": 4.903716802597046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101674, + "balance_loss_mlp": 1.06619751, + "epoch": 0.13986148518661023, + "flos": 499604557824.0, + "grad_norm": 0.07361028590256702, + "language_loss": 0.88265646, + "learning_rate": 0.0009686873120259941, + "loss": 0.89367324, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.35522461, + "step": 727, + "time_per_iteration": 2.639673948287964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099007, + "balance_loss_mlp": 1.06612897, + "epoch": 0.1400538668718738, + "flos": 598381862400.0, + "grad_norm": 0.053177263225715844, + "language_loss": 0.87612498, + "learning_rate": 0.0009685787037446004, + "loss": 0.88711506, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.32885742, + "step": 728, + "time_per_iteration": 2.7457332611083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095755, + "balance_loss_mlp": 1.06135106, + "epoch": 0.14024624855713735, + "flos": 593757941760.0, + "grad_norm": 0.0730266030670127, + "language_loss": 0.88032103, + "learning_rate": 0.0009684699135448201, + "loss": 0.89127851, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.34423828, + "step": 729, + "time_per_iteration": 2.6995558738708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091636, + "balance_loss_mlp": 1.05940139, + "epoch": 0.1404386302424009, + "flos": 506335311360.0, + "grad_norm": 0.06378774069808751, + "language_loss": 0.93033969, + "learning_rate": 0.0009683609414688895, + "loss": 0.94125605, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.32226562, + "step": 730, + "time_per_iteration": 2.6648926734924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097348, + "balance_loss_mlp": 1.06175184, + "epoch": 0.14063101192766447, + "flos": 573132105216.0, + "grad_norm": 0.05452232030629634, + "language_loss": 0.86945236, + "learning_rate": 0.0009682517875591154, + "loss": 0.88042581, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.35620117, + "step": 731, + "time_per_iteration": 2.7333967685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099629, + "balance_loss_mlp": 1.06656027, + "epoch": 0.14082339361292806, + "flos": 564333806592.0, + "grad_norm": 0.06482276791137384, + "language_loss": 0.87207299, + "learning_rate": 0.0009681424518578749, + "loss": 0.88306928, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.33081055, + "step": 732, + "time_per_iteration": 2.706704616546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_mlp": 1.06734443, + "epoch": 0.14101577529819162, + "flos": 463336187904.0, + "grad_norm": 0.05411989278901109, + "language_loss": 0.88122302, + "learning_rate": 0.000968032934407616, + "loss": 0.89222693, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.33056641, + "step": 733, + "time_per_iteration": 2.5904436111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100997, + "balance_loss_mlp": 1.06766593, + "epoch": 0.14120815698345518, + "flos": 595791991296.0, + "grad_norm": 0.06321555834593343, + "language_loss": 0.82077157, + "learning_rate": 0.0009679232352508571, + "loss": 0.83178151, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.33349609, + "step": 734, + "time_per_iteration": 2.758493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102299, + "balance_loss_mlp": 1.06992185, + "epoch": 0.14140053866871874, + "flos": 534864591360.0, + "grad_norm": 0.05697576898708014, + "language_loss": 0.81442666, + "learning_rate": 0.0009678133544301871, + "loss": 0.82544965, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.32373047, + "step": 735, + "time_per_iteration": 2.6508195400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092208, + "balance_loss_mlp": 1.06006956, + "epoch": 0.1415929203539823, + "flos": 520013606400.0, + "grad_norm": 0.0400187761209974, + "language_loss": 0.91843486, + "learning_rate": 0.0009677032919882658, + "loss": 0.92935699, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.32128906, + "step": 736, + "time_per_iteration": 2.705019474029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_mlp": 1.06975937, + "epoch": 0.14178530203924586, + "flos": 482095300608.0, + "grad_norm": 0.07179339183341249, + "language_loss": 0.92199683, + "learning_rate": 0.000967593047967823, + "loss": 0.93300164, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.30712891, + "step": 737, + "time_per_iteration": 2.55415415763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109678, + "balance_loss_mlp": 1.07577443, + "epoch": 0.14197768372450942, + "flos": 676339863552.0, + "grad_norm": 0.08640894081958116, + "language_loss": 0.87084705, + "learning_rate": 0.0009674826224116593, + "loss": 0.88194382, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.33911133, + "step": 738, + "time_per_iteration": 2.819878101348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097633, + "balance_loss_mlp": 1.06544614, + "epoch": 0.14217006540977298, + "flos": 445802199552.0, + "grad_norm": 0.06953952980021996, + "language_loss": 0.8713401, + "learning_rate": 0.0009673720153626455, + "loss": 0.88231641, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.32177734, + "step": 739, + "time_per_iteration": 2.5987422466278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096281, + "balance_loss_mlp": 1.06385565, + "epoch": 0.14236244709503657, + "flos": 496261016064.0, + "grad_norm": 0.08400230511878481, + "language_loss": 0.87465405, + "learning_rate": 0.0009672612268637235, + "loss": 0.88561684, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.32421875, + "step": 740, + "time_per_iteration": 2.6148736476898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098472, + "balance_loss_mlp": 1.06669128, + "epoch": 0.14255482878030012, + "flos": 648022989312.0, + "grad_norm": 0.0806935070673247, + "language_loss": 0.846753, + "learning_rate": 0.0009671502569579048, + "loss": 0.85773772, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.31762695, + "step": 741, + "time_per_iteration": 2.7533769607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089599, + "balance_loss_mlp": 1.05774641, + "epoch": 0.14274721046556368, + "flos": 535888894464.0, + "grad_norm": 0.06572551706098649, + "language_loss": 0.90748239, + "learning_rate": 0.0009670391056882719, + "loss": 0.91837835, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.31835938, + "step": 742, + "time_per_iteration": 2.698690176010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088511, + "balance_loss_mlp": 1.0565629, + "epoch": 0.14293959215082724, + "flos": 956677215744.0, + "grad_norm": 0.07291469749344824, + "language_loss": 0.89417249, + "learning_rate": 0.0009669277730979776, + "loss": 0.90505755, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.31958008, + "step": 743, + "time_per_iteration": 3.1728732585906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108649, + "balance_loss_mlp": 1.05408931, + "epoch": 0.1431319738360908, + "flos": 692766590976.0, + "grad_norm": 0.06693583917292938, + "language_loss": 0.85588205, + "learning_rate": 0.0009668162592302449, + "loss": 0.86674696, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.32397461, + "step": 744, + "time_per_iteration": 2.896467685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099426, + "balance_loss_mlp": 1.06673896, + "epoch": 0.14332435552135436, + "flos": 565174817280.0, + "grad_norm": 0.0717564206721674, + "language_loss": 0.86683381, + "learning_rate": 0.0009667045641283676, + "loss": 0.877828, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.3269043, + "step": 745, + "time_per_iteration": 2.6326427459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095955, + "balance_loss_mlp": 1.06336319, + "epoch": 0.14351673720661792, + "flos": 738045665280.0, + "grad_norm": 0.07083856064802352, + "language_loss": 0.95545924, + "learning_rate": 0.0009665926878357092, + "loss": 0.96641874, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.32592773, + "step": 746, + "time_per_iteration": 2.902628183364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108393, + "balance_loss_mlp": 1.07565856, + "epoch": 0.14370911889188148, + "flos": 548951731200.0, + "grad_norm": 0.08672542857876225, + "language_loss": 0.91510898, + "learning_rate": 0.0009664806303957043, + "loss": 0.92619288, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.32714844, + "step": 747, + "time_per_iteration": 2.678656578063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107271, + "balance_loss_mlp": 1.07448816, + "epoch": 0.14390150057714507, + "flos": 589973469696.0, + "grad_norm": 0.06575006445724518, + "language_loss": 0.87633115, + "learning_rate": 0.0009663683918518571, + "loss": 0.88740385, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.32788086, + "step": 748, + "time_per_iteration": 2.894339084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116744, + "balance_loss_mlp": 1.08226848, + "epoch": 0.14409388226240863, + "flos": 590773782528.0, + "grad_norm": 0.06412555003569581, + "language_loss": 0.86334193, + "learning_rate": 0.0009662559722477428, + "loss": 0.87450933, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.3449707, + "step": 749, + "time_per_iteration": 2.6673357486724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116866, + "balance_loss_mlp": 1.15397346, + "epoch": 0.1442862639476722, + "flos": 1510418479104.0, + "grad_norm": 0.05654081816866197, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77331638, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.14648438, + "step": 750, + "time_per_iteration": 4.97744607925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111461, + "balance_loss_mlp": 1.0782733, + "epoch": 0.14447864563293575, + "flos": 496493770752.0, + "grad_norm": 0.05840496998451829, + "language_loss": 0.89989787, + "learning_rate": 0.0009660305900333632, + "loss": 0.91101241, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.33203125, + "step": 751, + "time_per_iteration": 2.6919631958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108129, + "balance_loss_mlp": 1.07513142, + "epoch": 0.1446710273181993, + "flos": 589400271360.0, + "grad_norm": 0.0663289310880325, + "language_loss": 0.83084202, + "learning_rate": 0.0009659176275105992, + "loss": 0.8419233, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.33007812, + "step": 752, + "time_per_iteration": 2.702003240585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097403, + "balance_loss_mlp": 1.0634284, + "epoch": 0.14486340900346287, + "flos": 585521256960.0, + "grad_norm": 0.05748666507804042, + "language_loss": 0.86628646, + "learning_rate": 0.0009658044841025701, + "loss": 0.87726045, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.34008789, + "step": 753, + "time_per_iteration": 2.7666702270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106626, + "balance_loss_mlp": 1.07114923, + "epoch": 0.14505579068872643, + "flos": 504405978624.0, + "grad_norm": 0.07320865998852653, + "language_loss": 0.81996346, + "learning_rate": 0.0009656911598532021, + "loss": 0.83102977, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.35498047, + "step": 754, + "time_per_iteration": 2.6273839473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094053, + "balance_loss_mlp": 1.05936301, + "epoch": 0.14524817237399, + "flos": 486566452224.0, + "grad_norm": 0.05776902712696923, + "language_loss": 0.90229332, + "learning_rate": 0.0009655776548064917, + "loss": 0.91323388, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.34667969, + "step": 755, + "time_per_iteration": 2.6639461517333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092368, + "balance_loss_mlp": 1.05867922, + "epoch": 0.14544055405925355, + "flos": 727857888768.0, + "grad_norm": 0.059694446461720084, + "language_loss": 0.88762641, + "learning_rate": 0.0009654639690065054, + "loss": 0.89855003, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.33691406, + "step": 756, + "time_per_iteration": 2.881164789199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092737, + "balance_loss_mlp": 1.05981112, + "epoch": 0.14563293574451713, + "flos": 593359271424.0, + "grad_norm": 0.0719411984245977, + "language_loss": 0.88362074, + "learning_rate": 0.00096535010249738, + "loss": 0.89454818, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.3293457, + "step": 757, + "time_per_iteration": 2.703355312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092325, + "balance_loss_mlp": 1.05925632, + "epoch": 0.1458253174297807, + "flos": 560192924160.0, + "grad_norm": 0.09095988428785044, + "language_loss": 0.8300786, + "learning_rate": 0.0009652360553233224, + "loss": 0.84100187, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.33081055, + "step": 758, + "time_per_iteration": 2.7321062088012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_mlp": 1.04821551, + "epoch": 0.14601769911504425, + "flos": 1557025984512.0, + "grad_norm": 0.03493248396843453, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74836457, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.14453125, + "step": 759, + "time_per_iteration": 4.917184591293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099151, + "balance_loss_mlp": 1.06605887, + "epoch": 0.1462100808003078, + "flos": 865922628096.0, + "grad_norm": 0.05465610046720203, + "language_loss": 0.8166393, + "learning_rate": 0.0009650074191575883, + "loss": 0.82763088, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.33105469, + "step": 760, + "time_per_iteration": 3.2009472846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097005, + "balance_loss_mlp": 1.06341171, + "epoch": 0.14640246248557137, + "flos": 522673288704.0, + "grad_norm": 0.07890258703475667, + "language_loss": 0.86329532, + "learning_rate": 0.0009648928302546766, + "loss": 0.87426543, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.3359375, + "step": 761, + "time_per_iteration": 2.6858482360839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087335, + "balance_loss_mlp": 1.05340791, + "epoch": 0.14659484417083493, + "flos": 1030121805312.0, + "grad_norm": 0.05505233607608704, + "language_loss": 0.8584463, + "learning_rate": 0.0009647780608643613, + "loss": 0.86931968, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.33935547, + "step": 762, + "time_per_iteration": 3.3784618377685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087006, + "balance_loss_mlp": 1.05365133, + "epoch": 0.1467872258560985, + "flos": 500426629632.0, + "grad_norm": 0.083565321416964, + "language_loss": 0.88299912, + "learning_rate": 0.0009646631110312001, + "loss": 0.89386916, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.33349609, + "step": 763, + "time_per_iteration": 2.642038345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096574, + "balance_loss_mlp": 1.06465006, + "epoch": 0.14697960754136205, + "flos": 547514201088.0, + "grad_norm": 0.05646167170610495, + "language_loss": 0.88908124, + "learning_rate": 0.0009645479807998203, + "loss": 0.900047, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.3190918, + "step": 764, + "time_per_iteration": 2.7709102630615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093321, + "balance_loss_mlp": 1.0614922, + "epoch": 0.14717198922662564, + "flos": 517586678784.0, + "grad_norm": 0.06731397985108602, + "language_loss": 0.93233657, + "learning_rate": 0.0009644326702149196, + "loss": 0.94326979, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.31811523, + "step": 765, + "time_per_iteration": 2.691761016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098194, + "balance_loss_mlp": 1.06472015, + "epoch": 0.1473643709118892, + "flos": 731661147648.0, + "grad_norm": 0.08664060064789567, + "language_loss": 0.85604531, + "learning_rate": 0.0009643171793212653, + "loss": 0.86702728, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.33496094, + "step": 766, + "time_per_iteration": 3.0578510761260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095374, + "balance_loss_mlp": 1.06190002, + "epoch": 0.14755675259715276, + "flos": 620257554432.0, + "grad_norm": 0.06875066800131625, + "language_loss": 0.90379435, + "learning_rate": 0.0009642015081636952, + "loss": 0.91474807, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.33496094, + "step": 767, + "time_per_iteration": 2.690892219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091039, + "balance_loss_mlp": 1.05830407, + "epoch": 0.14774913428241632, + "flos": 451981513728.0, + "grad_norm": 0.06617868208271054, + "language_loss": 0.88812423, + "learning_rate": 0.0009640856567871166, + "loss": 0.89903462, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.32714844, + "step": 768, + "time_per_iteration": 2.5108768939971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086849, + "balance_loss_mlp": 1.05316067, + "epoch": 0.14794151596767988, + "flos": 836881196544.0, + "grad_norm": 0.06813910901976611, + "language_loss": 0.89643073, + "learning_rate": 0.0009639696252365072, + "loss": 0.90729922, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.33691406, + "step": 769, + "time_per_iteration": 3.036872386932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087914, + "balance_loss_mlp": 1.05546558, + "epoch": 0.14813389765294344, + "flos": 685765204992.0, + "grad_norm": 0.06952898718112278, + "language_loss": 0.82433641, + "learning_rate": 0.0009638534135569144, + "loss": 0.83521557, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.32446289, + "step": 770, + "time_per_iteration": 2.920228958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096521, + "balance_loss_mlp": 1.06395316, + "epoch": 0.148326279338207, + "flos": 509625008640.0, + "grad_norm": 0.05850145176667806, + "language_loss": 0.90417981, + "learning_rate": 0.0009637370217934554, + "loss": 0.91514498, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.32568359, + "step": 771, + "time_per_iteration": 2.6692943572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0624088, + "epoch": 0.14851866102347056, + "flos": 587869608960.0, + "grad_norm": 0.06374792966079154, + "language_loss": 0.83362675, + "learning_rate": 0.0009636204499913175, + "loss": 0.84457153, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.32055664, + "step": 772, + "time_per_iteration": 2.9103784561157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101595, + "balance_loss_mlp": 1.07129157, + "epoch": 0.14871104270873411, + "flos": 690722366976.0, + "grad_norm": 0.05784692032564958, + "language_loss": 0.8891257, + "learning_rate": 0.0009635036981957581, + "loss": 0.90014172, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.30273438, + "step": 773, + "time_per_iteration": 2.840233087539673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109331, + "balance_loss_mlp": 1.06112361, + "epoch": 0.1489034243939977, + "flos": 654803205120.0, + "grad_norm": 0.06091674471201955, + "language_loss": 0.9126395, + "learning_rate": 0.0009633867664521043, + "loss": 0.9235726, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.32202148, + "step": 774, + "time_per_iteration": 2.8467912673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098219, + "balance_loss_mlp": 1.0643878, + "epoch": 0.14909580607926126, + "flos": 475595891712.0, + "grad_norm": 0.06395321005815084, + "language_loss": 0.87366414, + "learning_rate": 0.0009632696548057527, + "loss": 0.8846463, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.33862305, + "step": 775, + "time_per_iteration": 2.55267596244812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090534, + "balance_loss_mlp": 1.05729866, + "epoch": 0.14928818776452482, + "flos": 610789953024.0, + "grad_norm": 0.07257335679926562, + "language_loss": 0.85489643, + "learning_rate": 0.0009631523633021704, + "loss": 0.86580181, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.33251953, + "step": 776, + "time_per_iteration": 2.800656795501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090063, + "balance_loss_mlp": 1.05694628, + "epoch": 0.14948056944978838, + "flos": 561487859712.0, + "grad_norm": 0.058446141184189525, + "language_loss": 0.88943005, + "learning_rate": 0.0009630348919868936, + "loss": 0.90033066, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.33129883, + "step": 777, + "time_per_iteration": 2.7306644916534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088502, + "balance_loss_mlp": 1.05397916, + "epoch": 0.14967295113505194, + "flos": 448972623360.0, + "grad_norm": 0.08136314957760014, + "language_loss": 0.81536144, + "learning_rate": 0.0009629172409055293, + "loss": 0.82624644, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.34545898, + "step": 778, + "time_per_iteration": 2.532480239868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091534, + "balance_loss_mlp": 1.05937171, + "epoch": 0.1498653328203155, + "flos": 571000541184.0, + "grad_norm": 0.06865521140792329, + "language_loss": 0.88039231, + "learning_rate": 0.0009627994101037531, + "loss": 0.89130771, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.3215332, + "step": 779, + "time_per_iteration": 2.7336056232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091586, + "balance_loss_mlp": 1.05811191, + "epoch": 0.15005771450557906, + "flos": 630918194688.0, + "grad_norm": 0.06277485509918372, + "language_loss": 0.8981787, + "learning_rate": 0.0009626813996273114, + "loss": 0.90909451, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.3347168, + "step": 780, + "time_per_iteration": 2.8651859760284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092654, + "balance_loss_mlp": 1.06018162, + "epoch": 0.15025009619084262, + "flos": 577633780224.0, + "grad_norm": 0.06737111741199381, + "language_loss": 0.89359641, + "learning_rate": 0.0009625632095220198, + "loss": 0.90452296, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.32470703, + "step": 781, + "time_per_iteration": 2.910163640975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093984, + "balance_loss_mlp": 1.06041455, + "epoch": 0.1504424778761062, + "flos": 483646311936.0, + "grad_norm": 0.06188715182302237, + "language_loss": 0.87568116, + "learning_rate": 0.0009624448398337637, + "loss": 0.88662094, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.3359375, + "step": 782, + "time_per_iteration": 2.532055616378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100397, + "balance_loss_mlp": 1.06751907, + "epoch": 0.15063485956136977, + "flos": 762167812608.0, + "grad_norm": 0.06229794960735175, + "language_loss": 0.89905757, + "learning_rate": 0.0009623262906084984, + "loss": 0.91006154, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.32861328, + "step": 783, + "time_per_iteration": 2.9851605892181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104855, + "balance_loss_mlp": 1.0712378, + "epoch": 0.15082724124663333, + "flos": 497369687040.0, + "grad_norm": 0.060596744514248076, + "language_loss": 0.90796679, + "learning_rate": 0.0009622075618922486, + "loss": 0.91901541, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.33642578, + "step": 784, + "time_per_iteration": 2.6796786785125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102349, + "balance_loss_mlp": 1.06928015, + "epoch": 0.15101962293189689, + "flos": 509476621824.0, + "grad_norm": 0.06389342174673626, + "language_loss": 0.87423813, + "learning_rate": 0.0009620886537311091, + "loss": 0.8852616, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.33081055, + "step": 785, + "time_per_iteration": 2.6153056621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113163, + "balance_loss_mlp": 1.07685184, + "epoch": 0.15121200461716044, + "flos": 457520638464.0, + "grad_norm": 0.06793935281312648, + "language_loss": 0.85492945, + "learning_rate": 0.000961969566171244, + "loss": 0.86606109, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.36303711, + "step": 786, + "time_per_iteration": 2.506267786026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122272, + "balance_loss_mlp": 1.08703363, + "epoch": 0.151404386302424, + "flos": 537729477120.0, + "grad_norm": 0.0670602351843582, + "language_loss": 0.90370345, + "learning_rate": 0.0009618502992588873, + "loss": 0.91492617, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.35253906, + "step": 787, + "time_per_iteration": 2.623457670211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141844, + "balance_loss_mlp": 1.10658193, + "epoch": 0.15159676798768756, + "flos": 687858891264.0, + "grad_norm": 0.06543467559167064, + "language_loss": 0.88581872, + "learning_rate": 0.0009617308530403424, + "loss": 0.89723718, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.35302734, + "step": 788, + "time_per_iteration": 2.975861072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149381, + "balance_loss_mlp": 1.11371326, + "epoch": 0.15178914967295112, + "flos": 545042193408.0, + "grad_norm": 0.059566397417978756, + "language_loss": 0.87806541, + "learning_rate": 0.0009616112275619825, + "loss": 0.88955921, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.35668945, + "step": 789, + "time_per_iteration": 2.683262348175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152452, + "balance_loss_mlp": 1.1169517, + "epoch": 0.1519815313582147, + "flos": 511510671360.0, + "grad_norm": 0.05728483560240697, + "language_loss": 0.84466863, + "learning_rate": 0.0009614914228702503, + "loss": 0.85619313, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.35498047, + "step": 790, + "time_per_iteration": 2.6616339683532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142719, + "balance_loss_mlp": 1.10850596, + "epoch": 0.15217391304347827, + "flos": 683747122176.0, + "grad_norm": 0.057799273493116435, + "language_loss": 0.89279461, + "learning_rate": 0.0009613714390116581, + "loss": 0.90422177, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.34204102, + "step": 791, + "time_per_iteration": 2.947608470916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133841, + "balance_loss_mlp": 1.0997231, + "epoch": 0.15236629472874183, + "flos": 643873342464.0, + "grad_norm": 0.06413295296627212, + "language_loss": 0.86589968, + "learning_rate": 0.0009612512760327879, + "loss": 0.87723809, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.34155273, + "step": 792, + "time_per_iteration": 2.8261189460754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124449, + "balance_loss_mlp": 1.08727932, + "epoch": 0.1525586764140054, + "flos": 412654791168.0, + "grad_norm": 0.06095846853214657, + "language_loss": 0.85749042, + "learning_rate": 0.0009611309339802909, + "loss": 0.86873484, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.37182617, + "step": 793, + "time_per_iteration": 2.438474178314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113041, + "balance_loss_mlp": 1.07811236, + "epoch": 0.15275105809926895, + "flos": 802444644864.0, + "grad_norm": 0.04691390558901254, + "language_loss": 0.84620011, + "learning_rate": 0.0009610104129008881, + "loss": 0.85733056, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.34985352, + "step": 794, + "time_per_iteration": 3.1149892807006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092099, + "balance_loss_mlp": 1.05786228, + "epoch": 0.1529434397845325, + "flos": 612143115264.0, + "grad_norm": 0.06446455819394356, + "language_loss": 0.88995111, + "learning_rate": 0.0009608897128413701, + "loss": 0.90087205, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.3425293, + "step": 795, + "time_per_iteration": 2.7310965061187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085841, + "balance_loss_mlp": 1.05160367, + "epoch": 0.15313582146979607, + "flos": 614941009920.0, + "grad_norm": 0.04580320827636504, + "language_loss": 0.8595438, + "learning_rate": 0.0009607688338485965, + "loss": 0.87040222, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.3425293, + "step": 796, + "time_per_iteration": 2.8534584045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088916, + "balance_loss_mlp": 1.05427384, + "epoch": 0.15332820315505963, + "flos": 793256440320.0, + "grad_norm": 0.053101967265095064, + "language_loss": 0.91128695, + "learning_rate": 0.0009606477759694969, + "loss": 0.92217612, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.34643555, + "step": 797, + "time_per_iteration": 3.0544466972351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_mlp": 1.06441545, + "epoch": 0.1535205848403232, + "flos": 549945510912.0, + "grad_norm": 0.0662794157411924, + "language_loss": 0.87591946, + "learning_rate": 0.0009605265392510703, + "loss": 0.88690674, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.34350586, + "step": 798, + "time_per_iteration": 2.6120660305023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011091, + "balance_loss_mlp": 1.07417202, + "epoch": 0.15371296652558677, + "flos": 535691045376.0, + "grad_norm": 0.07220239734969772, + "language_loss": 0.92342889, + "learning_rate": 0.0009604051237403846, + "loss": 0.93451989, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.34960938, + "step": 799, + "time_per_iteration": 2.640749216079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111728, + "balance_loss_mlp": 1.07808757, + "epoch": 0.15390534821085033, + "flos": 395002939392.0, + "grad_norm": 0.06314402273456009, + "language_loss": 0.86126584, + "learning_rate": 0.0009602835294845776, + "loss": 0.87238312, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.33666992, + "step": 800, + "time_per_iteration": 2.44914174079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117351, + "balance_loss_mlp": 1.08254242, + "epoch": 0.1540977298961139, + "flos": 535587738624.0, + "grad_norm": 0.057636094576239, + "language_loss": 0.91100746, + "learning_rate": 0.0009601617565308565, + "loss": 0.92218101, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.34790039, + "step": 801, + "time_per_iteration": 2.599679470062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119062, + "balance_loss_mlp": 1.08511138, + "epoch": 0.15429011158137745, + "flos": 723388147200.0, + "grad_norm": 0.05961266019354579, + "language_loss": 0.86783326, + "learning_rate": 0.0009600398049264977, + "loss": 0.87902391, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.33935547, + "step": 802, + "time_per_iteration": 2.9514007568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121787, + "balance_loss_mlp": 1.08735943, + "epoch": 0.154482493266641, + "flos": 620209502208.0, + "grad_norm": 0.06366105456569557, + "language_loss": 0.92098475, + "learning_rate": 0.0009599176747188469, + "loss": 0.9322027, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.34448242, + "step": 803, + "time_per_iteration": 2.8068411350250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_mlp": 1.08012128, + "epoch": 0.15467487495190457, + "flos": 525351909888.0, + "grad_norm": 0.08101366702111423, + "language_loss": 0.83651662, + "learning_rate": 0.0009597953659553196, + "loss": 0.84765685, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.33911133, + "step": 804, + "time_per_iteration": 2.7075448036193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098377, + "balance_loss_mlp": 1.06616712, + "epoch": 0.15486725663716813, + "flos": 527473299456.0, + "grad_norm": 0.07377431927286832, + "language_loss": 0.89624304, + "learning_rate": 0.0009596728786833997, + "loss": 0.90722686, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.32202148, + "step": 805, + "time_per_iteration": 2.6376051902770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088085, + "balance_loss_mlp": 1.05420554, + "epoch": 0.1550596383224317, + "flos": 1048118482944.0, + "grad_norm": 0.06708822771662253, + "language_loss": 0.90018022, + "learning_rate": 0.0009595502129506415, + "loss": 0.91106105, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.33911133, + "step": 806, + "time_per_iteration": 3.3391284942626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092582, + "balance_loss_mlp": 1.05903625, + "epoch": 0.15525202000769528, + "flos": 613438050816.0, + "grad_norm": 0.06052700763637142, + "language_loss": 0.83084035, + "learning_rate": 0.0009594273688046678, + "loss": 0.84176612, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.33544922, + "step": 807, + "time_per_iteration": 2.7136006355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088184, + "balance_loss_mlp": 1.05273128, + "epoch": 0.15544440169295884, + "flos": 532805810688.0, + "grad_norm": 0.07048562468234597, + "language_loss": 0.86048424, + "learning_rate": 0.000959304346293171, + "loss": 0.87136608, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.35473633, + "step": 808, + "time_per_iteration": 2.6744906902313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097573, + "balance_loss_mlp": 1.06254935, + "epoch": 0.1556367833782224, + "flos": 644433546240.0, + "grad_norm": 0.06803397985071584, + "language_loss": 0.88331544, + "learning_rate": 0.0009591811454639125, + "loss": 0.89429116, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.3503418, + "step": 809, + "time_per_iteration": 2.730431079864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0610261, + "epoch": 0.15582916506348596, + "flos": 543540644352.0, + "grad_norm": 0.06204685505428811, + "language_loss": 0.88227659, + "learning_rate": 0.0009590577663647234, + "loss": 0.89322132, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.3347168, + "step": 810, + "time_per_iteration": 2.71469783782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104684, + "balance_loss_mlp": 1.07078123, + "epoch": 0.15602154674874952, + "flos": 579740613120.0, + "grad_norm": 0.05672341894910533, + "language_loss": 0.86610442, + "learning_rate": 0.0009589342090435036, + "loss": 0.87715125, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.33935547, + "step": 811, + "time_per_iteration": 2.799246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110387, + "balance_loss_mlp": 1.06918025, + "epoch": 0.15621392843401308, + "flos": 534982454784.0, + "grad_norm": 0.0647852675732537, + "language_loss": 0.87778354, + "learning_rate": 0.0009588104735482223, + "loss": 0.8888222, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.34692383, + "step": 812, + "time_per_iteration": 2.6684510707855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126921, + "balance_loss_mlp": 1.09106326, + "epoch": 0.15640631011927664, + "flos": 550635162624.0, + "grad_norm": 0.08222618986335321, + "language_loss": 0.84280443, + "learning_rate": 0.0009586865599269177, + "loss": 0.85407358, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.35864258, + "step": 813, + "time_per_iteration": 2.6293816566467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131277, + "balance_loss_mlp": 1.09651566, + "epoch": 0.1565986918045402, + "flos": 637190641152.0, + "grad_norm": 0.05945515562529824, + "language_loss": 0.88725412, + "learning_rate": 0.0009585624682276977, + "loss": 0.89856684, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.34814453, + "step": 814, + "time_per_iteration": 2.744253158569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137563, + "balance_loss_mlp": 1.10113239, + "epoch": 0.15679107348980378, + "flos": 490569122304.0, + "grad_norm": 0.09591637295165127, + "language_loss": 0.87945771, + "learning_rate": 0.0009584381984987386, + "loss": 0.89083332, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.36474609, + "step": 815, + "time_per_iteration": 2.5264036655426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124613, + "balance_loss_mlp": 1.0911386, + "epoch": 0.15698345517506734, + "flos": 529689231360.0, + "grad_norm": 0.05838460881618622, + "language_loss": 0.90277314, + "learning_rate": 0.0009583137507882864, + "loss": 0.91401929, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.3347168, + "step": 816, + "time_per_iteration": 2.6488330364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109898, + "balance_loss_mlp": 1.07418323, + "epoch": 0.1571758368603309, + "flos": 545779897344.0, + "grad_norm": 0.07313796537718548, + "language_loss": 0.81262791, + "learning_rate": 0.000958189125144656, + "loss": 0.82372689, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.35766602, + "step": 817, + "time_per_iteration": 2.7040657997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101746, + "balance_loss_mlp": 1.06672239, + "epoch": 0.15736821854559446, + "flos": 565377048576.0, + "grad_norm": 0.067694528538076, + "language_loss": 0.88558215, + "learning_rate": 0.0009580643216162313, + "loss": 0.89659959, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.3503418, + "step": 818, + "time_per_iteration": 2.6538634300231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096156, + "balance_loss_mlp": 1.06110835, + "epoch": 0.15756060023085802, + "flos": 500707436544.0, + "grad_norm": 0.05957146674366314, + "language_loss": 0.79884583, + "learning_rate": 0.0009579393402514652, + "loss": 0.80980736, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.35107422, + "step": 819, + "time_per_iteration": 2.5606625080108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082975, + "balance_loss_mlp": 1.048738, + "epoch": 0.15775298191612158, + "flos": 519014034432.0, + "grad_norm": 0.06194437160070725, + "language_loss": 0.91126758, + "learning_rate": 0.0009578141810988801, + "loss": 0.92209733, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.34228516, + "step": 820, + "time_per_iteration": 2.55538010597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082194, + "balance_loss_mlp": 1.04712272, + "epoch": 0.15794536360138514, + "flos": 465891153408.0, + "grad_norm": 0.060184436438788555, + "language_loss": 0.91010749, + "learning_rate": 0.0009576888442070668, + "loss": 0.92092943, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.35083008, + "step": 821, + "time_per_iteration": 2.6139276027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094225, + "balance_loss_mlp": 1.05982161, + "epoch": 0.1581377452866487, + "flos": 516911583744.0, + "grad_norm": 0.06832586535724347, + "language_loss": 0.92820144, + "learning_rate": 0.0009575633296246854, + "loss": 0.93914366, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.34423828, + "step": 822, + "time_per_iteration": 2.557404041290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096972, + "balance_loss_mlp": 1.06242526, + "epoch": 0.15833012697191226, + "flos": 549522109440.0, + "grad_norm": 0.06257557491721027, + "language_loss": 0.83520567, + "learning_rate": 0.0009574376374004652, + "loss": 0.84617537, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.34570312, + "step": 823, + "time_per_iteration": 2.673220157623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100077, + "balance_loss_mlp": 1.06395626, + "epoch": 0.15852250865717585, + "flos": 487206641664.0, + "grad_norm": 0.07116075590187526, + "language_loss": 0.81073487, + "learning_rate": 0.000957311767583204, + "loss": 0.82173562, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.36132812, + "step": 824, + "time_per_iteration": 2.605074882507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159484, + "balance_loss_mlp": 1.14126849, + "epoch": 0.1587148903424394, + "flos": 1309041672192.0, + "grad_norm": 0.051809649393169656, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83231074, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.18261719, + "step": 825, + "time_per_iteration": 4.726073265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111019, + "balance_loss_mlp": 1.07349157, + "epoch": 0.15890727202770297, + "flos": 466634649600.0, + "grad_norm": 0.07947222616221912, + "language_loss": 0.92132723, + "learning_rate": 0.0009570594953650961, + "loss": 0.93243748, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.37524414, + "step": 826, + "time_per_iteration": 2.5146830081939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109067, + "balance_loss_mlp": 1.07208848, + "epoch": 0.15909965371296653, + "flos": 776733608448.0, + "grad_norm": 0.06013225990958685, + "language_loss": 0.80852252, + "learning_rate": 0.00095693309306219, + "loss": 0.81961316, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.36962891, + "step": 827, + "time_per_iteration": 3.095632553100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117102, + "balance_loss_mlp": 1.07945621, + "epoch": 0.1592920353982301, + "flos": 1077852538368.0, + "grad_norm": 0.05984978885312211, + "language_loss": 0.88600951, + "learning_rate": 0.0009568065133621244, + "loss": 0.89718056, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.37646484, + "step": 828, + "time_per_iteration": 3.3153574466705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111269, + "balance_loss_mlp": 1.07584, + "epoch": 0.15948441708349365, + "flos": 725307305472.0, + "grad_norm": 0.0632864692280333, + "language_loss": 0.85493571, + "learning_rate": 0.0009566797563140422, + "loss": 0.86604846, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.35449219, + "step": 829, + "time_per_iteration": 2.8705785274505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121543, + "balance_loss_mlp": 1.08470702, + "epoch": 0.1596767987687572, + "flos": 578447087616.0, + "grad_norm": 0.06433687205870958, + "language_loss": 0.88630873, + "learning_rate": 0.0009565528219671547, + "loss": 0.89752412, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.36816406, + "step": 830, + "time_per_iteration": 2.8890771865844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137333, + "balance_loss_mlp": 1.10049748, + "epoch": 0.15986918045402077, + "flos": 528728947200.0, + "grad_norm": 0.04994246668943954, + "language_loss": 0.85232639, + "learning_rate": 0.0009564257103707418, + "loss": 0.86369967, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.36816406, + "step": 831, + "time_per_iteration": 2.5870308876037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133852, + "balance_loss_mlp": 1.09632492, + "epoch": 0.16006156213928435, + "flos": 574313559552.0, + "grad_norm": 0.0648316290803925, + "language_loss": 0.91675746, + "learning_rate": 0.0009562984215741533, + "loss": 0.92809594, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.37524414, + "step": 832, + "time_per_iteration": 2.655066967010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117496, + "balance_loss_mlp": 1.08170903, + "epoch": 0.1602539438245479, + "flos": 515258675712.0, + "grad_norm": 0.14271195523272245, + "language_loss": 0.82911491, + "learning_rate": 0.0009561709556268065, + "loss": 0.84028995, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.35839844, + "step": 833, + "time_per_iteration": 2.69999098777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119914, + "balance_loss_mlp": 1.08419931, + "epoch": 0.16044632550981147, + "flos": 620730418176.0, + "grad_norm": 0.05962773238435596, + "language_loss": 0.95060706, + "learning_rate": 0.0009560433125781884, + "loss": 0.96180618, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.35693359, + "step": 834, + "time_per_iteration": 2.711109161376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126421, + "balance_loss_mlp": 1.08977628, + "epoch": 0.16063870719507503, + "flos": 560817146880.0, + "grad_norm": 0.06388697234939344, + "language_loss": 0.92829657, + "learning_rate": 0.0009559154924778544, + "loss": 0.93956077, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.36621094, + "step": 835, + "time_per_iteration": 2.695260763168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121916, + "balance_loss_mlp": 1.08789361, + "epoch": 0.1608310888803386, + "flos": 804778440192.0, + "grad_norm": 0.05750453212643973, + "language_loss": 0.85217482, + "learning_rate": 0.0009557874953754284, + "loss": 0.86339402, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.34057617, + "step": 836, + "time_per_iteration": 3.002013921737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126166, + "balance_loss_mlp": 1.09204817, + "epoch": 0.16102347056560215, + "flos": 600311195136.0, + "grad_norm": 0.06332628409766573, + "language_loss": 0.84060842, + "learning_rate": 0.0009556593213206038, + "loss": 0.85187006, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.34130859, + "step": 837, + "time_per_iteration": 2.698716163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125003, + "balance_loss_mlp": 1.09102869, + "epoch": 0.1612158522508657, + "flos": 553235208192.0, + "grad_norm": 0.07524747482874264, + "language_loss": 0.87844718, + "learning_rate": 0.0009555309703631414, + "loss": 0.88969719, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.33984375, + "step": 838, + "time_per_iteration": 2.669588327407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133813, + "balance_loss_mlp": 1.09752607, + "epoch": 0.16140823393612927, + "flos": 555701423616.0, + "grad_norm": 0.07144746672945328, + "language_loss": 0.87685311, + "learning_rate": 0.0009554024425528722, + "loss": 0.88819122, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.36279297, + "step": 839, + "time_per_iteration": 2.6809709072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112028, + "balance_loss_mlp": 1.08737814, + "epoch": 0.16160061562139286, + "flos": 543613427712.0, + "grad_norm": 0.06970106087394082, + "language_loss": 0.8929134, + "learning_rate": 0.0009552737379396948, + "loss": 0.90411627, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.32885742, + "step": 840, + "time_per_iteration": 2.6100995540618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102587, + "balance_loss_mlp": 1.06920815, + "epoch": 0.16179299730665642, + "flos": 603590717952.0, + "grad_norm": 0.06131687325166246, + "language_loss": 0.87945604, + "learning_rate": 0.0009551448565735767, + "loss": 0.89048195, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.33398438, + "step": 841, + "time_per_iteration": 2.7360360622406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095736, + "balance_loss_mlp": 1.06168985, + "epoch": 0.16198537899191998, + "flos": 786821050368.0, + "grad_norm": 0.07162496841720159, + "language_loss": 0.8519845, + "learning_rate": 0.0009550157985045543, + "loss": 0.86294186, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.34082031, + "step": 842, + "time_per_iteration": 3.0436456203460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087632, + "balance_loss_mlp": 1.05413389, + "epoch": 0.16217776067718354, + "flos": 519550917120.0, + "grad_norm": 0.060562390499230526, + "language_loss": 0.89622426, + "learning_rate": 0.0009548865637827321, + "loss": 0.90710062, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.33496094, + "step": 843, + "time_per_iteration": 2.6422221660614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086448, + "balance_loss_mlp": 1.05342698, + "epoch": 0.1623701423624471, + "flos": 505015644672.0, + "grad_norm": 0.07097995853224412, + "language_loss": 0.90216166, + "learning_rate": 0.0009547571524582838, + "loss": 0.91302609, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.33032227, + "step": 844, + "time_per_iteration": 2.6082894802093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095057, + "balance_loss_mlp": 1.06031966, + "epoch": 0.16256252404771065, + "flos": 496940493312.0, + "grad_norm": 0.06932052947515681, + "language_loss": 0.92511153, + "learning_rate": 0.0009546275645814512, + "loss": 0.9360621, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.34765625, + "step": 845, + "time_per_iteration": 2.5985872745513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100013, + "balance_loss_mlp": 1.065418, + "epoch": 0.16275490573297421, + "flos": 502110061056.0, + "grad_norm": 0.07540183512891604, + "language_loss": 0.90294898, + "learning_rate": 0.0009544978002025446, + "loss": 0.91394913, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.34619141, + "step": 846, + "time_per_iteration": 2.5778391361236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096289, + "balance_loss_mlp": 1.06174231, + "epoch": 0.16294728741823777, + "flos": 506952179712.0, + "grad_norm": 0.06018935314915502, + "language_loss": 0.87532055, + "learning_rate": 0.0009543678593719434, + "loss": 0.8862834, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.34570312, + "step": 847, + "time_per_iteration": 2.697566270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102624, + "balance_loss_mlp": 1.06824434, + "epoch": 0.16313966910350133, + "flos": 509418395136.0, + "grad_norm": 0.054217985504269955, + "language_loss": 0.8754853, + "learning_rate": 0.0009542377421400945, + "loss": 0.88651162, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.34375, + "step": 848, + "time_per_iteration": 2.786766290664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104457, + "balance_loss_mlp": 1.06847942, + "epoch": 0.16333205078876492, + "flos": 543712352256.0, + "grad_norm": 0.06122856356214084, + "language_loss": 0.83524954, + "learning_rate": 0.0009541074485575145, + "loss": 0.84629411, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.35986328, + "step": 849, + "time_per_iteration": 2.713759183883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098701, + "balance_loss_mlp": 1.06346297, + "epoch": 0.16352443247402848, + "flos": 507477477888.0, + "grad_norm": 0.06331477383231503, + "language_loss": 0.92240757, + "learning_rate": 0.0009539769786747874, + "loss": 0.93339461, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.35253906, + "step": 850, + "time_per_iteration": 2.589945077896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100894, + "balance_loss_mlp": 1.06584692, + "epoch": 0.16371681415929204, + "flos": 541851420672.0, + "grad_norm": 0.06704648725492578, + "language_loss": 0.81567919, + "learning_rate": 0.0009538463325425665, + "loss": 0.82668811, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.35083008, + "step": 851, + "time_per_iteration": 2.6779844760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105544, + "balance_loss_mlp": 1.07042515, + "epoch": 0.1639091958445556, + "flos": 520501026816.0, + "grad_norm": 0.058426853420895056, + "language_loss": 0.8673842, + "learning_rate": 0.0009537155102115728, + "loss": 0.87843966, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.35131836, + "step": 852, + "time_per_iteration": 2.5614206790924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106136, + "balance_loss_mlp": 1.07175565, + "epoch": 0.16410157752981916, + "flos": 547149026304.0, + "grad_norm": 0.06460558975646845, + "language_loss": 0.83482397, + "learning_rate": 0.0009535845117325961, + "loss": 0.84588534, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.34423828, + "step": 853, + "time_per_iteration": 2.6453073024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098632, + "balance_loss_mlp": 1.06470513, + "epoch": 0.16429395921508272, + "flos": 582561828864.0, + "grad_norm": 0.052152281018199936, + "language_loss": 0.93584174, + "learning_rate": 0.0009534533371564946, + "loss": 0.94682807, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.33959961, + "step": 854, + "time_per_iteration": 2.75186824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111206, + "balance_loss_mlp": 1.07670665, + "epoch": 0.16448634090034628, + "flos": 530678628864.0, + "grad_norm": 0.06475772966833339, + "language_loss": 0.8907218, + "learning_rate": 0.0009533219865341949, + "loss": 0.90183383, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.3449707, + "step": 855, + "time_per_iteration": 2.581479787826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108852, + "balance_loss_mlp": 1.07285094, + "epoch": 0.16467872258560984, + "flos": 491623948800.0, + "grad_norm": 0.06378602693040462, + "language_loss": 0.87287533, + "learning_rate": 0.0009531904599166916, + "loss": 0.88396388, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.36035156, + "step": 856, + "time_per_iteration": 2.6429831981658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107008, + "balance_loss_mlp": 1.07141232, + "epoch": 0.16487110427087343, + "flos": 506015216640.0, + "grad_norm": 0.07162133431974482, + "language_loss": 0.85139728, + "learning_rate": 0.0009530587573550478, + "loss": 0.86246729, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.35620117, + "step": 857, + "time_per_iteration": 2.5667338371276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071319, + "balance_loss_mlp": 1.05434394, + "epoch": 0.16506348595613698, + "flos": 1432006553088.0, + "grad_norm": 0.02717136097410494, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75390708, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.16992188, + "step": 858, + "time_per_iteration": 5.02930474281311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107487, + "balance_loss_mlp": 1.0740366, + "epoch": 0.16525586764140054, + "flos": 476890827264.0, + "grad_norm": 0.06438670275364486, + "language_loss": 0.90481895, + "learning_rate": 0.0009527948246039337, + "loss": 0.91589379, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.33447266, + "step": 859, + "time_per_iteration": 2.5222055912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109782, + "balance_loss_mlp": 1.07618856, + "epoch": 0.1654482493266641, + "flos": 880737297408.0, + "grad_norm": 0.058857893791213665, + "language_loss": 0.88361865, + "learning_rate": 0.000952662594516931, + "loss": 0.8947165, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.33618164, + "step": 860, + "time_per_iteration": 3.065053701400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109979, + "balance_loss_mlp": 1.07497942, + "epoch": 0.16564063101192766, + "flos": 626527028736.0, + "grad_norm": 0.058557043780191484, + "language_loss": 0.86803752, + "learning_rate": 0.0009525301886907234, + "loss": 0.87913728, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.34985352, + "step": 861, + "time_per_iteration": 2.873415470123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121945, + "balance_loss_mlp": 1.08537149, + "epoch": 0.16583301269719122, + "flos": 561250722816.0, + "grad_norm": 0.0761086770239273, + "language_loss": 0.8825953, + "learning_rate": 0.0009523976071767155, + "loss": 0.8938148, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.36572266, + "step": 862, + "time_per_iteration": 2.71508526802063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115561, + "balance_loss_mlp": 1.07994115, + "epoch": 0.16602539438245478, + "flos": 567510022656.0, + "grad_norm": 0.05388299317844869, + "language_loss": 0.88433009, + "learning_rate": 0.00095226485002638, + "loss": 0.8954857, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.35620117, + "step": 863, + "time_per_iteration": 2.7524497509002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111173, + "balance_loss_mlp": 1.07617354, + "epoch": 0.16621777606771834, + "flos": 574589984256.0, + "grad_norm": 0.05833582522103205, + "language_loss": 0.89311475, + "learning_rate": 0.0009521319172912576, + "loss": 0.90422642, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.35009766, + "step": 864, + "time_per_iteration": 2.717493772506714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134798, + "balance_loss_mlp": 1.09846306, + "epoch": 0.16641015775298193, + "flos": 514292599296.0, + "grad_norm": 0.05644176285984134, + "language_loss": 0.94990546, + "learning_rate": 0.0009519988090229579, + "loss": 0.96125346, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.36352539, + "step": 865, + "time_per_iteration": 2.6850624084472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133116, + "balance_loss_mlp": 1.09668565, + "epoch": 0.1666025394382455, + "flos": 621395338752.0, + "grad_norm": 0.05816643645022503, + "language_loss": 0.88684535, + "learning_rate": 0.0009518655252731576, + "loss": 0.89817655, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.36450195, + "step": 866, + "time_per_iteration": 2.7240021228790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124082, + "balance_loss_mlp": 1.0882715, + "epoch": 0.16679492112350905, + "flos": 548528329728.0, + "grad_norm": 0.06128727898968579, + "language_loss": 0.9070124, + "learning_rate": 0.0009517320660936022, + "loss": 0.91825324, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.35839844, + "step": 867, + "time_per_iteration": 2.6959731578826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118134, + "balance_loss_mlp": 1.08260965, + "epoch": 0.1669873028087726, + "flos": 665379477504.0, + "grad_norm": 0.05857722537468161, + "language_loss": 0.83557463, + "learning_rate": 0.0009515984315361051, + "loss": 0.84675598, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.35546875, + "step": 868, + "time_per_iteration": 2.813674211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122458, + "balance_loss_mlp": 1.08638549, + "epoch": 0.16717968449403617, + "flos": 538305647616.0, + "grad_norm": 0.06553445455365839, + "language_loss": 0.87103701, + "learning_rate": 0.000951464621652548, + "loss": 0.88226151, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.36083984, + "step": 869, + "time_per_iteration": 2.674333333969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111253, + "balance_loss_mlp": 1.07757819, + "epoch": 0.16737206617929973, + "flos": 529833235968.0, + "grad_norm": 0.059309523866322815, + "language_loss": 0.78951609, + "learning_rate": 0.0009513306364948804, + "loss": 0.80064136, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.34985352, + "step": 870, + "time_per_iteration": 2.7519431114196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011143, + "balance_loss_mlp": 1.07953846, + "epoch": 0.1675644478645633, + "flos": 480529732608.0, + "grad_norm": 0.06711563999134491, + "language_loss": 0.89559376, + "learning_rate": 0.0009511964761151197, + "loss": 0.90673673, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.34814453, + "step": 871, + "time_per_iteration": 2.544520854949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113298, + "balance_loss_mlp": 1.07820272, + "epoch": 0.16775682954982685, + "flos": 494311334400.0, + "grad_norm": 0.06484202096701225, + "language_loss": 0.9050945, + "learning_rate": 0.0009510621405653521, + "loss": 0.91622752, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.35131836, + "step": 872, + "time_per_iteration": 2.594224452972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106918, + "balance_loss_mlp": 1.07265687, + "epoch": 0.1679492112350904, + "flos": 751694846976.0, + "grad_norm": 0.060317450015561574, + "language_loss": 0.847211, + "learning_rate": 0.0009509276298977309, + "loss": 0.85828018, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.34277344, + "step": 873, + "time_per_iteration": 2.9428915977478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110568, + "balance_loss_mlp": 1.07261181, + "epoch": 0.168141592920354, + "flos": 1135413075456.0, + "grad_norm": 0.05441785661992682, + "language_loss": 0.81867516, + "learning_rate": 0.0009507929441644778, + "loss": 0.82978088, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.37939453, + "step": 874, + "time_per_iteration": 3.52008318901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101336, + "balance_loss_mlp": 1.06640816, + "epoch": 0.16833397460561755, + "flos": 632114205696.0, + "grad_norm": 0.06557720885733571, + "language_loss": 0.86201179, + "learning_rate": 0.0009506580834178826, + "loss": 0.87302518, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.34936523, + "step": 875, + "time_per_iteration": 2.7744014263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110472, + "balance_loss_mlp": 1.06817079, + "epoch": 0.1685263562908811, + "flos": 541171943424.0, + "grad_norm": 0.06007828909359903, + "language_loss": 0.91612709, + "learning_rate": 0.0009505230477103028, + "loss": 0.92717427, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.36547852, + "step": 876, + "time_per_iteration": 2.6593635082244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103279, + "balance_loss_mlp": 1.06703997, + "epoch": 0.16871873797614467, + "flos": 619036812288.0, + "grad_norm": 0.08702038824672748, + "language_loss": 0.81312418, + "learning_rate": 0.0009503878370941641, + "loss": 0.824157, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.36206055, + "step": 877, + "time_per_iteration": 2.7511024475097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094199, + "balance_loss_mlp": 1.05986643, + "epoch": 0.16891111966140823, + "flos": 606067107840.0, + "grad_norm": 0.06953183101172467, + "language_loss": 0.88841844, + "learning_rate": 0.0009502524516219595, + "loss": 0.89936042, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.34375, + "step": 878, + "time_per_iteration": 2.697455406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091575, + "balance_loss_mlp": 1.05757689, + "epoch": 0.1691035013466718, + "flos": 552058136064.0, + "grad_norm": 0.0721678347454753, + "language_loss": 0.89980447, + "learning_rate": 0.0009501168913462506, + "loss": 0.91072023, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.34008789, + "step": 879, + "time_per_iteration": 2.6825287342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080465, + "balance_loss_mlp": 1.06263125, + "epoch": 0.16929588303193535, + "flos": 1475544121344.0, + "grad_norm": 0.044515803528062385, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80202389, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.17871094, + "step": 880, + "time_per_iteration": 4.825777769088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.0464623, + "epoch": 0.1694882647171989, + "flos": 925850456064.0, + "grad_norm": 0.06491790696384477, + "language_loss": 0.85360616, + "learning_rate": 0.0009498452465949042, + "loss": 0.86441934, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.34887695, + "step": 881, + "time_per_iteration": 3.2700376510620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086319, + "balance_loss_mlp": 1.05227244, + "epoch": 0.1696806464024625, + "flos": 545829359616.0, + "grad_norm": 0.057533624801199786, + "language_loss": 0.916857, + "learning_rate": 0.0009497091622247285, + "loss": 0.92772019, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.34082031, + "step": 882, + "time_per_iteration": 2.711721181869507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085559, + "balance_loss_mlp": 1.05184698, + "epoch": 0.16987302808772606, + "flos": 528970466304.0, + "grad_norm": 0.08384615451013337, + "language_loss": 0.93744707, + "learning_rate": 0.0009495729032619723, + "loss": 0.94830269, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.33740234, + "step": 883, + "time_per_iteration": 2.688525438308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084621, + "balance_loss_mlp": 1.05062199, + "epoch": 0.17006540977298962, + "flos": 754855096320.0, + "grad_norm": 0.06073677328113264, + "language_loss": 0.84419179, + "learning_rate": 0.0009494364697595354, + "loss": 0.85503805, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.34033203, + "step": 884, + "time_per_iteration": 2.9112000465393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092434, + "balance_loss_mlp": 1.05750597, + "epoch": 0.17025779145825318, + "flos": 558532813824.0, + "grad_norm": 0.06728326387015754, + "language_loss": 0.89818925, + "learning_rate": 0.0009492998617703867, + "loss": 0.90911365, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.34936523, + "step": 885, + "time_per_iteration": 2.6760926246643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093981, + "balance_loss_mlp": 1.06045985, + "epoch": 0.17045017314351674, + "flos": 511963186176.0, + "grad_norm": 0.0687386743468794, + "language_loss": 0.87971282, + "learning_rate": 0.0009491630793475619, + "loss": 0.89065266, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.33520508, + "step": 886, + "time_per_iteration": 2.59726619720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.06011951, + "epoch": 0.1706425548287803, + "flos": 508674898944.0, + "grad_norm": 0.058204707286146434, + "language_loss": 0.85722017, + "learning_rate": 0.0009490261225441643, + "loss": 0.8681649, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.34350586, + "step": 887, + "time_per_iteration": 2.900501012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087689, + "balance_loss_mlp": 1.0545013, + "epoch": 0.17083493651404386, + "flos": 717016776192.0, + "grad_norm": 0.05310353290702558, + "language_loss": 0.90775931, + "learning_rate": 0.0009488889914133656, + "loss": 0.9186362, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.33203125, + "step": 888, + "time_per_iteration": 2.992532968521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089684, + "balance_loss_mlp": 1.05520868, + "epoch": 0.17102731819930742, + "flos": 558852908544.0, + "grad_norm": 0.047287767355612194, + "language_loss": 0.88680297, + "learning_rate": 0.0009487516860084047, + "loss": 0.89769983, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.34472656, + "step": 889, + "time_per_iteration": 2.7029643058776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082798, + "balance_loss_mlp": 1.04858518, + "epoch": 0.17121969988457098, + "flos": 494542679040.0, + "grad_norm": 0.0765590367769256, + "language_loss": 0.88680983, + "learning_rate": 0.0009486142063825884, + "loss": 0.89763772, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.34228516, + "step": 890, + "time_per_iteration": 2.5640931129455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_mlp": 1.02402985, + "epoch": 0.17141208156983456, + "flos": 1548088063488.0, + "grad_norm": 0.02832238153451814, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.7346493, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.14648438, + "step": 891, + "time_per_iteration": 4.979609251022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108867, + "balance_loss_mlp": 1.0540278, + "epoch": 0.17160446325509812, + "flos": 619282713600.0, + "grad_norm": 0.06449268303648867, + "language_loss": 0.90758598, + "learning_rate": 0.0009483387246819542, + "loss": 0.91847265, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.34667969, + "step": 892, + "time_per_iteration": 2.731332540512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_mlp": 1.01767898, + "epoch": 0.17179684494036168, + "flos": 1381026972672.0, + "grad_norm": 0.016720826063608682, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83318138, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.1484375, + "step": 893, + "time_per_iteration": 4.641844987869263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097484, + "balance_loss_mlp": 1.06386662, + "epoch": 0.17198922662562524, + "flos": 492386383872.0, + "grad_norm": 0.05711411270468228, + "language_loss": 0.89587665, + "learning_rate": 0.0009480625467392688, + "loss": 0.90685147, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.33642578, + "step": 894, + "time_per_iteration": 2.6310250759124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_mlp": 1.01795936, + "epoch": 0.1721816083108888, + "flos": 1457529914880.0, + "grad_norm": 0.013728573618451478, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79027796, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.15136719, + "step": 895, + "time_per_iteration": 4.7525646686553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132534, + "balance_loss_mlp": 1.09834456, + "epoch": 0.17237398999615236, + "flos": 527853030912.0, + "grad_norm": 0.05821127752563967, + "language_loss": 0.87793648, + "learning_rate": 0.0009477856729834196, + "loss": 0.88926184, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.34228516, + "step": 896, + "time_per_iteration": 2.7015438079833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132108, + "balance_loss_mlp": 1.09901524, + "epoch": 0.17256637168141592, + "flos": 603644562432.0, + "grad_norm": 0.08337200045302615, + "language_loss": 0.9056648, + "learning_rate": 0.0009476469753098809, + "loss": 0.91698587, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.33105469, + "step": 897, + "time_per_iteration": 2.7035813331604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125108, + "balance_loss_mlp": 1.08922589, + "epoch": 0.17275875336667948, + "flos": 509437334016.0, + "grad_norm": 0.05742024530278536, + "language_loss": 0.874506, + "learning_rate": 0.0009475081038443738, + "loss": 0.88575709, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.35913086, + "step": 898, + "time_per_iteration": 2.584437370300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115669, + "balance_loss_mlp": 1.07971573, + "epoch": 0.17295113505194307, + "flos": 664951693824.0, + "grad_norm": 0.06535241228499304, + "language_loss": 0.85809892, + "learning_rate": 0.0009473690586408124, + "loss": 0.86925566, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.35986328, + "step": 899, + "time_per_iteration": 2.83156418800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116393, + "balance_loss_mlp": 1.08084452, + "epoch": 0.17314351673720663, + "flos": 555125253120.0, + "grad_norm": 0.0683413775827569, + "language_loss": 0.86639923, + "learning_rate": 0.0009472298397531792, + "loss": 0.87756318, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.35546875, + "step": 900, + "time_per_iteration": 2.6944193840026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123337, + "balance_loss_mlp": 1.08635855, + "epoch": 0.17333589842247019, + "flos": 503361326592.0, + "grad_norm": 0.09670394547256775, + "language_loss": 0.87118709, + "learning_rate": 0.0009470904472355235, + "loss": 0.88242042, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.36987305, + "step": 901, + "time_per_iteration": 2.637882709503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114178, + "balance_loss_mlp": 1.07982159, + "epoch": 0.17352828010773375, + "flos": 555924003840.0, + "grad_norm": 0.06358596699153923, + "language_loss": 0.79912066, + "learning_rate": 0.0009469508811419626, + "loss": 0.81026244, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.34399414, + "step": 902, + "time_per_iteration": 2.726072311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077408, + "balance_loss_mlp": 1.06453359, + "epoch": 0.1737206617929973, + "flos": 1553711556096.0, + "grad_norm": 0.030950293127884103, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72691238, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.12890625, + "step": 903, + "time_per_iteration": 4.791790723800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109546, + "balance_loss_mlp": 1.07445073, + "epoch": 0.17391304347826086, + "flos": 516390667776.0, + "grad_norm": 0.06883251868009001, + "language_loss": 0.84220147, + "learning_rate": 0.0009466712284439292, + "loss": 0.85329694, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.35131836, + "step": 904, + "time_per_iteration": 2.7648017406463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104504, + "balance_loss_mlp": 1.06995738, + "epoch": 0.17410542516352442, + "flos": 540773273088.0, + "grad_norm": 0.06988851938988141, + "language_loss": 0.8903957, + "learning_rate": 0.0009465311419480276, + "loss": 0.90144074, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.34545898, + "step": 905, + "time_per_iteration": 2.725829601287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098399, + "balance_loss_mlp": 1.0637325, + "epoch": 0.17429780684878798, + "flos": 623542869504.0, + "grad_norm": 0.06312030659776342, + "language_loss": 0.88624233, + "learning_rate": 0.0009463908820933622, + "loss": 0.89722633, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.34692383, + "step": 906, + "time_per_iteration": 2.8389482498168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093859, + "balance_loss_mlp": 1.05900264, + "epoch": 0.17449018853405157, + "flos": 575368386048.0, + "grad_norm": 0.056066721215551084, + "language_loss": 0.83138871, + "learning_rate": 0.0009462504489343868, + "loss": 0.84232736, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.34863281, + "step": 907, + "time_per_iteration": 2.863349437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086469, + "balance_loss_mlp": 1.05199337, + "epoch": 0.17468257021931513, + "flos": 533499844608.0, + "grad_norm": 0.07604190500703253, + "language_loss": 0.894853, + "learning_rate": 0.0009461098425256222, + "loss": 0.90571761, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.3449707, + "step": 908, + "time_per_iteration": 2.5941011905670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108834, + "balance_loss_mlp": 1.05345941, + "epoch": 0.1748749519045787, + "flos": 540496848384.0, + "grad_norm": 0.050694136543679796, + "language_loss": 0.85873353, + "learning_rate": 0.0009459690629216567, + "loss": 0.86961693, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.34887695, + "step": 909, + "time_per_iteration": 2.6097571849823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109032, + "balance_loss_mlp": 1.0570612, + "epoch": 0.17506733358984225, + "flos": 498373641216.0, + "grad_norm": 0.0569262349138849, + "language_loss": 0.88138729, + "learning_rate": 0.0009458281101771457, + "loss": 0.89229047, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.33276367, + "step": 910, + "time_per_iteration": 2.5904784202575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091588, + "balance_loss_mlp": 1.05744696, + "epoch": 0.1752597152751058, + "flos": 622621873152.0, + "grad_norm": 0.06350455217589325, + "language_loss": 0.8266046, + "learning_rate": 0.0009456869843468122, + "loss": 0.83752048, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.34179688, + "step": 911, + "time_per_iteration": 2.8930556774139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090023, + "balance_loss_mlp": 1.05476046, + "epoch": 0.17545209696036937, + "flos": 520717814784.0, + "grad_norm": 0.07844481886152296, + "language_loss": 0.79097009, + "learning_rate": 0.0009455456854854459, + "loss": 0.80187035, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.35302734, + "step": 912, + "time_per_iteration": 2.5984511375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096028, + "balance_loss_mlp": 1.0631026, + "epoch": 0.17564447864563293, + "flos": 461750270976.0, + "grad_norm": 0.05516798292623818, + "language_loss": 0.84505737, + "learning_rate": 0.0009454042136479039, + "loss": 0.85601771, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.3293457, + "step": 913, + "time_per_iteration": 2.586195945739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085981, + "balance_loss_mlp": 1.05286503, + "epoch": 0.1758368603308965, + "flos": 480416251392.0, + "grad_norm": 0.05301404729603274, + "language_loss": 0.83308446, + "learning_rate": 0.0009452625688891103, + "loss": 0.84394431, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.33129883, + "step": 914, + "time_per_iteration": 2.5374929904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052517, + "balance_loss_mlp": 1.038975, + "epoch": 0.17602924201616005, + "flos": 1478160133632.0, + "grad_norm": 0.03507986977902886, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79787254, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.13574219, + "step": 915, + "time_per_iteration": 4.5561583042144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097389, + "balance_loss_mlp": 1.06226993, + "epoch": 0.17622162370142364, + "flos": 602010593280.0, + "grad_norm": 0.06815502965849334, + "language_loss": 0.93451321, + "learning_rate": 0.0009449787608278015, + "loss": 0.94548714, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.35131836, + "step": 916, + "time_per_iteration": 2.7807908058166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092918, + "balance_loss_mlp": 1.0588007, + "epoch": 0.1764140053866872, + "flos": 442473214464.0, + "grad_norm": 0.0637680644109211, + "language_loss": 0.92700857, + "learning_rate": 0.0009448365976354704, + "loss": 0.93793774, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.34130859, + "step": 917, + "time_per_iteration": 2.4800124168395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.06909323, + "epoch": 0.17660638707195075, + "flos": 500362610688.0, + "grad_norm": 0.07080486598597346, + "language_loss": 0.90158784, + "learning_rate": 0.0009446942617422558, + "loss": 0.91264427, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.36547852, + "step": 918, + "time_per_iteration": 2.5415430068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101469, + "balance_loss_mlp": 1.06766129, + "epoch": 0.17679876875721431, + "flos": 538621360128.0, + "grad_norm": 0.060000223973742446, + "language_loss": 0.86201262, + "learning_rate": 0.0009445517532034176, + "loss": 0.87302732, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.33789062, + "step": 919, + "time_per_iteration": 2.6849868297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121669, + "balance_loss_mlp": 1.08569145, + "epoch": 0.17699115044247787, + "flos": 497477376000.0, + "grad_norm": 0.08221632690768264, + "language_loss": 0.89522099, + "learning_rate": 0.0009444090720742824, + "loss": 0.9064377, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.35986328, + "step": 920, + "time_per_iteration": 2.6034600734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113572, + "balance_loss_mlp": 1.07883418, + "epoch": 0.17718353212774143, + "flos": 662444780544.0, + "grad_norm": 0.08029288241638204, + "language_loss": 0.88040781, + "learning_rate": 0.0009442662184102439, + "loss": 0.89154357, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.34741211, + "step": 921, + "time_per_iteration": 2.767352342605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105947, + "balance_loss_mlp": 1.07309294, + "epoch": 0.177375913813005, + "flos": 582340658688.0, + "grad_norm": 0.0705507668945597, + "language_loss": 0.87951338, + "learning_rate": 0.000944123192266763, + "loss": 0.89057279, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.32836914, + "step": 922, + "time_per_iteration": 2.789315700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108289, + "balance_loss_mlp": 1.0727644, + "epoch": 0.17756829549826855, + "flos": 552285098496.0, + "grad_norm": 0.06115562628552814, + "language_loss": 0.83835006, + "learning_rate": 0.0009439799936993671, + "loss": 0.84943295, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.35546875, + "step": 923, + "time_per_iteration": 2.7160987854003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090669, + "balance_loss_mlp": 1.05733824, + "epoch": 0.17776067718353214, + "flos": 556060806144.0, + "grad_norm": 0.07059184324253498, + "language_loss": 0.88508618, + "learning_rate": 0.0009438366227636511, + "loss": 0.89599288, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.33349609, + "step": 924, + "time_per_iteration": 2.6319191455841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084897, + "balance_loss_mlp": 1.05163789, + "epoch": 0.1779530588687957, + "flos": 658161303552.0, + "grad_norm": 0.06263940487075517, + "language_loss": 0.86677843, + "learning_rate": 0.0009436930795152763, + "loss": 0.87762737, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.33276367, + "step": 925, + "time_per_iteration": 2.8063783645629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084326, + "balance_loss_mlp": 1.05159163, + "epoch": 0.17814544055405926, + "flos": 644187644928.0, + "grad_norm": 0.06448697412821461, + "language_loss": 0.8710525, + "learning_rate": 0.0009435493640099713, + "loss": 0.88189578, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.32739258, + "step": 926, + "time_per_iteration": 2.7599081993103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080787, + "balance_loss_mlp": 1.04664516, + "epoch": 0.17833782223932282, + "flos": 460672123392.0, + "grad_norm": 0.06497730431564504, + "language_loss": 0.84328961, + "learning_rate": 0.0009434054763035314, + "loss": 0.85409749, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.34155273, + "step": 927, + "time_per_iteration": 2.612910032272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081381, + "balance_loss_mlp": 1.04740596, + "epoch": 0.17853020392458638, + "flos": 759212766720.0, + "grad_norm": 0.04594292129212818, + "language_loss": 0.85898727, + "learning_rate": 0.0009432614164518185, + "loss": 0.8698011, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.33984375, + "step": 928, + "time_per_iteration": 2.926981210708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086846, + "balance_loss_mlp": 1.05153632, + "epoch": 0.17872258560984994, + "flos": 782320785408.0, + "grad_norm": 0.055185850673896385, + "language_loss": 0.84792197, + "learning_rate": 0.000943117184510762, + "loss": 0.85879046, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.35327148, + "step": 929, + "time_per_iteration": 2.995514154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039861, + "balance_loss_mlp": 1.02660513, + "epoch": 0.1789149672951135, + "flos": 1459095482880.0, + "grad_norm": 0.021362691821678215, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79829824, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.1328125, + "step": 930, + "time_per_iteration": 4.99839448928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091678, + "balance_loss_mlp": 1.05739331, + "epoch": 0.17910734898037706, + "flos": 503598463488.0, + "grad_norm": 0.05761618473313655, + "language_loss": 0.88773429, + "learning_rate": 0.0009428282045846674, + "loss": 0.89865112, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.34301758, + "step": 931, + "time_per_iteration": 2.6966652870178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.05452061, + "epoch": 0.17929973066564064, + "flos": 745895264256.0, + "grad_norm": 0.05798282919409206, + "language_loss": 0.89983928, + "learning_rate": 0.0009426834567118214, + "loss": 0.91071755, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.33300781, + "step": 932, + "time_per_iteration": 3.072160482406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092765, + "balance_loss_mlp": 1.05907631, + "epoch": 0.1794921123509042, + "flos": 712875893760.0, + "grad_norm": 0.055390897890994044, + "language_loss": 0.80879378, + "learning_rate": 0.0009425385369740155, + "loss": 0.81972146, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.3371582, + "step": 933, + "time_per_iteration": 3.0337042808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092731, + "balance_loss_mlp": 1.05825567, + "epoch": 0.17968449403616776, + "flos": 632838763008.0, + "grad_norm": 0.0687685702394307, + "language_loss": 0.87443584, + "learning_rate": 0.0009423934454275125, + "loss": 0.8853631, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.3449707, + "step": 934, + "time_per_iteration": 2.7970879077911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089195, + "balance_loss_mlp": 1.05526757, + "epoch": 0.17987687572143132, + "flos": 536060602368.0, + "grad_norm": 0.08214865293258214, + "language_loss": 0.92215371, + "learning_rate": 0.0009422481821286418, + "loss": 0.93304563, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.33959961, + "step": 935, + "time_per_iteration": 2.7134642601013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091708, + "balance_loss_mlp": 1.05914021, + "epoch": 0.18006925740669488, + "flos": 537818227200.0, + "grad_norm": 0.0718764173736199, + "language_loss": 0.87967253, + "learning_rate": 0.0009421027471337998, + "loss": 0.89058959, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.32568359, + "step": 936, + "time_per_iteration": 2.608764171600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098474, + "balance_loss_mlp": 1.06333113, + "epoch": 0.18026163909195844, + "flos": 539255757312.0, + "grad_norm": 0.06697051800305152, + "language_loss": 0.82882118, + "learning_rate": 0.0009419571404994493, + "loss": 0.83980596, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.3515625, + "step": 937, + "time_per_iteration": 2.620296001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096306, + "balance_loss_mlp": 1.06240284, + "epoch": 0.180454020777222, + "flos": 500382959616.0, + "grad_norm": 0.08555714620461663, + "language_loss": 0.90948844, + "learning_rate": 0.00094181136228212, + "loss": 0.92045152, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.33935547, + "step": 938, + "time_per_iteration": 2.62837290763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109732, + "balance_loss_mlp": 1.06415629, + "epoch": 0.18064640246248556, + "flos": 498689353728.0, + "grad_norm": 0.06983123921060745, + "language_loss": 0.86323059, + "learning_rate": 0.0009416654125384077, + "loss": 0.8742038, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.33154297, + "step": 939, + "time_per_iteration": 2.715686321258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054723, + "balance_loss_mlp": 1.04242051, + "epoch": 0.18083878414774912, + "flos": 1518572358144.0, + "grad_norm": 0.027679884047562747, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80827093, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.12304688, + "step": 940, + "time_per_iteration": 4.941875219345093 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090642, + "balance_loss_mlp": 1.05728722, + "epoch": 0.1810311658330127, + "flos": 727006703616.0, + "grad_norm": 0.07011009980003599, + "language_loss": 0.84326053, + "learning_rate": 0.000941372998698552, + "loss": 0.85416698, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.33374023, + "step": 941, + "time_per_iteration": 2.931520938873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094264, + "balance_loss_mlp": 1.0597409, + "epoch": 0.18122354751827627, + "flos": 564643726848.0, + "grad_norm": 0.08254502738164117, + "language_loss": 0.8207435, + "learning_rate": 0.0009412265347159336, + "loss": 0.8316862, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.34570312, + "step": 942, + "time_per_iteration": 2.696354627609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091238, + "balance_loss_mlp": 1.05869377, + "epoch": 0.18141592920353983, + "flos": 519024208896.0, + "grad_norm": 0.05729066672306875, + "language_loss": 0.85217965, + "learning_rate": 0.0009410798994339829, + "loss": 0.86309201, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.32543945, + "step": 943, + "time_per_iteration": 2.6009600162506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088013, + "balance_loss_mlp": 1.0545156, + "epoch": 0.1816083108888034, + "flos": 512219261952.0, + "grad_norm": 0.05342615519744699, + "language_loss": 0.88234782, + "learning_rate": 0.000940933092909628, + "loss": 0.89322793, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.33520508, + "step": 944, + "time_per_iteration": 2.618419647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095526, + "balance_loss_mlp": 1.06286263, + "epoch": 0.18180069257406695, + "flos": 492144864768.0, + "grad_norm": 0.053227732023653135, + "language_loss": 0.8393383, + "learning_rate": 0.0009407861151998649, + "loss": 0.85029352, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.32666016, + "step": 945, + "time_per_iteration": 2.5718705654144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097774, + "balance_loss_mlp": 1.06406188, + "epoch": 0.1819930742593305, + "flos": 569891870208.0, + "grad_norm": 0.05775782434029923, + "language_loss": 0.86156505, + "learning_rate": 0.0009406389663617552, + "loss": 0.87254274, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.33740234, + "step": 946, + "time_per_iteration": 2.66513729095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097522, + "balance_loss_mlp": 1.06433463, + "epoch": 0.18218545594459407, + "flos": 605693168640.0, + "grad_norm": 0.06350431386522506, + "language_loss": 0.85736459, + "learning_rate": 0.000940491646452427, + "loss": 0.86833978, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.33203125, + "step": 947, + "time_per_iteration": 2.715071201324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103916, + "balance_loss_mlp": 1.07010818, + "epoch": 0.18237783762985763, + "flos": 548419230720.0, + "grad_norm": 0.06277969821047595, + "language_loss": 0.91195452, + "learning_rate": 0.000940344155529075, + "loss": 0.92299366, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.33837891, + "step": 948, + "time_per_iteration": 2.6502938270568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099574, + "balance_loss_mlp": 1.06550407, + "epoch": 0.1825702193151212, + "flos": 450509078016.0, + "grad_norm": 0.06933176029299125, + "language_loss": 0.87683523, + "learning_rate": 0.0009401964936489605, + "loss": 0.88783091, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.34106445, + "step": 949, + "time_per_iteration": 2.5181798934936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084135, + "balance_loss_mlp": 1.05247355, + "epoch": 0.18276260100038477, + "flos": 588962313216.0, + "grad_norm": 0.07980064544074586, + "language_loss": 0.85422635, + "learning_rate": 0.0009400486608694108, + "loss": 0.86506772, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.31640625, + "step": 950, + "time_per_iteration": 2.7189955711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087871, + "balance_loss_mlp": 1.05384839, + "epoch": 0.18295498268564833, + "flos": 786988376064.0, + "grad_norm": 0.05265351460276348, + "language_loss": 0.87225658, + "learning_rate": 0.0009399006572478195, + "loss": 0.88313532, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.34033203, + "step": 951, + "time_per_iteration": 3.0805773735046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086089, + "balance_loss_mlp": 1.05218577, + "epoch": 0.1831473643709119, + "flos": 577878271488.0, + "grad_norm": 0.059447924131550096, + "language_loss": 0.91242015, + "learning_rate": 0.0009397524828416468, + "loss": 0.92328107, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.33935547, + "step": 952, + "time_per_iteration": 2.6567108631134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082155, + "balance_loss_mlp": 1.04801321, + "epoch": 0.18333974605617545, + "flos": 566622521856.0, + "grad_norm": 0.05513512337372911, + "language_loss": 0.96184212, + "learning_rate": 0.0009396041377084192, + "loss": 0.97266364, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.34179688, + "step": 953, + "time_per_iteration": 2.6937921047210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079403, + "balance_loss_mlp": 1.04478431, + "epoch": 0.183532127741439, + "flos": 526725421056.0, + "grad_norm": 0.07204875194033089, + "language_loss": 0.87840325, + "learning_rate": 0.0009394556219057295, + "loss": 0.88919723, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.34667969, + "step": 954, + "time_per_iteration": 2.6962215900421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107777, + "balance_loss_mlp": 1.04272258, + "epoch": 0.18372450942670257, + "flos": 594259918848.0, + "grad_norm": 0.07227161235955501, + "language_loss": 0.83883446, + "learning_rate": 0.0009393069354912362, + "loss": 0.84961212, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.35058594, + "step": 955, + "time_per_iteration": 2.718308925628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081248, + "balance_loss_mlp": 1.04677236, + "epoch": 0.18391689111196613, + "flos": 644720145408.0, + "grad_norm": 0.07091738302891186, + "language_loss": 0.82511717, + "learning_rate": 0.0009391580785226649, + "loss": 0.83592963, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.34521484, + "step": 956, + "time_per_iteration": 2.907367467880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077991, + "balance_loss_mlp": 1.06216049, + "epoch": 0.18410927279722972, + "flos": 1456246563840.0, + "grad_norm": 0.048423099914415325, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80418444, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.15820312, + "step": 957, + "time_per_iteration": 4.78663969039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091525, + "balance_loss_mlp": 1.05702567, + "epoch": 0.18430165448249328, + "flos": 658437728256.0, + "grad_norm": 0.09319397884021513, + "language_loss": 0.86484683, + "learning_rate": 0.0009388598531545196, + "loss": 0.8757621, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.34545898, + "step": 958, + "time_per_iteration": 2.8470118045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087326, + "balance_loss_mlp": 1.05285025, + "epoch": 0.18449403616775684, + "flos": 517679811072.0, + "grad_norm": 0.07377492103556435, + "language_loss": 0.86076611, + "learning_rate": 0.000938710484870727, + "loss": 0.87163937, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.3449707, + "step": 959, + "time_per_iteration": 2.5930731296539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090986, + "balance_loss_mlp": 1.05672574, + "epoch": 0.1846864178530204, + "flos": 552481537536.0, + "grad_norm": 0.06589557505977534, + "language_loss": 0.86379164, + "learning_rate": 0.0009385609462644189, + "loss": 0.8747015, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.34277344, + "step": 960, + "time_per_iteration": 2.688706636428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096456, + "balance_loss_mlp": 1.06212378, + "epoch": 0.18487879953828396, + "flos": 465930441216.0, + "grad_norm": 0.0643439417949763, + "language_loss": 0.86035949, + "learning_rate": 0.0009384112373936514, + "loss": 0.871324, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.34326172, + "step": 961, + "time_per_iteration": 4.0801496505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095895, + "balance_loss_mlp": 1.06132412, + "epoch": 0.18507118122354752, + "flos": 648200489472.0, + "grad_norm": 0.0614591664996872, + "language_loss": 0.91820455, + "learning_rate": 0.0009382613583165467, + "loss": 0.92916346, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.34594727, + "step": 962, + "time_per_iteration": 2.790069341659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113098, + "balance_loss_mlp": 1.07921863, + "epoch": 0.18526356290881107, + "flos": 626486330880.0, + "grad_norm": 0.06374556186760763, + "language_loss": 0.89594233, + "learning_rate": 0.0009381113090912928, + "loss": 0.90707326, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.33886719, + "step": 963, + "time_per_iteration": 2.6891098022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117951, + "balance_loss_mlp": 1.08559799, + "epoch": 0.18545594459407463, + "flos": 432497843712.0, + "grad_norm": 0.06491910119233056, + "language_loss": 0.90103394, + "learning_rate": 0.000937961089776144, + "loss": 0.91221344, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.32348633, + "step": 964, + "time_per_iteration": 2.5821962356567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124554, + "balance_loss_mlp": 1.08926833, + "epoch": 0.1856483262793382, + "flos": 748720862208.0, + "grad_norm": 0.06849062336391444, + "language_loss": 0.829036, + "learning_rate": 0.0009378107004294208, + "loss": 0.84028149, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.35302734, + "step": 965, + "time_per_iteration": 2.9898061752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115255, + "balance_loss_mlp": 1.081972, + "epoch": 0.18584070796460178, + "flos": 530058788352.0, + "grad_norm": 0.08647217477609576, + "language_loss": 0.91352308, + "learning_rate": 0.0009376601411095096, + "loss": 0.92467564, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.33300781, + "step": 966, + "time_per_iteration": 2.6415059566497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093436, + "balance_loss_mlp": 1.06196475, + "epoch": 0.18603308964986534, + "flos": 482863527936.0, + "grad_norm": 0.05783783242438048, + "language_loss": 0.8708145, + "learning_rate": 0.0009375094118748622, + "loss": 0.88174886, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.31445312, + "step": 967, + "time_per_iteration": 2.5149550437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089094, + "balance_loss_mlp": 1.05650234, + "epoch": 0.1862254713351289, + "flos": 800976591360.0, + "grad_norm": 0.0756042683078202, + "language_loss": 0.9083451, + "learning_rate": 0.0009373585127839976, + "loss": 0.91923606, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.32592773, + "step": 968, + "time_per_iteration": 2.9569485187530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084434, + "balance_loss_mlp": 1.05250978, + "epoch": 0.18641785302039246, + "flos": 478082456064.0, + "grad_norm": 0.06160067145414361, + "language_loss": 0.91074634, + "learning_rate": 0.0009372074438954994, + "loss": 0.92159069, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.3190918, + "step": 969, + "time_per_iteration": 2.508530378341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083542, + "balance_loss_mlp": 1.05040169, + "epoch": 0.18661023470565602, + "flos": 388695587328.0, + "grad_norm": 0.07517959095695621, + "language_loss": 0.91676056, + "learning_rate": 0.0009370562052680181, + "loss": 0.92759597, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.33154297, + "step": 970, + "time_per_iteration": 2.4572672843933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087332, + "balance_loss_mlp": 1.05400109, + "epoch": 0.18680261639091958, + "flos": 564402207744.0, + "grad_norm": 0.052448577146131624, + "language_loss": 0.89610398, + "learning_rate": 0.0009369047969602695, + "loss": 0.90697736, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.33349609, + "step": 971, + "time_per_iteration": 2.714704751968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101064, + "balance_loss_mlp": 1.06556404, + "epoch": 0.18699499807618314, + "flos": 479018009088.0, + "grad_norm": 0.06595213007116614, + "language_loss": 0.8674072, + "learning_rate": 0.0009367532190310357, + "loss": 0.87841785, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.35498047, + "step": 972, + "time_per_iteration": 2.589667558670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111914, + "balance_loss_mlp": 1.07660413, + "epoch": 0.1871873797614467, + "flos": 553022802432.0, + "grad_norm": 0.0720295199384638, + "language_loss": 0.88701892, + "learning_rate": 0.0009366014715391644, + "loss": 0.89813805, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.35327148, + "step": 973, + "time_per_iteration": 2.634023904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107724, + "balance_loss_mlp": 1.07389259, + "epoch": 0.18737976144671029, + "flos": 552526617600.0, + "grad_norm": 0.05153911900793568, + "language_loss": 0.8432554, + "learning_rate": 0.0009364495545435693, + "loss": 0.85433269, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.33837891, + "step": 974, + "time_per_iteration": 2.7729458808898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107099, + "balance_loss_mlp": 1.07281494, + "epoch": 0.18757214313197385, + "flos": 502002372096.0, + "grad_norm": 0.05815108638233015, + "language_loss": 0.88620323, + "learning_rate": 0.0009362974681032297, + "loss": 0.8972742, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.34326172, + "step": 975, + "time_per_iteration": 2.631744623184204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105498, + "balance_loss_mlp": 1.07130909, + "epoch": 0.1877645248172374, + "flos": 674691337728.0, + "grad_norm": 0.06841603134690444, + "language_loss": 0.88265896, + "learning_rate": 0.0009361452122771907, + "loss": 0.89371395, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.34204102, + "step": 976, + "time_per_iteration": 2.8427281379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094167, + "balance_loss_mlp": 1.06012082, + "epoch": 0.18795690650250096, + "flos": 404771696640.0, + "grad_norm": 0.07319435948671522, + "language_loss": 0.8377496, + "learning_rate": 0.0009359927871245635, + "loss": 0.84869128, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.34057617, + "step": 977, + "time_per_iteration": 2.4665186405181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090091, + "balance_loss_mlp": 1.0565697, + "epoch": 0.18814928818776452, + "flos": 637599485952.0, + "grad_norm": 0.05986452276683665, + "language_loss": 0.86337954, + "learning_rate": 0.0009358401927045246, + "loss": 0.87428045, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.33520508, + "step": 978, + "time_per_iteration": 2.8037781715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090672, + "balance_loss_mlp": 1.05707908, + "epoch": 0.18834166987302808, + "flos": 1137825446400.0, + "grad_norm": 0.054509582003230646, + "language_loss": 0.88314402, + "learning_rate": 0.0009356874290763166, + "loss": 0.89405078, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.33618164, + "step": 979, + "time_per_iteration": 3.456723213195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097981, + "balance_loss_mlp": 1.06481671, + "epoch": 0.18853405155829164, + "flos": 504538398720.0, + "grad_norm": 0.06366920756378494, + "language_loss": 0.8866874, + "learning_rate": 0.0009355344962992474, + "loss": 0.89766723, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.33154297, + "step": 980, + "time_per_iteration": 2.6105339527130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109825, + "balance_loss_mlp": 1.06494308, + "epoch": 0.1887264332435552, + "flos": 607879987200.0, + "grad_norm": 0.05130215804193928, + "language_loss": 0.88147485, + "learning_rate": 0.0009353813944326908, + "loss": 0.89245737, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.33325195, + "step": 981, + "time_per_iteration": 2.882836103439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109809, + "balance_loss_mlp": 1.0758822, + "epoch": 0.1889188149288188, + "flos": 552264749568.0, + "grad_norm": 0.07032712681879846, + "language_loss": 0.83146608, + "learning_rate": 0.0009352281235360863, + "loss": 0.84256417, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.33959961, + "step": 982, + "time_per_iteration": 2.695748805999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120418, + "balance_loss_mlp": 1.08775461, + "epoch": 0.18911119661408235, + "flos": 418332128256.0, + "grad_norm": 0.06033753714629359, + "language_loss": 0.84987485, + "learning_rate": 0.0009350746836689389, + "loss": 0.86107904, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.32666016, + "step": 983, + "time_per_iteration": 2.5073440074920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260435, + "balance_loss_mlp": 1.23916793, + "epoch": 0.1893035782993459, + "flos": 1481141320704.0, + "grad_norm": 0.0731593378732656, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82699656, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.21289062, + "step": 984, + "time_per_iteration": 5.065609931945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133244, + "balance_loss_mlp": 1.09831583, + "epoch": 0.18949595998460947, + "flos": 508220974080.0, + "grad_norm": 0.09166419018528392, + "language_loss": 0.83211792, + "learning_rate": 0.0009347672972613634, + "loss": 0.84345031, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.34936523, + "step": 985, + "time_per_iteration": 2.580009937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115864, + "balance_loss_mlp": 1.08270001, + "epoch": 0.18968834166987303, + "flos": 530812459008.0, + "grad_norm": 0.0668772854373454, + "language_loss": 0.85875785, + "learning_rate": 0.0009346133508402735, + "loss": 0.8699165, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.33178711, + "step": 986, + "time_per_iteration": 2.6872711181640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111031, + "balance_loss_mlp": 1.07724667, + "epoch": 0.1898807233551366, + "flos": 499515807744.0, + "grad_norm": 0.11088649382938841, + "language_loss": 0.8420769, + "learning_rate": 0.0009344592356873166, + "loss": 0.8531872, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.33813477, + "step": 987, + "time_per_iteration": 2.6347994804382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098007, + "balance_loss_mlp": 1.06462848, + "epoch": 0.19007310504040015, + "flos": 601936399872.0, + "grad_norm": 0.05765681888892058, + "language_loss": 0.78527796, + "learning_rate": 0.0009343049518623255, + "loss": 0.79625803, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.33398438, + "step": 988, + "time_per_iteration": 2.696929693222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082914, + "balance_loss_mlp": 1.05029869, + "epoch": 0.1902654867256637, + "flos": 601374786048.0, + "grad_norm": 0.05732720380572914, + "language_loss": 0.83250153, + "learning_rate": 0.0009341504994251985, + "loss": 0.84333068, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.32617188, + "step": 989, + "time_per_iteration": 2.8399016857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095582, + "balance_loss_mlp": 1.07841623, + "epoch": 0.19045786841092727, + "flos": 1574925147648.0, + "grad_norm": 0.03888561388969961, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74616081, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.171875, + "step": 990, + "time_per_iteration": 5.072636842727661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109747, + "balance_loss_mlp": 1.06394839, + "epoch": 0.19065025009619085, + "flos": 681280906752.0, + "grad_norm": 0.135211113906906, + "language_loss": 0.818295, + "learning_rate": 0.0009338410889544574, + "loss": 0.82926977, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.33544922, + "step": 991, + "time_per_iteration": 3.050665855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_mlp": 1.06786811, + "epoch": 0.1908426317814544, + "flos": 601971305472.0, + "grad_norm": 0.06286082016671143, + "language_loss": 0.87738663, + "learning_rate": 0.000933686131040967, + "loss": 0.88840532, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.34033203, + "step": 992, + "time_per_iteration": 2.7589659690856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089273, + "balance_loss_mlp": 1.05672884, + "epoch": 0.19103501346671797, + "flos": 586027616256.0, + "grad_norm": 0.0561482479745879, + "language_loss": 0.90427077, + "learning_rate": 0.0009335310047555883, + "loss": 0.91516346, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.32543945, + "step": 993, + "time_per_iteration": 2.7133467197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108709, + "balance_loss_mlp": 1.0532825, + "epoch": 0.19122739515198153, + "flos": 545494708224.0, + "grad_norm": 0.06221036652136981, + "language_loss": 0.88114065, + "learning_rate": 0.0009333757101585467, + "loss": 0.89201152, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.33837891, + "step": 994, + "time_per_iteration": 2.6733241081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083527, + "balance_loss_mlp": 1.05105424, + "epoch": 0.1914197768372451, + "flos": 521171739648.0, + "grad_norm": 0.05606370206634765, + "language_loss": 0.93617988, + "learning_rate": 0.0009332202473101329, + "loss": 0.94701517, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.32470703, + "step": 995, + "time_per_iteration": 2.6689558029174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079312, + "balance_loss_mlp": 1.04536152, + "epoch": 0.19161215852250865, + "flos": 610961660928.0, + "grad_norm": 0.05986652691328414, + "language_loss": 0.83121806, + "learning_rate": 0.0009330646162707028, + "loss": 0.84201121, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.33984375, + "step": 996, + "time_per_iteration": 2.7264511585235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081823, + "balance_loss_mlp": 1.04849207, + "epoch": 0.1918045402077722, + "flos": 846281806848.0, + "grad_norm": 0.05485586532204223, + "language_loss": 0.84800065, + "learning_rate": 0.0009329088171006779, + "loss": 0.85881883, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.33349609, + "step": 997, + "time_per_iteration": 3.1486315727233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097705, + "balance_loss_mlp": 1.06220424, + "epoch": 0.19199692189303577, + "flos": 465699096576.0, + "grad_norm": 0.06540772430376247, + "language_loss": 0.84963006, + "learning_rate": 0.0009327528498605446, + "loss": 0.86060709, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.35522461, + "step": 998, + "time_per_iteration": 2.532460927963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.0542109, + "epoch": 0.19218930357829936, + "flos": 531318818304.0, + "grad_norm": 0.06065225266474605, + "language_loss": 0.89716202, + "learning_rate": 0.0009325967146108548, + "loss": 0.90804029, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.33642578, + "step": 999, + "time_per_iteration": 2.6381072998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108792, + "balance_loss_mlp": 1.05334902, + "epoch": 0.19238168526356292, + "flos": 601350054912.0, + "grad_norm": 0.06318510310852068, + "language_loss": 0.87984866, + "learning_rate": 0.0009324404114122258, + "loss": 0.89072788, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.34594727, + "step": 1000, + "time_per_iteration": 2.7017252445220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088105, + "balance_loss_mlp": 1.0544883, + "epoch": 0.19257406694882648, + "flos": 571690192896.0, + "grad_norm": 0.05361295189234855, + "language_loss": 0.87132722, + "learning_rate": 0.0009322839403253397, + "loss": 0.88220823, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.33642578, + "step": 1001, + "time_per_iteration": 2.7725350856781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091645, + "balance_loss_mlp": 1.05759907, + "epoch": 0.19276644863409004, + "flos": 801478568448.0, + "grad_norm": 0.0661765462165054, + "language_loss": 0.84038174, + "learning_rate": 0.0009321273014109439, + "loss": 0.85129815, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.34082031, + "step": 1002, + "time_per_iteration": 2.9275383949279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089997, + "balance_loss_mlp": 1.05676103, + "epoch": 0.1929588303193536, + "flos": 563024314368.0, + "grad_norm": 0.05133430998282463, + "language_loss": 0.85232604, + "learning_rate": 0.0009319704947298513, + "loss": 0.863226, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.33251953, + "step": 1003, + "time_per_iteration": 2.9198272228240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083204, + "balance_loss_mlp": 1.05120838, + "epoch": 0.19315121200461716, + "flos": 626550349824.0, + "grad_norm": 0.04652496586479965, + "language_loss": 0.88737059, + "learning_rate": 0.0009318135203429393, + "loss": 0.8982026, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.31982422, + "step": 1004, + "time_per_iteration": 2.7145965099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094807, + "balance_loss_mlp": 1.06116605, + "epoch": 0.19334359368988072, + "flos": 517169069568.0, + "grad_norm": 0.06711221272981459, + "language_loss": 0.88228458, + "learning_rate": 0.0009316563783111511, + "loss": 0.8932327, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.33642578, + "step": 1005, + "time_per_iteration": 2.68135404586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095243, + "balance_loss_mlp": 1.06050563, + "epoch": 0.19353597537514428, + "flos": 693751606272.0, + "grad_norm": 0.04947727679523619, + "language_loss": 0.82323831, + "learning_rate": 0.0009314990686954943, + "loss": 0.83419079, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.34765625, + "step": 1006, + "time_per_iteration": 2.9068872928619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098932, + "balance_loss_mlp": 1.06495738, + "epoch": 0.19372835706040784, + "flos": 1209665180160.0, + "grad_norm": 0.05336104081377929, + "language_loss": 0.80917025, + "learning_rate": 0.000931341591557042, + "loss": 0.82015955, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.34008789, + "step": 1007, + "time_per_iteration": 3.759119749069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098415, + "balance_loss_mlp": 1.06291509, + "epoch": 0.19392073874567142, + "flos": 520368606720.0, + "grad_norm": 0.06549831272650784, + "language_loss": 0.87757689, + "learning_rate": 0.0009311839469569325, + "loss": 0.88856107, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.35522461, + "step": 1008, + "time_per_iteration": 2.6298930644989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100893, + "balance_loss_mlp": 1.06620264, + "epoch": 0.19411312043093498, + "flos": 588543293952.0, + "grad_norm": 0.06763315162421418, + "language_loss": 0.8732397, + "learning_rate": 0.0009310261349563687, + "loss": 0.88424855, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.34692383, + "step": 1009, + "time_per_iteration": 2.6843061447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110066, + "balance_loss_mlp": 1.06718588, + "epoch": 0.19430550211619854, + "flos": 579085867008.0, + "grad_norm": 0.05371296475785438, + "language_loss": 0.8534441, + "learning_rate": 0.0009308681556166186, + "loss": 0.86445075, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.33496094, + "step": 1010, + "time_per_iteration": 2.8197336196899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107606, + "balance_loss_mlp": 1.07291579, + "epoch": 0.1944978838014621, + "flos": 620848281600.0, + "grad_norm": 0.08312668477716535, + "language_loss": 0.87206143, + "learning_rate": 0.0009307100089990152, + "loss": 0.88313752, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.34716797, + "step": 1011, + "time_per_iteration": 2.7118990421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101924, + "balance_loss_mlp": 1.0672822, + "epoch": 0.19469026548672566, + "flos": 598440089088.0, + "grad_norm": 0.061832865854500894, + "language_loss": 0.83946323, + "learning_rate": 0.0009305516951649568, + "loss": 0.85048252, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.34667969, + "step": 1012, + "time_per_iteration": 2.667672872543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096764, + "balance_loss_mlp": 1.06314659, + "epoch": 0.19488264717198922, + "flos": 551890810368.0, + "grad_norm": 0.04827143175142062, + "language_loss": 0.87187612, + "learning_rate": 0.0009303932141759057, + "loss": 0.88284373, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.33642578, + "step": 1013, + "time_per_iteration": 2.7321088314056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101502, + "balance_loss_mlp": 1.06705046, + "epoch": 0.19507502885725278, + "flos": 665842166784.0, + "grad_norm": 0.05715794205563071, + "language_loss": 0.84201366, + "learning_rate": 0.0009302345660933902, + "loss": 0.85302866, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.3449707, + "step": 1014, + "time_per_iteration": 2.7699263095855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109904, + "balance_loss_mlp": 1.07616735, + "epoch": 0.19526741054251634, + "flos": 670771625472.0, + "grad_norm": 0.05949834877265084, + "language_loss": 0.84866655, + "learning_rate": 0.0009300757509790026, + "loss": 0.85976553, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.33764648, + "step": 1015, + "time_per_iteration": 2.8250515460968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110813, + "balance_loss_mlp": 1.0766474, + "epoch": 0.19545979222777993, + "flos": 446983653888.0, + "grad_norm": 0.0671511226198219, + "language_loss": 0.90974069, + "learning_rate": 0.0009299167688944005, + "loss": 0.92084885, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.34204102, + "step": 1016, + "time_per_iteration": 2.545133590698242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111157, + "balance_loss_mlp": 1.07778645, + "epoch": 0.1956521739130435, + "flos": 568813722624.0, + "grad_norm": 0.06338586690579641, + "language_loss": 0.85958129, + "learning_rate": 0.0009297576199013063, + "loss": 0.87069696, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.33813477, + "step": 1017, + "time_per_iteration": 2.668503761291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148218, + "balance_loss_mlp": 1.13295972, + "epoch": 0.19584455559830705, + "flos": 1454969157120.0, + "grad_norm": 0.047651466398381144, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74150348, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.15234375, + "step": 1018, + "time_per_iteration": 4.920944929122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104842, + "balance_loss_mlp": 1.09015501, + "epoch": 0.1960369372835706, + "flos": 1590320369664.0, + "grad_norm": 0.036993279908541045, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80531144, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.14648438, + "step": 1019, + "time_per_iteration": 6.0059425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118502, + "balance_loss_mlp": 1.08505166, + "epoch": 0.19622931896883417, + "flos": 615709237248.0, + "grad_norm": 0.05240041234704895, + "language_loss": 0.86600977, + "learning_rate": 0.0009292791720892659, + "loss": 0.87719476, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.3347168, + "step": 1020, + "time_per_iteration": 2.995192527770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113873, + "balance_loss_mlp": 1.07930255, + "epoch": 0.19642170065409773, + "flos": 465950790144.0, + "grad_norm": 0.0657036282835547, + "language_loss": 0.88724279, + "learning_rate": 0.0009291193560807218, + "loss": 0.89838147, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.34594727, + "step": 1021, + "time_per_iteration": 2.633256196975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114293, + "balance_loss_mlp": 1.07962656, + "epoch": 0.19661408233936128, + "flos": 515040477696.0, + "grad_norm": 0.054836200403870924, + "language_loss": 0.87439638, + "learning_rate": 0.0009289593734732688, + "loss": 0.88553929, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.34716797, + "step": 1022, + "time_per_iteration": 2.622284173965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107262, + "balance_loss_mlp": 1.0736922, + "epoch": 0.19680646402462484, + "flos": 392427624960.0, + "grad_norm": 0.053036961045345866, + "language_loss": 0.94139373, + "learning_rate": 0.0009287992243290175, + "loss": 0.95246631, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.3359375, + "step": 1023, + "time_per_iteration": 2.4402668476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108975, + "balance_loss_mlp": 1.07247353, + "epoch": 0.19699884570988843, + "flos": 626122566144.0, + "grad_norm": 0.056904835680118435, + "language_loss": 0.90850759, + "learning_rate": 0.0009286389087101435, + "loss": 0.91959733, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.36523438, + "step": 1024, + "time_per_iteration": 2.762068271636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110519, + "balance_loss_mlp": 1.06957078, + "epoch": 0.197191227395152, + "flos": 557710742016.0, + "grad_norm": 0.05298833269370499, + "language_loss": 0.88575542, + "learning_rate": 0.0009284784266788864, + "loss": 0.89680731, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.35668945, + "step": 1025, + "time_per_iteration": 4.087035417556763 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109606, + "balance_loss_mlp": 1.07565546, + "epoch": 0.19738360908041555, + "flos": 664681061376.0, + "grad_norm": 0.0565537913278748, + "language_loss": 0.92494339, + "learning_rate": 0.0009283177782975512, + "loss": 0.93603945, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.33984375, + "step": 1026, + "time_per_iteration": 2.948167562484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095626, + "balance_loss_mlp": 1.06117415, + "epoch": 0.1975759907656791, + "flos": 522244094976.0, + "grad_norm": 0.06218898027866582, + "language_loss": 0.88052273, + "learning_rate": 0.000928156963628507, + "loss": 0.89147896, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.3449707, + "step": 1027, + "time_per_iteration": 2.564019203186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091019, + "balance_loss_mlp": 1.05694866, + "epoch": 0.19776837245094267, + "flos": 462233309184.0, + "grad_norm": 0.056114928823487176, + "language_loss": 0.8826099, + "learning_rate": 0.0009279959827341877, + "loss": 0.89352006, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.34082031, + "step": 1028, + "time_per_iteration": 2.7226340770721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090946, + "balance_loss_mlp": 1.05699515, + "epoch": 0.19796075413620623, + "flos": 502809887232.0, + "grad_norm": 0.05507551359640612, + "language_loss": 0.88204837, + "learning_rate": 0.0009278348356770915, + "loss": 0.89295781, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.33984375, + "step": 1029, + "time_per_iteration": 2.592756748199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085455, + "balance_loss_mlp": 1.05093157, + "epoch": 0.1981531358214698, + "flos": 507281038848.0, + "grad_norm": 0.061172366255401664, + "language_loss": 0.85939109, + "learning_rate": 0.0009276735225197814, + "loss": 0.87024558, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.34570312, + "step": 1030, + "time_per_iteration": 2.598607063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088832, + "balance_loss_mlp": 1.05495238, + "epoch": 0.19834551750673335, + "flos": 531275148288.0, + "grad_norm": 0.0802549423316463, + "language_loss": 0.86293721, + "learning_rate": 0.0009275120433248847, + "loss": 0.87382561, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.33886719, + "step": 1031, + "time_per_iteration": 2.7143311500549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090216, + "balance_loss_mlp": 1.05726683, + "epoch": 0.1985378991919969, + "flos": 775147691520.0, + "grad_norm": 0.05308511447166053, + "language_loss": 0.86272347, + "learning_rate": 0.0009273503981550931, + "loss": 0.87362564, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.32958984, + "step": 1032, + "time_per_iteration": 3.0616648197174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087082, + "balance_loss_mlp": 1.05351269, + "epoch": 0.1987302808772605, + "flos": 434063411712.0, + "grad_norm": 0.059916166081832097, + "language_loss": 0.8703599, + "learning_rate": 0.0009271885870731626, + "loss": 0.88123071, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.3359375, + "step": 1033, + "time_per_iteration": 2.487316131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092715, + "balance_loss_mlp": 1.05921745, + "epoch": 0.19892266256252406, + "flos": 553342897152.0, + "grad_norm": 0.06168947094446192, + "language_loss": 0.88599998, + "learning_rate": 0.0009270266101419143, + "loss": 0.89692712, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.33520508, + "step": 1034, + "time_per_iteration": 2.5978119373321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085912, + "balance_loss_mlp": 1.05403578, + "epoch": 0.19911504424778761, + "flos": 549596302848.0, + "grad_norm": 0.06019117447906982, + "language_loss": 0.85564321, + "learning_rate": 0.0009268644674242328, + "loss": 0.86650234, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.31860352, + "step": 1035, + "time_per_iteration": 2.7259163856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097892, + "balance_loss_mlp": 1.0645138, + "epoch": 0.19930742593305117, + "flos": 518024636928.0, + "grad_norm": 0.05869793462101787, + "language_loss": 0.81141233, + "learning_rate": 0.0009267021589830678, + "loss": 0.82239127, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.33398438, + "step": 1036, + "time_per_iteration": 2.597724199295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161292, + "balance_loss_mlp": 1.14507985, + "epoch": 0.19949980761831473, + "flos": 1508516849664.0, + "grad_norm": 0.04621309141147155, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78788376, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.16210938, + "step": 1037, + "time_per_iteration": 4.918612241744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093993, + "balance_loss_mlp": 1.06044722, + "epoch": 0.1996921893035783, + "flos": 697803738624.0, + "grad_norm": 0.061892224045152405, + "language_loss": 0.93283784, + "learning_rate": 0.000926377045182406, + "loss": 0.94377768, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.33569336, + "step": 1038, + "time_per_iteration": 2.8800160884857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096412, + "balance_loss_mlp": 1.06334293, + "epoch": 0.19988457098884185, + "flos": 726682226688.0, + "grad_norm": 0.0613562398808313, + "language_loss": 0.87972045, + "learning_rate": 0.0009262142399491296, + "loss": 0.89068449, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.33081055, + "step": 1039, + "time_per_iteration": 3.0561435222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097284, + "balance_loss_mlp": 1.06345224, + "epoch": 0.2000769526741054, + "flos": 560275881984.0, + "grad_norm": 0.06364175085873486, + "language_loss": 0.87837642, + "learning_rate": 0.0009260512692448105, + "loss": 0.88934934, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.33862305, + "step": 1040, + "time_per_iteration": 2.7037088871002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090351, + "balance_loss_mlp": 1.05697203, + "epoch": 0.200269334359369, + "flos": 571758594048.0, + "grad_norm": 0.05851279903795688, + "language_loss": 0.84325236, + "learning_rate": 0.000925888133132719, + "loss": 0.85415584, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.33398438, + "step": 1041, + "time_per_iteration": 2.836177110671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082089, + "balance_loss_mlp": 1.06730711, + "epoch": 0.20046171604463256, + "flos": 1485362340864.0, + "grad_norm": 0.029405325300647274, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80692518, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.14746094, + "step": 1042, + "time_per_iteration": 4.901337146759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094947, + "balance_loss_mlp": 1.06140149, + "epoch": 0.20065409772989612, + "flos": 496266808320.0, + "grad_norm": 0.07205728359427886, + "language_loss": 0.81256473, + "learning_rate": 0.0009255613649386244, + "loss": 0.82351422, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.33544922, + "step": 1043, + "time_per_iteration": 2.6198885440826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089381, + "balance_loss_mlp": 1.05686069, + "epoch": 0.20084647941515968, + "flos": 579094631424.0, + "grad_norm": 0.06625931968059934, + "language_loss": 0.79017001, + "learning_rate": 0.0009253977329834838, + "loss": 0.80106384, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.32519531, + "step": 1044, + "time_per_iteration": 2.6872498989105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111234, + "balance_loss_mlp": 1.07745981, + "epoch": 0.20103886110042324, + "flos": 641775273984.0, + "grad_norm": 0.06628294367657735, + "language_loss": 0.86666185, + "learning_rate": 0.0009252339358742965, + "loss": 0.87778521, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.34912109, + "step": 1045, + "time_per_iteration": 2.7749996185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116952, + "balance_loss_mlp": 1.08219087, + "epoch": 0.2012312427856868, + "flos": 441720953856.0, + "grad_norm": 0.05401214919341486, + "language_loss": 0.83449644, + "learning_rate": 0.000925069973674654, + "loss": 0.84566593, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.34814453, + "step": 1046, + "time_per_iteration": 2.662992477416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114258, + "balance_loss_mlp": 1.08116508, + "epoch": 0.20142362447095036, + "flos": 554135855616.0, + "grad_norm": 0.049297877184233195, + "language_loss": 0.88960069, + "learning_rate": 0.000924905846448212, + "loss": 0.90074325, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.33105469, + "step": 1047, + "time_per_iteration": 2.8164925575256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127137, + "balance_loss_mlp": 1.09306693, + "epoch": 0.20161600615621392, + "flos": 669988841472.0, + "grad_norm": 0.07365230282100185, + "language_loss": 0.85615861, + "learning_rate": 0.0009247415542586906, + "loss": 0.86742997, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.34106445, + "step": 1048, + "time_per_iteration": 2.858611822128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115399, + "balance_loss_mlp": 1.08130527, + "epoch": 0.2018083878414775, + "flos": 572788689408.0, + "grad_norm": 0.05223287600750505, + "language_loss": 0.83514655, + "learning_rate": 0.0009245770971698735, + "loss": 0.84630048, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.34106445, + "step": 1049, + "time_per_iteration": 2.8758163452148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103626, + "balance_loss_mlp": 1.07036686, + "epoch": 0.20200076952674106, + "flos": 425624495616.0, + "grad_norm": 0.061140118103518055, + "language_loss": 0.88792473, + "learning_rate": 0.0009244124752456087, + "loss": 0.89896095, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.33276367, + "step": 1050, + "time_per_iteration": 2.501565456390381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097765, + "balance_loss_mlp": 1.06457758, + "epoch": 0.20219315121200462, + "flos": 536326852608.0, + "grad_norm": 0.049507299183714965, + "language_loss": 0.85344577, + "learning_rate": 0.0009242476885498081, + "loss": 0.86442339, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.33203125, + "step": 1051, + "time_per_iteration": 2.698791027069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095962, + "balance_loss_mlp": 1.06222594, + "epoch": 0.20238553289726818, + "flos": 477634323456.0, + "grad_norm": 0.07140169421024865, + "language_loss": 0.8134433, + "learning_rate": 0.0009240827371464474, + "loss": 0.82440293, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.33764648, + "step": 1052, + "time_per_iteration": 2.5603079795837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082157, + "balance_loss_mlp": 1.04958868, + "epoch": 0.20257791458253174, + "flos": 1151611430400.0, + "grad_norm": 0.06069279327125781, + "language_loss": 0.84372044, + "learning_rate": 0.0009239176210995666, + "loss": 0.85454196, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.32568359, + "step": 1053, + "time_per_iteration": 3.4549684524536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088056, + "balance_loss_mlp": 1.05393791, + "epoch": 0.2027702962677953, + "flos": 666606011904.0, + "grad_norm": 0.06066867592012189, + "language_loss": 0.93657684, + "learning_rate": 0.0009237523404732695, + "loss": 0.94745743, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.34130859, + "step": 1054, + "time_per_iteration": 4.344247817993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078384, + "balance_loss_mlp": 1.04567289, + "epoch": 0.20296267795305886, + "flos": 641011428864.0, + "grad_norm": 0.0678922331878557, + "language_loss": 0.84289086, + "learning_rate": 0.0009235868953317235, + "loss": 0.85367465, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.32714844, + "step": 1055, + "time_per_iteration": 2.7755184173583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086857, + "balance_loss_mlp": 1.05321646, + "epoch": 0.20315505963832242, + "flos": 930187777536.0, + "grad_norm": 0.06816541670806936, + "language_loss": 0.85603452, + "learning_rate": 0.0009234212857391602, + "loss": 0.86690307, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.33642578, + "step": 1056, + "time_per_iteration": 3.1736087799072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089034, + "balance_loss_mlp": 1.05477369, + "epoch": 0.20334744132358598, + "flos": 561818128896.0, + "grad_norm": 0.05209348313890264, + "language_loss": 0.88978589, + "learning_rate": 0.000923255511759875, + "loss": 0.90067613, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.34301758, + "step": 1057, + "time_per_iteration": 2.7823617458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093097, + "balance_loss_mlp": 1.05945587, + "epoch": 0.20353982300884957, + "flos": 643902455808.0, + "grad_norm": 0.061337083912670884, + "language_loss": 0.85219932, + "learning_rate": 0.000923089573458227, + "loss": 0.86313027, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.33666992, + "step": 1058, + "time_per_iteration": 2.8398988246917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092738, + "balance_loss_mlp": 1.05952644, + "epoch": 0.20373220469411313, + "flos": 651101690880.0, + "grad_norm": 0.0713114334987562, + "language_loss": 0.84425724, + "learning_rate": 0.0009229234708986392, + "loss": 0.85518456, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.33203125, + "step": 1059, + "time_per_iteration": 2.891934394836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069957, + "balance_loss_mlp": 1.05603302, + "epoch": 0.2039245863793767, + "flos": 1436939136000.0, + "grad_norm": 0.037855568460977755, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.8273685, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.13964844, + "step": 1060, + "time_per_iteration": 4.673142194747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092929, + "balance_loss_mlp": 1.05985999, + "epoch": 0.20411696806464025, + "flos": 596678082048.0, + "grad_norm": 0.07190006801568614, + "language_loss": 0.85404283, + "learning_rate": 0.0009225907732636548, + "loss": 0.86497211, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.33081055, + "step": 1061, + "time_per_iteration": 2.74110746383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091714, + "balance_loss_mlp": 1.0585742, + "epoch": 0.2043093497499038, + "flos": 573530775552.0, + "grad_norm": 0.06271161302412134, + "language_loss": 0.86991799, + "learning_rate": 0.0009224241783174227, + "loss": 0.88083506, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.33154297, + "step": 1062, + "time_per_iteration": 2.6885697841644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084619, + "balance_loss_mlp": 1.05233693, + "epoch": 0.20450173143516737, + "flos": 630061217280.0, + "grad_norm": 0.055816021094363524, + "language_loss": 0.85842204, + "learning_rate": 0.0009222574193715802, + "loss": 0.86926818, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.32275391, + "step": 1063, + "time_per_iteration": 2.7569899559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087796, + "balance_loss_mlp": 1.05522823, + "epoch": 0.20469411312043093, + "flos": 573718450176.0, + "grad_norm": 0.051897822989382614, + "language_loss": 0.8621105, + "learning_rate": 0.000922090496490869, + "loss": 0.87298846, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.32568359, + "step": 1064, + "time_per_iteration": 2.7099597454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082485, + "balance_loss_mlp": 1.05025065, + "epoch": 0.20488649480569449, + "flos": 636748300800.0, + "grad_norm": 0.04962250787968228, + "language_loss": 0.90165728, + "learning_rate": 0.0009219234097400937, + "loss": 0.91248214, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.32226562, + "step": 1065, + "time_per_iteration": 2.8492319583892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108042, + "balance_loss_mlp": 1.04806709, + "epoch": 0.20507887649095807, + "flos": 975383894016.0, + "grad_norm": 0.051536979552593745, + "language_loss": 0.83029723, + "learning_rate": 0.0009217561591841237, + "loss": 0.84110147, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.32348633, + "step": 1066, + "time_per_iteration": 3.267207145690918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082085, + "balance_loss_mlp": 1.04951739, + "epoch": 0.20527125817622163, + "flos": 485940819456.0, + "grad_norm": 0.09661652793466288, + "language_loss": 0.81334901, + "learning_rate": 0.0009215887448878913, + "loss": 0.82416987, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.32568359, + "step": 1067, + "time_per_iteration": 2.5429391860961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088998, + "balance_loss_mlp": 1.05552411, + "epoch": 0.2054636398614852, + "flos": 526921860096.0, + "grad_norm": 0.09953641970782799, + "language_loss": 0.85144234, + "learning_rate": 0.0009214211669163922, + "loss": 0.86233234, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.33496094, + "step": 1068, + "time_per_iteration": 2.7006540298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085875, + "balance_loss_mlp": 1.05428481, + "epoch": 0.20565602154674875, + "flos": 557898416640.0, + "grad_norm": 0.048729379907622286, + "language_loss": 0.93896133, + "learning_rate": 0.0009212534253346862, + "loss": 0.94982004, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.31567383, + "step": 1069, + "time_per_iteration": 2.7544496059417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098993, + "balance_loss_mlp": 1.06713986, + "epoch": 0.2058484032320123, + "flos": 503976784896.0, + "grad_norm": 0.06649355865978995, + "language_loss": 0.8497259, + "learning_rate": 0.0009210855202078964, + "loss": 0.86071587, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.31835938, + "step": 1070, + "time_per_iteration": 2.59660005569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113348, + "balance_loss_mlp": 1.07975471, + "epoch": 0.20604078491727587, + "flos": 432950358528.0, + "grad_norm": 0.06315152856471482, + "language_loss": 0.87476587, + "learning_rate": 0.0009209174516012091, + "loss": 0.88589936, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.3359375, + "step": 1071, + "time_per_iteration": 2.498087167739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110052, + "balance_loss_mlp": 1.07624412, + "epoch": 0.20623316660253943, + "flos": 608421252096.0, + "grad_norm": 0.06211591839104366, + "language_loss": 0.89244497, + "learning_rate": 0.0009207492195798747, + "loss": 0.9035455, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.33837891, + "step": 1072, + "time_per_iteration": 2.760019063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116237, + "balance_loss_mlp": 1.08142757, + "epoch": 0.206425548287803, + "flos": 480184906752.0, + "grad_norm": 0.07379229384440758, + "language_loss": 0.84887302, + "learning_rate": 0.0009205808242092061, + "loss": 0.86003542, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.34838867, + "step": 1073, + "time_per_iteration": 2.6034529209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118683, + "balance_loss_mlp": 1.08423102, + "epoch": 0.20661792997306658, + "flos": 949007937024.0, + "grad_norm": 0.0763588165275792, + "language_loss": 0.82845032, + "learning_rate": 0.0009204122655545808, + "loss": 0.83963716, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.34472656, + "step": 1074, + "time_per_iteration": 3.3222029209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111187, + "balance_loss_mlp": 1.07759392, + "epoch": 0.20681031165833014, + "flos": 603206604288.0, + "grad_norm": 0.05592396046249817, + "language_loss": 0.80705297, + "learning_rate": 0.0009202435436814388, + "loss": 0.81816483, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.33618164, + "step": 1075, + "time_per_iteration": 2.721888780593872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114903, + "balance_loss_mlp": 1.08121455, + "epoch": 0.2070026933435937, + "flos": 708665200128.0, + "grad_norm": 0.07630450069092473, + "language_loss": 0.89700603, + "learning_rate": 0.0009200746586552836, + "loss": 0.90815508, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.3371582, + "step": 1076, + "time_per_iteration": 2.8797900676727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107662, + "balance_loss_mlp": 1.07416463, + "epoch": 0.20719507502885726, + "flos": 829456409088.0, + "grad_norm": 0.06176881640488279, + "language_loss": 0.84210765, + "learning_rate": 0.0009199056105416825, + "loss": 0.85318428, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.33520508, + "step": 1077, + "time_per_iteration": 3.120950698852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107166, + "balance_loss_mlp": 1.07312012, + "epoch": 0.20738745671412082, + "flos": 637993774080.0, + "grad_norm": 0.055893649084458805, + "language_loss": 0.86594802, + "learning_rate": 0.0009197363994062654, + "loss": 0.87701964, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.34057617, + "step": 1078, + "time_per_iteration": 2.8197755813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086727, + "balance_loss_mlp": 1.05480289, + "epoch": 0.20757983839938438, + "flos": 685258845696.0, + "grad_norm": 0.054433441748304986, + "language_loss": 0.84861732, + "learning_rate": 0.0009195670253147262, + "loss": 0.85948461, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.3190918, + "step": 1079, + "time_per_iteration": 2.966987133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096354, + "balance_loss_mlp": 1.06340468, + "epoch": 0.20777222008464794, + "flos": 519024208896.0, + "grad_norm": 0.07868801214896702, + "language_loss": 0.82301188, + "learning_rate": 0.0009193974883328216, + "loss": 0.83397532, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.32958984, + "step": 1080, + "time_per_iteration": 2.620704174041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091209, + "balance_loss_mlp": 1.05725837, + "epoch": 0.2079646017699115, + "flos": 511136732160.0, + "grad_norm": 0.09961486538628272, + "language_loss": 0.87482947, + "learning_rate": 0.0009192277885263718, + "loss": 0.88574153, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.33984375, + "step": 1081, + "time_per_iteration": 2.6247479915618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087114, + "balance_loss_mlp": 1.05352044, + "epoch": 0.20815698345517505, + "flos": 931409929728.0, + "grad_norm": 0.05448561879445608, + "language_loss": 0.86255312, + "learning_rate": 0.0009190579259612602, + "loss": 0.87342417, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.33618164, + "step": 1082, + "time_per_iteration": 3.2661428451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085916, + "balance_loss_mlp": 1.05187023, + "epoch": 0.20834936514043864, + "flos": 632114205696.0, + "grad_norm": 0.059638645169798186, + "language_loss": 0.86669636, + "learning_rate": 0.000918887900703433, + "loss": 0.87755549, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.34082031, + "step": 1083, + "time_per_iteration": 2.7930080890655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083976, + "balance_loss_mlp": 1.05038285, + "epoch": 0.2085417468257022, + "flos": 394170693120.0, + "grad_norm": 0.06326041775418027, + "language_loss": 0.90427065, + "learning_rate": 0.0009187177128188999, + "loss": 0.91511047, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.33618164, + "step": 1084, + "time_per_iteration": 2.431358814239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054848, + "balance_loss_mlp": 1.04159176, + "epoch": 0.20873412851096576, + "flos": 1401387969024.0, + "grad_norm": 0.04127554175786628, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78211385, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.1328125, + "step": 1085, + "time_per_iteration": 6.352816343307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080772, + "balance_loss_mlp": 1.04686832, + "epoch": 0.20892651019622932, + "flos": 447599112192.0, + "grad_norm": 0.06234370040412467, + "language_loss": 0.8612783, + "learning_rate": 0.000918376849434071, + "loss": 0.87208605, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.33935547, + "step": 1086, + "time_per_iteration": 2.5168843269348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084999, + "balance_loss_mlp": 1.05040443, + "epoch": 0.20911889188149288, + "flos": 492863629824.0, + "grad_norm": 0.07820142019527274, + "language_loss": 0.90828383, + "learning_rate": 0.0009182061740661098, + "loss": 0.91913384, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.34643555, + "step": 1087, + "time_per_iteration": 2.5461525917053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083648, + "balance_loss_mlp": 1.0494113, + "epoch": 0.20931127356675644, + "flos": 840928946688.0, + "grad_norm": 0.05821627614551514, + "language_loss": 0.85034752, + "learning_rate": 0.0009180353363361127, + "loss": 0.86118406, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.3425293, + "step": 1088, + "time_per_iteration": 3.11942982673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084605, + "balance_loss_mlp": 1.05036855, + "epoch": 0.20950365525202, + "flos": 756796013568.0, + "grad_norm": 0.06471498550944753, + "language_loss": 0.82101512, + "learning_rate": 0.0009178643363104044, + "loss": 0.83186114, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.34277344, + "step": 1089, + "time_per_iteration": 3.0986390113830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107905, + "balance_loss_mlp": 1.04543328, + "epoch": 0.20969603693728356, + "flos": 472301812224.0, + "grad_norm": 0.07091461504319575, + "language_loss": 0.91050649, + "learning_rate": 0.0009176931740553735, + "loss": 0.92129695, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.33642578, + "step": 1090, + "time_per_iteration": 2.4965460300445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108219, + "balance_loss_mlp": 1.04845381, + "epoch": 0.20988841862254715, + "flos": 976507121664.0, + "grad_norm": 0.05967441428812083, + "language_loss": 0.82829833, + "learning_rate": 0.0009175218496374708, + "loss": 0.83912027, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.33740234, + "step": 1091, + "time_per_iteration": 3.325467348098755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082454, + "balance_loss_mlp": 1.04917121, + "epoch": 0.2100808003078107, + "flos": 1092697731072.0, + "grad_norm": 0.06552872916111846, + "language_loss": 0.85816884, + "learning_rate": 0.0009173503631232103, + "loss": 0.86899334, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.33300781, + "step": 1092, + "time_per_iteration": 3.3492543697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080972, + "balance_loss_mlp": 1.04761767, + "epoch": 0.21027318199307427, + "flos": 1012567468032.0, + "grad_norm": 0.06864870254184631, + "language_loss": 0.8205356, + "learning_rate": 0.0009171787145791691, + "loss": 0.83134532, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.33374023, + "step": 1093, + "time_per_iteration": 3.229302167892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083111, + "balance_loss_mlp": 1.04975629, + "epoch": 0.21046556367833782, + "flos": 521141216256.0, + "grad_norm": 0.08362122797221107, + "language_loss": 0.80208671, + "learning_rate": 0.000917006904071987, + "loss": 0.81291783, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.33374023, + "step": 1094, + "time_per_iteration": 2.5901217460632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091484, + "balance_loss_mlp": 1.05843902, + "epoch": 0.21065794536360138, + "flos": 603437948928.0, + "grad_norm": 0.05523679641811596, + "language_loss": 0.87209588, + "learning_rate": 0.0009168349316683669, + "loss": 0.88301063, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.33056641, + "step": 1095, + "time_per_iteration": 2.67250919342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093141, + "balance_loss_mlp": 1.06081104, + "epoch": 0.21085032704886494, + "flos": 603045070848.0, + "grad_norm": 0.05347318487829757, + "language_loss": 0.82685143, + "learning_rate": 0.0009166627974350741, + "loss": 0.83778286, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.32324219, + "step": 1096, + "time_per_iteration": 2.9291882514953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097603, + "balance_loss_mlp": 1.06362867, + "epoch": 0.2110427087341285, + "flos": 637382697984.0, + "grad_norm": 0.059512513015867, + "language_loss": 0.89716321, + "learning_rate": 0.0009164905014389373, + "loss": 0.90813923, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.34008789, + "step": 1097, + "time_per_iteration": 2.7384700775146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100621, + "balance_loss_mlp": 1.06798196, + "epoch": 0.21123509041939206, + "flos": 522667496448.0, + "grad_norm": 0.08051519151754843, + "language_loss": 0.87020361, + "learning_rate": 0.0009163180437468476, + "loss": 0.88120985, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.32641602, + "step": 1098, + "time_per_iteration": 2.584890365600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109632, + "balance_loss_mlp": 1.0635848, + "epoch": 0.21142747210465565, + "flos": 450938271744.0, + "grad_norm": 0.05811835985780437, + "language_loss": 0.86184567, + "learning_rate": 0.000916145424425759, + "loss": 0.87280893, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.32739258, + "step": 1099, + "time_per_iteration": 2.6362791061401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_mlp": 1.07059634, + "epoch": 0.2116198537899192, + "flos": 875813630976.0, + "grad_norm": 0.07623729144092387, + "language_loss": 0.9082064, + "learning_rate": 0.0009159726435426885, + "loss": 0.91924655, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.33447266, + "step": 1100, + "time_per_iteration": 3.0668158531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108924, + "balance_loss_mlp": 1.0554564, + "epoch": 0.21181223547518277, + "flos": 523410992640.0, + "grad_norm": 0.06029059005678133, + "language_loss": 0.90809137, + "learning_rate": 0.0009157997011647154, + "loss": 0.91898382, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.33813477, + "step": 1101, + "time_per_iteration": 2.5932393074035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110663, + "balance_loss_mlp": 1.07327497, + "epoch": 0.21200461716044633, + "flos": 572014669824.0, + "grad_norm": 0.05812758027328986, + "language_loss": 0.86378956, + "learning_rate": 0.0009156265973589817, + "loss": 0.87485588, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.33374023, + "step": 1102, + "time_per_iteration": 2.79496431350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110266, + "balance_loss_mlp": 1.07672012, + "epoch": 0.2121969988457099, + "flos": 544869075456.0, + "grad_norm": 0.0704183859149776, + "language_loss": 0.89789248, + "learning_rate": 0.0009154533321926926, + "loss": 0.90899515, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.33569336, + "step": 1103, + "time_per_iteration": 2.5982048511505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107149, + "balance_loss_mlp": 1.07393694, + "epoch": 0.21238938053097345, + "flos": 843489704448.0, + "grad_norm": 0.06399101868010165, + "language_loss": 0.87705767, + "learning_rate": 0.0009152799057331156, + "loss": 0.88812917, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.33227539, + "step": 1104, + "time_per_iteration": 3.088672637939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100871, + "balance_loss_mlp": 1.06768262, + "epoch": 0.212581762216237, + "flos": 445984081920.0, + "grad_norm": 0.064105004549741, + "language_loss": 0.91047186, + "learning_rate": 0.0009151063180475805, + "loss": 0.9214806, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.33203125, + "step": 1105, + "time_per_iteration": 2.569998025894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090698, + "balance_loss_mlp": 1.05798697, + "epoch": 0.21277414390150057, + "flos": 514129655808.0, + "grad_norm": 0.05967324045126681, + "language_loss": 0.84732658, + "learning_rate": 0.0009149325692034803, + "loss": 0.85823357, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.32714844, + "step": 1106, + "time_per_iteration": 2.6009016036987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149918, + "balance_loss_mlp": 1.13380063, + "epoch": 0.21296652558676413, + "flos": 1484790552576.0, + "grad_norm": 0.04210654195905191, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80353343, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.16113281, + "step": 1107, + "time_per_iteration": 4.820629596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082056, + "balance_loss_mlp": 1.04994082, + "epoch": 0.21315890727202771, + "flos": 845689669632.0, + "grad_norm": 0.07454945953507684, + "language_loss": 0.87513995, + "learning_rate": 0.0009145845883094678, + "loss": 0.88596046, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.32080078, + "step": 1108, + "time_per_iteration": 3.0311591625213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083334, + "balance_loss_mlp": 1.04971695, + "epoch": 0.21335128895729127, + "flos": 629086376448.0, + "grad_norm": 0.07212897946446892, + "language_loss": 0.85387337, + "learning_rate": 0.000914410356394654, + "loss": 0.86470675, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.33642578, + "step": 1109, + "time_per_iteration": 2.7746968269348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085071, + "balance_loss_mlp": 1.05102468, + "epoch": 0.21354367064255483, + "flos": 710649787392.0, + "grad_norm": 0.053148069764317206, + "language_loss": 0.85104829, + "learning_rate": 0.0009142359635914709, + "loss": 0.86189902, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.34057617, + "step": 1110, + "time_per_iteration": 3.109018564224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083884, + "balance_loss_mlp": 1.05067194, + "epoch": 0.2137360523278184, + "flos": 455950688256.0, + "grad_norm": 0.07113647076789116, + "language_loss": 0.84692943, + "learning_rate": 0.0009140614099676245, + "loss": 0.8577683, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.33203125, + "step": 1111, + "time_per_iteration": 2.5607409477233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083673, + "balance_loss_mlp": 1.05072355, + "epoch": 0.21392843401308195, + "flos": 665749034496.0, + "grad_norm": 0.059219994241997045, + "language_loss": 0.82997137, + "learning_rate": 0.0009138866955908821, + "loss": 0.84080815, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.32958984, + "step": 1112, + "time_per_iteration": 2.901376724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086583, + "balance_loss_mlp": 1.05327559, + "epoch": 0.2141208156983455, + "flos": 748656843264.0, + "grad_norm": 0.06302145936378449, + "language_loss": 0.80617785, + "learning_rate": 0.0009137118205290738, + "loss": 0.81704366, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.33325195, + "step": 1113, + "time_per_iteration": 2.9629132747650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085777, + "balance_loss_mlp": 1.05142069, + "epoch": 0.21431319738360907, + "flos": 418898124288.0, + "grad_norm": 0.06913372638273338, + "language_loss": 0.90778732, + "learning_rate": 0.0009135367848500924, + "loss": 0.91864502, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.34399414, + "step": 1114, + "time_per_iteration": 2.5860419273376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087087, + "balance_loss_mlp": 1.05406582, + "epoch": 0.21450557906887263, + "flos": 608849035776.0, + "grad_norm": 0.07370492115341919, + "language_loss": 0.86567605, + "learning_rate": 0.0009133615886218927, + "loss": 0.87654686, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.33032227, + "step": 1115, + "time_per_iteration": 2.6986265182495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095845, + "balance_loss_mlp": 1.06082106, + "epoch": 0.21469796075413622, + "flos": 561649393152.0, + "grad_norm": 0.0682239504380638, + "language_loss": 0.88444531, + "learning_rate": 0.0009131862319124917, + "loss": 0.89540386, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.3503418, + "step": 1116, + "time_per_iteration": 2.644977569580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086331, + "balance_loss_mlp": 1.0540725, + "epoch": 0.21489034243939978, + "flos": 594363225600.0, + "grad_norm": 0.06937847326766512, + "language_loss": 0.8429122, + "learning_rate": 0.0009130107147899691, + "loss": 0.85377544, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.32250977, + "step": 1117, + "time_per_iteration": 2.768064498901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084886, + "balance_loss_mlp": 1.05148315, + "epoch": 0.21508272412466334, + "flos": 441661317120.0, + "grad_norm": 0.09911577685113587, + "language_loss": 0.85504615, + "learning_rate": 0.0009128350373224665, + "loss": 0.86589503, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.33422852, + "step": 1118, + "time_per_iteration": 2.5369865894317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_mlp": 1.02206326, + "epoch": 0.2152751058099269, + "flos": 1495397348352.0, + "grad_norm": 0.028624916140058014, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82490271, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.11767578, + "step": 1119, + "time_per_iteration": 4.634536266326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109091, + "balance_loss_mlp": 1.05741262, + "epoch": 0.21546748749519046, + "flos": 493759895040.0, + "grad_norm": 0.057336284262766976, + "language_loss": 0.85470641, + "learning_rate": 0.0009124832016254005, + "loss": 0.86561549, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.33520508, + "step": 1120, + "time_per_iteration": 2.57099986076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097048, + "balance_loss_mlp": 1.06245303, + "epoch": 0.21565986918045402, + "flos": 634241387520.0, + "grad_norm": 0.0556622286599547, + "language_loss": 0.8842063, + "learning_rate": 0.0009123070435324316, + "loss": 0.89517677, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.34619141, + "step": 1121, + "time_per_iteration": 2.73698091506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010392, + "balance_loss_mlp": 1.02780366, + "epoch": 0.21585225086571758, + "flos": 1582502704128.0, + "grad_norm": 0.024824935431588098, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78914982, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.11376953, + "step": 1122, + "time_per_iteration": 4.960963010787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093163, + "balance_loss_mlp": 1.05897415, + "epoch": 0.21604463255098114, + "flos": 683799556608.0, + "grad_norm": 0.06637115500638362, + "language_loss": 0.86772728, + "learning_rate": 0.0009119542471995752, + "loss": 0.87865889, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.34204102, + "step": 1123, + "time_per_iteration": 2.819042205810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109659, + "balance_loss_mlp": 1.06228209, + "epoch": 0.2162370142362447, + "flos": 780660675072.0, + "grad_norm": 0.06221084946299637, + "language_loss": 0.81623554, + "learning_rate": 0.0009117776090966554, + "loss": 0.82720149, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.34326172, + "step": 1124, + "time_per_iteration": 2.9435975551605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090546, + "balance_loss_mlp": 1.05473578, + "epoch": 0.21642939592150828, + "flos": 1001745294336.0, + "grad_norm": 0.06219513600405685, + "language_loss": 0.86821365, + "learning_rate": 0.0009116008111274899, + "loss": 0.8791191, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.35839844, + "step": 1125, + "time_per_iteration": 3.250828504562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023544, + "balance_loss_mlp": 1.01271951, + "epoch": 0.21662177760677184, + "flos": 1481867440128.0, + "grad_norm": 0.013492425774453086, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.8013047, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.10839844, + "step": 1126, + "time_per_iteration": 4.836662530899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078043, + "balance_loss_mlp": 1.0431627, + "epoch": 0.2168141592920354, + "flos": 887030092800.0, + "grad_norm": 0.06408405180788145, + "language_loss": 0.84878719, + "learning_rate": 0.0009112467358650396, + "loss": 0.85956764, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.34912109, + "step": 1127, + "time_per_iteration": 3.118460178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087154, + "balance_loss_mlp": 1.05205846, + "epoch": 0.21700654097729896, + "flos": 545682382848.0, + "grad_norm": 0.06014422622436645, + "language_loss": 0.86521864, + "learning_rate": 0.0009110694587092192, + "loss": 0.87609017, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.35131836, + "step": 1128, + "time_per_iteration": 2.736814022064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080165, + "balance_loss_mlp": 1.0446167, + "epoch": 0.21719892266256252, + "flos": 509270008320.0, + "grad_norm": 0.06606219668196793, + "language_loss": 0.81429344, + "learning_rate": 0.0009108920219620815, + "loss": 0.82509506, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.35571289, + "step": 1129, + "time_per_iteration": 2.6214489936828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083138, + "balance_loss_mlp": 1.04782772, + "epoch": 0.21739130434782608, + "flos": 543150738432.0, + "grad_norm": 0.060577581075914995, + "language_loss": 0.89903116, + "learning_rate": 0.0009107144256925133, + "loss": 0.90986252, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.35302734, + "step": 1130, + "time_per_iteration": 2.6337971687316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079633, + "balance_loss_mlp": 1.04489541, + "epoch": 0.21758368603308964, + "flos": 616564804608.0, + "grad_norm": 0.0674499307688184, + "language_loss": 0.82610142, + "learning_rate": 0.0009105366699694638, + "loss": 0.83689773, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.34790039, + "step": 1131, + "time_per_iteration": 2.6984267234802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085371, + "balance_loss_mlp": 1.04979873, + "epoch": 0.2177760677183532, + "flos": 634813175808.0, + "grad_norm": 0.051829013054278075, + "language_loss": 0.8159321, + "learning_rate": 0.0009103587548619439, + "loss": 0.8267858, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.35571289, + "step": 1132, + "time_per_iteration": 2.8308732509613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083319, + "balance_loss_mlp": 1.04850996, + "epoch": 0.2179684494036168, + "flos": 532181587968.0, + "grad_norm": 0.06772780520844247, + "language_loss": 0.86115086, + "learning_rate": 0.0009101806804390261, + "loss": 0.87198412, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.34863281, + "step": 1133, + "time_per_iteration": 2.7745282649993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081903, + "balance_loss_mlp": 1.04671264, + "epoch": 0.21816083108888035, + "flos": 474980433408.0, + "grad_norm": 0.05911376481567057, + "language_loss": 0.90451765, + "learning_rate": 0.0009100024467698453, + "loss": 0.91533667, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.35205078, + "step": 1134, + "time_per_iteration": 2.551278829574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086865, + "balance_loss_mlp": 1.051054, + "epoch": 0.2183532127741439, + "flos": 577198794240.0, + "grad_norm": 0.07962415284192025, + "language_loss": 0.83050048, + "learning_rate": 0.0009098240539235981, + "loss": 0.84136909, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.3581543, + "step": 1135, + "time_per_iteration": 2.660019636154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086508, + "balance_loss_mlp": 1.05172312, + "epoch": 0.21854559445940747, + "flos": 593832135168.0, + "grad_norm": 0.05867668726509775, + "language_loss": 0.87679315, + "learning_rate": 0.0009096455019695423, + "loss": 0.88765824, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.34838867, + "step": 1136, + "time_per_iteration": 2.756463050842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090029, + "balance_loss_mlp": 1.05426645, + "epoch": 0.21873797614467103, + "flos": 408464446464.0, + "grad_norm": 0.06290439978907646, + "language_loss": 0.90092266, + "learning_rate": 0.000909466790976998, + "loss": 0.91182297, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.35791016, + "step": 1137, + "time_per_iteration": 2.5046186447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089401, + "balance_loss_mlp": 1.05373359, + "epoch": 0.21893035782993459, + "flos": 893824865280.0, + "grad_norm": 0.05253297698454947, + "language_loss": 0.83030021, + "learning_rate": 0.0009092879210153473, + "loss": 0.84119421, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.35668945, + "step": 1138, + "time_per_iteration": 3.1294023990631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092735, + "balance_loss_mlp": 1.05835557, + "epoch": 0.21912273951519814, + "flos": 467392702464.0, + "grad_norm": 0.05516730570048504, + "language_loss": 0.88930631, + "learning_rate": 0.0009091088921540333, + "loss": 0.90023363, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.34399414, + "step": 1139, + "time_per_iteration": 2.5161380767822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081896, + "balance_loss_mlp": 1.06921172, + "epoch": 0.2193151212004617, + "flos": 1531262665728.0, + "grad_norm": 0.036356034107047845, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76590574, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.12695312, + "step": 1140, + "time_per_iteration": 4.929131984710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087847, + "balance_loss_mlp": 1.05306172, + "epoch": 0.2195075028857253, + "flos": 590901820416.0, + "grad_norm": 0.07364984820319191, + "language_loss": 0.8488574, + "learning_rate": 0.0009087503580104985, + "loss": 0.85973585, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.34814453, + "step": 1141, + "time_per_iteration": 2.676321029663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087752, + "balance_loss_mlp": 1.05287158, + "epoch": 0.21969988457098885, + "flos": 636033917952.0, + "grad_norm": 0.0662048159418312, + "language_loss": 0.79610777, + "learning_rate": 0.0009085708528674728, + "loss": 0.80698538, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.34912109, + "step": 1142, + "time_per_iteration": 2.7667393684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088746, + "balance_loss_mlp": 1.05202913, + "epoch": 0.2198922662562524, + "flos": 911974311936.0, + "grad_norm": 0.07907290305355467, + "language_loss": 0.86086833, + "learning_rate": 0.0009083911891031745, + "loss": 0.87175578, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.36743164, + "step": 1143, + "time_per_iteration": 3.1026079654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093334, + "balance_loss_mlp": 1.05809617, + "epoch": 0.22008464794151597, + "flos": 822603409920.0, + "grad_norm": 0.06284406217527433, + "language_loss": 0.91362917, + "learning_rate": 0.0009082113667873553, + "loss": 0.92456251, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.3527832, + "step": 1144, + "time_per_iteration": 3.098741292953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107482, + "balance_loss_mlp": 1.07188582, + "epoch": 0.22027702962677953, + "flos": 459416475648.0, + "grad_norm": 0.06625631151069579, + "language_loss": 0.90562177, + "learning_rate": 0.0009080313859898283, + "loss": 0.91669661, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.35620117, + "step": 1145, + "time_per_iteration": 2.4998207092285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111259, + "balance_loss_mlp": 1.07606888, + "epoch": 0.2204694113120431, + "flos": 530998723584.0, + "grad_norm": 0.05051092763003013, + "language_loss": 0.91815794, + "learning_rate": 0.0009078512467804684, + "loss": 0.92927051, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.35180664, + "step": 1146, + "time_per_iteration": 2.569073438644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117194, + "balance_loss_mlp": 1.08162224, + "epoch": 0.22066179299730665, + "flos": 522382307328.0, + "grad_norm": 0.06837547739928014, + "language_loss": 0.90610331, + "learning_rate": 0.0009076709492292119, + "loss": 0.91727525, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.35571289, + "step": 1147, + "time_per_iteration": 2.614039659500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114459, + "balance_loss_mlp": 1.07969809, + "epoch": 0.2208541746825702, + "flos": 546188742144.0, + "grad_norm": 0.06837160959472317, + "language_loss": 0.89193797, + "learning_rate": 0.0009074904934060562, + "loss": 0.90308249, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.34790039, + "step": 1148, + "time_per_iteration": 2.6419012546539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112121, + "balance_loss_mlp": 1.08578134, + "epoch": 0.22104655636783377, + "flos": 708404742144.0, + "grad_norm": 0.07108081727062696, + "language_loss": 0.84988266, + "learning_rate": 0.0009073098793810607, + "loss": 0.86109483, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.35473633, + "step": 1149, + "time_per_iteration": 2.909515142440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116065, + "balance_loss_mlp": 1.08061242, + "epoch": 0.22123893805309736, + "flos": 584594468352.0, + "grad_norm": 0.07695680382665727, + "language_loss": 0.88374794, + "learning_rate": 0.000907129107224346, + "loss": 0.89490861, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.35522461, + "step": 1150, + "time_per_iteration": 2.7029008865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099143, + "balance_loss_mlp": 1.06369042, + "epoch": 0.22143131973836092, + "flos": 492002270208.0, + "grad_norm": 0.049049579749502144, + "language_loss": 0.88305712, + "learning_rate": 0.0009069481770060939, + "loss": 0.89404863, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.35498047, + "step": 1151, + "time_per_iteration": 2.65167236328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097606, + "balance_loss_mlp": 1.06248736, + "epoch": 0.22162370142362448, + "flos": 1079227459584.0, + "grad_norm": 0.054063738490033035, + "language_loss": 0.84240663, + "learning_rate": 0.000906767088796548, + "loss": 0.85338271, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.35180664, + "step": 1152, + "time_per_iteration": 3.423985004425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110736, + "balance_loss_mlp": 1.07300401, + "epoch": 0.22181608310888803, + "flos": 492258345984.0, + "grad_norm": 0.057939830998464815, + "language_loss": 0.87012136, + "learning_rate": 0.0009065858426660127, + "loss": 0.88119501, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.34399414, + "step": 1153, + "time_per_iteration": 2.5987319946289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108178, + "balance_loss_mlp": 1.07355952, + "epoch": 0.2220084647941516, + "flos": 723687892992.0, + "grad_norm": 0.0653796708212952, + "language_loss": 0.84926325, + "learning_rate": 0.0009064044386848543, + "loss": 0.86034507, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.34667969, + "step": 1154, + "time_per_iteration": 2.9024224281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112372, + "balance_loss_mlp": 1.0753932, + "epoch": 0.22220084647941515, + "flos": 488988997632.0, + "grad_norm": 0.06606878403176955, + "language_loss": 0.88905716, + "learning_rate": 0.0009062228769234997, + "loss": 0.90018088, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.36987305, + "step": 1155, + "time_per_iteration": 2.5483920574188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104305, + "balance_loss_mlp": 1.06868565, + "epoch": 0.2223932281646787, + "flos": 536025696768.0, + "grad_norm": 0.06185680569649912, + "language_loss": 0.81360811, + "learning_rate": 0.0009060411574524376, + "loss": 0.82465118, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.35644531, + "step": 1156, + "time_per_iteration": 2.629166841506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114341, + "balance_loss_mlp": 1.0794363, + "epoch": 0.22258560984994227, + "flos": 931034580480.0, + "grad_norm": 0.06288530021121215, + "language_loss": 0.88191485, + "learning_rate": 0.0009058592803422178, + "loss": 0.8930583, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.34936523, + "step": 1157, + "time_per_iteration": 3.133453845977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219449, + "balance_loss_mlp": 1.20495331, + "epoch": 0.22277799153520586, + "flos": 1198998443520.0, + "grad_norm": 0.06392494715081258, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79929739, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.14453125, + "step": 1158, + "time_per_iteration": 4.838433027267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095231, + "balance_loss_mlp": 1.0620904, + "epoch": 0.22297037322046942, + "flos": 501052262400.0, + "grad_norm": 0.059439708082357066, + "language_loss": 0.90095651, + "learning_rate": 0.00090549505348681, + "loss": 0.91190875, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.33154297, + "step": 1159, + "time_per_iteration": 2.561887264251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083603, + "balance_loss_mlp": 1.04977143, + "epoch": 0.22316275490573298, + "flos": 752413612032.0, + "grad_norm": 0.05610915875378834, + "language_loss": 0.84254742, + "learning_rate": 0.0009053127038830275, + "loss": 0.85338354, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.33862305, + "step": 1160, + "time_per_iteration": 2.9465925693511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082896, + "balance_loss_mlp": 1.04844403, + "epoch": 0.22335513659099654, + "flos": 514553057280.0, + "grad_norm": 0.06657410760601727, + "language_loss": 0.87182009, + "learning_rate": 0.000905130196922898, + "loss": 0.88264906, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.3449707, + "step": 1161, + "time_per_iteration": 2.597325325012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076013, + "balance_loss_mlp": 1.04213357, + "epoch": 0.2235475182762601, + "flos": 484286501376.0, + "grad_norm": 0.057467913173926514, + "language_loss": 0.87228084, + "learning_rate": 0.0009049475326772769, + "loss": 0.88304096, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.33911133, + "step": 1162, + "time_per_iteration": 2.591592788696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107232, + "balance_loss_mlp": 1.0379163, + "epoch": 0.22373989996152366, + "flos": 469698794496.0, + "grad_norm": 0.05481831884816676, + "language_loss": 0.83362567, + "learning_rate": 0.0009047647112170811, + "loss": 0.84434885, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.34448242, + "step": 1163, + "time_per_iteration": 2.7466936111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080512, + "balance_loss_mlp": 1.04503536, + "epoch": 0.22393228164678722, + "flos": 1270512594432.0, + "grad_norm": 0.0775991801606853, + "language_loss": 0.87402856, + "learning_rate": 0.0009045817326132876, + "loss": 0.88483369, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.35498047, + "step": 1164, + "time_per_iteration": 3.6615524291992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082563, + "balance_loss_mlp": 1.04665732, + "epoch": 0.22412466333205078, + "flos": 596052449280.0, + "grad_norm": 0.05603114612800397, + "language_loss": 0.83484542, + "learning_rate": 0.0009043985969369357, + "loss": 0.84567106, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.35913086, + "step": 1165, + "time_per_iteration": 2.800389528274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084793, + "balance_loss_mlp": 1.047647, + "epoch": 0.22431704501731436, + "flos": 608136062976.0, + "grad_norm": 0.052919924442321326, + "language_loss": 0.84423298, + "learning_rate": 0.0009042153042591245, + "loss": 0.85508084, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.37158203, + "step": 1166, + "time_per_iteration": 2.7848384380340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080872, + "balance_loss_mlp": 1.04622972, + "epoch": 0.22450942670257792, + "flos": 906203842560.0, + "grad_norm": 0.054053491984114646, + "language_loss": 0.85318398, + "learning_rate": 0.0009040318546510146, + "loss": 0.86399269, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.34667969, + "step": 1167, + "time_per_iteration": 3.1406538486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080871, + "balance_loss_mlp": 1.04529881, + "epoch": 0.22470180838784148, + "flos": 565032222720.0, + "grad_norm": 0.06590224184570584, + "language_loss": 0.85490131, + "learning_rate": 0.0009038482481838275, + "loss": 0.86571002, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.35620117, + "step": 1168, + "time_per_iteration": 2.6623363494873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107832, + "balance_loss_mlp": 1.04265296, + "epoch": 0.22489419007310504, + "flos": 834109443072.0, + "grad_norm": 0.05295244004415107, + "language_loss": 0.87364161, + "learning_rate": 0.0009036644849288455, + "loss": 0.88442481, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.35668945, + "step": 1169, + "time_per_iteration": 3.096397638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082872, + "balance_loss_mlp": 1.04567838, + "epoch": 0.2250865717583686, + "flos": 580788237312.0, + "grad_norm": 0.06189616009675637, + "language_loss": 0.85257494, + "learning_rate": 0.0009034805649574118, + "loss": 0.86340362, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.37207031, + "step": 1170, + "time_per_iteration": 2.655629873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081931, + "balance_loss_mlp": 1.04669285, + "epoch": 0.22527895344363216, + "flos": 600091435008.0, + "grad_norm": 0.0574349936504533, + "language_loss": 0.85081124, + "learning_rate": 0.0009032964883409308, + "loss": 0.86163056, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.35253906, + "step": 1171, + "time_per_iteration": 2.9228479862213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096453, + "balance_loss_mlp": 1.08109915, + "epoch": 0.22547133512889572, + "flos": 1440009073152.0, + "grad_norm": 0.03435009764288223, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74146986, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.15332031, + "step": 1172, + "time_per_iteration": 4.9870195388793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099751, + "balance_loss_mlp": 1.06365418, + "epoch": 0.22566371681415928, + "flos": 490377065472.0, + "grad_norm": 0.06750207251725504, + "language_loss": 0.87597418, + "learning_rate": 0.0009029278654587462, + "loss": 0.88697171, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.36108398, + "step": 1173, + "time_per_iteration": 2.545078754425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098309, + "balance_loss_mlp": 1.06156898, + "epoch": 0.22585609849942284, + "flos": 604334214144.0, + "grad_norm": 0.06244795934891309, + "language_loss": 0.82517409, + "learning_rate": 0.0009027433193361548, + "loss": 0.8361572, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.36767578, + "step": 1174, + "time_per_iteration": 2.69753098487854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100346, + "balance_loss_mlp": 1.06305695, + "epoch": 0.22604848018468643, + "flos": 635280247296.0, + "grad_norm": 0.06854123529633785, + "language_loss": 0.87138826, + "learning_rate": 0.00090255861685474, + "loss": 0.88239175, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.37280273, + "step": 1175, + "time_per_iteration": 2.7199149131774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094451, + "balance_loss_mlp": 1.05744886, + "epoch": 0.22624086186995, + "flos": 479633467392.0, + "grad_norm": 0.06836538183258173, + "language_loss": 0.91474092, + "learning_rate": 0.0009023737580862095, + "loss": 0.92568541, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.36962891, + "step": 1176, + "time_per_iteration": 2.51255464553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092025, + "balance_loss_mlp": 1.0570724, + "epoch": 0.22643324355521355, + "flos": 495566982144.0, + "grad_norm": 0.05906016973995859, + "language_loss": 0.83066601, + "learning_rate": 0.0009021887431023321, + "loss": 0.84158623, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.34985352, + "step": 1177, + "time_per_iteration": 2.5783960819244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084144, + "balance_loss_mlp": 1.04842877, + "epoch": 0.2266256252404771, + "flos": 561271071744.0, + "grad_norm": 0.05542928649781209, + "language_loss": 0.87720597, + "learning_rate": 0.0009020035719749369, + "loss": 0.8880474, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.35742188, + "step": 1178, + "time_per_iteration": 2.7076900005340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088743, + "balance_loss_mlp": 1.05259871, + "epoch": 0.22681800692574067, + "flos": 579353527296.0, + "grad_norm": 0.05892405405909356, + "language_loss": 0.77506709, + "learning_rate": 0.0009018182447759136, + "loss": 0.78595448, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.36157227, + "step": 1179, + "time_per_iteration": 2.974362373352051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083275, + "balance_loss_mlp": 1.04798961, + "epoch": 0.22701038861100423, + "flos": 739842577920.0, + "grad_norm": 0.0555118465290956, + "language_loss": 0.80168724, + "learning_rate": 0.0009016327615772126, + "loss": 0.81252003, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.35327148, + "step": 1180, + "time_per_iteration": 2.9207658767700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082522, + "balance_loss_mlp": 1.04776096, + "epoch": 0.2272027702962678, + "flos": 576996562944.0, + "grad_norm": 0.06857059729731818, + "language_loss": 0.88146389, + "learning_rate": 0.0009014471224508451, + "loss": 0.8922891, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.34790039, + "step": 1181, + "time_per_iteration": 2.6884429454803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080826, + "balance_loss_mlp": 1.0466603, + "epoch": 0.22739515198153135, + "flos": 544012098048.0, + "grad_norm": 0.07386093909180869, + "language_loss": 0.83020878, + "learning_rate": 0.0009012613274688823, + "loss": 0.84101701, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.34204102, + "step": 1182, + "time_per_iteration": 2.625973701477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082925, + "balance_loss_mlp": 1.04735291, + "epoch": 0.22758753366679493, + "flos": 439932805632.0, + "grad_norm": 0.06621157637783351, + "language_loss": 0.87839937, + "learning_rate": 0.0009010753767034565, + "loss": 0.88922858, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.35571289, + "step": 1183, + "time_per_iteration": 2.545569658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086712, + "balance_loss_mlp": 1.05030489, + "epoch": 0.2277799153520585, + "flos": 729104772096.0, + "grad_norm": 0.07242159959797279, + "language_loss": 0.79501748, + "learning_rate": 0.0009008892702267599, + "loss": 0.80588454, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.36425781, + "step": 1184, + "time_per_iteration": 2.9862120151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089174, + "balance_loss_mlp": 1.05322075, + "epoch": 0.22797229703732205, + "flos": 526641053184.0, + "grad_norm": 0.0740207336504876, + "language_loss": 0.89059424, + "learning_rate": 0.0009007030081110457, + "loss": 0.90148592, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.35961914, + "step": 1185, + "time_per_iteration": 2.6184284687042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083236, + "balance_loss_mlp": 1.04783034, + "epoch": 0.2281646787225856, + "flos": 535159954944.0, + "grad_norm": 0.06479663876551665, + "language_loss": 0.84969211, + "learning_rate": 0.000900516590428627, + "loss": 0.86052454, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.35449219, + "step": 1186, + "time_per_iteration": 2.724161386489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083897, + "balance_loss_mlp": 1.049088, + "epoch": 0.22835706040784917, + "flos": 541107924480.0, + "grad_norm": 0.052728830082858405, + "language_loss": 0.89177948, + "learning_rate": 0.0009003300172518778, + "loss": 0.90261841, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.34790039, + "step": 1187, + "time_per_iteration": 2.6810121536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108576, + "balance_loss_mlp": 1.05018783, + "epoch": 0.22854944209311273, + "flos": 790297012224.0, + "grad_norm": 0.05376177869775473, + "language_loss": 0.84676045, + "learning_rate": 0.0009001432886532321, + "loss": 0.85761803, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.35571289, + "step": 1188, + "time_per_iteration": 2.977433919906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109735, + "balance_loss_mlp": 1.0614686, + "epoch": 0.2287418237783763, + "flos": 469047020544.0, + "grad_norm": 0.06589135500726684, + "language_loss": 0.86752445, + "learning_rate": 0.0008999564047051843, + "loss": 0.87849802, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.35913086, + "step": 1189, + "time_per_iteration": 2.5126237869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094774, + "balance_loss_mlp": 1.06017935, + "epoch": 0.22893420546363985, + "flos": 467786990592.0, + "grad_norm": 0.061551577223012334, + "language_loss": 0.84713042, + "learning_rate": 0.0008997693654802894, + "loss": 0.85807812, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.34643555, + "step": 1190, + "time_per_iteration": 2.6570322513580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088276, + "balance_loss_mlp": 1.05318046, + "epoch": 0.22912658714890344, + "flos": 625974179328.0, + "grad_norm": 0.05326512300588333, + "language_loss": 0.86549705, + "learning_rate": 0.0008995821710511625, + "loss": 0.87637979, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.35107422, + "step": 1191, + "time_per_iteration": 2.8115806579589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108036, + "balance_loss_mlp": 1.04731488, + "epoch": 0.229318968834167, + "flos": 502785156096.0, + "grad_norm": 0.06330680163661413, + "language_loss": 0.8511278, + "learning_rate": 0.0008993948214904786, + "loss": 0.86193144, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.33056641, + "step": 1192, + "time_per_iteration": 2.546410083770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125589, + "balance_loss_mlp": 1.11023474, + "epoch": 0.22951135051943056, + "flos": 1374108544512.0, + "grad_norm": 0.06153086464986019, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79547799, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.15332031, + "step": 1193, + "time_per_iteration": 4.891269207000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099997, + "balance_loss_mlp": 1.06306624, + "epoch": 0.22970373220469412, + "flos": 644045050368.0, + "grad_norm": 0.06658536787009234, + "language_loss": 0.79028845, + "learning_rate": 0.0008990196572654427, + "loss": 0.80128849, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.36914062, + "step": 1194, + "time_per_iteration": 2.8504366874694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100467, + "balance_loss_mlp": 1.06582475, + "epoch": 0.22989611388995768, + "flos": 499945001472.0, + "grad_norm": 0.048025217626556156, + "language_loss": 0.87748766, + "learning_rate": 0.0008988318427467426, + "loss": 0.88849235, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.34667969, + "step": 1195, + "time_per_iteration": 2.735084056854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099859, + "balance_loss_mlp": 1.06524014, + "epoch": 0.23008849557522124, + "flos": 1096071796224.0, + "grad_norm": 0.06731751876810108, + "language_loss": 0.86263168, + "learning_rate": 0.0008986438733877887, + "loss": 0.87363023, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.34667969, + "step": 1196, + "time_per_iteration": 3.435035228729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100924, + "balance_loss_mlp": 1.06606746, + "epoch": 0.2302808772604848, + "flos": 683313546240.0, + "grad_norm": 0.04733445604251135, + "language_loss": 0.84099567, + "learning_rate": 0.0008984557492615576, + "loss": 0.85200489, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.34887695, + "step": 1197, + "time_per_iteration": 2.927668809890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107534, + "balance_loss_mlp": 1.07317793, + "epoch": 0.23047325894574835, + "flos": 528664928256.0, + "grad_norm": 0.0630370564804667, + "language_loss": 0.89949608, + "learning_rate": 0.0008982674704410854, + "loss": 0.91057146, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.34399414, + "step": 1198, + "time_per_iteration": 2.691016435623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107479, + "balance_loss_mlp": 1.07228875, + "epoch": 0.23066564063101191, + "flos": 682427455488.0, + "grad_norm": 0.06209648084563375, + "language_loss": 0.77829844, + "learning_rate": 0.0008980790369994682, + "loss": 0.78937328, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.35205078, + "step": 1199, + "time_per_iteration": 2.9320883750915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_mlp": 1.06525421, + "epoch": 0.2308580223162755, + "flos": 558247624704.0, + "grad_norm": 0.09180159748966574, + "language_loss": 0.87396461, + "learning_rate": 0.000897890449009863, + "loss": 0.88496947, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.3527832, + "step": 1200, + "time_per_iteration": 2.6804869174957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096306, + "balance_loss_mlp": 1.06183052, + "epoch": 0.23105040400153906, + "flos": 555406060032.0, + "grad_norm": 0.05982856494430897, + "language_loss": 0.90313268, + "learning_rate": 0.0008977017065454853, + "loss": 0.9140957, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.3449707, + "step": 1201, + "time_per_iteration": 2.639636754989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090282, + "balance_loss_mlp": 1.05556786, + "epoch": 0.23124278568680262, + "flos": 704474855424.0, + "grad_norm": 0.06077351963181601, + "language_loss": 0.80804175, + "learning_rate": 0.0008975128096796121, + "loss": 0.81894457, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.34765625, + "step": 1202, + "time_per_iteration": 2.8410260677337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078695, + "balance_loss_mlp": 1.04431474, + "epoch": 0.23143516737206618, + "flos": 612469002240.0, + "grad_norm": 0.07413481943536562, + "language_loss": 0.85940087, + "learning_rate": 0.0008973237584855794, + "loss": 0.87018776, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.34423828, + "step": 1203, + "time_per_iteration": 2.898423671722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077022, + "balance_loss_mlp": 1.04233205, + "epoch": 0.23162754905732974, + "flos": 389030238720.0, + "grad_norm": 0.06038618944932519, + "language_loss": 0.8201915, + "learning_rate": 0.0008971345530367832, + "loss": 0.83096182, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.34716797, + "step": 1204, + "time_per_iteration": 2.486668586730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082339, + "balance_loss_mlp": 1.04738712, + "epoch": 0.2318199307425933, + "flos": 667481928192.0, + "grad_norm": 0.05427081728260985, + "language_loss": 0.85029405, + "learning_rate": 0.0008969451934066799, + "loss": 0.86111748, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.34960938, + "step": 1205, + "time_per_iteration": 2.771306276321411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091653, + "balance_loss_mlp": 1.05662966, + "epoch": 0.23201231242785686, + "flos": 666093860352.0, + "grad_norm": 0.0707913589572404, + "language_loss": 0.80143172, + "learning_rate": 0.0008967556796687854, + "loss": 0.81234825, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.35058594, + "step": 1206, + "time_per_iteration": 2.8904309272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087698, + "balance_loss_mlp": 1.05353224, + "epoch": 0.23220469411312042, + "flos": 748498281984.0, + "grad_norm": 0.05559113870944949, + "language_loss": 0.83954245, + "learning_rate": 0.0008965660118966752, + "loss": 0.8504194, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.34204102, + "step": 1207, + "time_per_iteration": 2.9140615463256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108846, + "balance_loss_mlp": 1.05529559, + "epoch": 0.232397075798384, + "flos": 666763163136.0, + "grad_norm": 0.04975334384076733, + "language_loss": 0.90441763, + "learning_rate": 0.0008963761901639851, + "loss": 0.91530222, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.33154297, + "step": 1208, + "time_per_iteration": 2.8032286167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094486, + "balance_loss_mlp": 1.06008244, + "epoch": 0.23258945748364757, + "flos": 609937357824.0, + "grad_norm": 0.05840728669351643, + "language_loss": 0.83201033, + "learning_rate": 0.0008961862145444103, + "loss": 0.84295517, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.34399414, + "step": 1209, + "time_per_iteration": 2.7161943912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_mlp": 1.05954397, + "epoch": 0.23278183916891113, + "flos": 489397842432.0, + "grad_norm": 0.06743904317466738, + "language_loss": 0.85216832, + "learning_rate": 0.0008959960851117059, + "loss": 0.86311138, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.34790039, + "step": 1210, + "time_per_iteration": 2.5817031860351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095047, + "balance_loss_mlp": 1.06014228, + "epoch": 0.23297422085417469, + "flos": 511314232320.0, + "grad_norm": 0.057575168534165826, + "language_loss": 0.84338427, + "learning_rate": 0.0008958058019396868, + "loss": 0.85433477, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.34936523, + "step": 1211, + "time_per_iteration": 2.7788164615631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091213, + "balance_loss_mlp": 1.05730987, + "epoch": 0.23316660253943824, + "flos": 546145072128.0, + "grad_norm": 0.057082370400879795, + "language_loss": 0.86897939, + "learning_rate": 0.0008956153651022274, + "loss": 0.87989151, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.33935547, + "step": 1212, + "time_per_iteration": 2.7309062480926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090204, + "balance_loss_mlp": 1.05608642, + "epoch": 0.2333589842247018, + "flos": 509998947840.0, + "grad_norm": 0.06317396982696966, + "language_loss": 0.84641176, + "learning_rate": 0.0008954247746732618, + "loss": 0.85731381, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.34155273, + "step": 1213, + "time_per_iteration": 2.619058609008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084199, + "balance_loss_mlp": 1.05072522, + "epoch": 0.23355136590996536, + "flos": 662834686464.0, + "grad_norm": 0.09780220222501788, + "language_loss": 0.90954423, + "learning_rate": 0.0008952340307267837, + "loss": 0.9203862, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.33496094, + "step": 1214, + "time_per_iteration": 2.869351387023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108593, + "balance_loss_mlp": 1.05128753, + "epoch": 0.23374374759522892, + "flos": 508206417408.0, + "grad_norm": 0.061496426320555984, + "language_loss": 0.83373952, + "learning_rate": 0.0008950431333368468, + "loss": 0.84459883, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.34667969, + "step": 1215, + "time_per_iteration": 2.5557806491851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093645, + "balance_loss_mlp": 1.05928898, + "epoch": 0.2339361292804925, + "flos": 1293964028928.0, + "grad_norm": 0.062331860667319446, + "language_loss": 0.84730738, + "learning_rate": 0.0008948520825775634, + "loss": 0.85824382, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.34399414, + "step": 1216, + "time_per_iteration": 3.6050164699554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098293, + "balance_loss_mlp": 1.06343639, + "epoch": 0.23412851096575607, + "flos": 705617021952.0, + "grad_norm": 0.06500023378725601, + "language_loss": 0.84162283, + "learning_rate": 0.0008946608785231067, + "loss": 0.8526057, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.34863281, + "step": 1217, + "time_per_iteration": 2.858696699142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109995, + "balance_loss_mlp": 1.065045, + "epoch": 0.23432089265101963, + "flos": 438036968448.0, + "grad_norm": 0.06356573317347913, + "language_loss": 0.84325957, + "learning_rate": 0.0008944695212477084, + "loss": 0.85425907, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.34912109, + "step": 1218, + "time_per_iteration": 2.4787168502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107192, + "balance_loss_mlp": 1.07190585, + "epoch": 0.2345132743362832, + "flos": 480697058304.0, + "grad_norm": 0.05460931090439532, + "language_loss": 0.86098325, + "learning_rate": 0.0008942780108256599, + "loss": 0.87205517, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.35327148, + "step": 1219, + "time_per_iteration": 2.574692726135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105521, + "balance_loss_mlp": 1.06966305, + "epoch": 0.23470565602154675, + "flos": 411231817728.0, + "grad_norm": 0.05853057396081394, + "language_loss": 0.86360055, + "learning_rate": 0.0008940863473313121, + "loss": 0.87465572, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.35839844, + "step": 1220, + "time_per_iteration": 2.4849462509155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115361, + "balance_loss_mlp": 1.08024168, + "epoch": 0.2348980377068103, + "flos": 545189170176.0, + "grad_norm": 0.0745659618807548, + "language_loss": 0.87691534, + "learning_rate": 0.0008938945308390756, + "loss": 0.88806891, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.3515625, + "step": 1221, + "time_per_iteration": 2.6100285053253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107677, + "balance_loss_mlp": 1.07248664, + "epoch": 0.23509041939207387, + "flos": 575465900544.0, + "grad_norm": 0.055913245264753976, + "language_loss": 0.87316763, + "learning_rate": 0.00089370256142342, + "loss": 0.88424438, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.35205078, + "step": 1222, + "time_per_iteration": 2.726897716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109541, + "balance_loss_mlp": 1.06007659, + "epoch": 0.23528280107733743, + "flos": 588568025088.0, + "grad_norm": 0.04976165943815558, + "language_loss": 0.85095507, + "learning_rate": 0.0008935104391588746, + "loss": 0.86190915, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.35351562, + "step": 1223, + "time_per_iteration": 2.7249879837036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108966, + "balance_loss_mlp": 1.07313156, + "epoch": 0.235475182762601, + "flos": 822948235776.0, + "grad_norm": 0.05651852634602403, + "language_loss": 0.82749176, + "learning_rate": 0.0008933181641200276, + "loss": 0.83858138, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.35839844, + "step": 1224, + "time_per_iteration": 3.1651737689971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094197, + "balance_loss_mlp": 1.06017447, + "epoch": 0.23566756444786457, + "flos": 679865287680.0, + "grad_norm": 0.06356150049585653, + "language_loss": 0.8609674, + "learning_rate": 0.0008931257363815271, + "loss": 0.87190938, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.34033203, + "step": 1225, + "time_per_iteration": 2.891789674758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091395, + "balance_loss_mlp": 1.05711007, + "epoch": 0.23585994613312813, + "flos": 701481931776.0, + "grad_norm": 0.04853721262867189, + "language_loss": 0.89892405, + "learning_rate": 0.0008929331560180798, + "loss": 0.90983796, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.34277344, + "step": 1226, + "time_per_iteration": 2.934101104736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093973, + "balance_loss_mlp": 1.06002271, + "epoch": 0.2360523278183917, + "flos": 523923144192.0, + "grad_norm": 0.06491881814379113, + "language_loss": 0.91129786, + "learning_rate": 0.0008927404231044525, + "loss": 0.92223763, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.33984375, + "step": 1227, + "time_per_iteration": 2.682377815246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083665, + "balance_loss_mlp": 1.0493325, + "epoch": 0.23624470950365525, + "flos": 524027860992.0, + "grad_norm": 0.053423388326064705, + "language_loss": 0.81944436, + "learning_rate": 0.0008925475377154703, + "loss": 0.83028102, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.34375, + "step": 1228, + "time_per_iteration": 2.7511117458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077291, + "balance_loss_mlp": 1.04169512, + "epoch": 0.2364370911889188, + "flos": 596525313024.0, + "grad_norm": 0.05836717970983508, + "language_loss": 0.8241868, + "learning_rate": 0.0008923544999260183, + "loss": 0.83495975, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.35644531, + "step": 1229, + "time_per_iteration": 2.7915079593658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087923, + "balance_loss_mlp": 1.05194569, + "epoch": 0.23662947287418237, + "flos": 756519588864.0, + "grad_norm": 0.08156392485297027, + "language_loss": 0.91757852, + "learning_rate": 0.00089216130981104, + "loss": 0.92845774, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.35986328, + "step": 1230, + "time_per_iteration": 3.037900924682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089884, + "balance_loss_mlp": 1.0531199, + "epoch": 0.23682185455944593, + "flos": 545907935232.0, + "grad_norm": 0.05473268619072285, + "language_loss": 0.82659578, + "learning_rate": 0.000891967967445539, + "loss": 0.83749461, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.36743164, + "step": 1231, + "time_per_iteration": 2.6595497131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088497, + "balance_loss_mlp": 1.05201924, + "epoch": 0.2370142362447095, + "flos": 661977709056.0, + "grad_norm": 0.04604146434030928, + "language_loss": 0.88502473, + "learning_rate": 0.0008917744729045772, + "loss": 0.89590967, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.36499023, + "step": 1232, + "time_per_iteration": 2.851651668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095833, + "balance_loss_mlp": 1.05868709, + "epoch": 0.23720661792997308, + "flos": 683361598464.0, + "grad_norm": 0.06835104069372223, + "language_loss": 0.84165156, + "learning_rate": 0.0008915808262632757, + "loss": 0.85260987, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.37133789, + "step": 1233, + "time_per_iteration": 2.8114235401153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095976, + "balance_loss_mlp": 1.05892599, + "epoch": 0.23739899961523664, + "flos": 558631738368.0, + "grad_norm": 0.055258261409357204, + "language_loss": 0.92769438, + "learning_rate": 0.0008913870275968148, + "loss": 0.93865418, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.37036133, + "step": 1234, + "time_per_iteration": 2.705349922180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092916, + "balance_loss_mlp": 1.05629516, + "epoch": 0.2375913813005002, + "flos": 889144128000.0, + "grad_norm": 0.12876854850300654, + "language_loss": 0.87540263, + "learning_rate": 0.0008911930769804342, + "loss": 0.8863318, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.36621094, + "step": 1235, + "time_per_iteration": 3.2342941761016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091192, + "balance_loss_mlp": 1.05492854, + "epoch": 0.23778376298576376, + "flos": 640810607616.0, + "grad_norm": 0.044375832072417805, + "language_loss": 0.91481459, + "learning_rate": 0.0008909989744894318, + "loss": 0.92572653, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.36303711, + "step": 1236, + "time_per_iteration": 2.8858232498168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089597, + "balance_loss_mlp": 1.05333364, + "epoch": 0.23797614467102732, + "flos": 616540073472.0, + "grad_norm": 0.05892762197337364, + "language_loss": 0.81707233, + "learning_rate": 0.0008908047201991649, + "loss": 0.82796836, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.36279297, + "step": 1237, + "time_per_iteration": 2.785226583480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085624, + "balance_loss_mlp": 1.05071974, + "epoch": 0.23816852635629088, + "flos": 623941539840.0, + "grad_norm": 0.051487502947417364, + "language_loss": 0.86561942, + "learning_rate": 0.0008906103141850502, + "loss": 0.87647569, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.34960938, + "step": 1238, + "time_per_iteration": 2.868241310119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095513, + "balance_loss_mlp": 1.05901158, + "epoch": 0.23836090804155444, + "flos": 521180504064.0, + "grad_norm": 0.07170300234131513, + "language_loss": 0.88119614, + "learning_rate": 0.0008904157565225621, + "loss": 0.89215136, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.36499023, + "step": 1239, + "time_per_iteration": 2.610048294067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092952, + "balance_loss_mlp": 1.05716562, + "epoch": 0.238553289726818, + "flos": 1153527616512.0, + "grad_norm": 0.07764557472008667, + "language_loss": 0.82042629, + "learning_rate": 0.000890221047287235, + "loss": 0.83135581, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.3581543, + "step": 1240, + "time_per_iteration": 3.547147274017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102634, + "balance_loss_mlp": 1.06772995, + "epoch": 0.23874567141208156, + "flos": 499600175616.0, + "grad_norm": 0.07123563443936186, + "language_loss": 0.91052604, + "learning_rate": 0.0008900261865546615, + "loss": 0.92155242, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.34936523, + "step": 1241, + "time_per_iteration": 2.6277406215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110411, + "balance_loss_mlp": 1.06768012, + "epoch": 0.23893805309734514, + "flos": 556657325568.0, + "grad_norm": 0.08027126565183675, + "language_loss": 0.84991688, + "learning_rate": 0.0008898311744004936, + "loss": 0.86095798, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.36425781, + "step": 1242, + "time_per_iteration": 2.687009811401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112655, + "balance_loss_mlp": 1.07708287, + "epoch": 0.2391304347826087, + "flos": 549009957888.0, + "grad_norm": 0.05686617926086787, + "language_loss": 0.86918116, + "learning_rate": 0.0008896360109004414, + "loss": 0.88030773, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.35595703, + "step": 1243, + "time_per_iteration": 2.6292564868927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111871, + "balance_loss_mlp": 1.07629931, + "epoch": 0.23932281646787226, + "flos": 515794148352.0, + "grad_norm": 0.05075175877282041, + "language_loss": 0.84481502, + "learning_rate": 0.0008894406961302742, + "loss": 0.85593379, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.35595703, + "step": 1244, + "time_per_iteration": 2.5960640907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122594, + "balance_loss_mlp": 1.08737969, + "epoch": 0.23951519815313582, + "flos": 743353445376.0, + "grad_norm": 0.06488001286924965, + "language_loss": 0.84053004, + "learning_rate": 0.0008892452301658201, + "loss": 0.85175598, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.35253906, + "step": 1245, + "time_per_iteration": 2.9320998191833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123528, + "balance_loss_mlp": 1.08728814, + "epoch": 0.23970757983839938, + "flos": 553855048704.0, + "grad_norm": 0.05553543969160018, + "language_loss": 0.83631629, + "learning_rate": 0.0008890496130829653, + "loss": 0.84755158, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.36230469, + "step": 1246, + "time_per_iteration": 2.6420071125030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123934, + "balance_loss_mlp": 1.08802795, + "epoch": 0.23989996152366294, + "flos": 480416251392.0, + "grad_norm": 0.0595921721906752, + "language_loss": 0.85551775, + "learning_rate": 0.0008888538449576555, + "loss": 0.86675715, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.359375, + "step": 1247, + "time_per_iteration": 2.544706344604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123543, + "balance_loss_mlp": 1.08687472, + "epoch": 0.2400923432089265, + "flos": 485069285376.0, + "grad_norm": 0.06973867138143126, + "language_loss": 0.82958472, + "learning_rate": 0.0008886579258658944, + "loss": 0.84082007, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.36669922, + "step": 1248, + "time_per_iteration": 2.5460424423217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108744, + "balance_loss_mlp": 1.0724808, + "epoch": 0.24028472489419006, + "flos": 623247505920.0, + "grad_norm": 0.04817062293972818, + "language_loss": 0.85353303, + "learning_rate": 0.0008884618558837446, + "loss": 0.86462045, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.36279297, + "step": 1249, + "time_per_iteration": 2.80222487449646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125107, + "balance_loss_mlp": 1.08765173, + "epoch": 0.24047710657945365, + "flos": 601302002688.0, + "grad_norm": 0.052194699096834656, + "language_loss": 0.86387813, + "learning_rate": 0.0008882656350873273, + "loss": 0.87512922, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.37426758, + "step": 1250, + "time_per_iteration": 2.830887794494629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136265, + "balance_loss_mlp": 1.09911942, + "epoch": 0.2406694882647172, + "flos": 841199579136.0, + "grad_norm": 0.07156482775024936, + "language_loss": 0.86951184, + "learning_rate": 0.0008880692635528219, + "loss": 0.88087451, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.37109375, + "step": 1251, + "time_per_iteration": 3.0439021587371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140454, + "balance_loss_mlp": 1.10450029, + "epoch": 0.24086186994998077, + "flos": 526789440000.0, + "grad_norm": 0.062254670736574515, + "language_loss": 0.89187038, + "learning_rate": 0.0008878727413564669, + "loss": 0.90327489, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.36010742, + "step": 1252, + "time_per_iteration": 2.7363240718841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094248, + "balance_loss_mlp": 1.07717752, + "epoch": 0.24105425163524433, + "flos": 1337464673280.0, + "grad_norm": 0.032126183170312766, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81229842, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.17089844, + "step": 1253, + "time_per_iteration": 4.8370680809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175937, + "balance_loss_mlp": 1.13755202, + "epoch": 0.24124663332050789, + "flos": 613822164480.0, + "grad_norm": 0.06436318886622608, + "language_loss": 0.78452635, + "learning_rate": 0.0008874792452834528, + "loss": 0.79628575, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.38354492, + "step": 1254, + "time_per_iteration": 2.724947452545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151033, + "balance_loss_mlp": 1.11381602, + "epoch": 0.24143901500577145, + "flos": 575278225920.0, + "grad_norm": 0.08996846201845816, + "language_loss": 0.87516546, + "learning_rate": 0.0008872822715595626, + "loss": 0.88667583, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.37207031, + "step": 1255, + "time_per_iteration": 2.676539659500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118396, + "balance_loss_mlp": 1.08275259, + "epoch": 0.241631396691035, + "flos": 494941349376.0, + "grad_norm": 0.06475486920780314, + "language_loss": 0.87080252, + "learning_rate": 0.0008870851474793598, + "loss": 0.88198644, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.35668945, + "step": 1256, + "time_per_iteration": 2.5523862838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108714, + "balance_loss_mlp": 1.07287991, + "epoch": 0.24182377837629856, + "flos": 635891323392.0, + "grad_norm": 0.0627898868455093, + "language_loss": 0.89724898, + "learning_rate": 0.0008868878731193752, + "loss": 0.90833616, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.35888672, + "step": 1257, + "time_per_iteration": 2.8152451515197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094242, + "balance_loss_mlp": 1.05933762, + "epoch": 0.24201616006156215, + "flos": 514938580992.0, + "grad_norm": 0.06450256361572139, + "language_loss": 0.89708877, + "learning_rate": 0.0008866904485561973, + "loss": 0.90803117, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.34936523, + "step": 1258, + "time_per_iteration": 2.7304298877716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095529, + "balance_loss_mlp": 1.05945659, + "epoch": 0.2422085417468257, + "flos": 614837703168.0, + "grad_norm": 0.05809143078881904, + "language_loss": 0.83024096, + "learning_rate": 0.000886492873866473, + "loss": 0.8411963, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.36108398, + "step": 1259, + "time_per_iteration": 2.828904628753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090865, + "balance_loss_mlp": 1.05576968, + "epoch": 0.24240092343208927, + "flos": 585515464704.0, + "grad_norm": 0.07568124142212555, + "language_loss": 0.84760439, + "learning_rate": 0.000886295149126908, + "loss": 0.85851306, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.35131836, + "step": 1260, + "time_per_iteration": 2.7011313438415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109068, + "balance_loss_mlp": 1.05537009, + "epoch": 0.24259330511735283, + "flos": 761930675712.0, + "grad_norm": 0.05459059834864095, + "language_loss": 0.85652769, + "learning_rate": 0.0008860972744142655, + "loss": 0.8674345, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.35327148, + "step": 1261, + "time_per_iteration": 2.9039082527160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084953, + "balance_loss_mlp": 1.05028725, + "epoch": 0.2427856868026164, + "flos": 626566316544.0, + "grad_norm": 0.06267274795834049, + "language_loss": 0.8183161, + "learning_rate": 0.0008858992498053671, + "loss": 0.82916564, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.34692383, + "step": 1262, + "time_per_iteration": 2.8293697834014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080455, + "balance_loss_mlp": 1.06586385, + "epoch": 0.24297806848787995, + "flos": 1510840470528.0, + "grad_norm": 0.02756761643082338, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77669203, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.14550781, + "step": 1263, + "time_per_iteration": 4.8116748332977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087756, + "balance_loss_mlp": 1.05328119, + "epoch": 0.2431704501731435, + "flos": 541669538304.0, + "grad_norm": 0.05501719814044903, + "language_loss": 0.83684969, + "learning_rate": 0.0008855027512063817, + "loss": 0.8477273, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.3449707, + "step": 1264, + "time_per_iteration": 2.6995394229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084053, + "balance_loss_mlp": 1.0493865, + "epoch": 0.24336283185840707, + "flos": 523588492800.0, + "grad_norm": 0.06804757776515359, + "language_loss": 0.85974693, + "learning_rate": 0.0008853042773702292, + "loss": 0.87058747, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.34692383, + "step": 1265, + "time_per_iteration": 2.683969497680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086337, + "balance_loss_mlp": 1.05002618, + "epoch": 0.24355521354367063, + "flos": 536839004160.0, + "grad_norm": 0.05444938358074035, + "language_loss": 0.87678754, + "learning_rate": 0.0008851056539456896, + "loss": 0.88765097, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.36303711, + "step": 1266, + "time_per_iteration": 2.674891471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_mlp": 1.04830909, + "epoch": 0.24374759522893422, + "flos": 930050975232.0, + "grad_norm": 0.04940136823280911, + "language_loss": 0.82195789, + "learning_rate": 0.0008849068810098755, + "loss": 0.83279288, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.35229492, + "step": 1267, + "time_per_iteration": 3.27172589302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083691, + "balance_loss_mlp": 1.04859626, + "epoch": 0.24393997691419778, + "flos": 427564002816.0, + "grad_norm": 0.07591960175092535, + "language_loss": 0.83287823, + "learning_rate": 0.0008847079586399575, + "loss": 0.84371519, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.35131836, + "step": 1268, + "time_per_iteration": 2.4539763927459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010876, + "balance_loss_mlp": 1.05281472, + "epoch": 0.24413235859946134, + "flos": 578582479872.0, + "grad_norm": 0.059755639557228325, + "language_loss": 0.86095846, + "learning_rate": 0.0008845088869131641, + "loss": 0.87183452, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.34790039, + "step": 1269, + "time_per_iteration": 2.651010274887085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094661, + "balance_loss_mlp": 1.058851, + "epoch": 0.2443247402847249, + "flos": 529600481280.0, + "grad_norm": 0.07776240560166553, + "language_loss": 0.89366186, + "learning_rate": 0.0008843096659067818, + "loss": 0.90460849, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.35839844, + "step": 1270, + "time_per_iteration": 2.61082124710083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108773, + "balance_loss_mlp": 1.05292046, + "epoch": 0.24451712196998845, + "flos": 695996651520.0, + "grad_norm": 0.05083497592617014, + "language_loss": 0.86395383, + "learning_rate": 0.000884110295698155, + "loss": 0.87483108, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.34863281, + "step": 1271, + "time_per_iteration": 2.930372476577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085133, + "balance_loss_mlp": 1.05089653, + "epoch": 0.24470950365525201, + "flos": 529575750144.0, + "grad_norm": 0.05520811698213447, + "language_loss": 0.86009014, + "learning_rate": 0.0008839107763646861, + "loss": 0.87094152, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.34277344, + "step": 1272, + "time_per_iteration": 2.576322078704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088964, + "balance_loss_mlp": 1.05324888, + "epoch": 0.24490188534051557, + "flos": 491091448320.0, + "grad_norm": 0.0616556586287024, + "language_loss": 0.9024111, + "learning_rate": 0.0008837111079838353, + "loss": 0.91330075, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.35742188, + "step": 1273, + "time_per_iteration": 2.6859118938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109117, + "balance_loss_mlp": 1.05631351, + "epoch": 0.24509426702577913, + "flos": 473916842496.0, + "grad_norm": 0.05704478566457949, + "language_loss": 0.89869869, + "learning_rate": 0.000883511290633121, + "loss": 0.90961039, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.34887695, + "step": 1274, + "time_per_iteration": 2.5262861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096408, + "balance_loss_mlp": 1.06152773, + "epoch": 0.24528664871104272, + "flos": 550329624576.0, + "grad_norm": 0.04914382449005864, + "language_loss": 0.92288065, + "learning_rate": 0.000883311324390119, + "loss": 0.93384475, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.34887695, + "step": 1275, + "time_per_iteration": 2.6791441440582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100279, + "balance_loss_mlp": 1.0631578, + "epoch": 0.24547903039630628, + "flos": 825546871296.0, + "grad_norm": 0.0705624444694786, + "language_loss": 0.81542301, + "learning_rate": 0.0008831112093324629, + "loss": 0.82642579, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.37060547, + "step": 1276, + "time_per_iteration": 3.0612823963165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100002, + "balance_loss_mlp": 1.06419206, + "epoch": 0.24567141208156984, + "flos": 591325221888.0, + "grad_norm": 0.0822852621967946, + "language_loss": 0.89184481, + "learning_rate": 0.0008829109455378444, + "loss": 0.90284485, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.35839844, + "step": 1277, + "time_per_iteration": 2.6601858139038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091196, + "balance_loss_mlp": 1.05567181, + "epoch": 0.2458637937668334, + "flos": 547611715584.0, + "grad_norm": 0.05101212903881184, + "language_loss": 0.86474031, + "learning_rate": 0.000882710533084013, + "loss": 0.87565225, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.35546875, + "step": 1278, + "time_per_iteration": 2.6333553791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087707, + "balance_loss_mlp": 1.05158627, + "epoch": 0.24605617545209696, + "flos": 515641379328.0, + "grad_norm": 0.04855931692812416, + "language_loss": 0.89387107, + "learning_rate": 0.0008825099720487755, + "loss": 0.9047482, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.36108398, + "step": 1279, + "time_per_iteration": 2.6388816833496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071735, + "balance_loss_mlp": 1.05943298, + "epoch": 0.24624855713736052, + "flos": 1510953951744.0, + "grad_norm": 0.03612446278815301, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76332873, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.12304688, + "step": 1280, + "time_per_iteration": 4.837193727493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044143, + "balance_loss_mlp": 1.03145874, + "epoch": 0.24644093882262408, + "flos": 1526826419712.0, + "grad_norm": 0.020354868078157083, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.78988254, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.12695312, + "step": 1281, + "time_per_iteration": 4.7473485469818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078971, + "balance_loss_mlp": 1.04418564, + "epoch": 0.24663332050788764, + "flos": 658811667456.0, + "grad_norm": 0.060866999123497585, + "language_loss": 0.89327228, + "learning_rate": 0.0008819073982335619, + "loss": 0.90406203, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.34838867, + "step": 1282, + "time_per_iteration": 2.839691162109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107951, + "balance_loss_mlp": 1.0446527, + "epoch": 0.24682570219315123, + "flos": 541510977024.0, + "grad_norm": 0.05752783194209404, + "language_loss": 0.84339237, + "learning_rate": 0.0008817062436519235, + "loss": 0.85418749, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.34887695, + "step": 1283, + "time_per_iteration": 2.6106019020080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080546, + "balance_loss_mlp": 1.04459274, + "epoch": 0.24701808387841478, + "flos": 440455131648.0, + "grad_norm": 0.05999718389674832, + "language_loss": 0.89926815, + "learning_rate": 0.0008815049408787788, + "loss": 0.91007358, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.36010742, + "step": 1284, + "time_per_iteration": 2.5186686515808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079282, + "balance_loss_mlp": 1.04518795, + "epoch": 0.24721046556367834, + "flos": 467826278400.0, + "grad_norm": 0.054777388157378364, + "language_loss": 0.8565737, + "learning_rate": 0.0008813034899922805, + "loss": 0.86736655, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.34106445, + "step": 1285, + "time_per_iteration": 2.5217878818511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082636, + "balance_loss_mlp": 1.04730225, + "epoch": 0.2474028472489419, + "flos": 504183398400.0, + "grad_norm": 0.06351521025868076, + "language_loss": 0.90182853, + "learning_rate": 0.0008811018910706387, + "loss": 0.91265488, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.35375977, + "step": 1286, + "time_per_iteration": 2.549523115158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010823, + "balance_loss_mlp": 1.04787278, + "epoch": 0.24759522893420546, + "flos": 479707660800.0, + "grad_norm": 0.06857789842871208, + "language_loss": 0.81978023, + "learning_rate": 0.0008809001441921211, + "loss": 0.83060318, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.34448242, + "step": 1287, + "time_per_iteration": 2.7147598266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083728, + "balance_loss_mlp": 1.04984844, + "epoch": 0.24778761061946902, + "flos": 533446000128.0, + "grad_norm": 0.05733880184845353, + "language_loss": 0.85523212, + "learning_rate": 0.0008806982494350528, + "loss": 0.86606944, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.33911133, + "step": 1288, + "time_per_iteration": 2.6304967403411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086884, + "balance_loss_mlp": 1.05197978, + "epoch": 0.24797999230473258, + "flos": 559513446912.0, + "grad_norm": 0.04849910181782432, + "language_loss": 0.90370154, + "learning_rate": 0.0008804962068778161, + "loss": 0.91457039, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.34936523, + "step": 1289, + "time_per_iteration": 2.8194985389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087315, + "balance_loss_mlp": 1.05291104, + "epoch": 0.24817237398999614, + "flos": 623912426496.0, + "grad_norm": 0.05410640942937228, + "language_loss": 0.80728722, + "learning_rate": 0.0008802940165988511, + "loss": 0.81816041, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.34423828, + "step": 1290, + "time_per_iteration": 2.8703298568725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096846, + "balance_loss_mlp": 1.06225193, + "epoch": 0.2483647556752597, + "flos": 611981581824.0, + "grad_norm": 0.06277561181530684, + "language_loss": 0.88376027, + "learning_rate": 0.000880091678676655, + "loss": 0.89472872, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.34619141, + "step": 1291, + "time_per_iteration": 2.7943451404571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088363, + "balance_loss_mlp": 1.05496097, + "epoch": 0.2485571373605233, + "flos": 583270419456.0, + "grad_norm": 0.061640996967182685, + "language_loss": 0.89207399, + "learning_rate": 0.0008798891931897821, + "loss": 0.90295762, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.33422852, + "step": 1292, + "time_per_iteration": 2.7013609409332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05463946, + "epoch": 0.24874951904578685, + "flos": 494503391232.0, + "grad_norm": 0.0568342609101268, + "language_loss": 0.84605837, + "learning_rate": 0.0008796865602168447, + "loss": 0.8569414, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.33691406, + "step": 1293, + "time_per_iteration": 2.517571210861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.05448937, + "epoch": 0.2489419007310504, + "flos": 455925957120.0, + "grad_norm": 0.05011975537228715, + "language_loss": 0.88745099, + "learning_rate": 0.0008794837798365115, + "loss": 0.8983261, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.33032227, + "step": 1294, + "time_per_iteration": 2.6243135929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093031, + "balance_loss_mlp": 1.05958056, + "epoch": 0.24913428241631397, + "flos": 485198733312.0, + "grad_norm": 0.05031013210073455, + "language_loss": 0.88537574, + "learning_rate": 0.0008792808521275089, + "loss": 0.89630604, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.3347168, + "step": 1295, + "time_per_iteration": 2.743821144104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090644, + "balance_loss_mlp": 1.05664551, + "epoch": 0.24932666410157753, + "flos": 518654651904.0, + "grad_norm": 0.0628198177294759, + "language_loss": 0.87554896, + "learning_rate": 0.0008790777771686206, + "loss": 0.8864553, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.34033203, + "step": 1296, + "time_per_iteration": 2.55996036529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091195, + "balance_loss_mlp": 1.05819798, + "epoch": 0.2495190457868411, + "flos": 472365831168.0, + "grad_norm": 0.05367084005526609, + "language_loss": 0.85479438, + "learning_rate": 0.0008788745550386872, + "loss": 0.86570632, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.33007812, + "step": 1297, + "time_per_iteration": 2.555238723754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099543, + "balance_loss_mlp": 1.06494844, + "epoch": 0.24971142747210465, + "flos": 745559202816.0, + "grad_norm": 0.05557204977607519, + "language_loss": 0.80045742, + "learning_rate": 0.0008786711858166063, + "loss": 0.81145287, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.34643555, + "step": 1298, + "time_per_iteration": 2.940908670425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090601, + "balance_loss_mlp": 1.05757999, + "epoch": 0.2499038091573682, + "flos": 749222839296.0, + "grad_norm": 0.08262860681241094, + "language_loss": 0.83490336, + "learning_rate": 0.0008784676695813332, + "loss": 0.84580934, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.33032227, + "step": 1299, + "time_per_iteration": 2.966646432876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092772, + "balance_loss_mlp": 1.05870187, + "epoch": 0.2500961908426318, + "flos": 744741513216.0, + "grad_norm": 0.04756275395178792, + "language_loss": 0.84761405, + "learning_rate": 0.0008782640064118796, + "loss": 0.85854173, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.34082031, + "step": 1300, + "time_per_iteration": 2.8889827728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078115, + "balance_loss_mlp": 1.06447709, + "epoch": 0.2502885725278953, + "flos": 1416652180992.0, + "grad_norm": 0.036683670441934005, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77262866, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.13671875, + "step": 1301, + "time_per_iteration": 4.988169431686401 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094633, + "balance_loss_mlp": 1.06196928, + "epoch": 0.2504809542131589, + "flos": 514961902080.0, + "grad_norm": 0.05923567857946263, + "language_loss": 0.86476314, + "learning_rate": 0.0008778562395867648, + "loss": 0.87570941, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.32666016, + "step": 1302, + "time_per_iteration": 2.5900919437408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087269, + "balance_loss_mlp": 1.05436766, + "epoch": 0.25067333589842244, + "flos": 525562905600.0, + "grad_norm": 0.06049595368492962, + "language_loss": 0.83774143, + "learning_rate": 0.0008776521360894127, + "loss": 0.8486141, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.32910156, + "step": 1303, + "time_per_iteration": 2.6029298305511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_mlp": 1.02275884, + "epoch": 0.25086571758368603, + "flos": 1473085108224.0, + "grad_norm": 0.024331867442101186, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.79997885, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.13085938, + "step": 1304, + "time_per_iteration": 4.800757646560669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096353, + "balance_loss_mlp": 1.063833, + "epoch": 0.2510580992689496, + "flos": 528128045568.0, + "grad_norm": 0.053799887970574674, + "language_loss": 0.90341735, + "learning_rate": 0.0008772434893213186, + "loss": 0.91438091, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.32519531, + "step": 1305, + "time_per_iteration": 2.5816421508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104465, + "balance_loss_mlp": 1.07123005, + "epoch": 0.25125048095421315, + "flos": 517192390656.0, + "grad_norm": 0.058690449713219205, + "language_loss": 0.84433925, + "learning_rate": 0.0008770389462092276, + "loss": 0.85538393, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.33251953, + "step": 1306, + "time_per_iteration": 2.6747090816497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106931, + "balance_loss_mlp": 1.07309926, + "epoch": 0.25144286263947674, + "flos": 620160039936.0, + "grad_norm": 0.16660488736040688, + "language_loss": 0.86719346, + "learning_rate": 0.0008768342567176357, + "loss": 0.87826276, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.33862305, + "step": 1307, + "time_per_iteration": 2.7788002490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098824, + "balance_loss_mlp": 1.06425333, + "epoch": 0.25163524432474027, + "flos": 503534444544.0, + "grad_norm": 0.04824933548887647, + "language_loss": 0.90589297, + "learning_rate": 0.0008766294209260107, + "loss": 0.91688126, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.34619141, + "step": 1308, + "time_per_iteration": 2.6300241947174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010929, + "balance_loss_mlp": 1.05852032, + "epoch": 0.25182762601000386, + "flos": 508821875712.0, + "grad_norm": 0.0633327884934456, + "language_loss": 0.91549027, + "learning_rate": 0.0008764244389138767, + "loss": 0.92641926, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.34399414, + "step": 1309, + "time_per_iteration": 2.574056625366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088516, + "balance_loss_mlp": 1.05306351, + "epoch": 0.2520200076952674, + "flos": 633596815872.0, + "grad_norm": 0.05898934519456769, + "language_loss": 0.8269434, + "learning_rate": 0.000876219310760815, + "loss": 0.83782852, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.35449219, + "step": 1310, + "time_per_iteration": 2.87404465675354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010989, + "balance_loss_mlp": 1.06299448, + "epoch": 0.252212389380531, + "flos": 494385527808.0, + "grad_norm": 0.05968729718727878, + "language_loss": 0.8144334, + "learning_rate": 0.0008760140365464631, + "loss": 0.82542241, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.35913086, + "step": 1311, + "time_per_iteration": 2.599480390548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105508, + "balance_loss_mlp": 1.07062793, + "epoch": 0.2524047710657945, + "flos": 490298489856.0, + "grad_norm": 0.06557576312810307, + "language_loss": 0.87226975, + "learning_rate": 0.0008758086163505156, + "loss": 0.88332486, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.34912109, + "step": 1312, + "time_per_iteration": 2.6165666580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112941, + "balance_loss_mlp": 1.07698762, + "epoch": 0.2525971527510581, + "flos": 647136898560.0, + "grad_norm": 0.06425852435188892, + "language_loss": 0.89039612, + "learning_rate": 0.0008756030502527239, + "loss": 0.90152562, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.36010742, + "step": 1313, + "time_per_iteration": 2.794595956802368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112418, + "balance_loss_mlp": 1.07636952, + "epoch": 0.2527895344363217, + "flos": 568991222784.0, + "grad_norm": 0.05792474282671988, + "language_loss": 0.90396988, + "learning_rate": 0.0008753973383328954, + "loss": 0.91509414, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.36108398, + "step": 1314, + "time_per_iteration": 2.66343355178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110344, + "balance_loss_mlp": 1.07491553, + "epoch": 0.2529819161215852, + "flos": 513795004416.0, + "grad_norm": 0.10488361484557306, + "language_loss": 0.84231019, + "learning_rate": 0.0008751914806708952, + "loss": 0.8534137, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.35449219, + "step": 1315, + "time_per_iteration": 2.5714006423950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099121, + "balance_loss_mlp": 1.06357241, + "epoch": 0.2531742978068488, + "flos": 530979784704.0, + "grad_norm": 0.0646255116041034, + "language_loss": 0.81763697, + "learning_rate": 0.0008749854773466439, + "loss": 0.82862812, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.35571289, + "step": 1316, + "time_per_iteration": 2.6507568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093995, + "balance_loss_mlp": 1.05892396, + "epoch": 0.25336667949211233, + "flos": 596362369536.0, + "grad_norm": 0.11519177634747009, + "language_loss": 0.84297431, + "learning_rate": 0.0008747793284401192, + "loss": 0.8539142, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.35107422, + "step": 1317, + "time_per_iteration": 2.6708261966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109538, + "balance_loss_mlp": 1.05966473, + "epoch": 0.2535590611773759, + "flos": 601764691968.0, + "grad_norm": 0.05376009268762157, + "language_loss": 0.86145389, + "learning_rate": 0.0008745730340313551, + "loss": 0.87240773, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.35742188, + "step": 1318, + "time_per_iteration": 2.7465810775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100043, + "balance_loss_mlp": 1.06504369, + "epoch": 0.25375144286263945, + "flos": 495079561728.0, + "grad_norm": 0.053440140598651036, + "language_loss": 0.8468703, + "learning_rate": 0.0008743665942004422, + "loss": 0.85787076, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.35009766, + "step": 1319, + "time_per_iteration": 2.632645606994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109509, + "balance_loss_mlp": 1.05908918, + "epoch": 0.25394382454790304, + "flos": 512219261952.0, + "grad_norm": 0.050076364746318706, + "language_loss": 0.92529714, + "learning_rate": 0.0008741600090275277, + "loss": 0.93624806, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.35986328, + "step": 1320, + "time_per_iteration": 2.564730405807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097426, + "balance_loss_mlp": 1.06092453, + "epoch": 0.25413620623316663, + "flos": 958586047488.0, + "grad_norm": 0.058049172943507095, + "language_loss": 0.83939385, + "learning_rate": 0.0008739532785928151, + "loss": 0.85036814, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.36474609, + "step": 1321, + "time_per_iteration": 3.4496617317199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056798, + "balance_loss_mlp": 1.04439986, + "epoch": 0.25432858791843016, + "flos": 1576445635584.0, + "grad_norm": 0.03297734471592195, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75950378, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.12353516, + "step": 1322, + "time_per_iteration": 4.803644418716431 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102711, + "balance_loss_mlp": 1.06706691, + "epoch": 0.25452096960369375, + "flos": 583530877440.0, + "grad_norm": 0.056711392027496164, + "language_loss": 0.83213425, + "learning_rate": 0.0008735393822590908, + "loss": 0.84316134, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.35668945, + "step": 1323, + "time_per_iteration": 2.6643528938293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099364, + "balance_loss_mlp": 1.06434083, + "epoch": 0.2547133512889573, + "flos": 508344629760.0, + "grad_norm": 0.06006943476027706, + "language_loss": 0.87018919, + "learning_rate": 0.0008733322165207681, + "loss": 0.88118285, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.35083008, + "step": 1324, + "time_per_iteration": 2.627495765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110409, + "balance_loss_mlp": 1.06863689, + "epoch": 0.25490573297422087, + "flos": 782266940928.0, + "grad_norm": 0.05604709920606865, + "language_loss": 0.83055937, + "learning_rate": 0.0008731249058420247, + "loss": 0.8416003, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.35498047, + "step": 1325, + "time_per_iteration": 3.0361831188201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100095, + "balance_loss_mlp": 1.06468964, + "epoch": 0.2550981146594844, + "flos": 509610451968.0, + "grad_norm": 0.06314633870869373, + "language_loss": 0.90780556, + "learning_rate": 0.0008729174503033459, + "loss": 0.91880649, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.35424805, + "step": 1326, + "time_per_iteration": 2.639625072479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109522, + "balance_loss_mlp": 1.06007695, + "epoch": 0.255290496344748, + "flos": 676360212480.0, + "grad_norm": 0.06489195741671011, + "language_loss": 0.82650065, + "learning_rate": 0.0008727098499852728, + "loss": 0.83745289, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.35180664, + "step": 1327, + "time_per_iteration": 2.830500602722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109753, + "balance_loss_mlp": 1.06231546, + "epoch": 0.2554828780300115, + "flos": 537524273664.0, + "grad_norm": 0.06666455638552511, + "language_loss": 0.89945138, + "learning_rate": 0.0008725021049684034, + "loss": 0.91042662, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.35253906, + "step": 1328, + "time_per_iteration": 2.747800350189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097741, + "balance_loss_mlp": 1.06240726, + "epoch": 0.2556752597152751, + "flos": 823828534272.0, + "grad_norm": 0.052131047599379726, + "language_loss": 0.82919741, + "learning_rate": 0.000872294215333391, + "loss": 0.84017479, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.35400391, + "step": 1329, + "time_per_iteration": 3.1658926010131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096694, + "balance_loss_mlp": 1.06176591, + "epoch": 0.2558676414005387, + "flos": 570517502976.0, + "grad_norm": 0.05425014800623288, + "language_loss": 0.82993001, + "learning_rate": 0.0008720861811609457, + "loss": 0.84089696, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.34985352, + "step": 1330, + "time_per_iteration": 2.709085702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101009, + "balance_loss_mlp": 1.06448317, + "epoch": 0.2560600230858022, + "flos": 486419475456.0, + "grad_norm": 0.05425594622111712, + "language_loss": 0.83756936, + "learning_rate": 0.0008718780025318338, + "loss": 0.84857947, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.36523438, + "step": 1331, + "time_per_iteration": 2.7126388549804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097427, + "balance_loss_mlp": 1.06280875, + "epoch": 0.2562524047710658, + "flos": 512874008064.0, + "grad_norm": 0.06594145834934585, + "language_loss": 0.8406449, + "learning_rate": 0.0008716696795268771, + "loss": 0.85161918, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.34667969, + "step": 1332, + "time_per_iteration": 2.6650350093841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089329, + "balance_loss_mlp": 1.05308938, + "epoch": 0.25644478645632934, + "flos": 634498873344.0, + "grad_norm": 0.051413439896644035, + "language_loss": 0.85076845, + "learning_rate": 0.0008714612122269538, + "loss": 0.86166173, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.36279297, + "step": 1333, + "time_per_iteration": 2.8611392974853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109443, + "balance_loss_mlp": 1.05754697, + "epoch": 0.25663716814159293, + "flos": 436353537024.0, + "grad_norm": 0.0705935369031189, + "language_loss": 0.89120972, + "learning_rate": 0.0008712526007129982, + "loss": 0.90215403, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.36889648, + "step": 1334, + "time_per_iteration": 2.5217065811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109442, + "balance_loss_mlp": 1.05813241, + "epoch": 0.25682954982685646, + "flos": 497892013056.0, + "grad_norm": 0.06578019441075163, + "language_loss": 0.90784955, + "learning_rate": 0.0008710438450660003, + "loss": 0.91879368, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.36303711, + "step": 1335, + "time_per_iteration": 2.651367425918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093579, + "balance_loss_mlp": 1.05643392, + "epoch": 0.25702193151212005, + "flos": 457471176192.0, + "grad_norm": 0.07087944464696884, + "language_loss": 0.8744905, + "learning_rate": 0.0008708349453670064, + "loss": 0.88542628, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.37133789, + "step": 1336, + "time_per_iteration": 2.51411771774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090754, + "balance_loss_mlp": 1.0543952, + "epoch": 0.2572143131973836, + "flos": 598002130944.0, + "grad_norm": 0.06329524505646734, + "language_loss": 0.91480416, + "learning_rate": 0.0008706259016971185, + "loss": 0.92571175, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.36401367, + "step": 1337, + "time_per_iteration": 2.754173517227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096487, + "balance_loss_mlp": 1.0589596, + "epoch": 0.25740669488264717, + "flos": 698004559872.0, + "grad_norm": 0.06697174190166053, + "language_loss": 0.83331275, + "learning_rate": 0.0008704167141374944, + "loss": 0.84427762, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.375, + "step": 1338, + "time_per_iteration": 2.795552968978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011015, + "balance_loss_mlp": 1.06385398, + "epoch": 0.25759907656791076, + "flos": 502130409984.0, + "grad_norm": 0.06639008708045263, + "language_loss": 0.88657552, + "learning_rate": 0.0008702073827693482, + "loss": 0.89759052, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.3762207, + "step": 1339, + "time_per_iteration": 2.6935572624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103743, + "balance_loss_mlp": 1.065763, + "epoch": 0.2577914582531743, + "flos": 773541425664.0, + "grad_norm": 0.06917089880544881, + "language_loss": 0.88938046, + "learning_rate": 0.0008699979076739494, + "loss": 0.90041792, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.37963867, + "step": 1340, + "time_per_iteration": 2.951148509979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102218, + "balance_loss_mlp": 1.06552505, + "epoch": 0.2579838399384379, + "flos": 459431032320.0, + "grad_norm": 0.07085954822691051, + "language_loss": 0.88831556, + "learning_rate": 0.0008697882889326234, + "loss": 0.89933777, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.36669922, + "step": 1341, + "time_per_iteration": 2.492182731628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105931, + "balance_loss_mlp": 1.06916654, + "epoch": 0.2581762216237014, + "flos": 568917029376.0, + "grad_norm": 0.060702491086151805, + "language_loss": 0.86630756, + "learning_rate": 0.0008695785266267515, + "loss": 0.8773669, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.36816406, + "step": 1342, + "time_per_iteration": 2.6635031700134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111038, + "balance_loss_mlp": 1.07448828, + "epoch": 0.258368603308965, + "flos": 603906430464.0, + "grad_norm": 0.06467765584173796, + "language_loss": 0.83112109, + "learning_rate": 0.0008693686208377704, + "loss": 0.84223145, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.36547852, + "step": 1343, + "time_per_iteration": 2.7769596576690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105659, + "balance_loss_mlp": 1.06975329, + "epoch": 0.2585609849942285, + "flos": 491204929536.0, + "grad_norm": 0.06376456739082713, + "language_loss": 0.88889539, + "learning_rate": 0.0008691585716471733, + "loss": 0.89995199, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.35913086, + "step": 1344, + "time_per_iteration": 2.6467716693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104044, + "balance_loss_mlp": 1.06809044, + "epoch": 0.2587533666794921, + "flos": 640455607296.0, + "grad_norm": 0.057733681270749564, + "language_loss": 0.85255873, + "learning_rate": 0.0008689483791365079, + "loss": 0.86359918, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.35961914, + "step": 1345, + "time_per_iteration": 2.8041999340057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099237, + "balance_loss_mlp": 1.06380773, + "epoch": 0.2589457483647557, + "flos": 576564397056.0, + "grad_norm": 0.05015471530609978, + "language_loss": 0.89365089, + "learning_rate": 0.0008687380433873786, + "loss": 0.9046433, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.35473633, + "step": 1346, + "time_per_iteration": 2.7955591678619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101447, + "balance_loss_mlp": 1.06630445, + "epoch": 0.25913813005001923, + "flos": 535164337152.0, + "grad_norm": 0.06074647569776127, + "language_loss": 0.82164252, + "learning_rate": 0.0008685275644814448, + "loss": 0.83265698, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.3515625, + "step": 1347, + "time_per_iteration": 2.6922154426574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100414, + "balance_loss_mlp": 1.06419861, + "epoch": 0.2593305117352828, + "flos": 720713908224.0, + "grad_norm": 0.05981927153656866, + "language_loss": 0.8445859, + "learning_rate": 0.0008683169425004216, + "loss": 0.85558999, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.36230469, + "step": 1348, + "time_per_iteration": 2.8701395988464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.05186677, + "epoch": 0.25952289342054635, + "flos": 709782635520.0, + "grad_norm": 0.06994851779161643, + "language_loss": 0.83445579, + "learning_rate": 0.0008681061775260799, + "loss": 0.84533083, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.35644531, + "step": 1349, + "time_per_iteration": 2.8206968307495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096473, + "balance_loss_mlp": 1.06032848, + "epoch": 0.25971527510580994, + "flos": 455688820224.0, + "grad_norm": 0.06118298275127208, + "language_loss": 0.91987318, + "learning_rate": 0.0008678952696402458, + "loss": 0.93083793, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.36132812, + "step": 1350, + "time_per_iteration": 2.5547540187835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108511, + "balance_loss_mlp": 1.04932308, + "epoch": 0.25990765679107347, + "flos": 612223100928.0, + "grad_norm": 0.04808004024566397, + "language_loss": 0.86496055, + "learning_rate": 0.000867684218924801, + "loss": 0.8758117, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.35791016, + "step": 1351, + "time_per_iteration": 2.8406949043273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110301, + "balance_loss_mlp": 1.09857082, + "epoch": 0.26010003847633706, + "flos": 1537105766400.0, + "grad_norm": 0.059206679514604114, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80057395, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.1171875, + "step": 1352, + "time_per_iteration": 4.8775153160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082749, + "balance_loss_mlp": 1.04753494, + "epoch": 0.2602924201616006, + "flos": 715947393024.0, + "grad_norm": 0.046134849134736367, + "language_loss": 0.85103661, + "learning_rate": 0.0008672616893328834, + "loss": 0.86186409, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.35253906, + "step": 1353, + "time_per_iteration": 2.98063588142395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108322, + "balance_loss_mlp": 1.04764819, + "epoch": 0.2604848018468642, + "flos": 643241917440.0, + "grad_norm": 0.060512322591449175, + "language_loss": 0.9000203, + "learning_rate": 0.0008670502106204512, + "loss": 0.91085243, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.35595703, + "step": 1354, + "time_per_iteration": 2.832679271697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087805, + "balance_loss_mlp": 1.05073047, + "epoch": 0.26067718353212777, + "flos": 516783545856.0, + "grad_norm": 0.05860289542603218, + "language_loss": 0.8165139, + "learning_rate": 0.0008668385894064892, + "loss": 0.82739192, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.37084961, + "step": 1355, + "time_per_iteration": 2.6204822063446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086317, + "balance_loss_mlp": 1.05143666, + "epoch": 0.2608695652173913, + "flos": 822361890816.0, + "grad_norm": 0.0623840657908754, + "language_loss": 0.88803548, + "learning_rate": 0.0008666268257731562, + "loss": 0.8988986, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.34912109, + "step": 1356, + "time_per_iteration": 3.113147735595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109295, + "balance_loss_mlp": 1.0566628, + "epoch": 0.2610619469026549, + "flos": 1007451744768.0, + "grad_norm": 0.056693012024963345, + "language_loss": 0.85794425, + "learning_rate": 0.0008664149198026662, + "loss": 0.86887372, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.36279297, + "step": 1357, + "time_per_iteration": 3.2569541931152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095714, + "balance_loss_mlp": 1.06040418, + "epoch": 0.2612543285879184, + "flos": 536523291648.0, + "grad_norm": 0.061594313952015485, + "language_loss": 0.88599586, + "learning_rate": 0.0008662028715772883, + "loss": 0.89695299, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.35351562, + "step": 1358, + "time_per_iteration": 2.6102256774902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100258, + "balance_loss_mlp": 1.06475711, + "epoch": 0.261446710273182, + "flos": 519166803456.0, + "grad_norm": 0.04975036534081278, + "language_loss": 0.85662109, + "learning_rate": 0.0008659906811793467, + "loss": 0.86762363, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.35546875, + "step": 1359, + "time_per_iteration": 2.6921935081481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101082, + "balance_loss_mlp": 1.06543839, + "epoch": 0.26163909195844554, + "flos": 582975055872.0, + "grad_norm": 0.06646109128582675, + "language_loss": 0.89397144, + "learning_rate": 0.0008657783486912215, + "loss": 0.90498233, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.35693359, + "step": 1360, + "time_per_iteration": 2.7003283500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110494, + "balance_loss_mlp": 1.06920147, + "epoch": 0.2618314736437091, + "flos": 958362057216.0, + "grad_norm": 0.06344844215605515, + "language_loss": 0.89840877, + "learning_rate": 0.0008655658741953472, + "loss": 0.90945816, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.35742188, + "step": 1361, + "time_per_iteration": 3.207960844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101357, + "balance_loss_mlp": 1.0664053, + "epoch": 0.26202385532897265, + "flos": 574530347520.0, + "grad_norm": 0.04606923720206454, + "language_loss": 0.88105857, + "learning_rate": 0.0008653532577742136, + "loss": 0.89207214, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.34960938, + "step": 1362, + "time_per_iteration": 2.69209885597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091565, + "balance_loss_mlp": 1.05744767, + "epoch": 0.26221623701423624, + "flos": 445240585728.0, + "grad_norm": 0.05480512848555835, + "language_loss": 0.87200153, + "learning_rate": 0.0008651404995103659, + "loss": 0.88291717, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.34155273, + "step": 1363, + "time_per_iteration": 2.5325255393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095927, + "balance_loss_mlp": 1.06164205, + "epoch": 0.26240861869949983, + "flos": 535459700736.0, + "grad_norm": 0.04992660146640532, + "language_loss": 0.870713, + "learning_rate": 0.0008649275994864041, + "loss": 0.8816722, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.34301758, + "step": 1364, + "time_per_iteration": 2.682365894317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098846, + "balance_loss_mlp": 1.06267846, + "epoch": 0.26260100038476336, + "flos": 564940500480.0, + "grad_norm": 0.05369640644722127, + "language_loss": 0.83917898, + "learning_rate": 0.0008647145577849834, + "loss": 0.85016745, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.36157227, + "step": 1365, + "time_per_iteration": 2.8129918575286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102304, + "balance_loss_mlp": 1.06701851, + "epoch": 0.26279338207002695, + "flos": 612745426944.0, + "grad_norm": 0.045782565775991005, + "language_loss": 0.82809973, + "learning_rate": 0.0008645013744888139, + "loss": 0.83912277, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.35327148, + "step": 1366, + "time_per_iteration": 2.8523411750793457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101615, + "balance_loss_mlp": 1.06730664, + "epoch": 0.2629857637552905, + "flos": 522555425280.0, + "grad_norm": 0.0597350257589219, + "language_loss": 0.87579656, + "learning_rate": 0.0008642880496806607, + "loss": 0.88681269, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.34350586, + "step": 1367, + "time_per_iteration": 2.766350507736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105811, + "balance_loss_mlp": 1.0706687, + "epoch": 0.26317814544055407, + "flos": 534273864192.0, + "grad_norm": 0.05812227598952832, + "language_loss": 0.84219468, + "learning_rate": 0.0008640745834433437, + "loss": 0.85325277, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.35205078, + "step": 1368, + "time_per_iteration": 2.7220964431762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100459, + "balance_loss_mlp": 1.06553102, + "epoch": 0.2633705271258176, + "flos": 555235762176.0, + "grad_norm": 0.06954601812969684, + "language_loss": 0.86862296, + "learning_rate": 0.000863860975859738, + "loss": 0.87962759, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.34960938, + "step": 1369, + "time_per_iteration": 2.8985280990600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094199, + "balance_loss_mlp": 1.05931866, + "epoch": 0.2635629088110812, + "flos": 552136711680.0, + "grad_norm": 0.06493737783890446, + "language_loss": 0.88711715, + "learning_rate": 0.0008636472270127733, + "loss": 0.89805913, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.34936523, + "step": 1370, + "time_per_iteration": 2.634615182876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090389, + "balance_loss_mlp": 1.05498338, + "epoch": 0.2637552904963448, + "flos": 455752839168.0, + "grad_norm": 0.062476231294863314, + "language_loss": 0.89913595, + "learning_rate": 0.0008634333369854345, + "loss": 0.91003978, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.35424805, + "step": 1371, + "time_per_iteration": 2.5908331871032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082759, + "balance_loss_mlp": 1.04818797, + "epoch": 0.2639476721816083, + "flos": 612847323648.0, + "grad_norm": 0.05509554660574217, + "language_loss": 0.87495965, + "learning_rate": 0.0008632193058607608, + "loss": 0.88578725, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.34594727, + "step": 1372, + "time_per_iteration": 2.6963188648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108436, + "balance_loss_mlp": 1.04878759, + "epoch": 0.2641400538668719, + "flos": 571645112832.0, + "grad_norm": 0.05982264925210271, + "language_loss": 0.81028771, + "learning_rate": 0.0008630051337218466, + "loss": 0.82113135, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.35595703, + "step": 1373, + "time_per_iteration": 2.644540786743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079985, + "balance_loss_mlp": 1.04582, + "epoch": 0.2643324355521354, + "flos": 581979866112.0, + "grad_norm": 0.08561984623812412, + "language_loss": 0.82428128, + "learning_rate": 0.0008627908206518409, + "loss": 0.8350811, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.34179688, + "step": 1374, + "time_per_iteration": 2.660578966140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110354, + "balance_loss_mlp": 1.08904421, + "epoch": 0.264524817237399, + "flos": 1543845284352.0, + "grad_norm": 0.03725698642258328, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76254791, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.14453125, + "step": 1375, + "time_per_iteration": 4.9595324993133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079461, + "balance_loss_mlp": 1.04474711, + "epoch": 0.26471719892266254, + "flos": 517783117824.0, + "grad_norm": 0.05493851972821551, + "language_loss": 0.91330564, + "learning_rate": 0.0008623617720514241, + "loss": 0.92410028, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.34741211, + "step": 1376, + "time_per_iteration": 2.5929205417633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.05498242, + "epoch": 0.26490958060792613, + "flos": 516936314880.0, + "grad_norm": 0.08106601153347975, + "language_loss": 0.84946424, + "learning_rate": 0.0008621470366875848, + "loss": 0.8603667, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.3527832, + "step": 1377, + "time_per_iteration": 2.5729684829711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084439, + "balance_loss_mlp": 1.0497725, + "epoch": 0.26510196229318966, + "flos": 596298350592.0, + "grad_norm": 0.05588669268878349, + "language_loss": 0.87771004, + "learning_rate": 0.0008619321607257966, + "loss": 0.88855445, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.34692383, + "step": 1378, + "time_per_iteration": 2.6708004474639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082921, + "balance_loss_mlp": 1.0483501, + "epoch": 0.26529434397845325, + "flos": 685488780288.0, + "grad_norm": 0.051774701706919043, + "language_loss": 0.82311988, + "learning_rate": 0.000861717144249482, + "loss": 0.83394915, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.34594727, + "step": 1379, + "time_per_iteration": 2.8249831199645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081979, + "balance_loss_mlp": 1.0468595, + "epoch": 0.26548672566371684, + "flos": 424127328768.0, + "grad_norm": 0.06288210815556809, + "language_loss": 0.90205348, + "learning_rate": 0.0008615019873421175, + "loss": 0.91287327, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.35131836, + "step": 1380, + "time_per_iteration": 2.455320358276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108606, + "balance_loss_mlp": 1.05108428, + "epoch": 0.26567910734898037, + "flos": 489619012608.0, + "grad_norm": 0.05393715583789803, + "language_loss": 0.8609767, + "learning_rate": 0.0008612866900872349, + "loss": 0.87183726, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.35009766, + "step": 1381, + "time_per_iteration": 2.54070782661438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083002, + "balance_loss_mlp": 1.04895568, + "epoch": 0.26587148903424396, + "flos": 533947977216.0, + "grad_norm": 0.05290962754614328, + "language_loss": 0.88052452, + "learning_rate": 0.0008610712525684197, + "loss": 0.8913545, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.34082031, + "step": 1382, + "time_per_iteration": 2.6350595951080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084878, + "balance_loss_mlp": 1.05049801, + "epoch": 0.2660638707195075, + "flos": 1017067732992.0, + "grad_norm": 0.06267977315545337, + "language_loss": 0.84534729, + "learning_rate": 0.0008608556748693121, + "loss": 0.85619605, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.34423828, + "step": 1383, + "time_per_iteration": 3.231172561645508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086499, + "balance_loss_mlp": 1.05216646, + "epoch": 0.2662562524047711, + "flos": 523712148480.0, + "grad_norm": 0.0585640776606728, + "language_loss": 0.86247015, + "learning_rate": 0.000860639957073607, + "loss": 0.87333512, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.34375, + "step": 1384, + "time_per_iteration": 2.72265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108718, + "balance_loss_mlp": 1.05280018, + "epoch": 0.2664486340900346, + "flos": 552107598336.0, + "grad_norm": 0.07312693577598182, + "language_loss": 0.87888551, + "learning_rate": 0.0008604240992650534, + "loss": 0.88975734, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.34423828, + "step": 1385, + "time_per_iteration": 2.6524593830108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079729, + "balance_loss_mlp": 1.0455637, + "epoch": 0.2666410157752982, + "flos": 469895233536.0, + "grad_norm": 0.058731941016447735, + "language_loss": 0.89070451, + "learning_rate": 0.0008602081015274545, + "loss": 0.90150183, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.34179688, + "step": 1386, + "time_per_iteration": 2.7026963233947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083917, + "balance_loss_mlp": 1.04953694, + "epoch": 0.2668333974605617, + "flos": 569645968896.0, + "grad_norm": 0.04572049987167494, + "language_loss": 0.83031899, + "learning_rate": 0.0008599919639446684, + "loss": 0.84115815, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.34423828, + "step": 1387, + "time_per_iteration": 2.6891515254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083945, + "balance_loss_mlp": 1.04920745, + "epoch": 0.2670257791458253, + "flos": 398755325952.0, + "grad_norm": 0.06113709644372323, + "language_loss": 0.80263156, + "learning_rate": 0.000859775686600607, + "loss": 0.81347102, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.34765625, + "step": 1388, + "time_per_iteration": 2.5367043018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088195, + "balance_loss_mlp": 1.05400586, + "epoch": 0.2672181608310889, + "flos": 515587534848.0, + "grad_norm": 0.07715457421599592, + "language_loss": 0.85045016, + "learning_rate": 0.0008595592695792367, + "loss": 0.86133218, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.34228516, + "step": 1389, + "time_per_iteration": 2.653684139251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097788, + "balance_loss_mlp": 1.06348014, + "epoch": 0.26741054251635243, + "flos": 507270864384.0, + "grad_norm": 0.05276083290405683, + "language_loss": 0.9085412, + "learning_rate": 0.0008593427129645778, + "loss": 0.91951907, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.34350586, + "step": 1390, + "time_per_iteration": 2.5497426986694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100169, + "balance_loss_mlp": 1.06512117, + "epoch": 0.267602924201616, + "flos": 576357783552.0, + "grad_norm": 0.0907689109524766, + "language_loss": 0.85371816, + "learning_rate": 0.0008591260168407052, + "loss": 0.86471987, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.35083008, + "step": 1391, + "time_per_iteration": 2.752777576446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099505, + "balance_loss_mlp": 1.06598353, + "epoch": 0.26779530588687955, + "flos": 523731087360.0, + "grad_norm": 0.05201595058269412, + "language_loss": 0.83216429, + "learning_rate": 0.0008589091812917479, + "loss": 0.84315932, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.33544922, + "step": 1392, + "time_per_iteration": 2.602858781814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092875, + "balance_loss_mlp": 1.05897164, + "epoch": 0.26798768757214314, + "flos": 556508938752.0, + "grad_norm": 0.054199587407170555, + "language_loss": 0.85476619, + "learning_rate": 0.0008586922064018887, + "loss": 0.86569488, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.33911133, + "step": 1393, + "time_per_iteration": 2.663135528564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110003, + "balance_loss_mlp": 1.0641005, + "epoch": 0.2681800692574067, + "flos": 930246004224.0, + "grad_norm": 0.05606615550920643, + "language_loss": 0.89228028, + "learning_rate": 0.0008584750922553651, + "loss": 0.90328062, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.35961914, + "step": 1394, + "time_per_iteration": 3.126030206680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094414, + "balance_loss_mlp": 1.06020141, + "epoch": 0.26837245094267026, + "flos": 700771931136.0, + "grad_norm": 0.055333821001128054, + "language_loss": 0.83724457, + "learning_rate": 0.0008582578389364677, + "loss": 0.84818876, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.3425293, + "step": 1395, + "time_per_iteration": 2.858774423599243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096924, + "balance_loss_mlp": 1.06187642, + "epoch": 0.26856483262793385, + "flos": 592892199936.0, + "grad_norm": 0.04773968262798697, + "language_loss": 0.9195987, + "learning_rate": 0.0008580404465295422, + "loss": 0.93056792, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.35058594, + "step": 1396, + "time_per_iteration": 2.7737646102905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096258, + "balance_loss_mlp": 1.06190252, + "epoch": 0.2687572143131974, + "flos": 713943866880.0, + "grad_norm": 0.07288281155022573, + "language_loss": 0.88208908, + "learning_rate": 0.0008578229151189876, + "loss": 0.89305162, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.34375, + "step": 1397, + "time_per_iteration": 2.9974029064178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087674, + "balance_loss_mlp": 1.05441451, + "epoch": 0.26894959599846097, + "flos": 467481452544.0, + "grad_norm": 0.0581153622766974, + "language_loss": 0.81433654, + "learning_rate": 0.0008576052447892573, + "loss": 0.82521319, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.33276367, + "step": 1398, + "time_per_iteration": 2.586427688598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090085, + "balance_loss_mlp": 1.05551457, + "epoch": 0.2691419776837245, + "flos": 468470850048.0, + "grad_norm": 0.08083264737918114, + "language_loss": 0.86589479, + "learning_rate": 0.000857387435624858, + "loss": 0.87679559, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.34619141, + "step": 1399, + "time_per_iteration": 2.5227789878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096966, + "balance_loss_mlp": 1.06239533, + "epoch": 0.2693343593689881, + "flos": 937244418048.0, + "grad_norm": 0.0443934808912178, + "language_loss": 0.88525635, + "learning_rate": 0.0008571694877103513, + "loss": 0.89622605, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.34594727, + "step": 1400, + "time_per_iteration": 3.252573251724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095335, + "balance_loss_mlp": 1.06064546, + "epoch": 0.2695267410542516, + "flos": 577303511040.0, + "grad_norm": 0.05297583192015558, + "language_loss": 0.87603962, + "learning_rate": 0.0008569514011303515, + "loss": 0.88699305, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.34716797, + "step": 1401, + "time_per_iteration": 2.7824223041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092716, + "balance_loss_mlp": 1.0577879, + "epoch": 0.2697191227395152, + "flos": 556539462144.0, + "grad_norm": 0.06414718170709632, + "language_loss": 0.87859815, + "learning_rate": 0.0008567331759695277, + "loss": 0.88952529, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.34985352, + "step": 1402, + "time_per_iteration": 2.7109498977661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098142, + "balance_loss_mlp": 1.06183052, + "epoch": 0.26991150442477874, + "flos": 529024310784.0, + "grad_norm": 0.05462837975359106, + "language_loss": 0.86148876, + "learning_rate": 0.0008565148123126023, + "loss": 0.87247014, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.36328125, + "step": 1403, + "time_per_iteration": 2.6526310443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088301, + "balance_loss_mlp": 1.05289555, + "epoch": 0.2701038861100423, + "flos": 531737837568.0, + "grad_norm": 0.12276519374226595, + "language_loss": 0.86177158, + "learning_rate": 0.0008562963102443516, + "loss": 0.87265456, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.35400391, + "step": 1404, + "time_per_iteration": 2.6809849739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092814, + "balance_loss_mlp": 1.05757618, + "epoch": 0.2702962677953059, + "flos": 734908737024.0, + "grad_norm": 0.05743337882617235, + "language_loss": 0.85265231, + "learning_rate": 0.0008560776698496056, + "loss": 0.86358047, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.3527832, + "step": 1405, + "time_per_iteration": 2.9008774757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099947, + "balance_loss_mlp": 1.06420779, + "epoch": 0.27048864948056944, + "flos": 574453181952.0, + "grad_norm": 0.06281004106283315, + "language_loss": 0.85864103, + "learning_rate": 0.0008558588912132481, + "loss": 0.86964047, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.35742188, + "step": 1406, + "time_per_iteration": 2.8967840671539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057025, + "balance_loss_mlp": 1.04519963, + "epoch": 0.27068103116583303, + "flos": 1423091953152.0, + "grad_norm": 0.03126478371356873, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77516007, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.11816406, + "step": 1407, + "time_per_iteration": 4.933698892593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109874, + "balance_loss_mlp": 1.06400251, + "epoch": 0.27087341285109656, + "flos": 531742219776.0, + "grad_norm": 0.050597666424933845, + "language_loss": 0.82942683, + "learning_rate": 0.0008554209195555016, + "loss": 0.84041423, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.34765625, + "step": 1408, + "time_per_iteration": 2.6599888801574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093614, + "balance_loss_mlp": 1.0582329, + "epoch": 0.27106579453636015, + "flos": 581108332032.0, + "grad_norm": 0.058412744436649705, + "language_loss": 0.88199335, + "learning_rate": 0.0008552017267041483, + "loss": 0.89292949, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.35375977, + "step": 1409, + "time_per_iteration": 2.673828363418579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091257, + "balance_loss_mlp": 1.05563736, + "epoch": 0.2712581762216237, + "flos": 506533160448.0, + "grad_norm": 0.05246666988179206, + "language_loss": 0.83264577, + "learning_rate": 0.0008549823959512549, + "loss": 0.84355831, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.35644531, + "step": 1410, + "time_per_iteration": 2.634523868560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091517, + "balance_loss_mlp": 1.05451441, + "epoch": 0.27145055790688727, + "flos": 997019476992.0, + "grad_norm": 0.050905982394943275, + "language_loss": 0.86668658, + "learning_rate": 0.0008547629273819728, + "loss": 0.87760168, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.36987305, + "step": 1411, + "time_per_iteration": 3.3559322357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094566, + "balance_loss_mlp": 1.05875564, + "epoch": 0.2716429395921508, + "flos": 546420086784.0, + "grad_norm": 0.06363965087638479, + "language_loss": 0.83773881, + "learning_rate": 0.0008545433210815074, + "loss": 0.84868449, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.35839844, + "step": 1412, + "time_per_iteration": 2.607379913330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109226, + "balance_loss_mlp": 1.05606771, + "epoch": 0.2718353212774144, + "flos": 572954605056.0, + "grad_norm": 0.05881941163427475, + "language_loss": 0.87753916, + "learning_rate": 0.0008543235771351176, + "loss": 0.88846171, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.36230469, + "step": 1413, + "time_per_iteration": 2.722318649291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096323, + "balance_loss_mlp": 1.06118035, + "epoch": 0.272027702962678, + "flos": 643986823680.0, + "grad_norm": 0.044269909609048815, + "language_loss": 0.84649068, + "learning_rate": 0.0008541036956281154, + "loss": 0.85745388, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.3515625, + "step": 1414, + "time_per_iteration": 2.8785104751586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100326, + "balance_loss_mlp": 1.0645628, + "epoch": 0.2722200846479415, + "flos": 653410755072.0, + "grad_norm": 0.0658433573318433, + "language_loss": 0.82281864, + "learning_rate": 0.0008538836766458665, + "loss": 0.83382189, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.35791016, + "step": 1415, + "time_per_iteration": 2.834148645401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098566, + "balance_loss_mlp": 1.06311321, + "epoch": 0.2724124663332051, + "flos": 579346324992.0, + "grad_norm": 0.07330345680392343, + "language_loss": 0.85275221, + "learning_rate": 0.0008536635202737897, + "loss": 0.86373788, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.35473633, + "step": 1416, + "time_per_iteration": 2.7886626720428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099993, + "balance_loss_mlp": 1.06513667, + "epoch": 0.2726048480184686, + "flos": 537178037760.0, + "grad_norm": 0.06667202152625065, + "language_loss": 0.82212454, + "learning_rate": 0.0008534432265973573, + "loss": 0.8331244, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.34912109, + "step": 1417, + "time_per_iteration": 2.604626417160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095149, + "balance_loss_mlp": 1.05931497, + "epoch": 0.2727972297037322, + "flos": 995360776704.0, + "grad_norm": 0.08172035912068172, + "language_loss": 0.88052338, + "learning_rate": 0.000853222795702095, + "loss": 0.8914749, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.35839844, + "step": 1418, + "time_per_iteration": 3.391664505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095275, + "balance_loss_mlp": 1.06125307, + "epoch": 0.27298961138899575, + "flos": 605924513280.0, + "grad_norm": 0.05480231780963067, + "language_loss": 0.83608377, + "learning_rate": 0.0008530022276735813, + "loss": 0.84703648, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.34057617, + "step": 1419, + "time_per_iteration": 2.705235004425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107985, + "balance_loss_mlp": 1.04666257, + "epoch": 0.27318199307425933, + "flos": 529059216384.0, + "grad_norm": 0.054542785174291425, + "language_loss": 0.85957551, + "learning_rate": 0.0008527815225974489, + "loss": 0.87037402, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.33203125, + "step": 1420, + "time_per_iteration": 2.654003620147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087931, + "balance_loss_mlp": 1.05395651, + "epoch": 0.2733743747595229, + "flos": 408809272320.0, + "grad_norm": 0.06460584893454492, + "language_loss": 0.88538897, + "learning_rate": 0.0008525606805593829, + "loss": 0.89626825, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.33984375, + "step": 1421, + "time_per_iteration": 2.4287912845611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087097, + "balance_loss_mlp": 1.05233574, + "epoch": 0.27356675644478645, + "flos": 515976030720.0, + "grad_norm": 0.055753761808712644, + "language_loss": 0.82379127, + "learning_rate": 0.0008523397016451213, + "loss": 0.8346622, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.34814453, + "step": 1422, + "time_per_iteration": 2.5620808601379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096714, + "balance_loss_mlp": 1.0617379, + "epoch": 0.27375913813005004, + "flos": 1051914539520.0, + "grad_norm": 0.0481984630724129, + "language_loss": 0.87272507, + "learning_rate": 0.0008521185859404564, + "loss": 0.88369215, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.34985352, + "step": 1423, + "time_per_iteration": 3.361171245574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085067, + "balance_loss_mlp": 1.05037737, + "epoch": 0.27395151981531357, + "flos": 624507535872.0, + "grad_norm": 0.05502068897485729, + "language_loss": 0.89717311, + "learning_rate": 0.0008518973335312326, + "loss": 0.90802383, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.34716797, + "step": 1424, + "time_per_iteration": 2.7964961528778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088932, + "balance_loss_mlp": 1.05390823, + "epoch": 0.27414390150057716, + "flos": 550112836608.0, + "grad_norm": 0.056708357312241935, + "language_loss": 0.83878243, + "learning_rate": 0.0008516759445033477, + "loss": 0.84967172, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.35058594, + "step": 1425, + "time_per_iteration": 2.6100170612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090578, + "balance_loss_mlp": 1.05603087, + "epoch": 0.2743362831858407, + "flos": 539596200960.0, + "grad_norm": 0.061048707375716146, + "language_loss": 0.84983361, + "learning_rate": 0.0008514544189427526, + "loss": 0.86073935, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.34594727, + "step": 1426, + "time_per_iteration": 2.6465015411376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088002, + "balance_loss_mlp": 1.05347919, + "epoch": 0.2745286648711043, + "flos": 468352986624.0, + "grad_norm": 0.061383055639382046, + "language_loss": 0.8704657, + "learning_rate": 0.0008512327569354511, + "loss": 0.88134569, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.34570312, + "step": 1427, + "time_per_iteration": 2.5696229934692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087297, + "balance_loss_mlp": 1.05212998, + "epoch": 0.2747210465563678, + "flos": 472617524736.0, + "grad_norm": 0.05941983459852813, + "language_loss": 0.8349936, + "learning_rate": 0.0008510109585675001, + "loss": 0.84586656, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.35180664, + "step": 1428, + "time_per_iteration": 2.6195123195648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086947, + "balance_loss_mlp": 1.07245111, + "epoch": 0.2749134282416314, + "flos": 1314345070080.0, + "grad_norm": 0.037284634304165044, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82240289, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.14453125, + "step": 1429, + "time_per_iteration": 4.714681625366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083808, + "balance_loss_mlp": 1.04938078, + "epoch": 0.275105809926895, + "flos": 970445670912.0, + "grad_norm": 0.07686972857934649, + "language_loss": 0.80942416, + "learning_rate": 0.0008505669530941415, + "loss": 0.82026225, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.34472656, + "step": 1430, + "time_per_iteration": 3.2975006103515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080869, + "balance_loss_mlp": 1.04715657, + "epoch": 0.2752981916121585, + "flos": 527089185792.0, + "grad_norm": 0.061626933195079385, + "language_loss": 0.84357536, + "learning_rate": 0.000850344746161112, + "loss": 0.85438406, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.33740234, + "step": 1431, + "time_per_iteration": 2.596623182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079512, + "balance_loss_mlp": 1.04487014, + "epoch": 0.2754905732974221, + "flos": 453487444992.0, + "grad_norm": 0.05883177646218185, + "language_loss": 0.87880194, + "learning_rate": 0.0008501224032121894, + "loss": 0.88959706, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.34692383, + "step": 1432, + "time_per_iteration": 2.5134201049804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082699, + "balance_loss_mlp": 1.04817569, + "epoch": 0.27568295498268564, + "flos": 497216918016.0, + "grad_norm": 0.05235854639463291, + "language_loss": 0.82002538, + "learning_rate": 0.0008498999243336946, + "loss": 0.83085239, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.34570312, + "step": 1433, + "time_per_iteration": 2.601771593093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095225, + "balance_loss_mlp": 1.06060696, + "epoch": 0.2758753366679492, + "flos": 607890161664.0, + "grad_norm": 0.05891633941102979, + "language_loss": 0.87444806, + "learning_rate": 0.0008496773096120021, + "loss": 0.8854003, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.34643555, + "step": 1434, + "time_per_iteration": 2.788516044616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096763, + "balance_loss_mlp": 1.06212115, + "epoch": 0.27606771835321275, + "flos": 739803290112.0, + "grad_norm": 0.06770297286276174, + "language_loss": 0.84185004, + "learning_rate": 0.0008494545591335381, + "loss": 0.85281765, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.34667969, + "step": 1435, + "time_per_iteration": 2.8759751319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110127, + "balance_loss_mlp": 1.06736696, + "epoch": 0.27626010003847634, + "flos": 554279860224.0, + "grad_norm": 0.04450223786838935, + "language_loss": 0.87244952, + "learning_rate": 0.0008492316729847823, + "loss": 0.88346225, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.33935547, + "step": 1436, + "time_per_iteration": 2.781244993209839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094118, + "balance_loss_mlp": 1.06023908, + "epoch": 0.2764524817237399, + "flos": 542270439936.0, + "grad_norm": 0.055325808882979444, + "language_loss": 0.79874223, + "learning_rate": 0.0008490086512522664, + "loss": 0.80968338, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.33911133, + "step": 1437, + "time_per_iteration": 2.7197165489196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093138, + "balance_loss_mlp": 1.05913925, + "epoch": 0.27664486340900346, + "flos": 406027344384.0, + "grad_norm": 0.0539948754920925, + "language_loss": 0.90713239, + "learning_rate": 0.0008487854940225755, + "loss": 0.91806382, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.34008789, + "step": 1438, + "time_per_iteration": 2.438218593597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094602, + "balance_loss_mlp": 1.06017423, + "epoch": 0.27683724509426705, + "flos": 521884712448.0, + "grad_norm": 0.06140365718889793, + "language_loss": 0.90140885, + "learning_rate": 0.0008485622013823466, + "loss": 0.91235483, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.34448242, + "step": 1439, + "time_per_iteration": 2.653393268585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102393, + "balance_loss_mlp": 1.06770289, + "epoch": 0.2770296267795306, + "flos": 535085761536.0, + "grad_norm": 0.07554461134761571, + "language_loss": 0.8283006, + "learning_rate": 0.00084833877341827, + "loss": 0.83932453, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.34692383, + "step": 1440, + "time_per_iteration": 2.6312928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109947, + "balance_loss_mlp": 1.06587648, + "epoch": 0.27722200846479417, + "flos": 487747906560.0, + "grad_norm": 0.12145939933268801, + "language_loss": 0.8064183, + "learning_rate": 0.000848115210217088, + "loss": 0.81741297, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.33618164, + "step": 1441, + "time_per_iteration": 2.5490710735321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086774, + "balance_loss_mlp": 1.05167925, + "epoch": 0.2774143901500577, + "flos": 618012509184.0, + "grad_norm": 0.05766268366580332, + "language_loss": 0.82057106, + "learning_rate": 0.0008478915118655952, + "loss": 0.83143878, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.35131836, + "step": 1442, + "time_per_iteration": 2.710240602493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086605, + "balance_loss_mlp": 1.05160558, + "epoch": 0.2776067718353213, + "flos": 513563659776.0, + "grad_norm": 0.05774564569051742, + "language_loss": 0.86505657, + "learning_rate": 0.0008476676784506393, + "loss": 0.87592262, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.35009766, + "step": 1443, + "time_per_iteration": 2.636622667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108686, + "balance_loss_mlp": 1.0526228, + "epoch": 0.2777991535205848, + "flos": 1003985957376.0, + "grad_norm": 0.10311001825576924, + "language_loss": 0.82024419, + "learning_rate": 0.0008474437100591201, + "loss": 0.8311128, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.34277344, + "step": 1444, + "time_per_iteration": 3.282383918762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091274, + "balance_loss_mlp": 1.05517697, + "epoch": 0.2779915352058484, + "flos": 550005147648.0, + "grad_norm": 0.05151300271624721, + "language_loss": 0.85496646, + "learning_rate": 0.0008472196067779898, + "loss": 0.86587918, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.36108398, + "step": 1445, + "time_per_iteration": 2.703263998031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092087, + "balance_loss_mlp": 1.05591917, + "epoch": 0.278183916891112, + "flos": 873444930048.0, + "grad_norm": 0.06736388569990436, + "language_loss": 0.85432607, + "learning_rate": 0.0008469953686942531, + "loss": 0.86524689, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.36181641, + "step": 1446, + "time_per_iteration": 3.0834743976593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087224, + "balance_loss_mlp": 1.05305839, + "epoch": 0.2783762985763755, + "flos": 623782978560.0, + "grad_norm": 0.06536474240751361, + "language_loss": 0.83167183, + "learning_rate": 0.0008467709958949668, + "loss": 0.84254414, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.34204102, + "step": 1447, + "time_per_iteration": 2.737135887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087199, + "balance_loss_mlp": 1.05362952, + "epoch": 0.2785686802616391, + "flos": 581571021312.0, + "grad_norm": 0.057056565872365954, + "language_loss": 0.85917461, + "learning_rate": 0.0008465464884672403, + "loss": 0.87004662, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.33569336, + "step": 1448, + "time_per_iteration": 2.6771810054779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087224, + "balance_loss_mlp": 1.05346394, + "epoch": 0.27876106194690264, + "flos": 587032980480.0, + "grad_norm": 0.06237565084734976, + "language_loss": 0.85356677, + "learning_rate": 0.0008463218464982348, + "loss": 0.86443901, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.33789062, + "step": 1449, + "time_per_iteration": 2.799407720565796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086336, + "balance_loss_mlp": 1.05228984, + "epoch": 0.27895344363216623, + "flos": 875621574144.0, + "grad_norm": 0.06450477685794259, + "language_loss": 0.87800258, + "learning_rate": 0.0008460970700751645, + "loss": 0.88886595, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.34057617, + "step": 1450, + "time_per_iteration": 3.0517759323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086942, + "balance_loss_mlp": 1.05322921, + "epoch": 0.27914582531742976, + "flos": 603630005760.0, + "grad_norm": 0.06893143963997089, + "language_loss": 0.87761652, + "learning_rate": 0.000845872159285295, + "loss": 0.88848597, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.33740234, + "step": 1451, + "time_per_iteration": 2.6964316368103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042481, + "balance_loss_mlp": 1.0276041, + "epoch": 0.27933820700269335, + "flos": 1496892953088.0, + "grad_norm": 0.0242162718076618, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78809333, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.1484375, + "step": 1452, + "time_per_iteration": 4.936378717422485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085289, + "balance_loss_mlp": 1.05162406, + "epoch": 0.2795305886879569, + "flos": 1031445854208.0, + "grad_norm": 0.05721806240601363, + "language_loss": 0.86067116, + "learning_rate": 0.0008454219349544836, + "loss": 0.87152404, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.33691406, + "step": 1453, + "time_per_iteration": 3.336336135864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086797, + "balance_loss_mlp": 1.053442, + "epoch": 0.27972297037322047, + "flos": 606766934016.0, + "grad_norm": 0.056433536115445035, + "language_loss": 0.81829166, + "learning_rate": 0.000845196621588334, + "loss": 0.82915968, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.33374023, + "step": 1454, + "time_per_iteration": 2.7415192127227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088727, + "balance_loss_mlp": 1.05475271, + "epoch": 0.27991535205848406, + "flos": 630085948416.0, + "grad_norm": 0.05700257056170363, + "language_loss": 0.76605666, + "learning_rate": 0.0008449711742049706, + "loss": 0.77694392, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.33984375, + "step": 1455, + "time_per_iteration": 2.755082130432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093567, + "balance_loss_mlp": 1.06035542, + "epoch": 0.2801077337437476, + "flos": 549034689024.0, + "grad_norm": 0.056412270826162, + "language_loss": 0.83750427, + "learning_rate": 0.0008447455928919196, + "loss": 0.84843993, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.33227539, + "step": 1456, + "time_per_iteration": 2.601306438446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097544, + "balance_loss_mlp": 1.06423688, + "epoch": 0.2803001154290112, + "flos": 486516989952.0, + "grad_norm": 0.08664389404831466, + "language_loss": 0.86875856, + "learning_rate": 0.0008445198777367595, + "loss": 0.87973404, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.33325195, + "step": 1457, + "time_per_iteration": 2.5534915924072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106361, + "balance_loss_mlp": 1.07267249, + "epoch": 0.2804924971142747, + "flos": 521820693504.0, + "grad_norm": 0.060155581105879694, + "language_loss": 0.8096568, + "learning_rate": 0.0008442940288271208, + "loss": 0.82072043, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.33691406, + "step": 1458, + "time_per_iteration": 2.6646370887756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108311, + "balance_loss_mlp": 1.07459903, + "epoch": 0.2806848787995383, + "flos": 527410690560.0, + "grad_norm": 0.05492641307724046, + "language_loss": 0.86995763, + "learning_rate": 0.0008440680462506856, + "loss": 0.88104069, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.33740234, + "step": 1459, + "time_per_iteration": 2.793306589126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103222, + "balance_loss_mlp": 1.06927133, + "epoch": 0.2808772604848018, + "flos": 485246785536.0, + "grad_norm": 0.053370474172872176, + "language_loss": 0.86799729, + "learning_rate": 0.0008438419300951883, + "loss": 0.87902945, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.33984375, + "step": 1460, + "time_per_iteration": 2.6732945442199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098403, + "balance_loss_mlp": 1.06488156, + "epoch": 0.2810696421700654, + "flos": 617840801280.0, + "grad_norm": 0.06081455520295947, + "language_loss": 0.86599934, + "learning_rate": 0.0008436156804484148, + "loss": 0.87698334, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.33544922, + "step": 1461, + "time_per_iteration": 2.768385410308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087899, + "balance_loss_mlp": 1.05397177, + "epoch": 0.28126202385532895, + "flos": 454521922560.0, + "grad_norm": 0.061036272851527865, + "language_loss": 0.88221931, + "learning_rate": 0.0008433892973982031, + "loss": 0.89309829, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.33959961, + "step": 1462, + "time_per_iteration": 2.502269983291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108859, + "balance_loss_mlp": 1.0546149, + "epoch": 0.28145440554059253, + "flos": 530447284224.0, + "grad_norm": 0.06533100110399645, + "language_loss": 0.85006338, + "learning_rate": 0.0008431627810324431, + "loss": 0.86094928, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.34008789, + "step": 1463, + "time_per_iteration": 2.6481637954711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097854, + "balance_loss_mlp": 1.06344974, + "epoch": 0.2816467872258561, + "flos": 451996070400.0, + "grad_norm": 0.053948569536927254, + "language_loss": 0.81259125, + "learning_rate": 0.000842936131439076, + "loss": 0.82356977, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.34423828, + "step": 1464, + "time_per_iteration": 2.598619222640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100723, + "balance_loss_mlp": 1.06665313, + "epoch": 0.28183916891111965, + "flos": 472464755712.0, + "grad_norm": 0.06117554261618067, + "language_loss": 0.88043475, + "learning_rate": 0.0008427093487060951, + "loss": 0.89144206, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.34106445, + "step": 1465, + "time_per_iteration": 2.611067533493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092883, + "balance_loss_mlp": 1.05917072, + "epoch": 0.28203155059638324, + "flos": 556770806784.0, + "grad_norm": 0.05001896034653533, + "language_loss": 0.84742111, + "learning_rate": 0.000842482432921545, + "loss": 0.85834992, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.33740234, + "step": 1466, + "time_per_iteration": 2.8155059814453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109283, + "balance_loss_mlp": 1.05990458, + "epoch": 0.28222393228164677, + "flos": 416756385792.0, + "grad_norm": 0.06017010781955974, + "language_loss": 0.87132335, + "learning_rate": 0.0008422553841735225, + "loss": 0.88225162, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.3293457, + "step": 1467, + "time_per_iteration": 2.459348201751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091932, + "balance_loss_mlp": 1.05731392, + "epoch": 0.28241631396691036, + "flos": 604629577728.0, + "grad_norm": 0.060074700521020694, + "language_loss": 0.84810078, + "learning_rate": 0.0008420282025501757, + "loss": 0.85902011, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.34643555, + "step": 1468, + "time_per_iteration": 2.7499678134918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086806, + "balance_loss_mlp": 1.05275989, + "epoch": 0.2826086956521739, + "flos": 572698529280.0, + "grad_norm": 0.05717030328031113, + "language_loss": 0.854882, + "learning_rate": 0.0008418008881397043, + "loss": 0.86575013, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.34082031, + "step": 1469, + "time_per_iteration": 2.6512861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080943, + "balance_loss_mlp": 1.04677796, + "epoch": 0.2828010773374375, + "flos": 842367886848.0, + "grad_norm": 0.05184982716140645, + "language_loss": 0.82590878, + "learning_rate": 0.0008415734410303595, + "loss": 0.8367182, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.34204102, + "step": 1470, + "time_per_iteration": 3.1787467002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085547, + "balance_loss_mlp": 1.05214453, + "epoch": 0.28299345902270107, + "flos": 542402860032.0, + "grad_norm": 0.04590644458835405, + "language_loss": 0.90709066, + "learning_rate": 0.0008413458613104444, + "loss": 0.9179461, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.33447266, + "step": 1471, + "time_per_iteration": 2.6650707721710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092285, + "balance_loss_mlp": 1.05780995, + "epoch": 0.2831858407079646, + "flos": 571320635904.0, + "grad_norm": 0.05367648266979066, + "language_loss": 0.82737631, + "learning_rate": 0.0008411181490683129, + "loss": 0.83829916, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.34472656, + "step": 1472, + "time_per_iteration": 2.7423322200775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104233, + "balance_loss_mlp": 1.06909025, + "epoch": 0.2833782223932282, + "flos": 763491861504.0, + "grad_norm": 0.05498194123694656, + "language_loss": 0.82467097, + "learning_rate": 0.0008408903043923707, + "loss": 0.83571333, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.35180664, + "step": 1473, + "time_per_iteration": 2.991787910461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114011, + "balance_loss_mlp": 1.07810485, + "epoch": 0.2835706040784917, + "flos": 538793068032.0, + "grad_norm": 0.05681509946110844, + "language_loss": 0.81401253, + "learning_rate": 0.0008406623273710754, + "loss": 0.82515264, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.35913086, + "step": 1474, + "time_per_iteration": 2.58866286277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112011, + "balance_loss_mlp": 1.07732141, + "epoch": 0.2837629857637553, + "flos": 530329420800.0, + "grad_norm": 0.06008709036576614, + "language_loss": 0.82883334, + "learning_rate": 0.0008404342180929351, + "loss": 0.83995342, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.34716797, + "step": 1475, + "time_per_iteration": 2.636383295059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112842, + "balance_loss_mlp": 1.07834268, + "epoch": 0.28395536744901884, + "flos": 539763526656.0, + "grad_norm": 0.06514959519071023, + "language_loss": 0.81725156, + "learning_rate": 0.00084020597664651, + "loss": 0.82837999, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.34521484, + "step": 1476, + "time_per_iteration": 2.7587718963623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106069, + "balance_loss_mlp": 1.0697813, + "epoch": 0.2841477491342824, + "flos": 573344510976.0, + "grad_norm": 0.0608139165355994, + "language_loss": 0.84204602, + "learning_rate": 0.0008399776031204111, + "loss": 0.85310674, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.36303711, + "step": 1477, + "time_per_iteration": 2.7376203536987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097711, + "balance_loss_mlp": 1.06263912, + "epoch": 0.28434013081954596, + "flos": 571802264064.0, + "grad_norm": 0.06275845169868324, + "language_loss": 0.8026123, + "learning_rate": 0.0008397490976033009, + "loss": 0.81358939, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.35083008, + "step": 1478, + "time_per_iteration": 2.6391618251800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103776, + "balance_loss_mlp": 1.02412283, + "epoch": 0.28453251250480954, + "flos": 1552554832896.0, + "grad_norm": 0.016614249421738093, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78917408, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.13671875, + "step": 1479, + "time_per_iteration": 4.730362176895142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079858, + "balance_loss_mlp": 1.04550207, + "epoch": 0.28472489419007313, + "flos": 748720862208.0, + "grad_norm": 0.04873312803653651, + "language_loss": 0.85529596, + "learning_rate": 0.0008392916909509525, + "loss": 0.86609453, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.34399414, + "step": 1480, + "time_per_iteration": 3.0429892539978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083924, + "balance_loss_mlp": 1.0495913, + "epoch": 0.28491727587533666, + "flos": 489914376192.0, + "grad_norm": 0.056617404906403615, + "language_loss": 0.85149431, + "learning_rate": 0.0008390627899932954, + "loss": 0.86233348, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.34375, + "step": 1481, + "time_per_iteration": 2.6355843544006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083301, + "balance_loss_mlp": 1.04818201, + "epoch": 0.28510965756060025, + "flos": 728671196160.0, + "grad_norm": 0.06013951169928809, + "language_loss": 0.88358414, + "learning_rate": 0.000838833757399789, + "loss": 0.89441717, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.3515625, + "step": 1482, + "time_per_iteration": 2.9198856353759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088409, + "balance_loss_mlp": 1.05357623, + "epoch": 0.2853020392458638, + "flos": 551300083200.0, + "grad_norm": 0.06378715850004578, + "language_loss": 0.80512154, + "learning_rate": 0.0008386045932593515, + "loss": 0.81600565, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.34887695, + "step": 1483, + "time_per_iteration": 2.665919065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087283, + "balance_loss_mlp": 1.05233121, + "epoch": 0.28549442093112737, + "flos": 754456425984.0, + "grad_norm": 0.06049898751226662, + "language_loss": 0.86304945, + "learning_rate": 0.0008383752976609525, + "loss": 0.87392229, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.34960938, + "step": 1484, + "time_per_iteration": 2.9113876819610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081674, + "balance_loss_mlp": 1.04586315, + "epoch": 0.2856868026163909, + "flos": 538311439872.0, + "grad_norm": 0.05363431349597561, + "language_loss": 0.79897112, + "learning_rate": 0.0008381458706936123, + "loss": 0.80978787, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.3581543, + "step": 1485, + "time_per_iteration": 2.715182065963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082792, + "balance_loss_mlp": 1.0470289, + "epoch": 0.2858791843016545, + "flos": 583487207424.0, + "grad_norm": 0.055785658857036256, + "language_loss": 0.8776381, + "learning_rate": 0.0008379163124464025, + "loss": 0.888466, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.35766602, + "step": 1486, + "time_per_iteration": 2.713412284851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083174, + "balance_loss_mlp": 1.04881775, + "epoch": 0.286071565986918, + "flos": 644503357440.0, + "grad_norm": 0.05967072286491994, + "language_loss": 0.76593089, + "learning_rate": 0.0008376866230084452, + "loss": 0.7767626, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.34399414, + "step": 1487, + "time_per_iteration": 2.812953472137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091632, + "balance_loss_mlp": 1.05522537, + "epoch": 0.2862639476721816, + "flos": 491120561664.0, + "grad_norm": 0.06413337589788286, + "language_loss": 0.85965335, + "learning_rate": 0.000837456802468914, + "loss": 0.87056965, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.36401367, + "step": 1488, + "time_per_iteration": 2.5974318981170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096408, + "balance_loss_mlp": 1.06050217, + "epoch": 0.2864563293574452, + "flos": 521363796480.0, + "grad_norm": 0.06049840128310572, + "language_loss": 0.85439187, + "learning_rate": 0.0008372268509170331, + "loss": 0.86535597, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.35888672, + "step": 1489, + "time_per_iteration": 2.6646056175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100143, + "balance_loss_mlp": 1.06478548, + "epoch": 0.2866487110427087, + "flos": 546834723840.0, + "grad_norm": 0.05582745965585505, + "language_loss": 0.84845203, + "learning_rate": 0.0008369967684420779, + "loss": 0.85945344, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.35424805, + "step": 1490, + "time_per_iteration": 2.737180471420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094742, + "balance_loss_mlp": 1.05902719, + "epoch": 0.2868410927279723, + "flos": 481977437184.0, + "grad_norm": 0.0702351654670911, + "language_loss": 0.84684229, + "learning_rate": 0.0008367665551333736, + "loss": 0.85778964, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.35717773, + "step": 1491, + "time_per_iteration": 2.591179847717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095437, + "balance_loss_mlp": 1.05807662, + "epoch": 0.28703347441323585, + "flos": 724578365952.0, + "grad_norm": 0.0690733570245185, + "language_loss": 0.85732669, + "learning_rate": 0.0008365362110802977, + "loss": 0.86828107, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.37329102, + "step": 1492, + "time_per_iteration": 2.8586251735687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083578, + "balance_loss_mlp": 1.04733849, + "epoch": 0.28722585609849943, + "flos": 634670581248.0, + "grad_norm": 0.059898504183233336, + "language_loss": 0.82604659, + "learning_rate": 0.0008363057363722773, + "loss": 0.83688229, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.36254883, + "step": 1493, + "time_per_iteration": 2.8491427898406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076927, + "balance_loss_mlp": 1.04264212, + "epoch": 0.28741823778376296, + "flos": 509974216704.0, + "grad_norm": 0.05796804627405179, + "language_loss": 0.84095198, + "learning_rate": 0.0008360751310987906, + "loss": 0.85172129, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.34301758, + "step": 1494, + "time_per_iteration": 2.5735158920288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077369, + "balance_loss_mlp": 1.04294157, + "epoch": 0.28761061946902655, + "flos": 603458297856.0, + "grad_norm": 0.07368534281083228, + "language_loss": 0.85552645, + "learning_rate": 0.0008358443953493666, + "loss": 0.86630011, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.34472656, + "step": 1495, + "time_per_iteration": 2.8492400646209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078466, + "balance_loss_mlp": 1.04260767, + "epoch": 0.28780300115429014, + "flos": 406977454080.0, + "grad_norm": 0.061458136593000166, + "language_loss": 0.88553399, + "learning_rate": 0.0008356135292135851, + "loss": 0.89631861, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.35864258, + "step": 1496, + "time_per_iteration": 2.499234676361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109055, + "balance_loss_mlp": 1.05428672, + "epoch": 0.28799538283955367, + "flos": 374726310912.0, + "grad_norm": 0.06023187099093886, + "language_loss": 0.92244387, + "learning_rate": 0.0008353825327810758, + "loss": 0.93334937, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.36230469, + "step": 1497, + "time_per_iteration": 2.4068801403045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083025, + "balance_loss_mlp": 1.04761958, + "epoch": 0.28818776452481726, + "flos": 591645316608.0, + "grad_norm": 0.050935971597675156, + "language_loss": 0.81914794, + "learning_rate": 0.00083515140614152, + "loss": 0.82997811, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.35473633, + "step": 1498, + "time_per_iteration": 2.7172293663024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05012727, + "epoch": 0.2883801462100808, + "flos": 534819511296.0, + "grad_norm": 0.055380500747477406, + "language_loss": 0.86671853, + "learning_rate": 0.0008349201493846485, + "loss": 0.87756932, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.34985352, + "step": 1499, + "time_per_iteration": 2.666877508163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088163, + "balance_loss_mlp": 1.05268669, + "epoch": 0.2885725278953444, + "flos": 479850255360.0, + "grad_norm": 0.06392675802491345, + "language_loss": 0.89344347, + "learning_rate": 0.0008346887626002432, + "loss": 0.90432513, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.35473633, + "step": 1500, + "time_per_iteration": 2.547353744506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081111, + "balance_loss_mlp": 1.04613519, + "epoch": 0.2887649095806079, + "flos": 463798877184.0, + "grad_norm": 0.050375470508879826, + "language_loss": 0.86195928, + "learning_rate": 0.000834457245878137, + "loss": 0.87277037, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.35009766, + "step": 1501, + "time_per_iteration": 2.6108102798461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076981, + "balance_loss_mlp": 1.04317355, + "epoch": 0.2889572912658715, + "flos": 930631527936.0, + "grad_norm": 0.05668037017333152, + "language_loss": 0.81365681, + "learning_rate": 0.000834225599308212, + "loss": 0.82442665, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.33837891, + "step": 1502, + "time_per_iteration": 3.2222447395324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085232, + "balance_loss_mlp": 1.04994583, + "epoch": 0.28914967295113503, + "flos": 569848200192.0, + "grad_norm": 0.05132223508893719, + "language_loss": 0.85018057, + "learning_rate": 0.0008339938229804016, + "loss": 0.8610329, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.35327148, + "step": 1503, + "time_per_iteration": 2.698528289794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_mlp": 1.01945031, + "epoch": 0.2893420546363986, + "flos": 1485803119104.0, + "grad_norm": 0.02573157997511775, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76467812, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.13574219, + "step": 1504, + "time_per_iteration": 4.950274467468262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083767, + "balance_loss_mlp": 1.04793239, + "epoch": 0.2895344363216622, + "flos": 469938903552.0, + "grad_norm": 0.06568085425348943, + "language_loss": 0.84119928, + "learning_rate": 0.0008335298814111094, + "loss": 0.85203701, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.35864258, + "step": 1505, + "time_per_iteration": 2.542043924331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085418, + "balance_loss_mlp": 1.05089498, + "epoch": 0.28972681800692573, + "flos": 647909508096.0, + "grad_norm": 0.06591449016003405, + "language_loss": 0.87860626, + "learning_rate": 0.0008332977163497455, + "loss": 0.88946044, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.34570312, + "step": 1506, + "time_per_iteration": 2.810399293899536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087351, + "balance_loss_mlp": 1.05163622, + "epoch": 0.2899191996921893, + "flos": 571955033088.0, + "grad_norm": 0.054529888005095714, + "language_loss": 0.83185649, + "learning_rate": 0.0008330654218907325, + "loss": 0.84272999, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.35742188, + "step": 1507, + "time_per_iteration": 2.6968414783477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084817, + "balance_loss_mlp": 1.04981744, + "epoch": 0.29011158137745285, + "flos": 661037773824.0, + "grad_norm": 0.1280653735040032, + "language_loss": 0.81777966, + "learning_rate": 0.0008328329981242548, + "loss": 0.82862782, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.3503418, + "step": 1508, + "time_per_iteration": 2.886396884918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.05441689, + "epoch": 0.29030396306271644, + "flos": 535933974528.0, + "grad_norm": 0.060234790533374126, + "language_loss": 0.87937206, + "learning_rate": 0.0008326004451405475, + "loss": 0.89026886, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.3527832, + "step": 1509, + "time_per_iteration": 2.7797772884368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096033, + "balance_loss_mlp": 1.06148589, + "epoch": 0.29049634474798, + "flos": 511707110400.0, + "grad_norm": 0.05385470227261208, + "language_loss": 0.82548428, + "learning_rate": 0.0008323677630298957, + "loss": 0.83644462, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.34594727, + "step": 1510, + "time_per_iteration": 2.5542855262756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093546, + "balance_loss_mlp": 1.05892766, + "epoch": 0.29068872643324356, + "flos": 613454017536.0, + "grad_norm": 0.05556182475666109, + "language_loss": 0.85001689, + "learning_rate": 0.0008321349518826345, + "loss": 0.86095232, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.34643555, + "step": 1511, + "time_per_iteration": 2.7849388122558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093876, + "balance_loss_mlp": 1.05878115, + "epoch": 0.2908811081185071, + "flos": 546164011008.0, + "grad_norm": 0.07046084545113683, + "language_loss": 0.94823933, + "learning_rate": 0.0008319020117891491, + "loss": 0.95917809, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.35131836, + "step": 1512, + "time_per_iteration": 2.5936031341552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090975, + "balance_loss_mlp": 1.05485463, + "epoch": 0.2910734898037707, + "flos": 604516096512.0, + "grad_norm": 0.06307487928884016, + "language_loss": 0.87063539, + "learning_rate": 0.0008316689428398751, + "loss": 0.88154513, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.36108398, + "step": 1513, + "time_per_iteration": 2.6774067878723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082761, + "balance_loss_mlp": 1.04792798, + "epoch": 0.29126587148903427, + "flos": 574383370752.0, + "grad_norm": 0.043578668947666564, + "language_loss": 0.88254529, + "learning_rate": 0.0008314357451252979, + "loss": 0.89337289, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.34887695, + "step": 1514, + "time_per_iteration": 2.7561941146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086548, + "balance_loss_mlp": 1.05092812, + "epoch": 0.2914582531742978, + "flos": 570802692096.0, + "grad_norm": 0.06240160889449628, + "language_loss": 0.87564558, + "learning_rate": 0.0008312024187359527, + "loss": 0.88651109, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.35644531, + "step": 1515, + "time_per_iteration": 2.636056900024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084088, + "balance_loss_mlp": 1.04942179, + "epoch": 0.2916506348595614, + "flos": 730523363328.0, + "grad_norm": 0.06185972429295104, + "language_loss": 0.87361014, + "learning_rate": 0.000830968963762425, + "loss": 0.88445103, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.34716797, + "step": 1516, + "time_per_iteration": 3.021732807159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091474, + "balance_loss_mlp": 1.05528235, + "epoch": 0.2918430165448249, + "flos": 510220118016.0, + "grad_norm": 0.05583453201751925, + "language_loss": 0.83947027, + "learning_rate": 0.0008307353802953497, + "loss": 0.85038507, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.36206055, + "step": 1517, + "time_per_iteration": 2.6659955978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100836, + "balance_loss_mlp": 1.06318951, + "epoch": 0.2920353982300885, + "flos": 630096122880.0, + "grad_norm": 0.04472517729516854, + "language_loss": 0.86110896, + "learning_rate": 0.0008305016684254125, + "loss": 0.87211728, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.37646484, + "step": 1518, + "time_per_iteration": 2.7896409034729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098986, + "balance_loss_mlp": 1.06153059, + "epoch": 0.29222777991535204, + "flos": 501411644928.0, + "grad_norm": 0.055409034097420505, + "language_loss": 0.86932153, + "learning_rate": 0.0008302678282433479, + "loss": 0.88031137, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.37451172, + "step": 1519, + "time_per_iteration": 2.585256814956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095676, + "balance_loss_mlp": 1.05891216, + "epoch": 0.2924201616006156, + "flos": 486522782208.0, + "grad_norm": 0.057505891705300044, + "language_loss": 0.85011005, + "learning_rate": 0.0008300338598399411, + "loss": 0.86106682, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.36791992, + "step": 1520, + "time_per_iteration": 2.6471352577209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109825, + "balance_loss_mlp": 1.07210708, + "epoch": 0.2926125432858792, + "flos": 476211350016.0, + "grad_norm": 0.05302442020038178, + "language_loss": 0.9456166, + "learning_rate": 0.0008297997633060263, + "loss": 0.95671487, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.37719727, + "step": 1521, + "time_per_iteration": 2.547457695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.06987774, + "epoch": 0.29280492497114274, + "flos": 676379151360.0, + "grad_norm": 0.054888704647412474, + "language_loss": 0.85310549, + "learning_rate": 0.0008295655387324883, + "loss": 0.86417043, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.36645508, + "step": 1522, + "time_per_iteration": 2.822557210922241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105716, + "balance_loss_mlp": 1.0686183, + "epoch": 0.29299730665640633, + "flos": 458175384576.0, + "grad_norm": 0.055715580232585875, + "language_loss": 0.85025144, + "learning_rate": 0.0008293311862102609, + "loss": 0.86130863, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.37084961, + "step": 1523, + "time_per_iteration": 2.5033791065216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103076, + "balance_loss_mlp": 1.06795669, + "epoch": 0.29318968834166986, + "flos": 446343464448.0, + "grad_norm": 0.0584953499596263, + "language_loss": 0.88722956, + "learning_rate": 0.0008290967058303275, + "loss": 0.89826035, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.3515625, + "step": 1524, + "time_per_iteration": 2.5981156826019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_mlp": 1.06630898, + "epoch": 0.29338207002693345, + "flos": 450085676544.0, + "grad_norm": 0.05072610752657829, + "language_loss": 0.86932707, + "learning_rate": 0.0008288620976837219, + "loss": 0.88035178, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.36181641, + "step": 1525, + "time_per_iteration": 2.522019863128662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092716, + "balance_loss_mlp": 1.05623853, + "epoch": 0.293574451712197, + "flos": 502027103232.0, + "grad_norm": 0.05230718040210392, + "language_loss": 0.83001733, + "learning_rate": 0.000828627361861527, + "loss": 0.84094453, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.36474609, + "step": 1526, + "time_per_iteration": 2.559201955795288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085779, + "balance_loss_mlp": 1.05051661, + "epoch": 0.29376683339746057, + "flos": 696158184960.0, + "grad_norm": 0.071892180297548, + "language_loss": 0.8465147, + "learning_rate": 0.0008283924984548752, + "loss": 0.85737246, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.3527832, + "step": 1527, + "time_per_iteration": 2.8108816146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079347, + "balance_loss_mlp": 1.04494321, + "epoch": 0.2939592150827241, + "flos": 478353088512.0, + "grad_norm": 0.05128355551395112, + "language_loss": 0.85087478, + "learning_rate": 0.0008281575075549485, + "loss": 0.86166823, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.34399414, + "step": 1528, + "time_per_iteration": 2.576814889907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051452, + "balance_loss_mlp": 1.04057992, + "epoch": 0.2941515967679877, + "flos": 1484482042368.0, + "grad_norm": 0.031732851839211505, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.7840414, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.10888672, + "step": 1529, + "time_per_iteration": 4.662023067474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089321, + "balance_loss_mlp": 1.0547502, + "epoch": 0.2943439784532513, + "flos": 673848916992.0, + "grad_norm": 0.06453398347295829, + "language_loss": 0.90086716, + "learning_rate": 0.0008276871436402469, + "loss": 0.91176039, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.34619141, + "step": 1530, + "time_per_iteration": 2.795783758163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097742, + "balance_loss_mlp": 1.06460166, + "epoch": 0.2945363601385148, + "flos": 576031896576.0, + "grad_norm": 0.05195467848041957, + "language_loss": 0.87790835, + "learning_rate": 0.000827451770808083, + "loss": 0.88888574, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.33154297, + "step": 1531, + "time_per_iteration": 2.6522414684295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100953, + "balance_loss_mlp": 1.06628692, + "epoch": 0.2947287418237784, + "flos": 480416251392.0, + "grad_norm": 0.05572078055736918, + "language_loss": 0.83276248, + "learning_rate": 0.0008272162708478674, + "loss": 0.84377199, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.34692383, + "step": 1532, + "time_per_iteration": 2.5960874557495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098518, + "balance_loss_mlp": 1.06459141, + "epoch": 0.2949211235090419, + "flos": 557917355520.0, + "grad_norm": 0.05232404820193651, + "language_loss": 0.86136103, + "learning_rate": 0.000826980643851029, + "loss": 0.87234622, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.33959961, + "step": 1533, + "time_per_iteration": 2.671867609024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106353, + "balance_loss_mlp": 1.07147205, + "epoch": 0.2951135051943055, + "flos": 483646311936.0, + "grad_norm": 0.06650262295584625, + "language_loss": 0.84864676, + "learning_rate": 0.0008267448899090464, + "loss": 0.85971034, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.34887695, + "step": 1534, + "time_per_iteration": 2.5133543014526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095921, + "balance_loss_mlp": 1.0604682, + "epoch": 0.29530588687956905, + "flos": 550015322112.0, + "grad_norm": 0.05711998360561463, + "language_loss": 0.80980158, + "learning_rate": 0.0008265090091134473, + "loss": 0.82076073, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.35473633, + "step": 1535, + "time_per_iteration": 2.8528778553009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085228, + "balance_loss_mlp": 1.05072904, + "epoch": 0.29549826856483263, + "flos": 672731481600.0, + "grad_norm": 0.047870597747086484, + "language_loss": 0.80150926, + "learning_rate": 0.0008262730015558088, + "loss": 0.8123616, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.34521484, + "step": 1536, + "time_per_iteration": 2.8849382400512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076376, + "balance_loss_mlp": 1.04192495, + "epoch": 0.29569065025009617, + "flos": 764300786688.0, + "grad_norm": 0.06331525049863725, + "language_loss": 0.82269859, + "learning_rate": 0.0008260368673277574, + "loss": 0.8334623, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.34472656, + "step": 1537, + "time_per_iteration": 3.12172269821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078873, + "balance_loss_mlp": 1.04480314, + "epoch": 0.29588303193535975, + "flos": 543398049792.0, + "grad_norm": 0.05107262607685598, + "language_loss": 0.84019077, + "learning_rate": 0.0008258006065209682, + "loss": 0.85097957, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.34106445, + "step": 1538, + "time_per_iteration": 2.7388381958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082306, + "balance_loss_mlp": 1.04744971, + "epoch": 0.29607541362062334, + "flos": 596648968704.0, + "grad_norm": 0.06469434822608365, + "language_loss": 0.80634302, + "learning_rate": 0.0008255642192271657, + "loss": 0.81716609, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.34863281, + "step": 1539, + "time_per_iteration": 2.7957324981689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082434, + "balance_loss_mlp": 1.04774427, + "epoch": 0.29626779530588687, + "flos": 609588149760.0, + "grad_norm": 0.06097977692176942, + "language_loss": 0.83830953, + "learning_rate": 0.0008253277055381241, + "loss": 0.84913385, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.34741211, + "step": 1540, + "time_per_iteration": 2.8428521156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085866, + "balance_loss_mlp": 1.05146217, + "epoch": 0.29646017699115046, + "flos": 867050237952.0, + "grad_norm": 0.06407432486539091, + "language_loss": 0.8580029, + "learning_rate": 0.0008250910655456658, + "loss": 0.8688615, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.34448242, + "step": 1541, + "time_per_iteration": 3.1741185188293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081587, + "balance_loss_mlp": 1.04696846, + "epoch": 0.296652558676414, + "flos": 495616444416.0, + "grad_norm": 0.06683547404256097, + "language_loss": 0.83703458, + "learning_rate": 0.0008248542993416625, + "loss": 0.84785044, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.34643555, + "step": 1542, + "time_per_iteration": 2.5634429454803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083482, + "balance_loss_mlp": 1.04914963, + "epoch": 0.2968449403616776, + "flos": 571275555840.0, + "grad_norm": 0.054805025189504364, + "language_loss": 0.83645159, + "learning_rate": 0.0008246174070180352, + "loss": 0.84728634, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.34375, + "step": 1543, + "time_per_iteration": 2.664029121398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108698, + "balance_loss_mlp": 1.05226684, + "epoch": 0.2970373220469411, + "flos": 793799115264.0, + "grad_norm": 0.06369286414713611, + "language_loss": 0.84087443, + "learning_rate": 0.0008243803886667537, + "loss": 0.85174423, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.34765625, + "step": 1544, + "time_per_iteration": 3.129185199737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082817, + "balance_loss_mlp": 1.04793644, + "epoch": 0.2972297037322047, + "flos": 660736617984.0, + "grad_norm": 0.0569938777400986, + "language_loss": 0.79051471, + "learning_rate": 0.0008241432443798364, + "loss": 0.80134284, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.34936523, + "step": 1545, + "time_per_iteration": 2.7968478202819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076425, + "balance_loss_mlp": 1.04225969, + "epoch": 0.29742208541746823, + "flos": 596849789952.0, + "grad_norm": 0.05185676674228935, + "language_loss": 0.85634965, + "learning_rate": 0.0008239059742493512, + "loss": 0.86711389, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.34204102, + "step": 1546, + "time_per_iteration": 2.730803966522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079044, + "balance_loss_mlp": 1.0448308, + "epoch": 0.2976144671027318, + "flos": 769519816704.0, + "grad_norm": 0.049935350225070424, + "language_loss": 0.87424839, + "learning_rate": 0.0008236685783674142, + "loss": 0.88503873, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.34228516, + "step": 1547, + "time_per_iteration": 3.0735998153686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060573, + "balance_loss_mlp": 1.04903388, + "epoch": 0.2978068487879954, + "flos": 1483980065280.0, + "grad_norm": 0.022808758650826922, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77281767, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.11523438, + "step": 1548, + "time_per_iteration": 4.902673959732056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088892, + "balance_loss_mlp": 1.05460715, + "epoch": 0.29799923047325894, + "flos": 475079357952.0, + "grad_norm": 0.07696298762455249, + "language_loss": 0.82568306, + "learning_rate": 0.0008231934097178955, + "loss": 0.83657193, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.34326172, + "step": 1549, + "time_per_iteration": 2.59600567817688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087503, + "balance_loss_mlp": 1.05173981, + "epoch": 0.2981916121585225, + "flos": 759464460288.0, + "grad_norm": 0.05308820200633048, + "language_loss": 0.8525809, + "learning_rate": 0.0008229556371347903, + "loss": 0.86345589, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.35791016, + "step": 1550, + "time_per_iteration": 2.955467939376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080247, + "balance_loss_mlp": 1.04593909, + "epoch": 0.29838399384378606, + "flos": 874642351104.0, + "grad_norm": 0.058723621398699785, + "language_loss": 0.79088616, + "learning_rate": 0.0008227177391691874, + "loss": 0.80168855, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.34350586, + "step": 1551, + "time_per_iteration": 3.1204521656036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084967, + "balance_loss_mlp": 1.05001473, + "epoch": 0.29857637552904964, + "flos": 579389995008.0, + "grad_norm": 0.060980844602782615, + "language_loss": 0.89576113, + "learning_rate": 0.0008224797159134463, + "loss": 0.90661073, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.34985352, + "step": 1552, + "time_per_iteration": 2.7535347938537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.04132128, + "epoch": 0.2987687572143132, + "flos": 836048950272.0, + "grad_norm": 0.05791718796165568, + "language_loss": 0.83571118, + "learning_rate": 0.0008222415674599765, + "loss": 0.84646177, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.33764648, + "step": 1553, + "time_per_iteration": 3.0609707832336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077849, + "balance_loss_mlp": 1.0417521, + "epoch": 0.29896113889957676, + "flos": 566800022016.0, + "grad_norm": 0.05477323920870417, + "language_loss": 0.83255476, + "learning_rate": 0.0008220032939012349, + "loss": 0.84333324, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.36108398, + "step": 1554, + "time_per_iteration": 2.6683521270751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077555, + "balance_loss_mlp": 1.04310393, + "epoch": 0.29915352058484035, + "flos": 498370669056.0, + "grad_norm": 0.049159177960894807, + "language_loss": 0.87956095, + "learning_rate": 0.0008217648953297277, + "loss": 0.89033645, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.34472656, + "step": 1555, + "time_per_iteration": 2.82114315032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109245, + "balance_loss_mlp": 1.05711639, + "epoch": 0.2993459022701039, + "flos": 591837373440.0, + "grad_norm": 0.06210935096260163, + "language_loss": 0.7799179, + "learning_rate": 0.0008215263718380095, + "loss": 0.79084241, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.35327148, + "step": 1556, + "time_per_iteration": 2.6806485652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104132, + "balance_loss_mlp": 1.06815481, + "epoch": 0.29953828395536747, + "flos": 572107802112.0, + "grad_norm": 0.051501670996139066, + "language_loss": 0.8437115, + "learning_rate": 0.0008212877235186833, + "loss": 0.8547529, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.36010742, + "step": 1557, + "time_per_iteration": 2.706531286239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075275, + "balance_loss_mlp": 1.06321061, + "epoch": 0.299730665640631, + "flos": 1503855051264.0, + "grad_norm": 0.03618665962020262, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78812838, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.12060547, + "step": 1558, + "time_per_iteration": 4.914030313491821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102225, + "balance_loss_mlp": 1.06641483, + "epoch": 0.2999230473258946, + "flos": 513538928640.0, + "grad_norm": 0.06717328469529676, + "language_loss": 0.80777293, + "learning_rate": 0.0008208100527678611, + "loss": 0.8187952, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.3581543, + "step": 1559, + "time_per_iteration": 2.5862650871276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097511, + "balance_loss_mlp": 1.06294012, + "epoch": 0.3001154290111581, + "flos": 834128381952.0, + "grad_norm": 0.05731533213860712, + "language_loss": 0.78337204, + "learning_rate": 0.0008205710305218135, + "loss": 0.79434717, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.34594727, + "step": 1560, + "time_per_iteration": 3.00581693649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094947, + "balance_loss_mlp": 1.06149733, + "epoch": 0.3003078106964217, + "flos": 556485617664.0, + "grad_norm": 0.051151635719759364, + "language_loss": 0.89917201, + "learning_rate": 0.0008203318838190541, + "loss": 0.9101215, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.3347168, + "step": 1561, + "time_per_iteration": 2.730187177658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087932, + "balance_loss_mlp": 1.05345702, + "epoch": 0.30050019238168524, + "flos": 525897556992.0, + "grad_norm": 0.07455466191279551, + "language_loss": 0.85053575, + "learning_rate": 0.0008200926127524281, + "loss": 0.86141509, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.34521484, + "step": 1562, + "time_per_iteration": 2.6252634525299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082824, + "balance_loss_mlp": 1.04837239, + "epoch": 0.3006925740669488, + "flos": 577582907904.0, + "grad_norm": 0.08578868432126639, + "language_loss": 0.83193934, + "learning_rate": 0.0008198532174148289, + "loss": 0.84276754, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.3449707, + "step": 1563, + "time_per_iteration": 2.784132957458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010668, + "balance_loss_mlp": 0.99912882, + "epoch": 0.3008849557522124, + "flos": 1489408528896.0, + "grad_norm": 0.006418694176289122, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81696838, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.11523438, + "step": 1564, + "time_per_iteration": 4.826026678085327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079569, + "balance_loss_mlp": 1.04607105, + "epoch": 0.30107733743747594, + "flos": 509565371904.0, + "grad_norm": 0.057361266050022765, + "language_loss": 0.88701093, + "learning_rate": 0.0008193740542985244, + "loss": 0.89780664, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.33520508, + "step": 1565, + "time_per_iteration": 2.5685722827911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082334, + "balance_loss_mlp": 1.04881263, + "epoch": 0.30126971912273953, + "flos": 587425858560.0, + "grad_norm": 0.055549771382925904, + "language_loss": 0.86598676, + "learning_rate": 0.0008191342867058467, + "loss": 0.87681007, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.33520508, + "step": 1566, + "time_per_iteration": 2.7413527965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088116, + "balance_loss_mlp": 1.05423677, + "epoch": 0.30146210080800306, + "flos": 601822918656.0, + "grad_norm": 0.054174391750340056, + "language_loss": 0.83411789, + "learning_rate": 0.0008188943952142509, + "loss": 0.84499902, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.33911133, + "step": 1567, + "time_per_iteration": 2.816777229309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098385, + "balance_loss_mlp": 1.06376624, + "epoch": 0.30165448249326665, + "flos": 917424686592.0, + "grad_norm": 0.057308899380469513, + "language_loss": 0.81973398, + "learning_rate": 0.0008186543799168711, + "loss": 0.8307178, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.34643555, + "step": 1568, + "time_per_iteration": 3.138439655303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094575, + "balance_loss_mlp": 1.060076, + "epoch": 0.3018468641785302, + "flos": 776953368576.0, + "grad_norm": 0.06314470525088989, + "language_loss": 0.88671768, + "learning_rate": 0.0008184142409068892, + "loss": 0.89766341, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.34545898, + "step": 1569, + "time_per_iteration": 3.0061678886413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104914, + "balance_loss_mlp": 1.07134473, + "epoch": 0.30203924586379377, + "flos": 522101500416.0, + "grad_norm": 0.05000282823150535, + "language_loss": 0.86630476, + "learning_rate": 0.000818173978277536, + "loss": 0.87735385, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.3359375, + "step": 1570, + "time_per_iteration": 2.7171432971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101367, + "balance_loss_mlp": 1.06779718, + "epoch": 0.3022316275490573, + "flos": 524288318976.0, + "grad_norm": 0.052630401262377564, + "language_loss": 0.83781934, + "learning_rate": 0.000817933592122089, + "loss": 0.84883296, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.3359375, + "step": 1571, + "time_per_iteration": 2.7346580028533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105149, + "balance_loss_mlp": 1.0710789, + "epoch": 0.3024240092343209, + "flos": 479672755200.0, + "grad_norm": 0.05357670269103591, + "language_loss": 0.83451247, + "learning_rate": 0.0008176930825338749, + "loss": 0.84556395, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.34106445, + "step": 1572, + "time_per_iteration": 2.5449459552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095582, + "balance_loss_mlp": 1.06110692, + "epoch": 0.3026163909195845, + "flos": 686901579264.0, + "grad_norm": 0.06283664606524127, + "language_loss": 0.8873198, + "learning_rate": 0.0008174524496062679, + "loss": 0.89827561, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.3449707, + "step": 1573, + "time_per_iteration": 2.8826043605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108869, + "balance_loss_mlp": 1.05380964, + "epoch": 0.302808772604848, + "flos": 542654553600.0, + "grad_norm": 0.05929060654319276, + "language_loss": 0.85444844, + "learning_rate": 0.0008172116934326894, + "loss": 0.86533535, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.34912109, + "step": 1574, + "time_per_iteration": 2.7539572715759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088334, + "balance_loss_mlp": 1.05435979, + "epoch": 0.3030011542901116, + "flos": 474852395520.0, + "grad_norm": 0.051325587648683973, + "language_loss": 0.87683225, + "learning_rate": 0.0008169708141066097, + "loss": 0.88771558, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.34008789, + "step": 1575, + "time_per_iteration": 2.5635225772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086472, + "balance_loss_mlp": 1.05199683, + "epoch": 0.30319353597537513, + "flos": 481233940992.0, + "grad_norm": 0.06106098638193731, + "language_loss": 0.90820259, + "learning_rate": 0.0008167298117215465, + "loss": 0.91906732, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.3449707, + "step": 1576, + "time_per_iteration": 2.5388035774230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087257, + "balance_loss_mlp": 1.05242443, + "epoch": 0.3033859176606387, + "flos": 704455916544.0, + "grad_norm": 0.06728579874610481, + "language_loss": 0.88300574, + "learning_rate": 0.0008164886863710649, + "loss": 0.89387834, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.34887695, + "step": 1577, + "time_per_iteration": 2.8935675621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088541, + "balance_loss_mlp": 1.05554342, + "epoch": 0.30357829934590225, + "flos": 764344456704.0, + "grad_norm": 0.04642698643554312, + "language_loss": 0.86113924, + "learning_rate": 0.0008162474381487783, + "loss": 0.87202466, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.33007812, + "step": 1578, + "time_per_iteration": 3.0151257514953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088208, + "balance_loss_mlp": 1.05401909, + "epoch": 0.30377068103116583, + "flos": 532082663424.0, + "grad_norm": 0.05691489249418783, + "language_loss": 0.84894794, + "learning_rate": 0.0008160060671483475, + "loss": 0.85983002, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.34228516, + "step": 1579, + "time_per_iteration": 2.6575984954833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097191, + "balance_loss_mlp": 1.06338358, + "epoch": 0.3039630627164294, + "flos": 509934928896.0, + "grad_norm": 0.07240450604386858, + "language_loss": 0.83520651, + "learning_rate": 0.0008157645734634809, + "loss": 0.84617841, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.33837891, + "step": 1580, + "time_per_iteration": 2.5869438648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061274, + "balance_loss_mlp": 1.04992568, + "epoch": 0.30415544440169295, + "flos": 1505206803456.0, + "grad_norm": 0.030998653754179664, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.77957761, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.11328125, + "step": 1581, + "time_per_iteration": 4.907174348831177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_mlp": 1.01606703, + "epoch": 0.30434782608695654, + "flos": 1457976637440.0, + "grad_norm": 0.012214555664928241, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74242246, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.11669922, + "step": 1582, + "time_per_iteration": 4.8717710971832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102415, + "balance_loss_mlp": 1.0685122, + "epoch": 0.3045402077722201, + "flos": 482312088576.0, + "grad_norm": 0.05813255519406619, + "language_loss": 0.83851862, + "learning_rate": 0.000815039357240067, + "loss": 0.84954274, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.33935547, + "step": 1583, + "time_per_iteration": 2.640148401260376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102551, + "balance_loss_mlp": 1.06879056, + "epoch": 0.30473258945748366, + "flos": 543220549632.0, + "grad_norm": 0.06312099992380371, + "language_loss": 0.85312426, + "learning_rate": 0.0008147973737554952, + "loss": 0.86414981, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.33789062, + "step": 1584, + "time_per_iteration": 2.772367000579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098103, + "balance_loss_mlp": 1.06443787, + "epoch": 0.3049249711427472, + "flos": 566789847552.0, + "grad_norm": 0.054268296030043885, + "language_loss": 0.85613728, + "learning_rate": 0.000814555268055744, + "loss": 0.86711836, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.33691406, + "step": 1585, + "time_per_iteration": 2.616687536239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109794, + "balance_loss_mlp": 1.06441879, + "epoch": 0.3051173528280108, + "flos": 527970894336.0, + "grad_norm": 0.05527556644311566, + "language_loss": 0.87648201, + "learning_rate": 0.0008143130402348073, + "loss": 0.88746148, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.33544922, + "step": 1586, + "time_per_iteration": 2.635103940963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094771, + "balance_loss_mlp": 1.06141627, + "epoch": 0.3053097345132743, + "flos": 586097427456.0, + "grad_norm": 0.052385807505719764, + "language_loss": 0.79520649, + "learning_rate": 0.0008140706903867265, + "loss": 0.80615419, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.33349609, + "step": 1587, + "time_per_iteration": 2.7922940254211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087687, + "balance_loss_mlp": 1.05263984, + "epoch": 0.3055021161985379, + "flos": 606810604032.0, + "grad_norm": 0.054380951058352583, + "language_loss": 0.90043247, + "learning_rate": 0.0008138282186055897, + "loss": 0.9113093, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.35058594, + "step": 1588, + "time_per_iteration": 2.683783769607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085545, + "balance_loss_mlp": 1.05290556, + "epoch": 0.3056944978838015, + "flos": 573594794496.0, + "grad_norm": 0.05235756550943364, + "language_loss": 0.8193745, + "learning_rate": 0.0008135856249855331, + "loss": 0.83023, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.32641602, + "step": 1589, + "time_per_iteration": 2.6717309951782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081523, + "balance_loss_mlp": 1.04900289, + "epoch": 0.305886879569065, + "flos": 633640485888.0, + "grad_norm": 0.06284243799371535, + "language_loss": 0.89691997, + "learning_rate": 0.0008133429096207398, + "loss": 0.90773523, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.32519531, + "step": 1590, + "time_per_iteration": 2.757962465286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037916, + "balance_loss_mlp": 1.02561319, + "epoch": 0.3060792612543286, + "flos": 1368227414016.0, + "grad_norm": 0.023218914608202516, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76350176, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.12304688, + "step": 1591, + "time_per_iteration": 4.927033185958862 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078048, + "balance_loss_mlp": 1.0450511, + "epoch": 0.30627164293959214, + "flos": 518290887168.0, + "grad_norm": 0.05132667013942606, + "language_loss": 0.86601979, + "learning_rate": 0.0008128571140339123, + "loss": 0.87680024, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.33007812, + "step": 1592, + "time_per_iteration": 2.627272367477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075862, + "balance_loss_mlp": 1.0423162, + "epoch": 0.3064640246248557, + "flos": 455354168832.0, + "grad_norm": 0.054345541641725725, + "language_loss": 0.87405455, + "learning_rate": 0.0008126140340004805, + "loss": 0.88481319, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.33569336, + "step": 1593, + "time_per_iteration": 2.5047686100006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076578, + "balance_loss_mlp": 1.04355717, + "epoch": 0.30665640631011926, + "flos": 849718480896.0, + "grad_norm": 0.04925367115496714, + "language_loss": 0.82262254, + "learning_rate": 0.0008123708325995172, + "loss": 0.83338827, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.33032227, + "step": 1594, + "time_per_iteration": 3.1693196296691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107923, + "balance_loss_mlp": 1.04582715, + "epoch": 0.30684878799538284, + "flos": 757996406784.0, + "grad_norm": 0.04977841797679214, + "language_loss": 0.79901791, + "learning_rate": 0.0008121275099254414, + "loss": 0.80981016, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.33422852, + "step": 1595, + "time_per_iteration": 2.9197185039520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108089, + "balance_loss_mlp": 1.04758275, + "epoch": 0.3070411696806464, + "flos": 517320428544.0, + "grad_norm": 0.05488318824662342, + "language_loss": 0.88300943, + "learning_rate": 0.0008118840660727194, + "loss": 0.89381832, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.33325195, + "step": 1596, + "time_per_iteration": 2.6452150344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079954, + "balance_loss_mlp": 1.04788685, + "epoch": 0.30723355136590996, + "flos": 843883992576.0, + "grad_norm": 0.05612425557740203, + "language_loss": 0.87403214, + "learning_rate": 0.0008116405011358644, + "loss": 0.88483167, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.32055664, + "step": 1597, + "time_per_iteration": 3.135666608810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_mlp": 1.05092525, + "epoch": 0.30742593305117355, + "flos": 465905710080.0, + "grad_norm": 0.05391343675647517, + "language_loss": 0.80005342, + "learning_rate": 0.0008113968152094369, + "loss": 0.81089526, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.33276367, + "step": 1598, + "time_per_iteration": 2.5122313499450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076054, + "balance_loss_mlp": 1.04331923, + "epoch": 0.3076183147364371, + "flos": 686286120960.0, + "grad_norm": 0.04979397496165333, + "language_loss": 0.82305032, + "learning_rate": 0.0008111530083880438, + "loss": 0.83381081, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.32739258, + "step": 1599, + "time_per_iteration": 2.8883755207061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072089, + "balance_loss_mlp": 1.03949702, + "epoch": 0.30781069642170067, + "flos": 613729032192.0, + "grad_norm": 0.059000164712882774, + "language_loss": 0.86657357, + "learning_rate": 0.0008109090807663399, + "loss": 0.87729448, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.32592773, + "step": 1600, + "time_per_iteration": 2.7799928188323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075122, + "balance_loss_mlp": 1.04260147, + "epoch": 0.3080030781069642, + "flos": 590021521920.0, + "grad_norm": 0.046450828420206536, + "language_loss": 0.88735926, + "learning_rate": 0.0008106650324390257, + "loss": 0.89811045, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.32519531, + "step": 1601, + "time_per_iteration": 2.7887444496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071489, + "balance_loss_mlp": 1.03913534, + "epoch": 0.3081954597922278, + "flos": 562353601536.0, + "grad_norm": 0.06865077604181559, + "language_loss": 0.81335884, + "learning_rate": 0.0008104208635008493, + "loss": 0.82407373, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.32348633, + "step": 1602, + "time_per_iteration": 2.6526358127593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077173, + "balance_loss_mlp": 1.04448628, + "epoch": 0.3083878414774913, + "flos": 447599112192.0, + "grad_norm": 0.053973671264543166, + "language_loss": 0.81925714, + "learning_rate": 0.0008101765740466058, + "loss": 0.83002889, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.3269043, + "step": 1603, + "time_per_iteration": 2.5337142944335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073512, + "balance_loss_mlp": 1.04020488, + "epoch": 0.3085802231627549, + "flos": 493297205760.0, + "grad_norm": 0.05670542728571842, + "language_loss": 0.84135199, + "learning_rate": 0.0008099321641711364, + "loss": 0.85208714, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.33325195, + "step": 1604, + "time_per_iteration": 2.6616737842559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071402, + "balance_loss_mlp": 1.03804755, + "epoch": 0.3087726048480185, + "flos": 487437986304.0, + "grad_norm": 0.0517354770696361, + "language_loss": 0.8343811, + "learning_rate": 0.0008096876339693295, + "loss": 0.8450951, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.33374023, + "step": 1605, + "time_per_iteration": 2.6034042835235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078765, + "balance_loss_mlp": 1.04412317, + "epoch": 0.308964986533282, + "flos": 730265877504.0, + "grad_norm": 0.0630488444124333, + "language_loss": 0.8123467, + "learning_rate": 0.0008094429835361206, + "loss": 0.8231343, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.34667969, + "step": 1606, + "time_per_iteration": 2.9442811012268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_mlp": 1.03788495, + "epoch": 0.3091573682185456, + "flos": 605131554816.0, + "grad_norm": 0.0490228515497239, + "language_loss": 0.85833865, + "learning_rate": 0.0008091982129664908, + "loss": 0.8690542, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.33691406, + "step": 1607, + "time_per_iteration": 2.734976053237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077455, + "balance_loss_mlp": 1.04290783, + "epoch": 0.30934974990380915, + "flos": 460081396224.0, + "grad_norm": 0.04772079658934369, + "language_loss": 0.82646394, + "learning_rate": 0.0008089533223554687, + "loss": 0.83723843, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.34594727, + "step": 1608, + "time_per_iteration": 2.756741762161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080172, + "balance_loss_mlp": 1.04669785, + "epoch": 0.30954213158907273, + "flos": 553142075904.0, + "grad_norm": 0.05499274022240881, + "language_loss": 0.8525604, + "learning_rate": 0.0008087083117981294, + "loss": 0.86336207, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.33496094, + "step": 1609, + "time_per_iteration": 2.9062788486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081142, + "balance_loss_mlp": 1.04676199, + "epoch": 0.30973451327433627, + "flos": 552776901120.0, + "grad_norm": 0.0512798930400947, + "language_loss": 0.87996054, + "learning_rate": 0.0008084631813895943, + "loss": 0.89077199, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.34375, + "step": 1610, + "time_per_iteration": 2.789893627166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079299, + "balance_loss_mlp": 1.04575384, + "epoch": 0.30992689495959985, + "flos": 565430893056.0, + "grad_norm": 0.07403274033744815, + "language_loss": 0.83632123, + "learning_rate": 0.0008082179312250315, + "loss": 0.84711421, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.33544922, + "step": 1611, + "time_per_iteration": 2.713533878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_mlp": 1.02864099, + "epoch": 0.3101192766448634, + "flos": 1441621131264.0, + "grad_norm": 0.023040649208851512, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.80895925, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.11425781, + "step": 1612, + "time_per_iteration": 4.866560459136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_mlp": 1.02448523, + "epoch": 0.31031165833012697, + "flos": 1531086575616.0, + "grad_norm": 0.021256554447441355, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77664924, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.11132812, + "step": 1613, + "time_per_iteration": 5.01593279838562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010886, + "balance_loss_mlp": 1.05483997, + "epoch": 0.31050404001539056, + "flos": 991534196736.0, + "grad_norm": 0.06253960188626659, + "language_loss": 0.81937206, + "learning_rate": 0.0008074814631475545, + "loss": 0.83025801, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.33789062, + "step": 1614, + "time_per_iteration": 3.3154871463775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092644, + "balance_loss_mlp": 1.05940843, + "epoch": 0.3106964217006541, + "flos": 445748355072.0, + "grad_norm": 0.0719929788966035, + "language_loss": 0.78655052, + "learning_rate": 0.0008072357349114907, + "loss": 0.79747701, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.33251953, + "step": 1615, + "time_per_iteration": 2.6783502101898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084302, + "balance_loss_mlp": 1.05063736, + "epoch": 0.3108888033859177, + "flos": 510259405824.0, + "grad_norm": 0.06269338504314155, + "language_loss": 0.88523185, + "learning_rate": 0.0008069898873959363, + "loss": 0.89607489, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.33691406, + "step": 1616, + "time_per_iteration": 2.6805779933929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092165, + "balance_loss_mlp": 1.05952573, + "epoch": 0.3110811850711812, + "flos": 520471913472.0, + "grad_norm": 0.06669389997650658, + "language_loss": 0.85964763, + "learning_rate": 0.0008067439206963375, + "loss": 0.87056935, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.32641602, + "step": 1617, + "time_per_iteration": 2.6084542274475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091143, + "balance_loss_mlp": 1.05797851, + "epoch": 0.3112735667564448, + "flos": 686085299712.0, + "grad_norm": 0.06020913179087489, + "language_loss": 0.86049557, + "learning_rate": 0.0008064978349081873, + "loss": 0.87140703, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.33178711, + "step": 1618, + "time_per_iteration": 2.9622116088867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089777, + "balance_loss_mlp": 1.05554032, + "epoch": 0.31146594844170833, + "flos": 532786871808.0, + "grad_norm": 0.04562356821057988, + "language_loss": 0.86218596, + "learning_rate": 0.0008062516301270245, + "loss": 0.87308377, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.3425293, + "step": 1619, + "time_per_iteration": 2.691589593887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091127, + "balance_loss_mlp": 1.0575099, + "epoch": 0.3116583301269719, + "flos": 679187220480.0, + "grad_norm": 0.05429224886242875, + "language_loss": 0.88343138, + "learning_rate": 0.0008060053064484343, + "loss": 0.89434266, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.33642578, + "step": 1620, + "time_per_iteration": 2.936244487762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096127, + "balance_loss_mlp": 1.06277251, + "epoch": 0.31185071181223545, + "flos": 585855908352.0, + "grad_norm": 0.05040245512912965, + "language_loss": 0.85009742, + "learning_rate": 0.0008057588639680482, + "loss": 0.86105865, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.33374023, + "step": 1621, + "time_per_iteration": 2.7633163928985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107807, + "balance_loss_mlp": 1.07309282, + "epoch": 0.31204309349749904, + "flos": 725090517504.0, + "grad_norm": 0.06801147163116106, + "language_loss": 0.82624507, + "learning_rate": 0.0008055123027815434, + "loss": 0.83732307, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.34741211, + "step": 1622, + "time_per_iteration": 2.946943521499634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105084, + "balance_loss_mlp": 1.07156253, + "epoch": 0.3122354751827626, + "flos": 576558604800.0, + "grad_norm": 0.0611005921730787, + "language_loss": 0.85109818, + "learning_rate": 0.0008052656229846436, + "loss": 0.862149, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.33544922, + "step": 1623, + "time_per_iteration": 2.6431145668029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106208, + "balance_loss_mlp": 1.07106483, + "epoch": 0.31242785686802615, + "flos": 575672514048.0, + "grad_norm": 0.055122717603047884, + "language_loss": 0.90674621, + "learning_rate": 0.0008050188246731182, + "loss": 0.91780829, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.35180664, + "step": 1624, + "time_per_iteration": 2.6666738986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101645, + "balance_loss_mlp": 1.06745625, + "epoch": 0.31262023855328974, + "flos": 736490271744.0, + "grad_norm": 0.05430344032768667, + "language_loss": 0.81962687, + "learning_rate": 0.0008047719079427834, + "loss": 0.8306433, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.34204102, + "step": 1625, + "time_per_iteration": 2.978775978088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095181, + "balance_loss_mlp": 1.07791936, + "epoch": 0.3128126202385533, + "flos": 1558395113472.0, + "grad_norm": 0.034550759669135796, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75446886, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.17285156, + "step": 1626, + "time_per_iteration": 4.800370931625366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109181, + "balance_loss_mlp": 1.05695319, + "epoch": 0.31300500192381686, + "flos": 514666538496.0, + "grad_norm": 0.04752817769408696, + "language_loss": 0.86259782, + "learning_rate": 0.0008042777196091757, + "loss": 0.87351596, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.34863281, + "step": 1627, + "time_per_iteration": 2.695350408554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088728, + "balance_loss_mlp": 1.05301261, + "epoch": 0.3131973836090804, + "flos": 526370420736.0, + "grad_norm": 0.06407391520506579, + "language_loss": 0.8214981, + "learning_rate": 0.0008040304481977643, + "loss": 0.83238542, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.35742188, + "step": 1628, + "time_per_iteration": 2.6652393341064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_mlp": 1.05030346, + "epoch": 0.313389765294344, + "flos": 822473961984.0, + "grad_norm": 0.08346342139557943, + "language_loss": 0.86950874, + "learning_rate": 0.0008037830587512649, + "loss": 0.88034296, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.33129883, + "step": 1629, + "time_per_iteration": 3.0668327808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090413, + "balance_loss_mlp": 1.05651021, + "epoch": 0.31358214697960757, + "flos": 393604697088.0, + "grad_norm": 0.061409761762948115, + "language_loss": 0.78720629, + "learning_rate": 0.0008035355513657224, + "loss": 0.79811049, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.33935547, + "step": 1630, + "time_per_iteration": 2.5013740062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087918, + "balance_loss_mlp": 1.05449188, + "epoch": 0.3137745286648711, + "flos": 571611617280.0, + "grad_norm": 0.049199842100191564, + "language_loss": 0.93020999, + "learning_rate": 0.0008032879261372279, + "loss": 0.94108921, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.33447266, + "step": 1631, + "time_per_iteration": 2.8559622764587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088241, + "balance_loss_mlp": 1.07612944, + "epoch": 0.3139669103501347, + "flos": 1497614690304.0, + "grad_norm": 0.04267228885339989, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80724084, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.12109375, + "step": 1632, + "time_per_iteration": 5.726024627685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081932, + "balance_loss_mlp": 1.04886341, + "epoch": 0.3141592920353982, + "flos": 525090041856.0, + "grad_norm": 0.04986838794009694, + "language_loss": 0.87459773, + "learning_rate": 0.0008027923225359748, + "loss": 0.88541704, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.33081055, + "step": 1633, + "time_per_iteration": 2.599775791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078913, + "balance_loss_mlp": 1.04465246, + "epoch": 0.3143516737206618, + "flos": 592989714432.0, + "grad_norm": 0.05680374588643473, + "language_loss": 0.8835839, + "learning_rate": 0.0008025443443556267, + "loss": 0.89437306, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.34301758, + "step": 1634, + "time_per_iteration": 2.7439024448394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010804, + "balance_loss_mlp": 1.04776073, + "epoch": 0.31454405540592534, + "flos": 648034573824.0, + "grad_norm": 0.04764849369773053, + "language_loss": 0.88161099, + "learning_rate": 0.000802296248717147, + "loss": 0.89241499, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.32641602, + "step": 1635, + "time_per_iteration": 2.902290105819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082457, + "balance_loss_mlp": 1.04850602, + "epoch": 0.3147364370911889, + "flos": 642543501312.0, + "grad_norm": 0.05380775409858787, + "language_loss": 0.79150212, + "learning_rate": 0.0008020480357168554, + "loss": 0.80232668, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.33984375, + "step": 1636, + "time_per_iteration": 2.7940564155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107486, + "balance_loss_mlp": 1.04176795, + "epoch": 0.31492881877645246, + "flos": 471607778304.0, + "grad_norm": 0.05509564816324918, + "language_loss": 0.88341308, + "learning_rate": 0.0008017997054511165, + "loss": 0.89416164, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.33105469, + "step": 1637, + "time_per_iteration": 2.596212148666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075188, + "balance_loss_mlp": 1.04157114, + "epoch": 0.31512120046171604, + "flos": 629135838720.0, + "grad_norm": 0.0536589952194777, + "language_loss": 0.8549943, + "learning_rate": 0.0008015512580163407, + "loss": 0.86574614, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.33642578, + "step": 1638, + "time_per_iteration": 2.763343334197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107569, + "balance_loss_mlp": 1.0416913, + "epoch": 0.31531358214697963, + "flos": 703460726784.0, + "grad_norm": 0.0636877441873346, + "language_loss": 0.80888116, + "learning_rate": 0.0008013026935089838, + "loss": 0.81963813, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.34033203, + "step": 1639, + "time_per_iteration": 2.9786083698272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070933, + "balance_loss_mlp": 1.03798366, + "epoch": 0.31550596383224316, + "flos": 572275127808.0, + "grad_norm": 0.055086353977466425, + "language_loss": 0.83909047, + "learning_rate": 0.0008010540120255472, + "loss": 0.84979975, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.32958984, + "step": 1640, + "time_per_iteration": 2.666520357131958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075633, + "balance_loss_mlp": 1.04196858, + "epoch": 0.31569834551750675, + "flos": 658047822336.0, + "grad_norm": 0.06483249406864507, + "language_loss": 0.86052895, + "learning_rate": 0.0008008052136625774, + "loss": 0.8712852, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.33691406, + "step": 1641, + "time_per_iteration": 2.8062589168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078165, + "balance_loss_mlp": 1.04407096, + "epoch": 0.3158907272027703, + "flos": 566002681344.0, + "grad_norm": 0.05792040128516231, + "language_loss": 0.86837387, + "learning_rate": 0.0008005562985166666, + "loss": 0.87915552, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.34130859, + "step": 1642, + "time_per_iteration": 2.6996512413024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081029, + "balance_loss_mlp": 1.04576707, + "epoch": 0.31608310888803387, + "flos": 536622216192.0, + "grad_norm": 0.04642534602938139, + "language_loss": 0.84936821, + "learning_rate": 0.0008003072666844524, + "loss": 0.86017853, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.35302734, + "step": 1643, + "time_per_iteration": 2.6999776363372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078239, + "balance_loss_mlp": 1.04292917, + "epoch": 0.3162754905732974, + "flos": 486428239872.0, + "grad_norm": 0.08271259063406261, + "language_loss": 0.82613683, + "learning_rate": 0.0008000581182626173, + "loss": 0.83691919, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.35302734, + "step": 1644, + "time_per_iteration": 2.541093111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082389, + "balance_loss_mlp": 1.04893875, + "epoch": 0.316467872258561, + "flos": 529792538112.0, + "grad_norm": 0.058359985905672214, + "language_loss": 0.86275887, + "learning_rate": 0.0007998088533478894, + "loss": 0.87358278, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.33447266, + "step": 1645, + "time_per_iteration": 2.641402006149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077931, + "balance_loss_mlp": 1.04309845, + "epoch": 0.3166602539438245, + "flos": 443197771776.0, + "grad_norm": 0.07387441321187599, + "language_loss": 0.84062803, + "learning_rate": 0.000799559472037042, + "loss": 0.85140741, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.34887695, + "step": 1646, + "time_per_iteration": 2.5274438858032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076892, + "balance_loss_mlp": 1.04222584, + "epoch": 0.3168526356290881, + "flos": 645513103872.0, + "grad_norm": 0.053861363144643716, + "language_loss": 0.87875295, + "learning_rate": 0.0007993099744268932, + "loss": 0.8895219, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.34716797, + "step": 1647, + "time_per_iteration": 2.8893649578094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070585, + "balance_loss_mlp": 1.03646684, + "epoch": 0.3170450173143517, + "flos": 585889403904.0, + "grad_norm": 0.05841982976759713, + "language_loss": 0.87792766, + "learning_rate": 0.000799060360614307, + "loss": 0.88863349, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.34155273, + "step": 1648, + "time_per_iteration": 2.6867480278015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076826, + "balance_loss_mlp": 1.04273248, + "epoch": 0.3172373989996152, + "flos": 826763231232.0, + "grad_norm": 0.05654214693871822, + "language_loss": 0.83848637, + "learning_rate": 0.0007988106306961917, + "loss": 0.84925467, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.34130859, + "step": 1649, + "time_per_iteration": 3.1321003437042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080036, + "balance_loss_mlp": 1.04577541, + "epoch": 0.3174297806848788, + "flos": 527153204736.0, + "grad_norm": 0.060794493166337976, + "language_loss": 0.84529203, + "learning_rate": 0.0007985607847695014, + "loss": 0.85609239, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.34301758, + "step": 1650, + "time_per_iteration": 2.6306049823760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081772, + "balance_loss_mlp": 1.04851258, + "epoch": 0.31762216237014235, + "flos": 712855544832.0, + "grad_norm": 0.05325998456044798, + "language_loss": 0.82638443, + "learning_rate": 0.0007983108229312345, + "loss": 0.83720207, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.33276367, + "step": 1651, + "time_per_iteration": 2.909571647644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108965, + "balance_loss_mlp": 1.05567503, + "epoch": 0.31781454405540593, + "flos": 483567736320.0, + "grad_norm": 0.0653784528473409, + "language_loss": 0.86672306, + "learning_rate": 0.0007980607452784351, + "loss": 0.87761962, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.33984375, + "step": 1652, + "time_per_iteration": 2.5339765548706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_mlp": 1.06639075, + "epoch": 0.31800692574066947, + "flos": 548483249664.0, + "grad_norm": 0.0685029555550019, + "language_loss": 0.90562367, + "learning_rate": 0.0007978105519081919, + "loss": 0.91662765, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.34008789, + "step": 1653, + "time_per_iteration": 2.6916213035583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096542, + "balance_loss_mlp": 1.06213784, + "epoch": 0.31819930742593305, + "flos": 516640951296.0, + "grad_norm": 0.07941193091019123, + "language_loss": 0.87969935, + "learning_rate": 0.0007975602429176385, + "loss": 0.89066482, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.34423828, + "step": 1654, + "time_per_iteration": 2.5997297763824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100835, + "balance_loss_mlp": 1.06695616, + "epoch": 0.31839168911119664, + "flos": 455748456960.0, + "grad_norm": 0.07129171690745044, + "language_loss": 0.81582803, + "learning_rate": 0.0007973098184039536, + "loss": 0.82683635, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.33911133, + "step": 1655, + "time_per_iteration": 2.654914140701294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096724, + "balance_loss_mlp": 1.06284511, + "epoch": 0.3185840707964602, + "flos": 625719513600.0, + "grad_norm": 0.05658637496419385, + "language_loss": 0.86710656, + "learning_rate": 0.0007970592784643602, + "loss": 0.87807381, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.33911133, + "step": 1656, + "time_per_iteration": 2.846390962600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090807, + "balance_loss_mlp": 1.05719042, + "epoch": 0.31877645248172376, + "flos": 567213249024.0, + "grad_norm": 0.058346793379709355, + "language_loss": 0.85032123, + "learning_rate": 0.0007968086231961272, + "loss": 0.8612293, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.33642578, + "step": 1657, + "time_per_iteration": 2.652986526489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094073, + "balance_loss_mlp": 1.0593828, + "epoch": 0.3189688341669873, + "flos": 489338205696.0, + "grad_norm": 0.08644842740903268, + "language_loss": 0.836254, + "learning_rate": 0.0007965578526965671, + "loss": 0.84719473, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.34741211, + "step": 1658, + "time_per_iteration": 2.5607872009277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092531, + "balance_loss_mlp": 1.05860353, + "epoch": 0.3191612158522509, + "flos": 575948938752.0, + "grad_norm": 0.04707712809776705, + "language_loss": 0.86020696, + "learning_rate": 0.0007963069670630377, + "loss": 0.87113225, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.33959961, + "step": 1659, + "time_per_iteration": 2.7435247898101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097687, + "balance_loss_mlp": 1.06447566, + "epoch": 0.3193535975375144, + "flos": 537867689472.0, + "grad_norm": 0.062321727217464123, + "language_loss": 0.87956834, + "learning_rate": 0.0007960559663929416, + "loss": 0.89054519, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.33227539, + "step": 1660, + "time_per_iteration": 2.6282846927642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096682, + "balance_loss_mlp": 1.06265998, + "epoch": 0.319545979222778, + "flos": 733954245120.0, + "grad_norm": 0.07201541894751945, + "language_loss": 0.87465358, + "learning_rate": 0.0007958048507837259, + "loss": 0.88562042, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.34057617, + "step": 1661, + "time_per_iteration": 2.9250974655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099132, + "balance_loss_mlp": 1.0647999, + "epoch": 0.31973836090804153, + "flos": 764136433152.0, + "grad_norm": 0.0721917610669121, + "language_loss": 0.87230003, + "learning_rate": 0.0007955536203328822, + "loss": 0.8832913, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.34375, + "step": 1662, + "time_per_iteration": 2.899735450744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109485, + "balance_loss_mlp": 1.06109047, + "epoch": 0.3199307425933051, + "flos": 560252560896.0, + "grad_norm": 0.06666532975578916, + "language_loss": 0.83308822, + "learning_rate": 0.0007953022751379469, + "loss": 0.84403676, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.33789062, + "step": 1663, + "time_per_iteration": 2.7743418216705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093375, + "balance_loss_mlp": 1.05899549, + "epoch": 0.3201231242785687, + "flos": 751019751936.0, + "grad_norm": 0.058114271957014456, + "language_loss": 0.81677037, + "learning_rate": 0.000795050815296501, + "loss": 0.82770407, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.34399414, + "step": 1664, + "time_per_iteration": 2.9620323181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091229, + "balance_loss_mlp": 1.05768323, + "epoch": 0.32031550596383224, + "flos": 496157709312.0, + "grad_norm": 0.061791342299560625, + "language_loss": 0.93274921, + "learning_rate": 0.0007947992409061695, + "loss": 0.94366151, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.33569336, + "step": 1665, + "time_per_iteration": 2.6097099781036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083934, + "balance_loss_mlp": 1.05022132, + "epoch": 0.3205078876490958, + "flos": 731294562816.0, + "grad_norm": 0.05133774923717053, + "language_loss": 0.86471802, + "learning_rate": 0.0007945475520646226, + "loss": 0.8755573, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.33740234, + "step": 1666, + "time_per_iteration": 2.9224231243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088823, + "balance_loss_mlp": 1.05487204, + "epoch": 0.32070026933435936, + "flos": 549177283584.0, + "grad_norm": 0.1345109768982335, + "language_loss": 0.8496111, + "learning_rate": 0.0007942957488695743, + "loss": 0.86049932, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.33959961, + "step": 1667, + "time_per_iteration": 2.6267666816711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086537, + "balance_loss_mlp": 1.05265749, + "epoch": 0.32089265101962294, + "flos": 744949536768.0, + "grad_norm": 0.061316479944915916, + "language_loss": 0.80963373, + "learning_rate": 0.0007940438314187833, + "loss": 0.82049918, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.33886719, + "step": 1668, + "time_per_iteration": 3.00421142578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089646, + "balance_loss_mlp": 1.05638647, + "epoch": 0.3210850327048865, + "flos": 493937395200.0, + "grad_norm": 0.05864654089818211, + "language_loss": 0.80047274, + "learning_rate": 0.0007937917998100529, + "loss": 0.81136918, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.33276367, + "step": 1669, + "time_per_iteration": 2.607917070388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_mlp": 1.06610548, + "epoch": 0.32127741439015006, + "flos": 530383265280.0, + "grad_norm": 0.060159342011431034, + "language_loss": 0.78680766, + "learning_rate": 0.0007935396541412302, + "loss": 0.79781532, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.34692383, + "step": 1670, + "time_per_iteration": 2.6022346019744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108678, + "balance_loss_mlp": 1.07458389, + "epoch": 0.3214697960754136, + "flos": 500948955648.0, + "grad_norm": 0.07085567852213893, + "language_loss": 0.85879421, + "learning_rate": 0.0007932873945102068, + "loss": 0.86988097, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.34130859, + "step": 1671, + "time_per_iteration": 2.581815719604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120776, + "balance_loss_mlp": 1.10942781, + "epoch": 0.3216621777606772, + "flos": 1382579394048.0, + "grad_norm": 0.04969555951860313, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76882553, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.11328125, + "step": 1672, + "time_per_iteration": 4.821724891662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106649, + "balance_loss_mlp": 1.07193518, + "epoch": 0.32185455944594077, + "flos": 571260999168.0, + "grad_norm": 0.05896773993357689, + "language_loss": 0.86527109, + "learning_rate": 0.0007927825337533461, + "loss": 0.87633765, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.34765625, + "step": 1673, + "time_per_iteration": 2.6640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105888, + "balance_loss_mlp": 1.07184219, + "epoch": 0.3220469411312043, + "flos": 543652715520.0, + "grad_norm": 0.06618360944756078, + "language_loss": 0.84761298, + "learning_rate": 0.0007925299328235131, + "loss": 0.85867184, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.34057617, + "step": 1674, + "time_per_iteration": 2.6524405479431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102416, + "balance_loss_mlp": 1.06705832, + "epoch": 0.3222393228164679, + "flos": 490884834816.0, + "grad_norm": 0.05872681692102293, + "language_loss": 0.85148364, + "learning_rate": 0.000792277218323488, + "loss": 0.86250782, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.35424805, + "step": 1675, + "time_per_iteration": 2.557460069656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100116, + "balance_loss_mlp": 1.06523526, + "epoch": 0.3224317045017314, + "flos": 490145720832.0, + "grad_norm": 0.05188137415598196, + "language_loss": 0.84647608, + "learning_rate": 0.0007920243903513833, + "loss": 0.85747719, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.34912109, + "step": 1676, + "time_per_iteration": 2.5208208560943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092752, + "balance_loss_mlp": 1.05813313, + "epoch": 0.322624086186995, + "flos": 575505188352.0, + "grad_norm": 0.06429192544800656, + "language_loss": 0.83992624, + "learning_rate": 0.0007917714490053556, + "loss": 0.8508538, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.34667969, + "step": 1677, + "time_per_iteration": 2.653686761856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109123, + "balance_loss_mlp": 1.05668271, + "epoch": 0.32281646787225854, + "flos": 628974305280.0, + "grad_norm": 0.048890211607645416, + "language_loss": 0.86094737, + "learning_rate": 0.0007915183943836055, + "loss": 0.87185967, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.34594727, + "step": 1678, + "time_per_iteration": 2.852612018585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083818, + "balance_loss_mlp": 1.04950905, + "epoch": 0.3230088495575221, + "flos": 781036024320.0, + "grad_norm": 0.05620908679364121, + "language_loss": 0.83880055, + "learning_rate": 0.0007912652265843773, + "loss": 0.8496387, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.34350586, + "step": 1679, + "time_per_iteration": 3.006805419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.0433867, + "epoch": 0.3232012312427857, + "flos": 535839432192.0, + "grad_norm": 0.04762836982551939, + "language_loss": 0.81587136, + "learning_rate": 0.0007910119457059597, + "loss": 0.82664907, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.34423828, + "step": 1680, + "time_per_iteration": 2.6930737495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_mlp": 1.04129148, + "epoch": 0.32339361292804925, + "flos": 704515553280.0, + "grad_norm": 0.06110281418881611, + "language_loss": 0.80031025, + "learning_rate": 0.0007907585518466849, + "loss": 0.81107438, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.35180664, + "step": 1681, + "time_per_iteration": 2.940950870513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081204, + "balance_loss_mlp": 1.04603744, + "epoch": 0.32358599461331283, + "flos": 452099377152.0, + "grad_norm": 0.0474614445796137, + "language_loss": 0.90124965, + "learning_rate": 0.000790505045104929, + "loss": 0.91206169, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.35205078, + "step": 1682, + "time_per_iteration": 2.4919469356536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078264, + "balance_loss_mlp": 1.0435977, + "epoch": 0.32377837629857636, + "flos": 600597794304.0, + "grad_norm": 0.057051782898604, + "language_loss": 0.86989701, + "learning_rate": 0.0007902514255791125, + "loss": 0.88067961, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.34692383, + "step": 1683, + "time_per_iteration": 2.7545859813690186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108269, + "balance_loss_mlp": 1.04721308, + "epoch": 0.32397075798383995, + "flos": 807180636672.0, + "grad_norm": 0.05145240385219177, + "language_loss": 0.87981123, + "learning_rate": 0.0007899976933676986, + "loss": 0.89063811, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.35498047, + "step": 1684, + "time_per_iteration": 2.97807240486145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081365, + "balance_loss_mlp": 1.04638934, + "epoch": 0.3241631396691035, + "flos": 601414073856.0, + "grad_norm": 0.06429290680378846, + "language_loss": 0.8767072, + "learning_rate": 0.0007897438485691955, + "loss": 0.88752091, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.3503418, + "step": 1685, + "time_per_iteration": 2.704326868057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083548, + "balance_loss_mlp": 1.04826176, + "epoch": 0.32435552135436707, + "flos": 473980861440.0, + "grad_norm": 0.058364951070402814, + "language_loss": 0.82023847, + "learning_rate": 0.0007894898912821542, + "loss": 0.831074, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.3527832, + "step": 1686, + "time_per_iteration": 2.5206680297851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076257, + "balance_loss_mlp": 1.04178166, + "epoch": 0.3245479030396306, + "flos": 537824019456.0, + "grad_norm": 0.04476181031616706, + "language_loss": 0.86661267, + "learning_rate": 0.0007892358216051695, + "loss": 0.87737525, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.3449707, + "step": 1687, + "time_per_iteration": 2.7332026958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075747, + "balance_loss_mlp": 1.04108071, + "epoch": 0.3247402847248942, + "flos": 547394927616.0, + "grad_norm": 0.05643246072623682, + "language_loss": 0.92275292, + "learning_rate": 0.0007889816396368803, + "loss": 0.93351042, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.34692383, + "step": 1688, + "time_per_iteration": 2.6158432960510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077832, + "balance_loss_mlp": 1.04388082, + "epoch": 0.3249326664101578, + "flos": 377941814784.0, + "grad_norm": 0.04953960067471088, + "language_loss": 0.85575634, + "learning_rate": 0.0007887273454759687, + "loss": 0.86653465, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.33984375, + "step": 1689, + "time_per_iteration": 2.4958317279815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075156, + "balance_loss_mlp": 1.04051399, + "epoch": 0.3251250480954213, + "flos": 527818125312.0, + "grad_norm": 0.050587956688220255, + "language_loss": 0.82717729, + "learning_rate": 0.0007884729392211603, + "loss": 0.83792883, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.34692383, + "step": 1690, + "time_per_iteration": 2.6325736045837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075634, + "balance_loss_mlp": 1.04113472, + "epoch": 0.3253174297806849, + "flos": 449435312640.0, + "grad_norm": 0.06211432544239721, + "language_loss": 0.85214412, + "learning_rate": 0.0007882184209712245, + "loss": 0.8629005, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.34545898, + "step": 1691, + "time_per_iteration": 2.5199012756347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076936, + "balance_loss_mlp": 1.04303288, + "epoch": 0.32550981146594843, + "flos": 703855014912.0, + "grad_norm": 0.0444021152083115, + "language_loss": 0.85646939, + "learning_rate": 0.000787963790824974, + "loss": 0.86723876, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.33935547, + "step": 1692, + "time_per_iteration": 2.9585483074188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076864, + "balance_loss_mlp": 1.04217362, + "epoch": 0.325702193151212, + "flos": 392491643904.0, + "grad_norm": 0.06035071191190156, + "language_loss": 0.89588344, + "learning_rate": 0.0007877090488812651, + "loss": 0.90665203, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.34716797, + "step": 1693, + "time_per_iteration": 2.4247992038726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073167, + "balance_loss_mlp": 1.0379529, + "epoch": 0.32589457483647555, + "flos": 577223525376.0, + "grad_norm": 0.051335929306222446, + "language_loss": 0.83377099, + "learning_rate": 0.0007874541952389973, + "loss": 0.84450269, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.35253906, + "step": 1694, + "time_per_iteration": 2.679868459701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074307, + "balance_loss_mlp": 1.03947401, + "epoch": 0.32608695652173914, + "flos": 498092834304.0, + "grad_norm": 0.051580366849716015, + "language_loss": 0.86795568, + "learning_rate": 0.0007871992299971136, + "loss": 0.87869877, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.34887695, + "step": 1695, + "time_per_iteration": 2.6005072593688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079265, + "balance_loss_mlp": 1.04431272, + "epoch": 0.32627933820700267, + "flos": 590858150400.0, + "grad_norm": 0.054409905067417906, + "language_loss": 0.84529006, + "learning_rate": 0.0007869441532546001, + "loss": 0.85608268, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.34985352, + "step": 1696, + "time_per_iteration": 2.7373292446136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071229, + "balance_loss_mlp": 1.03749299, + "epoch": 0.32647171989226625, + "flos": 608790809088.0, + "grad_norm": 0.05196776598603691, + "language_loss": 0.79551816, + "learning_rate": 0.0007866889651104867, + "loss": 0.80623043, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.33764648, + "step": 1697, + "time_per_iteration": 2.768869638442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069598, + "balance_loss_mlp": 1.03464603, + "epoch": 0.32666410157752984, + "flos": 476896619520.0, + "grad_norm": 0.05699082390473629, + "language_loss": 0.83313, + "learning_rate": 0.000786433665663846, + "loss": 0.84382606, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.34985352, + "step": 1698, + "time_per_iteration": 2.6574184894561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070595, + "balance_loss_mlp": 1.03664398, + "epoch": 0.3268564832627934, + "flos": 718060018176.0, + "grad_norm": 0.0499104315286651, + "language_loss": 0.86441195, + "learning_rate": 0.0007861782550137942, + "loss": 0.8751179, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.33984375, + "step": 1699, + "time_per_iteration": 2.8897016048431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071541, + "balance_loss_mlp": 1.0379957, + "epoch": 0.32704886494805696, + "flos": 768469372416.0, + "grad_norm": 0.05892131453680714, + "language_loss": 0.85990739, + "learning_rate": 0.0007859227332594901, + "loss": 0.87062275, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.33569336, + "step": 1700, + "time_per_iteration": 2.8941755294799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080492, + "balance_loss_mlp": 1.046803, + "epoch": 0.3272412466333205, + "flos": 849540980736.0, + "grad_norm": 0.0647173620985618, + "language_loss": 0.84537613, + "learning_rate": 0.0007856671005001365, + "loss": 0.85618103, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.3371582, + "step": 1701, + "time_per_iteration": 3.1362555027008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107778, + "balance_loss_mlp": 1.04373336, + "epoch": 0.3274336283185841, + "flos": 831224208384.0, + "grad_norm": 0.055785838120560656, + "language_loss": 0.81608075, + "learning_rate": 0.0007854113568349787, + "loss": 0.82685852, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.34082031, + "step": 1702, + "time_per_iteration": 3.0684425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108348, + "balance_loss_mlp": 1.04900455, + "epoch": 0.3276260100038476, + "flos": 691721938944.0, + "grad_norm": 0.059478075679183354, + "language_loss": 0.80008304, + "learning_rate": 0.0007851555023633052, + "loss": 0.81091785, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.34521484, + "step": 1703, + "time_per_iteration": 2.829838991165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083551, + "balance_loss_mlp": 1.04974365, + "epoch": 0.3278183916891112, + "flos": 435831211008.0, + "grad_norm": 0.05938301584715095, + "language_loss": 0.82290888, + "learning_rate": 0.0007848995371844474, + "loss": 0.83374435, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.33837891, + "step": 1704, + "time_per_iteration": 2.498462200164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.0532254, + "epoch": 0.3280107733743748, + "flos": 460883119104.0, + "grad_norm": 0.06015932024064871, + "language_loss": 0.80622214, + "learning_rate": 0.0007846434613977801, + "loss": 0.81709725, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.34326172, + "step": 1705, + "time_per_iteration": 2.4933369159698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087045, + "balance_loss_mlp": 1.05330932, + "epoch": 0.3282031550596383, + "flos": 679018484736.0, + "grad_norm": 0.05558890685700398, + "language_loss": 0.78265798, + "learning_rate": 0.0007843872751027203, + "loss": 0.7935285, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.33764648, + "step": 1706, + "time_per_iteration": 2.81091046333313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090191, + "balance_loss_mlp": 1.05621648, + "epoch": 0.3283955367449019, + "flos": 544821023232.0, + "grad_norm": 0.10097233050810657, + "language_loss": 0.87312186, + "learning_rate": 0.0007841309783987287, + "loss": 0.88402379, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.34008789, + "step": 1707, + "time_per_iteration": 2.7456212043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083118, + "balance_loss_mlp": 1.0490005, + "epoch": 0.32858791843016544, + "flos": 481017153024.0, + "grad_norm": 0.06288690811568091, + "language_loss": 0.89185357, + "learning_rate": 0.0007838745713853084, + "loss": 0.90268475, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.34155273, + "step": 1708, + "time_per_iteration": 2.5734565258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086944, + "balance_loss_mlp": 1.0529933, + "epoch": 0.328780300115429, + "flos": 566529389568.0, + "grad_norm": 0.059735917623485235, + "language_loss": 0.84101981, + "learning_rate": 0.0007836180541620053, + "loss": 0.85188925, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.33984375, + "step": 1709, + "time_per_iteration": 2.6734848022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082239, + "balance_loss_mlp": 1.04843152, + "epoch": 0.32897268180069256, + "flos": 475787948544.0, + "grad_norm": 0.06557165815913592, + "language_loss": 0.86666405, + "learning_rate": 0.0007833614268284082, + "loss": 0.87748647, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.33813477, + "step": 1710, + "time_per_iteration": 2.5004236698150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109204, + "balance_loss_mlp": 1.07611382, + "epoch": 0.32916506348595614, + "flos": 1576517008896.0, + "grad_norm": 0.028486921929439343, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75201809, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.15917969, + "step": 1711, + "time_per_iteration": 4.857421159744263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084065, + "balance_loss_mlp": 1.05011439, + "epoch": 0.3293574451712197, + "flos": 482646739968.0, + "grad_norm": 0.05383776069577274, + "language_loss": 0.78376174, + "learning_rate": 0.0007828478422289016, + "loss": 0.79460239, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.33984375, + "step": 1712, + "time_per_iteration": 2.5763661861419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089524, + "balance_loss_mlp": 1.05438161, + "epoch": 0.32954982685648326, + "flos": 622266872832.0, + "grad_norm": 0.05220026625301518, + "language_loss": 0.89185119, + "learning_rate": 0.0007825908851623833, + "loss": 0.90274644, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.35205078, + "step": 1713, + "time_per_iteration": 2.7262768745422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081531, + "balance_loss_mlp": 1.04648352, + "epoch": 0.32974220854174685, + "flos": 544697367552.0, + "grad_norm": 0.06806070360888057, + "language_loss": 0.85360491, + "learning_rate": 0.0007823338183843533, + "loss": 0.86442018, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.35083008, + "step": 1714, + "time_per_iteration": 2.652278184890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078166, + "balance_loss_mlp": 1.0447638, + "epoch": 0.3299345902270104, + "flos": 981740708352.0, + "grad_norm": 0.05603975865081876, + "language_loss": 0.8075726, + "learning_rate": 0.0007820766419946141, + "loss": 0.81835425, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.33422852, + "step": 1715, + "time_per_iteration": 3.282278537750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087722, + "balance_loss_mlp": 1.07227242, + "epoch": 0.33012697191227397, + "flos": 1402857432576.0, + "grad_norm": 0.02753251532821737, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80760199, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.15429688, + "step": 1716, + "time_per_iteration": 4.925649881362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081747, + "balance_loss_mlp": 1.04789162, + "epoch": 0.3303193535975375, + "flos": 504897781248.0, + "grad_norm": 0.09582479469105179, + "language_loss": 0.75968826, + "learning_rate": 0.0007815619607794288, + "loss": 0.77050573, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.33886719, + "step": 1717, + "time_per_iteration": 2.653355598449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077451, + "balance_loss_mlp": 1.04423952, + "epoch": 0.3305117352828011, + "flos": 937602390528.0, + "grad_norm": 0.059316474336830904, + "language_loss": 0.8254683, + "learning_rate": 0.0007813044561538001, + "loss": 0.83624279, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.33227539, + "step": 1718, + "time_per_iteration": 3.1251535415649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084283, + "balance_loss_mlp": 1.05035567, + "epoch": 0.3307041169680646, + "flos": 721176597504.0, + "grad_norm": 0.08429030846847434, + "language_loss": 0.88411027, + "learning_rate": 0.0007810468423160958, + "loss": 0.89495313, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.33959961, + "step": 1719, + "time_per_iteration": 2.8598783016204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090007, + "balance_loss_mlp": 1.05760598, + "epoch": 0.3308964986533282, + "flos": 583315499520.0, + "grad_norm": 0.04547421634197757, + "language_loss": 0.81920642, + "learning_rate": 0.0007807891193663306, + "loss": 0.8301065, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.32397461, + "step": 1720, + "time_per_iteration": 2.7775802612304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092448, + "balance_loss_mlp": 1.0582583, + "epoch": 0.33108888033859174, + "flos": 473340672000.0, + "grad_norm": 0.07591254280459368, + "language_loss": 0.82440275, + "learning_rate": 0.0007805312874045614, + "loss": 0.83532727, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.34228516, + "step": 1721, + "time_per_iteration": 2.5138351917266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100808, + "balance_loss_mlp": 1.06657076, + "epoch": 0.3312812620238553, + "flos": 385913659392.0, + "grad_norm": 0.08101052667778896, + "language_loss": 0.86919391, + "learning_rate": 0.0007802733465308874, + "loss": 0.88020205, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.34277344, + "step": 1722, + "time_per_iteration": 2.440809726715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106056, + "balance_loss_mlp": 1.07074606, + "epoch": 0.3314736437091189, + "flos": 494292395520.0, + "grad_norm": 0.0567806329299034, + "language_loss": 0.8509872, + "learning_rate": 0.0007800152968454501, + "loss": 0.86204773, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.35375977, + "step": 1723, + "time_per_iteration": 2.61602520942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090428, + "balance_loss_mlp": 1.056072, + "epoch": 0.33166602539438245, + "flos": 653346736128.0, + "grad_norm": 0.038882210578800376, + "language_loss": 0.90476918, + "learning_rate": 0.0007797571384484334, + "loss": 0.91567349, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.34399414, + "step": 1724, + "time_per_iteration": 2.857562780380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090291, + "balance_loss_mlp": 1.05586314, + "epoch": 0.33185840707964603, + "flos": 520550489088.0, + "grad_norm": 0.04870849772261114, + "language_loss": 0.91599178, + "learning_rate": 0.0007794988714400633, + "loss": 0.92689478, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.34448242, + "step": 1725, + "time_per_iteration": 2.5884361267089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097227, + "balance_loss_mlp": 1.06077266, + "epoch": 0.33205078876490957, + "flos": 436712919552.0, + "grad_norm": 0.05260760436426434, + "language_loss": 0.85199809, + "learning_rate": 0.0007792404959206079, + "loss": 0.86297035, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.36474609, + "step": 1726, + "time_per_iteration": 2.4855122566223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095942, + "balance_loss_mlp": 1.05965447, + "epoch": 0.33224317045017315, + "flos": 768400971264.0, + "grad_norm": 0.052329818141719754, + "language_loss": 0.81527805, + "learning_rate": 0.0007789820119903774, + "loss": 0.82623744, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.36279297, + "step": 1727, + "time_per_iteration": 2.945405960083008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105991, + "balance_loss_mlp": 1.04579556, + "epoch": 0.3324355521354367, + "flos": 1465656090624.0, + "grad_norm": 0.02932642968329903, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79552573, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.14160156, + "step": 1728, + "time_per_iteration": 4.810296297073364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084477, + "balance_loss_mlp": 1.04880977, + "epoch": 0.3326279338207003, + "flos": 496415195136.0, + "grad_norm": 0.05339720943334808, + "language_loss": 0.83919221, + "learning_rate": 0.0007784647192990428, + "loss": 0.85003698, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.35693359, + "step": 1729, + "time_per_iteration": 2.6848132610321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083753, + "balance_loss_mlp": 1.04844344, + "epoch": 0.33282031550596386, + "flos": 635600342016.0, + "grad_norm": 0.05212570885713578, + "language_loss": 0.80661625, + "learning_rate": 0.0007782059107387696, + "loss": 0.81745386, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.35351562, + "step": 1730, + "time_per_iteration": 2.874356269836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078294, + "balance_loss_mlp": 1.04329371, + "epoch": 0.3330126971912274, + "flos": 689210643456.0, + "grad_norm": 0.05636936917064103, + "language_loss": 0.88407743, + "learning_rate": 0.0007779469941693826, + "loss": 0.89486033, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.3503418, + "step": 1731, + "time_per_iteration": 2.7914862632751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079351, + "balance_loss_mlp": 1.04511368, + "epoch": 0.333205078876491, + "flos": 566184563712.0, + "grad_norm": 0.05730145040609657, + "language_loss": 0.77017218, + "learning_rate": 0.0007776879696914029, + "loss": 0.78096569, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.34277344, + "step": 1732, + "time_per_iteration": 2.8158769607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081, + "balance_loss_mlp": 1.04666734, + "epoch": 0.3333974605617545, + "flos": 640618550784.0, + "grad_norm": 0.044212495165629015, + "language_loss": 0.8903594, + "learning_rate": 0.000777428837405392, + "loss": 0.90116942, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.34375, + "step": 1733, + "time_per_iteration": 2.8417906761169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079717, + "balance_loss_mlp": 1.04536092, + "epoch": 0.3335898422470181, + "flos": 461597501952.0, + "grad_norm": 0.05109390766697835, + "language_loss": 0.87070495, + "learning_rate": 0.0007771695974119544, + "loss": 0.88150203, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.34399414, + "step": 1734, + "time_per_iteration": 2.4995057582855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078764, + "balance_loss_mlp": 1.04514742, + "epoch": 0.33378222393228163, + "flos": 852504791040.0, + "grad_norm": 0.05825672376588237, + "language_loss": 0.75576115, + "learning_rate": 0.0007769102498117359, + "loss": 0.76654887, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.33642578, + "step": 1735, + "time_per_iteration": 3.0868663787841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083295, + "balance_loss_mlp": 1.04991651, + "epoch": 0.3339746056175452, + "flos": 954256080384.0, + "grad_norm": 0.05069255593645712, + "language_loss": 0.79858601, + "learning_rate": 0.000776650794705424, + "loss": 0.80941892, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.33398438, + "step": 1736, + "time_per_iteration": 3.2665328979492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108809, + "balance_loss_mlp": 1.05387688, + "epoch": 0.33416698730280875, + "flos": 544559155200.0, + "grad_norm": 0.045819605067785145, + "language_loss": 0.82160866, + "learning_rate": 0.0007763912321937483, + "loss": 0.83248949, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.34228516, + "step": 1737, + "time_per_iteration": 2.677316665649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081816, + "balance_loss_mlp": 1.04798412, + "epoch": 0.33435936898807234, + "flos": 1013652817920.0, + "grad_norm": 0.053421386657792044, + "language_loss": 0.82471478, + "learning_rate": 0.0007761315623774799, + "loss": 0.8355329, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.33862305, + "step": 1738, + "time_per_iteration": 3.4182283878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089571, + "balance_loss_mlp": 1.05554879, + "epoch": 0.3345517506733359, + "flos": 614935217664.0, + "grad_norm": 0.051536505858366714, + "language_loss": 0.87671852, + "learning_rate": 0.0007758717853574313, + "loss": 0.88761419, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.34057617, + "step": 1739, + "time_per_iteration": 2.7348380088806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084561, + "balance_loss_mlp": 1.05125391, + "epoch": 0.33474413235859946, + "flos": 494350622208.0, + "grad_norm": 0.06141180611747274, + "language_loss": 0.9002257, + "learning_rate": 0.0007756119012344571, + "loss": 0.91107136, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.33325195, + "step": 1740, + "time_per_iteration": 2.536121129989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091135, + "balance_loss_mlp": 1.05754209, + "epoch": 0.33493651404386304, + "flos": 628105743360.0, + "grad_norm": 0.06662069566578578, + "language_loss": 0.84404671, + "learning_rate": 0.0007753519101094535, + "loss": 0.85495806, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.33618164, + "step": 1741, + "time_per_iteration": 2.753371238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082833, + "balance_loss_mlp": 1.04945421, + "epoch": 0.3351288957291266, + "flos": 513474909696.0, + "grad_norm": 0.05750412427252262, + "language_loss": 0.86366677, + "learning_rate": 0.0007750918120833575, + "loss": 0.87449515, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.33398438, + "step": 1742, + "time_per_iteration": 2.56093168258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082276, + "balance_loss_mlp": 1.05037546, + "epoch": 0.33532127741439016, + "flos": 647008860672.0, + "grad_norm": 0.0676260973392943, + "language_loss": 0.87342101, + "learning_rate": 0.0007748316072571485, + "loss": 0.88424373, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.31884766, + "step": 1743, + "time_per_iteration": 2.7759556770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086123, + "balance_loss_mlp": 1.05193388, + "epoch": 0.3355136590996537, + "flos": 768134721024.0, + "grad_norm": 0.047185436483198326, + "language_loss": 0.79306734, + "learning_rate": 0.0007745712957318467, + "loss": 0.80392861, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.34228516, + "step": 1744, + "time_per_iteration": 2.959686756134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085284, + "balance_loss_mlp": 1.05119014, + "epoch": 0.3357060407849173, + "flos": 595259490816.0, + "grad_norm": 0.046948111550021425, + "language_loss": 0.86506951, + "learning_rate": 0.0007743108776085141, + "loss": 0.87592232, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.34106445, + "step": 1745, + "time_per_iteration": 2.7391204833984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089169, + "balance_loss_mlp": 1.05462217, + "epoch": 0.3358984224701808, + "flos": 598288730112.0, + "grad_norm": 0.04983419543630797, + "language_loss": 0.82728243, + "learning_rate": 0.0007740503529882543, + "loss": 0.8381741, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.34594727, + "step": 1746, + "time_per_iteration": 2.788041114807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108953, + "balance_loss_mlp": 1.05474496, + "epoch": 0.3360908041554444, + "flos": 578055771648.0, + "grad_norm": 0.05677254755827829, + "language_loss": 0.91252941, + "learning_rate": 0.0007737897219722114, + "loss": 0.92342472, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.34790039, + "step": 1747, + "time_per_iteration": 2.6752376556396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083715, + "balance_loss_mlp": 1.04945374, + "epoch": 0.336283185840708, + "flos": 513332315136.0, + "grad_norm": 0.05427874766165502, + "language_loss": 0.81146061, + "learning_rate": 0.0007735289846615716, + "loss": 0.82229781, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.34301758, + "step": 1748, + "time_per_iteration": 2.670315742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083181, + "balance_loss_mlp": 1.04984999, + "epoch": 0.3364755675259715, + "flos": 524716102656.0, + "grad_norm": 0.05445380235157479, + "language_loss": 0.81899059, + "learning_rate": 0.0007732681411575621, + "loss": 0.82982242, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.33349609, + "step": 1749, + "time_per_iteration": 2.644740104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079502, + "balance_loss_mlp": 1.04567027, + "epoch": 0.3366679492112351, + "flos": 554594162688.0, + "grad_norm": 0.05291201013717534, + "language_loss": 0.87517959, + "learning_rate": 0.0007730071915614514, + "loss": 0.88597459, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.33862305, + "step": 1750, + "time_per_iteration": 2.6605777740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082694, + "balance_loss_mlp": 1.04874277, + "epoch": 0.33686033089649864, + "flos": 427051851264.0, + "grad_norm": 0.07867660779661921, + "language_loss": 0.88976741, + "learning_rate": 0.0007727461359745489, + "loss": 0.90059435, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.33984375, + "step": 1751, + "time_per_iteration": 2.4562768936157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082471, + "balance_loss_mlp": 1.04987907, + "epoch": 0.3370527125817622, + "flos": 541452750336.0, + "grad_norm": 0.05472309390748721, + "language_loss": 0.86156446, + "learning_rate": 0.0007724849744982056, + "loss": 0.87238914, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.32592773, + "step": 1752, + "time_per_iteration": 2.683575391769409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086284, + "balance_loss_mlp": 1.05295336, + "epoch": 0.33724509426702576, + "flos": 541836864000.0, + "grad_norm": 0.052181206472060114, + "language_loss": 0.81578106, + "learning_rate": 0.0007722237072338131, + "loss": 0.82664388, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.33349609, + "step": 1753, + "time_per_iteration": 2.7059788703918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108949, + "balance_loss_mlp": 1.05563486, + "epoch": 0.33743747595228935, + "flos": 472557888000.0, + "grad_norm": 0.063588606701447, + "language_loss": 0.85402888, + "learning_rate": 0.0007719623342828046, + "loss": 0.86492383, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.33886719, + "step": 1754, + "time_per_iteration": 2.5117459297180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090728, + "balance_loss_mlp": 1.05708706, + "epoch": 0.33762985763755293, + "flos": 469564964352.0, + "grad_norm": 0.05602573096673387, + "language_loss": 0.84115714, + "learning_rate": 0.000771700855746654, + "loss": 0.85206437, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.33666992, + "step": 1755, + "time_per_iteration": 2.5685064792633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085423, + "balance_loss_mlp": 1.05214, + "epoch": 0.33782223932281646, + "flos": 492002270208.0, + "grad_norm": 0.05352941428578995, + "language_loss": 0.88329422, + "learning_rate": 0.0007714392717268763, + "loss": 0.89414847, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.33300781, + "step": 1756, + "time_per_iteration": 2.568432569503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084365, + "balance_loss_mlp": 1.04981852, + "epoch": 0.33801462100808005, + "flos": 464827562496.0, + "grad_norm": 0.051807056092833426, + "language_loss": 0.86368155, + "learning_rate": 0.0007711775823250273, + "loss": 0.87452519, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.34594727, + "step": 1757, + "time_per_iteration": 2.5542263984680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010902, + "balance_loss_mlp": 1.05660665, + "epoch": 0.3382070026933436, + "flos": 795319603200.0, + "grad_norm": 0.05510084593487172, + "language_loss": 0.83019066, + "learning_rate": 0.0007709157876427039, + "loss": 0.84109271, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.33618164, + "step": 1758, + "time_per_iteration": 3.07852840423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082149, + "balance_loss_mlp": 1.04903245, + "epoch": 0.33839938437860717, + "flos": 508181686272.0, + "grad_norm": 0.0524958838474987, + "language_loss": 0.85602981, + "learning_rate": 0.0007706538877815439, + "loss": 0.86685127, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.33129883, + "step": 1759, + "time_per_iteration": 2.6002085208892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082716, + "balance_loss_mlp": 1.05021918, + "epoch": 0.3385917660638707, + "flos": 483986755584.0, + "grad_norm": 0.05079207863068971, + "language_loss": 0.83150595, + "learning_rate": 0.0007703918828432259, + "loss": 0.84233308, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.32495117, + "step": 1760, + "time_per_iteration": 2.5961215496063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086286, + "balance_loss_mlp": 1.05297899, + "epoch": 0.3387841477491343, + "flos": 545071306752.0, + "grad_norm": 0.0542286668270813, + "language_loss": 0.89021361, + "learning_rate": 0.000770129772929469, + "loss": 0.9010765, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.33325195, + "step": 1761, + "time_per_iteration": 2.6393394470214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076289, + "balance_loss_mlp": 1.04264784, + "epoch": 0.3389765294343978, + "flos": 719487373824.0, + "grad_norm": 0.057381721603975526, + "language_loss": 0.88803625, + "learning_rate": 0.0007698675581420334, + "loss": 0.89879912, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.33666992, + "step": 1762, + "time_per_iteration": 2.8959014415740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_mlp": 1.03968656, + "epoch": 0.3391689111196614, + "flos": 699596269056.0, + "grad_norm": 0.05381480837735757, + "language_loss": 0.78922743, + "learning_rate": 0.0007696052385827199, + "loss": 0.79995811, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.33398438, + "step": 1763, + "time_per_iteration": 2.9110584259033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068767, + "balance_loss_mlp": 1.03519773, + "epoch": 0.339361292804925, + "flos": 626806425600.0, + "grad_norm": 0.05521588721088573, + "language_loss": 0.78407156, + "learning_rate": 0.00076934281435337, + "loss": 0.79475927, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.33569336, + "step": 1764, + "time_per_iteration": 2.7673115730285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073935, + "balance_loss_mlp": 1.04043674, + "epoch": 0.33955367449018853, + "flos": 609302960640.0, + "grad_norm": 0.0615155635578628, + "language_loss": 0.85995364, + "learning_rate": 0.0007690802855558658, + "loss": 0.87069303, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.33520508, + "step": 1765, + "time_per_iteration": 2.871255397796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_mlp": 1.07131636, + "epoch": 0.3397460561754521, + "flos": 1452494177280.0, + "grad_norm": 0.03113174858532202, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77458668, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.12402344, + "step": 1766, + "time_per_iteration": 4.906975746154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089831, + "balance_loss_mlp": 1.05485463, + "epoch": 0.33993843786071565, + "flos": 487068429312.0, + "grad_norm": 0.059784397062932884, + "language_loss": 0.89060128, + "learning_rate": 0.0007685549146641262, + "loss": 0.90149957, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.35009766, + "step": 1767, + "time_per_iteration": 2.5172948837280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081834, + "balance_loss_mlp": 1.04683375, + "epoch": 0.34013081954597923, + "flos": 417115768320.0, + "grad_norm": 0.05470212710373979, + "language_loss": 0.88085568, + "learning_rate": 0.0007682920727738579, + "loss": 0.89167398, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.35058594, + "step": 1768, + "time_per_iteration": 2.4423539638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084324, + "balance_loss_mlp": 1.04939604, + "epoch": 0.34032320123124277, + "flos": 437293472256.0, + "grad_norm": 0.06228189549734304, + "language_loss": 0.84428132, + "learning_rate": 0.000768029126723369, + "loss": 0.85512453, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.34960938, + "step": 1769, + "time_per_iteration": 2.5054280757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082733, + "balance_loss_mlp": 1.04811513, + "epoch": 0.34051558291650635, + "flos": 457353312768.0, + "grad_norm": 0.058774755629116764, + "language_loss": 0.81489038, + "learning_rate": 0.0007677660766147447, + "loss": 0.82571769, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.34667969, + "step": 1770, + "time_per_iteration": 2.524327039718628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_mlp": 1.01844561, + "epoch": 0.3407079646017699, + "flos": 1558029938688.0, + "grad_norm": 0.017684799672329513, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.7350117, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.11767578, + "step": 1771, + "time_per_iteration": 4.924427032470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081114, + "balance_loss_mlp": 1.04666233, + "epoch": 0.3409003462870335, + "flos": 492312190464.0, + "grad_norm": 0.06375677891517043, + "language_loss": 0.79619604, + "learning_rate": 0.0007672396646316306, + "loss": 0.80700719, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.3449707, + "step": 1772, + "time_per_iteration": 2.5239012241363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081305, + "balance_loss_mlp": 1.04589987, + "epoch": 0.34109272797229706, + "flos": 808145303040.0, + "grad_norm": 0.06003817873980187, + "language_loss": 0.80518734, + "learning_rate": 0.000766976302961512, + "loss": 0.81600046, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.35424805, + "step": 1773, + "time_per_iteration": 2.9730074405670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083157, + "balance_loss_mlp": 1.04834807, + "epoch": 0.3412851096575606, + "flos": 469903997952.0, + "grad_norm": 0.05958263274361502, + "language_loss": 0.81420594, + "learning_rate": 0.0007667128376420003, + "loss": 0.82503754, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.34863281, + "step": 1774, + "time_per_iteration": 2.521329879760742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073378, + "balance_loss_mlp": 1.03842556, + "epoch": 0.3414774913428242, + "flos": 595402085376.0, + "grad_norm": 0.09709010294240925, + "language_loss": 0.84563607, + "learning_rate": 0.0007664492687753817, + "loss": 0.85636985, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.34985352, + "step": 1775, + "time_per_iteration": 2.6744766235351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072901, + "balance_loss_mlp": 1.03976059, + "epoch": 0.3416698730280877, + "flos": 527202667008.0, + "grad_norm": 0.05030413358353647, + "language_loss": 0.81513566, + "learning_rate": 0.000766185596463983, + "loss": 0.82586467, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.33154297, + "step": 1776, + "time_per_iteration": 2.6050221920013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073672, + "balance_loss_mlp": 1.03962612, + "epoch": 0.3418622547133513, + "flos": 874272794112.0, + "grad_norm": 0.05039515754698922, + "language_loss": 0.76683038, + "learning_rate": 0.0007659218208101706, + "loss": 0.77756709, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.34082031, + "step": 1777, + "time_per_iteration": 3.0900516510009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079189, + "balance_loss_mlp": 1.04504752, + "epoch": 0.34205463639861483, + "flos": 603462680064.0, + "grad_norm": 0.04915159817243754, + "language_loss": 0.84680861, + "learning_rate": 0.0007656579419163515, + "loss": 0.85760045, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.34179688, + "step": 1778, + "time_per_iteration": 2.7328884601593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081812, + "balance_loss_mlp": 1.04774237, + "epoch": 0.3422470180838784, + "flos": 463547183616.0, + "grad_norm": 0.05230649511498847, + "language_loss": 0.76939148, + "learning_rate": 0.0007653939598849724, + "loss": 0.7802096, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.34106445, + "step": 1779, + "time_per_iteration": 2.5020573139190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_mlp": 1.01441097, + "epoch": 0.34243939976914195, + "flos": 1585584377856.0, + "grad_norm": 0.019842751190116498, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83907396, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.12792969, + "step": 1780, + "time_per_iteration": 4.919352054595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107992, + "balance_loss_mlp": 1.04656482, + "epoch": 0.34263178145440554, + "flos": 872662146048.0, + "grad_norm": 0.0514393238831889, + "language_loss": 0.80344206, + "learning_rate": 0.000764865686819522, + "loss": 0.81424129, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.33374023, + "step": 1781, + "time_per_iteration": 3.059682846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089318, + "balance_loss_mlp": 1.05546236, + "epoch": 0.3428241631396691, + "flos": 506630674944.0, + "grad_norm": 0.04318417455303755, + "language_loss": 0.85701579, + "learning_rate": 0.0007646013959905449, + "loss": 0.86790895, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.33886719, + "step": 1782, + "time_per_iteration": 2.572772741317749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085097, + "balance_loss_mlp": 1.05162311, + "epoch": 0.34301654482493266, + "flos": 879669324288.0, + "grad_norm": 0.05640606275212692, + "language_loss": 0.80626374, + "learning_rate": 0.0007643370024341949, + "loss": 0.81711471, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.33496094, + "step": 1783, + "time_per_iteration": 3.0805578231811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089012, + "balance_loss_mlp": 1.05472708, + "epoch": 0.34320892651019624, + "flos": 431537559552.0, + "grad_norm": 0.05116039291223259, + "language_loss": 0.82947731, + "learning_rate": 0.0007640725062531195, + "loss": 0.84036732, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.34326172, + "step": 1784, + "time_per_iteration": 2.497859239578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083577, + "balance_loss_mlp": 1.05010295, + "epoch": 0.3434013081954598, + "flos": 463404589056.0, + "grad_norm": 0.06763804466989645, + "language_loss": 0.86272931, + "learning_rate": 0.0007638079075500047, + "loss": 0.87356508, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.33496094, + "step": 1785, + "time_per_iteration": 2.516842842102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017655, + "balance_loss_mlp": 1.0058769, + "epoch": 0.34359368988072336, + "flos": 1556499276288.0, + "grad_norm": 0.01279941843938601, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76198322, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.11767578, + "step": 1786, + "time_per_iteration": 4.979317665100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077073, + "balance_loss_mlp": 1.04417157, + "epoch": 0.3437860715659869, + "flos": 495267236352.0, + "grad_norm": 0.04590480874587016, + "language_loss": 0.83035767, + "learning_rate": 0.0007632784029886026, + "loss": 0.84112841, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.32910156, + "step": 1787, + "time_per_iteration": 2.6075103282928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079505, + "balance_loss_mlp": 1.04617453, + "epoch": 0.3439784532512505, + "flos": 717942154752.0, + "grad_norm": 0.04559278353066439, + "language_loss": 0.85611933, + "learning_rate": 0.0007630134973358873, + "loss": 0.86691439, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.33349609, + "step": 1788, + "time_per_iteration": 2.917405366897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086657, + "balance_loss_mlp": 1.05327868, + "epoch": 0.34417083493651407, + "flos": 565598218752.0, + "grad_norm": 0.05301353071730806, + "language_loss": 0.86864436, + "learning_rate": 0.0007627484895722763, + "loss": 0.879511, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.33398438, + "step": 1789, + "time_per_iteration": 2.6353273391723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081939, + "balance_loss_mlp": 1.04834569, + "epoch": 0.3443632166217776, + "flos": 795988905984.0, + "grad_norm": 0.057022653970397005, + "language_loss": 0.80155563, + "learning_rate": 0.0007624833798006552, + "loss": 0.81237495, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.3359375, + "step": 1790, + "time_per_iteration": 3.039126396179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090848, + "balance_loss_mlp": 1.05692101, + "epoch": 0.3445555983070412, + "flos": 569045067264.0, + "grad_norm": 0.05940117534987587, + "language_loss": 0.84113955, + "learning_rate": 0.0007622181681239483, + "loss": 0.85204804, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.33935547, + "step": 1791, + "time_per_iteration": 2.6392083168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083434, + "balance_loss_mlp": 1.04903054, + "epoch": 0.3447479799923047, + "flos": 568524151296.0, + "grad_norm": 0.04492792711883196, + "language_loss": 0.84501636, + "learning_rate": 0.0007619528546451202, + "loss": 0.8558507, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.34448242, + "step": 1792, + "time_per_iteration": 2.776982069015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080197, + "balance_loss_mlp": 1.04708052, + "epoch": 0.3449403616775683, + "flos": 967323299328.0, + "grad_norm": 0.05878857203246004, + "language_loss": 0.8358798, + "learning_rate": 0.0007616874394671745, + "loss": 0.84668171, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.33129883, + "step": 1793, + "time_per_iteration": 3.3427693843841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074615, + "balance_loss_mlp": 1.04128361, + "epoch": 0.34513274336283184, + "flos": 568340858880.0, + "grad_norm": 0.05893035372227358, + "language_loss": 0.84961653, + "learning_rate": 0.0007614219226931547, + "loss": 0.86036265, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.33349609, + "step": 1794, + "time_per_iteration": 2.6591315269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070169, + "balance_loss_mlp": 1.03783977, + "epoch": 0.3453251250480954, + "flos": 460715793408.0, + "grad_norm": 0.06432823181520617, + "language_loss": 0.84724808, + "learning_rate": 0.0007611563044261435, + "loss": 0.85794979, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.32324219, + "step": 1795, + "time_per_iteration": 2.51755690574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078106, + "balance_loss_mlp": 1.0443697, + "epoch": 0.34551750673335896, + "flos": 415397431296.0, + "grad_norm": 0.0640589434438139, + "language_loss": 0.87120652, + "learning_rate": 0.0007608905847692631, + "loss": 0.88198757, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.33764648, + "step": 1796, + "time_per_iteration": 2.47190260887146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074227, + "balance_loss_mlp": 1.04103947, + "epoch": 0.34570988841862255, + "flos": 587540749824.0, + "grad_norm": 0.04642061059617041, + "language_loss": 0.86689866, + "learning_rate": 0.0007606247638256749, + "loss": 0.8776409, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.33203125, + "step": 1797, + "time_per_iteration": 2.956444025039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094984, + "balance_loss_mlp": 1.08373046, + "epoch": 0.34590227010388613, + "flos": 1566835439616.0, + "grad_norm": 0.041839887126655914, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79265279, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.11230469, + "step": 1798, + "time_per_iteration": 4.9134039878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.05352104, + "epoch": 0.34609465178914967, + "flos": 1536950177280.0, + "grad_norm": 0.029939636480755576, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80391788, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.11083984, + "step": 1799, + "time_per_iteration": 4.743322849273682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083924, + "balance_loss_mlp": 1.05054486, + "epoch": 0.34628703347441325, + "flos": 609075998208.0, + "grad_norm": 0.0564087129204809, + "language_loss": 0.85731971, + "learning_rate": 0.0007598266943068686, + "loss": 0.86815894, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.33398438, + "step": 1800, + "time_per_iteration": 2.7374043464660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077797, + "balance_loss_mlp": 1.04603946, + "epoch": 0.3464794151596768, + "flos": 473084596224.0, + "grad_norm": 0.06346922489791823, + "language_loss": 0.83911705, + "learning_rate": 0.0007595604692488507, + "loss": 0.849895, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.31738281, + "step": 1801, + "time_per_iteration": 2.5296249389648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.04147625, + "epoch": 0.34667179684494037, + "flos": 605397805056.0, + "grad_norm": 0.05750507090521113, + "language_loss": 0.83014846, + "learning_rate": 0.0007592941434205215, + "loss": 0.84088963, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.32641602, + "step": 1802, + "time_per_iteration": 2.758260488510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017015, + "balance_loss_mlp": 1.00628662, + "epoch": 0.3468641785302039, + "flos": 1564053511680.0, + "grad_norm": 0.014489769518708178, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74588072, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.10742188, + "step": 1803, + "time_per_iteration": 5.078529119491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069541, + "balance_loss_mlp": 1.0363059, + "epoch": 0.3470565602154675, + "flos": 906902258688.0, + "grad_norm": 0.0666829693597375, + "language_loss": 0.79937375, + "learning_rate": 0.0007587611898665566, + "loss": 0.81006914, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.33251953, + "step": 1804, + "time_per_iteration": 3.05087947845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078664, + "balance_loss_mlp": 1.04621565, + "epoch": 0.347248941900731, + "flos": 638613614592.0, + "grad_norm": 0.050247612363814816, + "language_loss": 0.8218019, + "learning_rate": 0.0007584945623478315, + "loss": 0.83258855, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.32446289, + "step": 1805, + "time_per_iteration": 2.8188822269439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071735, + "balance_loss_mlp": 1.03940511, + "epoch": 0.3474413235859946, + "flos": 847009336320.0, + "grad_norm": 0.06830759319763476, + "language_loss": 0.81376302, + "learning_rate": 0.000758227834472617, + "loss": 0.82448041, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.32324219, + "step": 1806, + "time_per_iteration": 3.049736976623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080854, + "balance_loss_mlp": 1.04771423, + "epoch": 0.3476337052712582, + "flos": 515395478016.0, + "grad_norm": 0.0580200838122141, + "language_loss": 0.77365351, + "learning_rate": 0.0007579610063444664, + "loss": 0.78446203, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.33154297, + "step": 1807, + "time_per_iteration": 2.768986701965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072697, + "balance_loss_mlp": 1.03993857, + "epoch": 0.34782608695652173, + "flos": 913161558528.0, + "grad_norm": 0.05804810611861273, + "language_loss": 0.8735044, + "learning_rate": 0.0007576940780669712, + "loss": 0.88423139, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.32763672, + "step": 1808, + "time_per_iteration": 3.200984477996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073414, + "balance_loss_mlp": 1.04041636, + "epoch": 0.3480184686417853, + "flos": 773374099968.0, + "grad_norm": 0.05336970886803796, + "language_loss": 0.84611619, + "learning_rate": 0.0007574270497437624, + "loss": 0.85685027, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.33007812, + "step": 1809, + "time_per_iteration": 2.9432260990142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069124, + "balance_loss_mlp": 1.03619814, + "epoch": 0.34821085032704885, + "flos": 576549840384.0, + "grad_norm": 0.04930975616190813, + "language_loss": 0.87883413, + "learning_rate": 0.000757159921478509, + "loss": 0.88952535, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.3293457, + "step": 1810, + "time_per_iteration": 2.769669771194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079936, + "balance_loss_mlp": 1.06887364, + "epoch": 0.34840323201231244, + "flos": 1524176911872.0, + "grad_norm": 0.03214902088053901, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75530577, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.11083984, + "step": 1811, + "time_per_iteration": 4.764174222946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107769, + "balance_loss_mlp": 1.04469275, + "epoch": 0.34859561369757597, + "flos": 508910625792.0, + "grad_norm": 0.059132347701423886, + "language_loss": 0.87255216, + "learning_rate": 0.0007566253655367423, + "loss": 0.88332909, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.33007812, + "step": 1812, + "time_per_iteration": 2.578930616378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073116, + "balance_loss_mlp": 1.04014218, + "epoch": 0.34878799538283956, + "flos": 548390117376.0, + "grad_norm": 0.051501554075800156, + "language_loss": 0.89574003, + "learning_rate": 0.000756357938067762, + "loss": 0.90647119, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.32983398, + "step": 1813, + "time_per_iteration": 2.6508982181549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079394, + "balance_loss_mlp": 1.04673076, + "epoch": 0.34898037706810314, + "flos": 983251021824.0, + "grad_norm": 0.051360492330316726, + "language_loss": 0.82609868, + "learning_rate": 0.0007560904110718033, + "loss": 0.8368926, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.32666016, + "step": 1814, + "time_per_iteration": 3.236894130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075398, + "balance_loss_mlp": 1.04185271, + "epoch": 0.3491727587533667, + "flos": 681298435584.0, + "grad_norm": 0.05446392192761228, + "language_loss": 0.83478653, + "learning_rate": 0.0007558227846527297, + "loss": 0.84554052, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.33569336, + "step": 1815, + "time_per_iteration": 2.8674044609069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074987, + "balance_loss_mlp": 1.04175162, + "epoch": 0.34936514043863026, + "flos": 393811310592.0, + "grad_norm": 0.0691488486506015, + "language_loss": 0.83195454, + "learning_rate": 0.0007555550589144429, + "loss": 0.84270442, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.33251953, + "step": 1816, + "time_per_iteration": 2.421494722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071292, + "balance_loss_mlp": 1.03917694, + "epoch": 0.3495575221238938, + "flos": 461120256000.0, + "grad_norm": 0.07868701205222765, + "language_loss": 0.8463372, + "learning_rate": 0.000755287233960883, + "loss": 0.85705012, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.32104492, + "step": 1817, + "time_per_iteration": 2.54315185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072544, + "balance_loss_mlp": 1.04023862, + "epoch": 0.3497499038091574, + "flos": 723859600896.0, + "grad_norm": 0.06602653795060065, + "language_loss": 0.77636009, + "learning_rate": 0.0007550193098960292, + "loss": 0.78708553, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.32299805, + "step": 1818, + "time_per_iteration": 2.848236560821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076763, + "balance_loss_mlp": 1.04452837, + "epoch": 0.3499422854944209, + "flos": 827364132864.0, + "grad_norm": 0.049816715297611704, + "language_loss": 0.86387616, + "learning_rate": 0.0007547512868238988, + "loss": 0.8746438, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.32226562, + "step": 1819, + "time_per_iteration": 3.140552043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076553, + "balance_loss_mlp": 1.0441277, + "epoch": 0.3501346671796845, + "flos": 493214247936.0, + "grad_norm": 0.049810070694169546, + "language_loss": 0.83196282, + "learning_rate": 0.0007544831648485473, + "loss": 0.84272838, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.32421875, + "step": 1820, + "time_per_iteration": 2.7085797786712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074414, + "balance_loss_mlp": 1.04179859, + "epoch": 0.35032704886494803, + "flos": 578479173120.0, + "grad_norm": 0.05987447994889705, + "language_loss": 0.81237, + "learning_rate": 0.0007542149440740694, + "loss": 0.82311416, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.32617188, + "step": 1821, + "time_per_iteration": 2.6648108959198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075377, + "balance_loss_mlp": 1.0426898, + "epoch": 0.3505194305502116, + "flos": 584383472640.0, + "grad_norm": 0.06285767185927299, + "language_loss": 0.85454488, + "learning_rate": 0.000753946624604597, + "loss": 0.86529863, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.3269043, + "step": 1822, + "time_per_iteration": 2.7114102840423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080366, + "balance_loss_mlp": 1.04722571, + "epoch": 0.3507118122354752, + "flos": 526705072128.0, + "grad_norm": 0.056571758739544044, + "language_loss": 0.88259315, + "learning_rate": 0.0007536782065443015, + "loss": 0.89339685, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.33154297, + "step": 1823, + "time_per_iteration": 2.6336190700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077518, + "balance_loss_mlp": 1.04576099, + "epoch": 0.35090419392073874, + "flos": 511269152256.0, + "grad_norm": 0.06612506998948281, + "language_loss": 0.74917412, + "learning_rate": 0.0007534096899973919, + "loss": 0.75994933, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.31738281, + "step": 1824, + "time_per_iteration": 2.5683584213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108279, + "balance_loss_mlp": 1.05069852, + "epoch": 0.3510965756060023, + "flos": 563728522752.0, + "grad_norm": 0.05207355522992398, + "language_loss": 0.82511663, + "learning_rate": 0.0007531410750681154, + "loss": 0.83594453, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.32080078, + "step": 1825, + "time_per_iteration": 2.7370071411132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108428, + "balance_loss_mlp": 1.05207014, + "epoch": 0.35128895729126586, + "flos": 1020107146752.0, + "grad_norm": 0.05855996344544413, + "language_loss": 0.86223209, + "learning_rate": 0.0007528723618607575, + "loss": 0.87307489, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.32202148, + "step": 1826, + "time_per_iteration": 3.4230828285217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080919, + "balance_loss_mlp": 1.04889941, + "epoch": 0.35148133897652944, + "flos": 587972915712.0, + "grad_norm": 0.06514472806491804, + "language_loss": 0.82370871, + "learning_rate": 0.0007526035504796422, + "loss": 0.8345179, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.32006836, + "step": 1827, + "time_per_iteration": 2.7472023963928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083394, + "balance_loss_mlp": 1.05011046, + "epoch": 0.351673720661793, + "flos": 495054830592.0, + "grad_norm": 0.13631276803870807, + "language_loss": 0.86120903, + "learning_rate": 0.0007523346410291312, + "loss": 0.87204289, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.33300781, + "step": 1828, + "time_per_iteration": 2.7665555477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080213, + "balance_loss_mlp": 1.04757404, + "epoch": 0.35186610234705656, + "flos": 762339520512.0, + "grad_norm": 0.04983334941453678, + "language_loss": 0.85021639, + "learning_rate": 0.0007520656336136245, + "loss": 0.86101854, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.32641602, + "step": 1829, + "time_per_iteration": 2.9405102729797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080442, + "balance_loss_mlp": 1.04847038, + "epoch": 0.3520584840323201, + "flos": 625822820352.0, + "grad_norm": 0.049266285647792965, + "language_loss": 0.87560928, + "learning_rate": 0.0007517965283375599, + "loss": 0.88641369, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.31958008, + "step": 1830, + "time_per_iteration": 2.8742456436157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076907, + "balance_loss_mlp": 1.04429162, + "epoch": 0.3522508657175837, + "flos": 537124193280.0, + "grad_norm": 0.05152278098600794, + "language_loss": 0.8913554, + "learning_rate": 0.0007515273253054132, + "loss": 0.90212452, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.32617188, + "step": 1831, + "time_per_iteration": 2.647270917892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085251, + "balance_loss_mlp": 1.0506562, + "epoch": 0.35244324740284727, + "flos": 567105560064.0, + "grad_norm": 0.052396269804254075, + "language_loss": 0.82697165, + "learning_rate": 0.0007512580246216988, + "loss": 0.83782411, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.34643555, + "step": 1832, + "time_per_iteration": 2.6887552738189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079591, + "balance_loss_mlp": 1.04673672, + "epoch": 0.3526356290881108, + "flos": 512809989120.0, + "grad_norm": 0.05749675796225481, + "language_loss": 0.85263908, + "learning_rate": 0.000750988626390968, + "loss": 0.86343497, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.32861328, + "step": 1833, + "time_per_iteration": 2.6013457775115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080641, + "balance_loss_mlp": 1.04781032, + "epoch": 0.3528280107733744, + "flos": 595496627712.0, + "grad_norm": 0.05344239959905588, + "language_loss": 0.84880912, + "learning_rate": 0.0007507191307178108, + "loss": 0.8596155, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.32836914, + "step": 1834, + "time_per_iteration": 2.7490363121032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080747, + "balance_loss_mlp": 1.04682052, + "epoch": 0.3530203924586379, + "flos": 550969814016.0, + "grad_norm": 0.06515988244691072, + "language_loss": 0.74431223, + "learning_rate": 0.0007504495377068543, + "loss": 0.75511968, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.33959961, + "step": 1835, + "time_per_iteration": 2.729029417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084115, + "balance_loss_mlp": 1.04925871, + "epoch": 0.3532127741439015, + "flos": 652662876672.0, + "grad_norm": 0.06759605529963146, + "language_loss": 0.81589389, + "learning_rate": 0.0007501798474627642, + "loss": 0.82673502, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.34912109, + "step": 1836, + "time_per_iteration": 2.9048030376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080775, + "balance_loss_mlp": 1.04708695, + "epoch": 0.35340515582916504, + "flos": 722452594176.0, + "grad_norm": 0.055893281392717674, + "language_loss": 0.83221173, + "learning_rate": 0.0007499100600902433, + "loss": 0.84301955, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.3371582, + "step": 1837, + "time_per_iteration": 2.9900574684143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080937, + "balance_loss_mlp": 1.0464375, + "epoch": 0.35359753751442863, + "flos": 594619301376.0, + "grad_norm": 0.06113982905710786, + "language_loss": 0.84191763, + "learning_rate": 0.0007496401756940324, + "loss": 0.852727, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.34545898, + "step": 1838, + "time_per_iteration": 2.6746203899383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079632, + "balance_loss_mlp": 1.04575253, + "epoch": 0.3537899191996922, + "flos": 632384838144.0, + "grad_norm": 0.05956961248716192, + "language_loss": 0.82392603, + "learning_rate": 0.0007493701943789098, + "loss": 0.8347224, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.33886719, + "step": 1839, + "time_per_iteration": 2.773550510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089423, + "balance_loss_mlp": 1.05590141, + "epoch": 0.35398230088495575, + "flos": 506118523392.0, + "grad_norm": 0.05410374630174333, + "language_loss": 0.82311571, + "learning_rate": 0.000749100116249692, + "loss": 0.83400989, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.33544922, + "step": 1840, + "time_per_iteration": 2.6255862712860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089923, + "balance_loss_mlp": 1.05649722, + "epoch": 0.35417468257021933, + "flos": 507783015936.0, + "grad_norm": 0.06109989504264522, + "language_loss": 0.86315167, + "learning_rate": 0.0007488299414112321, + "loss": 0.87405092, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.33447266, + "step": 1841, + "time_per_iteration": 2.5875229835510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088685, + "balance_loss_mlp": 1.05552149, + "epoch": 0.35436706425548287, + "flos": 656133046272.0, + "grad_norm": 0.05742985112465967, + "language_loss": 0.77833533, + "learning_rate": 0.0007485596699684215, + "loss": 0.78922212, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.33178711, + "step": 1842, + "time_per_iteration": 2.819591760635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092886, + "balance_loss_mlp": 1.05903101, + "epoch": 0.35455944594074645, + "flos": 652322433024.0, + "grad_norm": 0.047878329403948795, + "language_loss": 0.85455877, + "learning_rate": 0.000748289302026189, + "loss": 0.86548758, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.33886719, + "step": 1843, + "time_per_iteration": 2.829897880554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088118, + "balance_loss_mlp": 1.05569351, + "epoch": 0.35475182762601, + "flos": 848240252928.0, + "grad_norm": 0.06279452498251797, + "language_loss": 0.85658133, + "learning_rate": 0.0007480188376895004, + "loss": 0.86746252, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.32421875, + "step": 1844, + "time_per_iteration": 3.067828893661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085552, + "balance_loss_mlp": 1.07358336, + "epoch": 0.3549442093112736, + "flos": 1520644133376.0, + "grad_norm": 0.027370210450034033, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.7489689, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.11962891, + "step": 1845, + "time_per_iteration": 4.860119342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087227, + "balance_loss_mlp": 1.05401564, + "epoch": 0.3551365909965371, + "flos": 651087134208.0, + "grad_norm": 0.057022057365061586, + "language_loss": 0.7840451, + "learning_rate": 0.0007474776202528074, + "loss": 0.79491735, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.33227539, + "step": 1846, + "time_per_iteration": 2.924600601196289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081626, + "balance_loss_mlp": 1.04877198, + "epoch": 0.3553289726818007, + "flos": 897094213632.0, + "grad_norm": 0.05655103479540665, + "language_loss": 0.81245291, + "learning_rate": 0.000747206867362922, + "loss": 0.82326913, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.32861328, + "step": 1847, + "time_per_iteration": 3.0635437965393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078587, + "balance_loss_mlp": 1.0454942, + "epoch": 0.3555213543670643, + "flos": 688181958144.0, + "grad_norm": 0.057996459019562165, + "language_loss": 0.83748043, + "learning_rate": 0.0007469360184988194, + "loss": 0.84826624, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.33105469, + "step": 1848, + "time_per_iteration": 2.816774606704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072989, + "balance_loss_mlp": 1.03977752, + "epoch": 0.3557137360523278, + "flos": 538305647616.0, + "grad_norm": 0.0578078380794177, + "language_loss": 0.87284935, + "learning_rate": 0.0007466650737656518, + "loss": 0.88357925, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.33203125, + "step": 1849, + "time_per_iteration": 2.611743927001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075842, + "balance_loss_mlp": 1.04208231, + "epoch": 0.3559061177375914, + "flos": 402039230976.0, + "grad_norm": 0.05251231214094578, + "language_loss": 0.90093362, + "learning_rate": 0.0007463940332686098, + "loss": 0.91169202, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.33789062, + "step": 1850, + "time_per_iteration": 2.4692726135253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073866, + "balance_loss_mlp": 1.04017735, + "epoch": 0.35609849942285493, + "flos": 696238170624.0, + "grad_norm": 0.04795835093932571, + "language_loss": 0.84167922, + "learning_rate": 0.0007461228971129205, + "loss": 0.85241795, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.33691406, + "step": 1851, + "time_per_iteration": 2.894505023956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106997, + "balance_loss_mlp": 1.0372591, + "epoch": 0.3562908811081185, + "flos": 568660953600.0, + "grad_norm": 0.055081415669052246, + "language_loss": 0.85513294, + "learning_rate": 0.0007458516654038483, + "loss": 0.86583257, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.32714844, + "step": 1852, + "time_per_iteration": 2.678018569946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076636, + "balance_loss_mlp": 1.0421133, + "epoch": 0.35648326279338205, + "flos": 682081219584.0, + "grad_norm": 0.04842584798560518, + "language_loss": 0.8668319, + "learning_rate": 0.0007455803382466946, + "loss": 0.87759829, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.34545898, + "step": 1853, + "time_per_iteration": 2.795799493789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081318, + "balance_loss_mlp": 1.04674757, + "epoch": 0.35667564447864564, + "flos": 628840475136.0, + "grad_norm": 0.04891463031827082, + "language_loss": 0.87319207, + "learning_rate": 0.0007453089157467979, + "loss": 0.88400525, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.34594727, + "step": 1854, + "time_per_iteration": 2.7683348655700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084525, + "balance_loss_mlp": 1.05000162, + "epoch": 0.35686802616390917, + "flos": 813685837824.0, + "grad_norm": 0.04901692214928195, + "language_loss": 0.81941634, + "learning_rate": 0.0007450373980095341, + "loss": 0.83026159, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.34545898, + "step": 1855, + "time_per_iteration": 3.069664716720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088761, + "balance_loss_mlp": 1.0541904, + "epoch": 0.35706040784917276, + "flos": 525922288128.0, + "grad_norm": 0.06393454459125486, + "language_loss": 0.86792582, + "learning_rate": 0.0007447657851403155, + "loss": 0.87881339, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.34619141, + "step": 1856, + "time_per_iteration": 2.6120662689208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081268, + "balance_loss_mlp": 1.04793692, + "epoch": 0.35725278953443634, + "flos": 511698345984.0, + "grad_norm": 0.060959809088696394, + "language_loss": 0.78963649, + "learning_rate": 0.0007444940772445915, + "loss": 0.80044913, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.33349609, + "step": 1857, + "time_per_iteration": 2.802053689956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079861, + "balance_loss_mlp": 1.04653037, + "epoch": 0.3574451712196999, + "flos": 487162971648.0, + "grad_norm": 0.06448223618511208, + "language_loss": 0.80338144, + "learning_rate": 0.0007442222744278484, + "loss": 0.81418002, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.33349609, + "step": 1858, + "time_per_iteration": 2.660689353942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080529, + "balance_loss_mlp": 1.04765141, + "epoch": 0.35763755290496346, + "flos": 550384879104.0, + "grad_norm": 0.061962253699798436, + "language_loss": 0.84126002, + "learning_rate": 0.0007439503767956099, + "loss": 0.85206527, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.32885742, + "step": 1859, + "time_per_iteration": 2.7479875087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080547, + "balance_loss_mlp": 1.06767237, + "epoch": 0.357829934590227, + "flos": 1503300791808.0, + "grad_norm": 0.035903025748828234, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80752152, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.12890625, + "step": 1860, + "time_per_iteration": 4.900041580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077433, + "balance_loss_mlp": 1.04479325, + "epoch": 0.3580223162754906, + "flos": 568410670080.0, + "grad_norm": 0.040558802150678905, + "language_loss": 0.85799539, + "learning_rate": 0.000743406297506922, + "loss": 0.86876976, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.32641602, + "step": 1861, + "time_per_iteration": 2.701162576675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084428, + "balance_loss_mlp": 1.05107355, + "epoch": 0.3582146979607541, + "flos": 626153089536.0, + "grad_norm": 0.04686630584337546, + "language_loss": 0.8419295, + "learning_rate": 0.0007431341160617031, + "loss": 0.85277379, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.33374023, + "step": 1862, + "time_per_iteration": 2.860173463821411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085309, + "balance_loss_mlp": 1.05266929, + "epoch": 0.3584070796460177, + "flos": 507010406400.0, + "grad_norm": 0.04939599291948986, + "language_loss": 0.88143289, + "learning_rate": 0.0007428618402234491, + "loss": 0.89228594, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.32641602, + "step": 1863, + "time_per_iteration": 2.62233567237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083488, + "balance_loss_mlp": 1.05030036, + "epoch": 0.3585994613312813, + "flos": 606190763520.0, + "grad_norm": 0.051497717495276533, + "language_loss": 0.80248374, + "learning_rate": 0.0007425894700978668, + "loss": 0.81331861, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.33203125, + "step": 1864, + "time_per_iteration": 2.711484670639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087216, + "balance_loss_mlp": 1.05424261, + "epoch": 0.3587918430165448, + "flos": 1412338484736.0, + "grad_norm": 0.047877863134497434, + "language_loss": 0.79232943, + "learning_rate": 0.0007423170057906996, + "loss": 0.80320162, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.32983398, + "step": 1865, + "time_per_iteration": 3.852776527404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091453, + "balance_loss_mlp": 1.05821717, + "epoch": 0.3589842247018084, + "flos": 478313800704.0, + "grad_norm": 0.06431447428318769, + "language_loss": 0.85827845, + "learning_rate": 0.0007420444474077275, + "loss": 0.86919296, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.33251953, + "step": 1866, + "time_per_iteration": 2.6104037761688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101796, + "balance_loss_mlp": 1.06829846, + "epoch": 0.35917660638707194, + "flos": 504464205312.0, + "grad_norm": 0.06438653143979521, + "language_loss": 0.89830631, + "learning_rate": 0.0007417717950547671, + "loss": 0.90932429, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.33520508, + "step": 1867, + "time_per_iteration": 2.5619330406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108848, + "balance_loss_mlp": 1.09687901, + "epoch": 0.3593689880723355, + "flos": 1491294191616.0, + "grad_norm": 0.037524520389889536, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77105457, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.11962891, + "step": 1868, + "time_per_iteration": 4.971943378448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096427, + "balance_loss_mlp": 1.06388319, + "epoch": 0.35956136975759906, + "flos": 528369564672.0, + "grad_norm": 0.050983088733796166, + "language_loss": 0.84612024, + "learning_rate": 0.0007412262088623299, + "loss": 0.85708451, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.32543945, + "step": 1869, + "time_per_iteration": 2.7295072078704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104656, + "balance_loss_mlp": 1.07142007, + "epoch": 0.35975375144286265, + "flos": 534647803392.0, + "grad_norm": 0.057848782745497714, + "language_loss": 0.79012549, + "learning_rate": 0.0007409532752346684, + "loss": 0.80117208, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.33251953, + "step": 1870, + "time_per_iteration": 2.74664568901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107858, + "balance_loss_mlp": 1.07464623, + "epoch": 0.3599461331281262, + "flos": 504695549952.0, + "grad_norm": 0.054621035664709404, + "language_loss": 0.88661271, + "learning_rate": 0.0007406802480606491, + "loss": 0.89769125, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.33227539, + "step": 1871, + "time_per_iteration": 2.636101722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_mlp": 1.06923246, + "epoch": 0.36013851481338977, + "flos": 511283708928.0, + "grad_norm": 0.05849515281409536, + "language_loss": 0.90592384, + "learning_rate": 0.0007404071274462707, + "loss": 0.91694903, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.33300781, + "step": 1872, + "time_per_iteration": 2.559588670730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098641, + "balance_loss_mlp": 1.06533384, + "epoch": 0.36033089649865335, + "flos": 547330908672.0, + "grad_norm": 0.06237198940644659, + "language_loss": 0.8363173, + "learning_rate": 0.0007401339134975682, + "loss": 0.84730369, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.33325195, + "step": 1873, + "time_per_iteration": 2.6156845092773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100197, + "balance_loss_mlp": 1.06617522, + "epoch": 0.3605232781839169, + "flos": 458416903680.0, + "grad_norm": 0.05108892475659159, + "language_loss": 0.84187275, + "learning_rate": 0.0007398606063206122, + "loss": 0.85287476, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.34033203, + "step": 1874, + "time_per_iteration": 2.6152756214141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090341, + "balance_loss_mlp": 1.05729628, + "epoch": 0.36071565986918047, + "flos": 509309296128.0, + "grad_norm": 0.05589329807105905, + "language_loss": 0.7857852, + "learning_rate": 0.0007395872060215101, + "loss": 0.79668868, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.33056641, + "step": 1875, + "time_per_iteration": 2.592906951904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095107, + "balance_loss_mlp": 1.06230044, + "epoch": 0.360908041554444, + "flos": 558931484160.0, + "grad_norm": 0.12468103825296885, + "language_loss": 0.88329792, + "learning_rate": 0.0007393137127064056, + "loss": 0.89424896, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.328125, + "step": 1876, + "time_per_iteration": 2.629368782043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109653, + "balance_loss_mlp": 1.06300879, + "epoch": 0.3611004232397076, + "flos": 523588492800.0, + "grad_norm": 0.05189881397754868, + "language_loss": 0.84167802, + "learning_rate": 0.0007390401264814779, + "loss": 0.85264337, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.33544922, + "step": 1877, + "time_per_iteration": 2.644322156906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084049, + "balance_loss_mlp": 1.05179131, + "epoch": 0.3612928049249711, + "flos": 540728193024.0, + "grad_norm": 0.07312313725982984, + "language_loss": 0.84472072, + "learning_rate": 0.0007387664474529427, + "loss": 0.8555612, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.32250977, + "step": 1878, + "time_per_iteration": 2.6105034351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094149, + "balance_loss_mlp": 1.06131935, + "epoch": 0.3614851866102347, + "flos": 552289480704.0, + "grad_norm": 0.06338398309504269, + "language_loss": 0.9129535, + "learning_rate": 0.0007384926757270518, + "loss": 0.923895, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.32836914, + "step": 1879, + "time_per_iteration": 2.6268200874328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082708, + "balance_loss_mlp": 1.05023539, + "epoch": 0.36167756829549824, + "flos": 771734338560.0, + "grad_norm": 0.048672507925477976, + "language_loss": 0.79680419, + "learning_rate": 0.0007382188114100924, + "loss": 0.80763125, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.32470703, + "step": 1880, + "time_per_iteration": 2.9548373222351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078144, + "balance_loss_mlp": 1.04509938, + "epoch": 0.36186994998076183, + "flos": 711560609280.0, + "grad_norm": 0.04804943389379678, + "language_loss": 0.81544787, + "learning_rate": 0.0007379448546083884, + "loss": 0.82622933, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.33056641, + "step": 1881, + "time_per_iteration": 2.900480031967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082413, + "balance_loss_mlp": 1.04884315, + "epoch": 0.3620623316660254, + "flos": 747209138688.0, + "grad_norm": 0.049920719635936736, + "language_loss": 0.88019323, + "learning_rate": 0.0007376708054282992, + "loss": 0.89101738, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.3359375, + "step": 1882, + "time_per_iteration": 2.9482829570770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075385, + "balance_loss_mlp": 1.04286492, + "epoch": 0.36225471335128895, + "flos": 482312088576.0, + "grad_norm": 0.04692483307288239, + "language_loss": 0.83908749, + "learning_rate": 0.0007373966639762201, + "loss": 0.84984136, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.32519531, + "step": 1883, + "time_per_iteration": 2.597809076309204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078049, + "balance_loss_mlp": 1.0448606, + "epoch": 0.36244709503655254, + "flos": 506655406080.0, + "grad_norm": 0.0703007209611724, + "language_loss": 0.8835175, + "learning_rate": 0.0007371224303585822, + "loss": 0.89429802, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.33203125, + "step": 1884, + "time_per_iteration": 2.5686471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046189, + "balance_loss_mlp": 1.03360081, + "epoch": 0.36263947672181607, + "flos": 1393302643200.0, + "grad_norm": 0.020620128786032376, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81403255, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.12597656, + "step": 1885, + "time_per_iteration": 4.68831205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073343, + "balance_loss_mlp": 1.03939199, + "epoch": 0.36283185840707965, + "flos": 652991735808.0, + "grad_norm": 0.05943029236677907, + "language_loss": 0.82845902, + "learning_rate": 0.0007365736870525335, + "loss": 0.83919251, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.33959961, + "step": 1886, + "time_per_iteration": 2.8566346168518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070323, + "balance_loss_mlp": 1.0373739, + "epoch": 0.3630242400923432, + "flos": 488619440640.0, + "grad_norm": 0.06703223685064427, + "language_loss": 0.82574463, + "learning_rate": 0.000736299177577164, + "loss": 0.83644783, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.32958984, + "step": 1887, + "time_per_iteration": 2.5848894119262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074233, + "balance_loss_mlp": 1.04130709, + "epoch": 0.3632166217776068, + "flos": 516892644864.0, + "grad_norm": 0.0626455482667494, + "language_loss": 0.83844066, + "learning_rate": 0.0007360245763623174, + "loss": 0.84918302, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.3293457, + "step": 1888, + "time_per_iteration": 2.6179397106170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067525, + "balance_loss_mlp": 1.03564882, + "epoch": 0.36340900346287036, + "flos": 645881250816.0, + "grad_norm": 0.06111369810549259, + "language_loss": 0.89794236, + "learning_rate": 0.0007357498835146039, + "loss": 0.90861762, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.31860352, + "step": 1889, + "time_per_iteration": 2.8311662673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070281, + "balance_loss_mlp": 1.03854752, + "epoch": 0.3636013851481339, + "flos": 553057708032.0, + "grad_norm": 0.0568549422608731, + "language_loss": 0.86402494, + "learning_rate": 0.0007354750991406684, + "loss": 0.87472773, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.31713867, + "step": 1890, + "time_per_iteration": 2.6922197341918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072041, + "balance_loss_mlp": 1.03952074, + "epoch": 0.3637937668333975, + "flos": 546395355648.0, + "grad_norm": 0.053455628499382915, + "language_loss": 0.80957252, + "learning_rate": 0.0007352002233471919, + "loss": 0.82029295, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.32519531, + "step": 1891, + "time_per_iteration": 2.621241569519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066281, + "balance_loss_mlp": 1.03371286, + "epoch": 0.363986148518661, + "flos": 537838576128.0, + "grad_norm": 0.07508945751401845, + "language_loss": 0.79549944, + "learning_rate": 0.0007349252562408906, + "loss": 0.80616224, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.32543945, + "step": 1892, + "time_per_iteration": 2.674318552017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069175, + "balance_loss_mlp": 1.0372504, + "epoch": 0.3641785302039246, + "flos": 659895607296.0, + "grad_norm": 0.04761623500947703, + "language_loss": 0.81258041, + "learning_rate": 0.0007346501979285158, + "loss": 0.82327211, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.3190918, + "step": 1893, + "time_per_iteration": 2.8671226501464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_mlp": 1.0238719, + "epoch": 0.36437091188918813, + "flos": 1467911158272.0, + "grad_norm": 0.02143179240630706, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81574464, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.11474609, + "step": 1894, + "time_per_iteration": 4.7720019817352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075531, + "balance_loss_mlp": 1.04248571, + "epoch": 0.3645632935744517, + "flos": 597012733440.0, + "grad_norm": 0.049808711864839754, + "language_loss": 0.85850054, + "learning_rate": 0.0007340998081127308, + "loss": 0.8692559, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.33056641, + "step": 1895, + "time_per_iteration": 2.753730058670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081037, + "balance_loss_mlp": 1.0486834, + "epoch": 0.36475567525971525, + "flos": 599214108672.0, + "grad_norm": 0.05384470863640996, + "language_loss": 0.9063257, + "learning_rate": 0.0007338244768230007, + "loss": 0.91713607, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.32348633, + "step": 1896, + "time_per_iteration": 2.749844551086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_mlp": 1.05189669, + "epoch": 0.36494805694497884, + "flos": 798047686656.0, + "grad_norm": 0.12041633701688108, + "language_loss": 0.88843018, + "learning_rate": 0.0007335490547545578, + "loss": 0.89927363, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.32446289, + "step": 1897, + "time_per_iteration": 3.0181519985198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089996, + "balance_loss_mlp": 1.05795288, + "epoch": 0.3651404386302424, + "flos": 637023315456.0, + "grad_norm": 0.06340749789439089, + "language_loss": 0.82377589, + "learning_rate": 0.0007332735420143308, + "loss": 0.83467579, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.3203125, + "step": 1898, + "time_per_iteration": 2.7370855808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077681, + "balance_loss_mlp": 1.04442132, + "epoch": 0.36533282031550596, + "flos": 491337349632.0, + "grad_norm": 0.05751458989837244, + "language_loss": 0.8663426, + "learning_rate": 0.0007329979387092826, + "loss": 0.87711942, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.33276367, + "step": 1899, + "time_per_iteration": 2.552072048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076766, + "balance_loss_mlp": 1.0440551, + "epoch": 0.36552520200076954, + "flos": 855587874816.0, + "grad_norm": 0.050366197091212025, + "language_loss": 0.83863711, + "learning_rate": 0.0007327222449464124, + "loss": 0.84940481, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.32714844, + "step": 1900, + "time_per_iteration": 3.2450594902038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072697, + "balance_loss_mlp": 1.03936601, + "epoch": 0.3657175836860331, + "flos": 483449872896.0, + "grad_norm": 0.053278478248789174, + "language_loss": 0.88864619, + "learning_rate": 0.0007324464608327538, + "loss": 0.89937317, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.33349609, + "step": 1901, + "time_per_iteration": 2.6027348041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072333, + "balance_loss_mlp": 1.03957391, + "epoch": 0.36590996537129666, + "flos": 434561006592.0, + "grad_norm": 0.058664113400220264, + "language_loss": 0.88440275, + "learning_rate": 0.0007321705864753758, + "loss": 0.8951261, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.32763672, + "step": 1902, + "time_per_iteration": 2.668935537338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073172, + "balance_loss_mlp": 1.03950715, + "epoch": 0.3661023470565602, + "flos": 711880704000.0, + "grad_norm": 0.047699393186438684, + "language_loss": 0.84307706, + "learning_rate": 0.0007318946219813823, + "loss": 0.85380876, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.33691406, + "step": 1903, + "time_per_iteration": 3.025866985321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074477, + "balance_loss_mlp": 1.04100275, + "epoch": 0.3662947287418238, + "flos": 564495340032.0, + "grad_norm": 0.05797091317965262, + "language_loss": 0.90078342, + "learning_rate": 0.000731618567457912, + "loss": 0.91152817, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.3347168, + "step": 1904, + "time_per_iteration": 2.6391115188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073585, + "balance_loss_mlp": 1.03937197, + "epoch": 0.3664871104270873, + "flos": 789391982592.0, + "grad_norm": 0.05925410463566973, + "language_loss": 0.87083924, + "learning_rate": 0.000731342423012139, + "loss": 0.88157511, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.3425293, + "step": 1905, + "time_per_iteration": 3.020660400390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070848, + "balance_loss_mlp": 1.03682566, + "epoch": 0.3666794921123509, + "flos": 752202616320.0, + "grad_norm": 0.06601024748857935, + "language_loss": 0.82244205, + "learning_rate": 0.0007310661887512722, + "loss": 0.83315057, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.34033203, + "step": 1906, + "time_per_iteration": 3.0128185749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068568, + "balance_loss_mlp": 1.03571391, + "epoch": 0.3668718737976145, + "flos": 523264015872.0, + "grad_norm": 0.04853340441162438, + "language_loss": 0.82115662, + "learning_rate": 0.0007307898647825549, + "loss": 0.8318423, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.32861328, + "step": 1907, + "time_per_iteration": 2.6610257625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075142, + "balance_loss_mlp": 1.04126275, + "epoch": 0.367064255482878, + "flos": 571698957312.0, + "grad_norm": 0.05773956677348378, + "language_loss": 0.89470363, + "learning_rate": 0.0007305134512132659, + "loss": 0.90545505, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.33886719, + "step": 1908, + "time_per_iteration": 2.706658124923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076976, + "balance_loss_mlp": 1.04309678, + "epoch": 0.3672566371681416, + "flos": 446880347136.0, + "grad_norm": 0.0894454707503668, + "language_loss": 0.83388865, + "learning_rate": 0.0007302369481507183, + "loss": 0.84465849, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.33911133, + "step": 1909, + "time_per_iteration": 2.499483346939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049259, + "balance_loss_mlp": 1.03838694, + "epoch": 0.36744901885340514, + "flos": 1539275208192.0, + "grad_norm": 0.032302576214162944, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81011015, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.10888672, + "step": 1910, + "time_per_iteration": 4.86514949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088721, + "balance_loss_mlp": 1.05476999, + "epoch": 0.36764140053866873, + "flos": 563417192448.0, + "grad_norm": 0.061805914783829616, + "language_loss": 0.85575247, + "learning_rate": 0.000729683673975274, + "loss": 0.86663967, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.33984375, + "step": 1911, + "time_per_iteration": 2.6907522678375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091619, + "balance_loss_mlp": 1.05709589, + "epoch": 0.36783378222393226, + "flos": 1216168971264.0, + "grad_norm": 0.04498413319979697, + "language_loss": 0.82746279, + "learning_rate": 0.0007294069030771774, + "loss": 0.83837891, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.34570312, + "step": 1912, + "time_per_iteration": 3.6445353031158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109623, + "balance_loss_mlp": 1.06196952, + "epoch": 0.36802616390919585, + "flos": 498476947968.0, + "grad_norm": 0.055898807174015214, + "language_loss": 0.90671504, + "learning_rate": 0.0007291300431154224, + "loss": 0.9176774, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.34301758, + "step": 1913, + "time_per_iteration": 2.5600366592407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025736, + "balance_loss_mlp": 1.0155319, + "epoch": 0.36821854559445943, + "flos": 1581281961984.0, + "grad_norm": 0.015307788275572325, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71415472, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.10205078, + "step": 1914, + "time_per_iteration": 4.9577555656433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110633, + "balance_loss_mlp": 1.07287991, + "epoch": 0.36841092727972297, + "flos": 835261784064.0, + "grad_norm": 0.06223209716338702, + "language_loss": 0.79735458, + "learning_rate": 0.0007285760564309179, + "loss": 0.80841786, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.3347168, + "step": 1915, + "time_per_iteration": 3.1251590251922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101251, + "balance_loss_mlp": 1.06811082, + "epoch": 0.36860330896498655, + "flos": 689517591552.0, + "grad_norm": 0.05672479428696366, + "language_loss": 0.85010201, + "learning_rate": 0.0007282989299232448, + "loss": 0.8611145, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.33154297, + "step": 1916, + "time_per_iteration": 3.062971353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096553, + "balance_loss_mlp": 1.06381774, + "epoch": 0.3687956906502501, + "flos": 553919067648.0, + "grad_norm": 0.05955658020637064, + "language_loss": 0.83600092, + "learning_rate": 0.0007280217147820668, + "loss": 0.84696645, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.32739258, + "step": 1917, + "time_per_iteration": 2.6169495582580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097845, + "balance_loss_mlp": 1.06465673, + "epoch": 0.3689880723355137, + "flos": 576426184704.0, + "grad_norm": 0.05443515430960571, + "language_loss": 0.79137111, + "learning_rate": 0.0007277444111150079, + "loss": 0.80234957, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.33203125, + "step": 1918, + "time_per_iteration": 2.672696828842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101204, + "balance_loss_mlp": 1.06820679, + "epoch": 0.3691804540207772, + "flos": 528615465984.0, + "grad_norm": 0.06564490716140688, + "language_loss": 0.84340626, + "learning_rate": 0.0007274670190297272, + "loss": 0.85441828, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.33007812, + "step": 1919, + "time_per_iteration": 2.569920539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091279, + "balance_loss_mlp": 1.0575906, + "epoch": 0.3693728357060408, + "flos": 560729806848.0, + "grad_norm": 0.06475948680742319, + "language_loss": 0.81988895, + "learning_rate": 0.0007271895386339179, + "loss": 0.83080173, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.3371582, + "step": 1920, + "time_per_iteration": 2.765470027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086642, + "balance_loss_mlp": 1.05350161, + "epoch": 0.3695652173913043, + "flos": 579488919552.0, + "grad_norm": 0.0536525739451854, + "language_loss": 0.82950377, + "learning_rate": 0.0007269119700353073, + "loss": 0.84037018, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.33154297, + "step": 1921, + "time_per_iteration": 2.703117847442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089729, + "balance_loss_mlp": 1.05756688, + "epoch": 0.3697575990765679, + "flos": 512629516800.0, + "grad_norm": 0.04104943724396866, + "language_loss": 0.84983069, + "learning_rate": 0.0007266343133416571, + "loss": 0.86072791, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.3215332, + "step": 1922, + "time_per_iteration": 2.7371909618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060973, + "balance_loss_mlp": 1.04967153, + "epoch": 0.3699499807618315, + "flos": 1569826953216.0, + "grad_norm": 0.023907464900796205, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78177893, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.11279297, + "step": 1923, + "time_per_iteration": 4.827981233596802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084577, + "balance_loss_mlp": 1.05136538, + "epoch": 0.37014236244709503, + "flos": 497093262336.0, + "grad_norm": 0.06739223877154035, + "language_loss": 0.84575641, + "learning_rate": 0.0007260787361004556, + "loss": 0.85660219, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.33227539, + "step": 1924, + "time_per_iteration": 2.6221601963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048708, + "balance_loss_mlp": 1.03764546, + "epoch": 0.3703347441323586, + "flos": 1443562048512.0, + "grad_norm": 0.02017040526472397, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74810213, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.11083984, + "step": 1925, + "time_per_iteration": 4.909639120101929 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083689, + "balance_loss_mlp": 1.04997683, + "epoch": 0.37052712581762215, + "flos": 563324060160.0, + "grad_norm": 0.19489786122972683, + "language_loss": 0.87265027, + "learning_rate": 0.0007255228077730903, + "loss": 0.88348716, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.33740234, + "step": 1926, + "time_per_iteration": 2.726412773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092339, + "balance_loss_mlp": 1.05850744, + "epoch": 0.37071950750288574, + "flos": 925706451456.0, + "grad_norm": 0.06639539702607969, + "language_loss": 0.81730163, + "learning_rate": 0.0007252447122218632, + "loss": 0.82822502, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.33862305, + "step": 1927, + "time_per_iteration": 3.157439708709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090678, + "balance_loss_mlp": 1.05710912, + "epoch": 0.37091188918814927, + "flos": 418090609152.0, + "grad_norm": 0.06586667444600991, + "language_loss": 0.87736213, + "learning_rate": 0.0007249665292228834, + "loss": 0.88826889, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.3359375, + "step": 1928, + "time_per_iteration": 2.569284677505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086968, + "balance_loss_mlp": 1.05208778, + "epoch": 0.37110427087341286, + "flos": 462941899776.0, + "grad_norm": 0.056849308308669105, + "language_loss": 0.83676869, + "learning_rate": 0.000724688258884151, + "loss": 0.84763837, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.34912109, + "step": 1929, + "time_per_iteration": 2.522596597671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085634, + "balance_loss_mlp": 1.05204105, + "epoch": 0.3712966525586764, + "flos": 849303843840.0, + "grad_norm": 0.0484214736208702, + "language_loss": 0.86208755, + "learning_rate": 0.0007244099013137002, + "loss": 0.87294388, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.33618164, + "step": 1930, + "time_per_iteration": 3.055302619934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089345, + "balance_loss_mlp": 1.05370176, + "epoch": 0.37148903424394, + "flos": 925555092480.0, + "grad_norm": 0.05147814185741214, + "language_loss": 0.88918859, + "learning_rate": 0.0007241314566195993, + "loss": 0.90008199, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.35693359, + "step": 1931, + "time_per_iteration": 3.249950408935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108542, + "balance_loss_mlp": 1.05020559, + "epoch": 0.37168141592920356, + "flos": 519565473792.0, + "grad_norm": 0.061459583473066896, + "language_loss": 0.85347825, + "learning_rate": 0.0007238529249099496, + "loss": 0.86433244, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.35253906, + "step": 1932, + "time_per_iteration": 2.603287696838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068675, + "balance_loss_mlp": 1.05599129, + "epoch": 0.3718737976144671, + "flos": 1445107267584.0, + "grad_norm": 0.02721294021284605, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.7892555, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.12695312, + "step": 1933, + "time_per_iteration": 4.850685358047485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089729, + "balance_loss_mlp": 1.05346537, + "epoch": 0.3720661792997307, + "flos": 759218558976.0, + "grad_norm": 0.08735029164116491, + "language_loss": 0.80658156, + "learning_rate": 0.000723295600876581, + "loss": 0.81747884, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.36279297, + "step": 1934, + "time_per_iteration": 3.0082099437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092646, + "balance_loss_mlp": 1.05690694, + "epoch": 0.3722585609849942, + "flos": 516686031360.0, + "grad_norm": 0.1760204301041219, + "language_loss": 0.8798061, + "learning_rate": 0.0007230168087692344, + "loss": 0.89073259, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.35791016, + "step": 1935, + "time_per_iteration": 2.7076756954193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095918, + "balance_loss_mlp": 1.06070328, + "epoch": 0.3724509426702578, + "flos": 782114171904.0, + "grad_norm": 0.058450977170247324, + "language_loss": 0.82290804, + "learning_rate": 0.0007227379300790839, + "loss": 0.83386725, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.35205078, + "step": 1936, + "time_per_iteration": 3.0381107330322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108913, + "balance_loss_mlp": 1.07262528, + "epoch": 0.37264332435552133, + "flos": 391502246400.0, + "grad_norm": 0.062314619417064634, + "language_loss": 0.85779369, + "learning_rate": 0.0007224589649143997, + "loss": 0.86888283, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.36328125, + "step": 1937, + "time_per_iteration": 2.5413918495178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118094, + "balance_loss_mlp": 1.08223581, + "epoch": 0.3728357060407849, + "flos": 542599299072.0, + "grad_norm": 0.08241458585549921, + "language_loss": 0.80921531, + "learning_rate": 0.0007221799133834861, + "loss": 0.82039624, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.35839844, + "step": 1938, + "time_per_iteration": 2.620593309402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122902, + "balance_loss_mlp": 1.08682895, + "epoch": 0.3730280877260485, + "flos": 433344646656.0, + "grad_norm": 0.05702640818290307, + "language_loss": 0.81373966, + "learning_rate": 0.00072190077559468, + "loss": 0.8249687, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.36083984, + "step": 1939, + "time_per_iteration": 2.512871026992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133095, + "balance_loss_mlp": 1.09587836, + "epoch": 0.37322046941131204, + "flos": 531230068224.0, + "grad_norm": 0.0616329871980105, + "language_loss": 0.89228082, + "learning_rate": 0.0007216215516563527, + "loss": 0.90361178, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.37207031, + "step": 1940, + "time_per_iteration": 2.6655144691467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113311, + "balance_loss_mlp": 1.09534478, + "epoch": 0.3734128510965756, + "flos": 531294087168.0, + "grad_norm": 0.05659412158312536, + "language_loss": 0.83479297, + "learning_rate": 0.0007213422416769083, + "loss": 0.84612405, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.37744141, + "step": 1941, + "time_per_iteration": 2.6088144779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127109, + "balance_loss_mlp": 1.09022546, + "epoch": 0.37360523278183916, + "flos": 500195284992.0, + "grad_norm": 0.05910558413712496, + "language_loss": 0.74991721, + "learning_rate": 0.0007210628457647849, + "loss": 0.76118833, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.36889648, + "step": 1942, + "time_per_iteration": 2.57867169380188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131479, + "balance_loss_mlp": 1.09366596, + "epoch": 0.37379761446710275, + "flos": 547652413440.0, + "grad_norm": 0.06364761456819781, + "language_loss": 0.79148316, + "learning_rate": 0.000720783364028453, + "loss": 0.80279785, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.37768555, + "step": 1943, + "time_per_iteration": 2.744575023651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121556, + "balance_loss_mlp": 1.0834806, + "epoch": 0.3739899961523663, + "flos": 475517316096.0, + "grad_norm": 0.05406366318559307, + "language_loss": 0.87411249, + "learning_rate": 0.0007205037965764177, + "loss": 0.88532799, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.38061523, + "step": 1944, + "time_per_iteration": 2.5253238677978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122588, + "balance_loss_mlp": 1.0851568, + "epoch": 0.37418237783762986, + "flos": 611626581504.0, + "grad_norm": 0.05571778703090581, + "language_loss": 0.85614675, + "learning_rate": 0.0007202241435172161, + "loss": 0.86737263, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.37426758, + "step": 1945, + "time_per_iteration": 2.7462716102600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117651, + "balance_loss_mlp": 1.07931328, + "epoch": 0.3743747595228934, + "flos": 765953694720.0, + "grad_norm": 0.05225192391609906, + "language_loss": 0.88148731, + "learning_rate": 0.0007199444049594198, + "loss": 0.89266384, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.38330078, + "step": 1946, + "time_per_iteration": 2.9533469676971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116752, + "balance_loss_mlp": 1.07798529, + "epoch": 0.374567141208157, + "flos": 524120993280.0, + "grad_norm": 0.06150523549490838, + "language_loss": 0.83402771, + "learning_rate": 0.0007196645810116322, + "loss": 0.84519523, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.38720703, + "step": 1947, + "time_per_iteration": 2.709965705871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106001, + "balance_loss_mlp": 1.06735349, + "epoch": 0.37475952289342057, + "flos": 681067090944.0, + "grad_norm": 0.05833074938802531, + "language_loss": 0.83909506, + "learning_rate": 0.0007193846717824912, + "loss": 0.850155, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.38598633, + "step": 1948, + "time_per_iteration": 2.854522705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094783, + "balance_loss_mlp": 1.05682671, + "epoch": 0.3749519045786841, + "flos": 460061047296.0, + "grad_norm": 0.06673844071937801, + "language_loss": 0.88263041, + "learning_rate": 0.0007191046773806669, + "loss": 0.89357823, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.37915039, + "step": 1949, + "time_per_iteration": 2.575989007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085738, + "balance_loss_mlp": 1.04682803, + "epoch": 0.3751442862639477, + "flos": 954471458304.0, + "grad_norm": 0.06638817682476543, + "language_loss": 0.83010924, + "learning_rate": 0.0007188245979148631, + "loss": 0.84096658, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.38867188, + "step": 1950, + "time_per_iteration": 3.1386518478393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082888, + "balance_loss_mlp": 1.04462171, + "epoch": 0.3753366679492112, + "flos": 527483473920.0, + "grad_norm": 0.05996025340147905, + "language_loss": 0.8766306, + "learning_rate": 0.0007185444334938157, + "loss": 0.88745946, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.38232422, + "step": 1951, + "time_per_iteration": 2.644848585128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082635, + "balance_loss_mlp": 1.04501283, + "epoch": 0.3755290496344748, + "flos": 521535504384.0, + "grad_norm": 0.05938829335869994, + "language_loss": 0.84891546, + "learning_rate": 0.0007182641842262947, + "loss": 0.85974181, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.3762207, + "step": 1952, + "time_per_iteration": 2.6239395141601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081192, + "balance_loss_mlp": 1.04361689, + "epoch": 0.37572143131973834, + "flos": 620810403840.0, + "grad_norm": 0.06544951097265184, + "language_loss": 0.77827752, + "learning_rate": 0.0007179838502211022, + "loss": 0.78908944, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.37524414, + "step": 1953, + "time_per_iteration": 2.8444712162017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076992, + "balance_loss_mlp": 1.03958416, + "epoch": 0.37591381300500193, + "flos": 770635842048.0, + "grad_norm": 0.05616797515781331, + "language_loss": 0.86183697, + "learning_rate": 0.0007177034315870738, + "loss": 0.87260687, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.37402344, + "step": 1954, + "time_per_iteration": 2.9628727436065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078673, + "balance_loss_mlp": 1.0411936, + "epoch": 0.37610619469026546, + "flos": 520191106560.0, + "grad_norm": 0.05872311076525267, + "language_loss": 0.9098376, + "learning_rate": 0.0007174229284330773, + "loss": 0.92062426, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.37402344, + "step": 1955, + "time_per_iteration": 2.579792022705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010784, + "balance_loss_mlp": 1.0412302, + "epoch": 0.37629857637552905, + "flos": 598524456960.0, + "grad_norm": 0.050284285498010506, + "language_loss": 0.86896843, + "learning_rate": 0.0007171423408680141, + "loss": 0.8797524, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.37133789, + "step": 1956, + "time_per_iteration": 2.7764384746551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079026, + "balance_loss_mlp": 1.04211903, + "epoch": 0.37649095806079264, + "flos": 564687396864.0, + "grad_norm": 0.058102078307858664, + "language_loss": 0.89614129, + "learning_rate": 0.0007168616690008176, + "loss": 0.90693152, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.36889648, + "step": 1957, + "time_per_iteration": 2.646986246109009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083688, + "balance_loss_mlp": 1.04606569, + "epoch": 0.37668333974605617, + "flos": 592196755968.0, + "grad_norm": 0.10223927136981294, + "language_loss": 0.86142451, + "learning_rate": 0.0007165809129404545, + "loss": 0.8722614, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.37573242, + "step": 1958, + "time_per_iteration": 2.756287097930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081219, + "balance_loss_mlp": 1.04440713, + "epoch": 0.37687572143131975, + "flos": 419257506816.0, + "grad_norm": 0.0560584683493853, + "language_loss": 0.85760534, + "learning_rate": 0.0007163000727959239, + "loss": 0.8684175, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.36791992, + "step": 1959, + "time_per_iteration": 2.5415151119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105237, + "balance_loss_mlp": 1.03587127, + "epoch": 0.3770681031165833, + "flos": 1356484243968.0, + "grad_norm": 0.028402472736143748, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.7901144, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.16503906, + "step": 1960, + "time_per_iteration": 4.892657518386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086261, + "balance_loss_mlp": 1.04892445, + "epoch": 0.3772604848018469, + "flos": 644592107520.0, + "grad_norm": 0.04530874218926827, + "language_loss": 0.84377986, + "learning_rate": 0.00071573814069052, + "loss": 0.85464251, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.37329102, + "step": 1961, + "time_per_iteration": 2.898301839828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088156, + "balance_loss_mlp": 1.05098641, + "epoch": 0.3774528664871104, + "flos": 901265619456.0, + "grad_norm": 0.052585227845940184, + "language_loss": 0.87987518, + "learning_rate": 0.0007154570489478081, + "loss": 0.89075673, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.37158203, + "step": 1962, + "time_per_iteration": 3.1638717651367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088198, + "balance_loss_mlp": 1.05048013, + "epoch": 0.377645248172374, + "flos": 787717315584.0, + "grad_norm": 0.047624248218528294, + "language_loss": 0.864995, + "learning_rate": 0.0007151758735572514, + "loss": 0.87587702, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.37695312, + "step": 1963, + "time_per_iteration": 2.985558271408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090796, + "balance_loss_mlp": 1.05236292, + "epoch": 0.3778376298576376, + "flos": 586417522176.0, + "grad_norm": 0.06598015027050642, + "language_loss": 0.80448836, + "learning_rate": 0.0007148946146280119, + "loss": 0.81539631, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.38427734, + "step": 1964, + "time_per_iteration": 2.784947395324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053975, + "balance_loss_mlp": 1.03938425, + "epoch": 0.3780300115429011, + "flos": 1396014759936.0, + "grad_norm": 0.018109037433210438, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73246121, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.14550781, + "step": 1965, + "time_per_iteration": 4.85606050491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052214, + "balance_loss_mlp": 1.03838599, + "epoch": 0.3782223932281647, + "flos": 1356935348736.0, + "grad_norm": 0.019183634636191996, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76394159, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.13867188, + "step": 1966, + "time_per_iteration": 4.946903467178345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082797, + "balance_loss_mlp": 1.04517484, + "epoch": 0.37841477491342823, + "flos": 703811344896.0, + "grad_norm": 0.05921890511558738, + "language_loss": 0.83782387, + "learning_rate": 0.0007140503377003022, + "loss": 0.84865183, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.3762207, + "step": 1967, + "time_per_iteration": 2.984163761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089165, + "balance_loss_mlp": 1.0504458, + "epoch": 0.3786071565986918, + "flos": 528856985088.0, + "grad_norm": 0.047303083725180994, + "language_loss": 0.84754062, + "learning_rate": 0.000713768745708599, + "loss": 0.85843223, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.38696289, + "step": 1968, + "time_per_iteration": 2.6251039505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086627, + "balance_loss_mlp": 1.04881418, + "epoch": 0.37879953828395535, + "flos": 992872802304.0, + "grad_norm": 0.053091209869219315, + "language_loss": 0.76740122, + "learning_rate": 0.0007134870707245085, + "loss": 0.7782675, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.37792969, + "step": 1969, + "time_per_iteration": 3.252840995788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082313, + "balance_loss_mlp": 1.04435682, + "epoch": 0.37899191996921894, + "flos": 626358292992.0, + "grad_norm": 0.06088981396741891, + "language_loss": 0.8454808, + "learning_rate": 0.0007132053128573864, + "loss": 0.85630393, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.37915039, + "step": 1970, + "time_per_iteration": 2.739210844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078955, + "balance_loss_mlp": 1.0417614, + "epoch": 0.37918430165448247, + "flos": 686005314048.0, + "grad_norm": 0.05972304110224919, + "language_loss": 0.83631253, + "learning_rate": 0.0007129234722166211, + "loss": 0.84710205, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.37182617, + "step": 1971, + "time_per_iteration": 2.814235210418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086171, + "balance_loss_mlp": 1.05012178, + "epoch": 0.37937668333974606, + "flos": 475374721536.0, + "grad_norm": 0.05230765101952506, + "language_loss": 0.91063309, + "learning_rate": 0.0007126415489116328, + "loss": 0.92149478, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.3605957, + "step": 1972, + "time_per_iteration": 2.657435178756714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091568, + "balance_loss_mlp": 1.05413604, + "epoch": 0.37956906502500964, + "flos": 707271340032.0, + "grad_norm": 0.05210329015751025, + "language_loss": 0.81174934, + "learning_rate": 0.0007123595430518736, + "loss": 0.82266498, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.37402344, + "step": 1973, + "time_per_iteration": 2.832801103591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108628, + "balance_loss_mlp": 1.04856205, + "epoch": 0.3797614467102732, + "flos": 426421836288.0, + "grad_norm": 0.07403044475037865, + "language_loss": 0.8602494, + "learning_rate": 0.0007120774547468282, + "loss": 0.87111217, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.37695312, + "step": 1974, + "time_per_iteration": 2.523059844970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087165, + "balance_loss_mlp": 1.05016232, + "epoch": 0.37995382839553676, + "flos": 481588941312.0, + "grad_norm": 0.05250431859571283, + "language_loss": 0.81228226, + "learning_rate": 0.0007117952841060128, + "loss": 0.82315391, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.36962891, + "step": 1975, + "time_per_iteration": 2.648947238922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080927, + "balance_loss_mlp": 1.04409158, + "epoch": 0.3801462100808003, + "flos": 560286056448.0, + "grad_norm": 0.0511194255012935, + "language_loss": 0.83466387, + "learning_rate": 0.0007115130312389756, + "loss": 0.84547317, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.3684082, + "step": 1976, + "time_per_iteration": 2.6648154258728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086722, + "balance_loss_mlp": 1.0505538, + "epoch": 0.3803385917660639, + "flos": 464699524608.0, + "grad_norm": 0.06028169205400359, + "language_loss": 0.79143679, + "learning_rate": 0.0007112306962552973, + "loss": 0.80230403, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.36181641, + "step": 1977, + "time_per_iteration": 2.5715434551239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086468, + "balance_loss_mlp": 1.04934657, + "epoch": 0.3805309734513274, + "flos": 521614080000.0, + "grad_norm": 0.055719330197324175, + "language_loss": 0.8517288, + "learning_rate": 0.0007109482792645896, + "loss": 0.86259341, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.37084961, + "step": 1978, + "time_per_iteration": 2.6932663917541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088226, + "balance_loss_mlp": 1.05222487, + "epoch": 0.380723355136591, + "flos": 591128782848.0, + "grad_norm": 0.06665517257748008, + "language_loss": 0.83491528, + "learning_rate": 0.0007106657803764969, + "loss": 0.84579754, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.36010742, + "step": 1979, + "time_per_iteration": 2.710658311843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079049, + "balance_loss_mlp": 1.04204643, + "epoch": 0.38091573682185453, + "flos": 622394910720.0, + "grad_norm": 0.05735071115872701, + "language_loss": 0.81648314, + "learning_rate": 0.0007103831997006948, + "loss": 0.82727367, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.36987305, + "step": 1980, + "time_per_iteration": 2.7589223384857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107835, + "balance_loss_mlp": 1.04168153, + "epoch": 0.3811081185071181, + "flos": 568716208128.0, + "grad_norm": 0.047165366669346453, + "language_loss": 0.85731214, + "learning_rate": 0.0007101005373468908, + "loss": 0.86809564, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.36669922, + "step": 1981, + "time_per_iteration": 2.85588002204895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077478, + "balance_loss_mlp": 1.04161954, + "epoch": 0.3813005001923817, + "flos": 584550798336.0, + "grad_norm": 0.055048826019740454, + "language_loss": 0.86394024, + "learning_rate": 0.0007098177934248242, + "loss": 0.87471503, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.35888672, + "step": 1982, + "time_per_iteration": 2.7226805686950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077944, + "balance_loss_mlp": 1.04160953, + "epoch": 0.38149288187764524, + "flos": 621287649792.0, + "grad_norm": 0.056689743602043985, + "language_loss": 0.85823661, + "learning_rate": 0.0007095349680442661, + "loss": 0.86901605, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.36352539, + "step": 1983, + "time_per_iteration": 2.8454253673553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075326, + "balance_loss_mlp": 1.03829932, + "epoch": 0.3816852635629088, + "flos": 570414196224.0, + "grad_norm": 0.07971741755252446, + "language_loss": 0.78927159, + "learning_rate": 0.0007092520613150188, + "loss": 0.80002487, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.37036133, + "step": 1984, + "time_per_iteration": 2.7279529571533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081451, + "balance_loss_mlp": 1.04368556, + "epoch": 0.38187764524817236, + "flos": 565313029632.0, + "grad_norm": 0.06238598748372814, + "language_loss": 0.81304747, + "learning_rate": 0.0007089690733469165, + "loss": 0.82386196, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.37719727, + "step": 1985, + "time_per_iteration": 2.7343544960021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077201, + "balance_loss_mlp": 1.04115212, + "epoch": 0.38207002693343595, + "flos": 630932751360.0, + "grad_norm": 0.07832972313002672, + "language_loss": 0.82561398, + "learning_rate": 0.000708686004249825, + "loss": 0.83638602, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.36035156, + "step": 1986, + "time_per_iteration": 2.7691054344177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082228, + "balance_loss_mlp": 1.04572582, + "epoch": 0.3822624086186995, + "flos": 548507980800.0, + "grad_norm": 0.053849318526496194, + "language_loss": 0.9147824, + "learning_rate": 0.0007084028541336413, + "loss": 0.9256047, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.36499023, + "step": 1987, + "time_per_iteration": 2.7131056785583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083994, + "balance_loss_mlp": 1.04753971, + "epoch": 0.38245479030396307, + "flos": 613571880960.0, + "grad_norm": 0.06787860515410171, + "language_loss": 0.86709088, + "learning_rate": 0.0007081196231082942, + "loss": 0.87793082, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.36450195, + "step": 1988, + "time_per_iteration": 2.7983548641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083168, + "balance_loss_mlp": 1.04621339, + "epoch": 0.38264717198922665, + "flos": 667787466240.0, + "grad_norm": 0.05230973877590939, + "language_loss": 0.80107033, + "learning_rate": 0.0007078363112837436, + "loss": 0.81190205, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.36938477, + "step": 1989, + "time_per_iteration": 2.8133270740509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087048, + "balance_loss_mlp": 1.04935408, + "epoch": 0.3828395536744902, + "flos": 454521922560.0, + "grad_norm": 0.077904410907181, + "language_loss": 0.84988701, + "learning_rate": 0.000707552918769981, + "loss": 0.86075753, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.37646484, + "step": 1990, + "time_per_iteration": 2.5100817680358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089992, + "balance_loss_mlp": 1.05213106, + "epoch": 0.3830319353597538, + "flos": 499191330816.0, + "grad_norm": 0.06242573245055077, + "language_loss": 0.83457661, + "learning_rate": 0.000707269445677029, + "loss": 0.84547657, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.37817383, + "step": 1991, + "time_per_iteration": 2.7737526893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099946, + "balance_loss_mlp": 1.06327772, + "epoch": 0.3832243170450173, + "flos": 743787021312.0, + "grad_norm": 0.05437985066129539, + "language_loss": 0.84858984, + "learning_rate": 0.0007069858921149416, + "loss": 0.85958934, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.36694336, + "step": 1992, + "time_per_iteration": 2.9642581939697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095684, + "balance_loss_mlp": 1.05868101, + "epoch": 0.3834166987302809, + "flos": 577937908224.0, + "grad_norm": 0.10762195872615073, + "language_loss": 0.85869837, + "learning_rate": 0.0007067022581938043, + "loss": 0.86965525, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.36987305, + "step": 1993, + "time_per_iteration": 2.805201292037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101221, + "balance_loss_mlp": 1.06531525, + "epoch": 0.3836090804155444, + "flos": 536194432512.0, + "grad_norm": 0.06280477504596697, + "language_loss": 0.831635, + "learning_rate": 0.0007064185440237334, + "loss": 0.84264719, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.359375, + "step": 1994, + "time_per_iteration": 2.7297706604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101916, + "balance_loss_mlp": 1.06527066, + "epoch": 0.383801462100808, + "flos": 601587191808.0, + "grad_norm": 0.05513764490979663, + "language_loss": 0.84278905, + "learning_rate": 0.0007061347497148764, + "loss": 0.85380822, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.36621094, + "step": 1995, + "time_per_iteration": 2.725632429122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094031, + "balance_loss_mlp": 1.05876923, + "epoch": 0.38399384378607154, + "flos": 572427896832.0, + "grad_norm": 0.06604776765413087, + "language_loss": 0.86282277, + "learning_rate": 0.0007058508753774122, + "loss": 0.87376308, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.35302734, + "step": 1996, + "time_per_iteration": 2.760045051574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092829, + "balance_loss_mlp": 1.05773425, + "epoch": 0.38418622547133513, + "flos": 536513117184.0, + "grad_norm": 0.058737109015633, + "language_loss": 0.86788458, + "learning_rate": 0.0007055669211215505, + "loss": 0.87881291, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.35131836, + "step": 1997, + "time_per_iteration": 2.6091408729553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088772, + "balance_loss_mlp": 1.05238962, + "epoch": 0.3843786071565987, + "flos": 572673798144.0, + "grad_norm": 0.06483433205315106, + "language_loss": 0.77687544, + "learning_rate": 0.0007052828870575322, + "loss": 0.78776312, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.36376953, + "step": 1998, + "time_per_iteration": 2.671349048614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109131, + "balance_loss_mlp": 1.0558331, + "epoch": 0.38457098884186225, + "flos": 728361275904.0, + "grad_norm": 0.05010154832824161, + "language_loss": 0.86955881, + "learning_rate": 0.0007049987732956291, + "loss": 0.88047194, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.35498047, + "step": 1999, + "time_per_iteration": 2.9918439388275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108718, + "balance_loss_mlp": 1.05189395, + "epoch": 0.38476337052712584, + "flos": 583123442688.0, + "grad_norm": 0.047224279388360366, + "language_loss": 0.82623643, + "learning_rate": 0.0007047145799461439, + "loss": 0.83710825, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.35327148, + "step": 2000, + "time_per_iteration": 2.84328293800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092374, + "balance_loss_mlp": 1.05656374, + "epoch": 0.38495575221238937, + "flos": 552787075584.0, + "grad_norm": 0.05254385134155795, + "language_loss": 0.82269979, + "learning_rate": 0.00070443030711941, + "loss": 0.83362353, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.3581543, + "step": 2001, + "time_per_iteration": 2.753903865814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092226, + "balance_loss_mlp": 1.0559628, + "epoch": 0.38514813389765296, + "flos": 654173190144.0, + "grad_norm": 0.05896823149323879, + "language_loss": 0.8241961, + "learning_rate": 0.0007041459549257924, + "loss": 0.83511841, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.36254883, + "step": 2002, + "time_per_iteration": 2.85963773727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086715, + "balance_loss_mlp": 1.05030835, + "epoch": 0.3853405155829165, + "flos": 867715158528.0, + "grad_norm": 0.0671523306708724, + "language_loss": 0.78569824, + "learning_rate": 0.0007038615234756859, + "loss": 0.79656541, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.36425781, + "step": 2003, + "time_per_iteration": 3.1452226638793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_mlp": 1.04440188, + "epoch": 0.3855328972681801, + "flos": 546164011008.0, + "grad_norm": 0.05736478292188374, + "language_loss": 0.83675313, + "learning_rate": 0.000703577012879517, + "loss": 0.84755898, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.36230469, + "step": 2004, + "time_per_iteration": 2.6308705806732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075931, + "balance_loss_mlp": 1.04040706, + "epoch": 0.3857252789534436, + "flos": 533819939328.0, + "grad_norm": 0.0602394573591363, + "language_loss": 0.8843599, + "learning_rate": 0.0007032924232477423, + "loss": 0.89511919, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.35595703, + "step": 2005, + "time_per_iteration": 2.6188220977783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079495, + "balance_loss_mlp": 1.04337406, + "epoch": 0.3859176606387072, + "flos": 491514849792.0, + "grad_norm": 0.055511202055775664, + "language_loss": 0.80448711, + "learning_rate": 0.0007030077546908493, + "loss": 0.81528199, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.36132812, + "step": 2006, + "time_per_iteration": 2.6309516429901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088067, + "balance_loss_mlp": 1.07609844, + "epoch": 0.3861100423239708, + "flos": 1486278955008.0, + "grad_norm": 0.032522163132150485, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84152722, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.11962891, + "step": 2007, + "time_per_iteration": 4.736604452133179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083286, + "balance_loss_mlp": 1.04816651, + "epoch": 0.3863024240092343, + "flos": 473493441024.0, + "grad_norm": 0.05514866045126494, + "language_loss": 0.79152983, + "learning_rate": 0.0007024381812438117, + "loss": 0.80236268, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.3515625, + "step": 2008, + "time_per_iteration": 2.5392396450042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108868, + "balance_loss_mlp": 1.05306053, + "epoch": 0.3864948056944979, + "flos": 716258723328.0, + "grad_norm": 0.059806412844581394, + "language_loss": 0.83199877, + "learning_rate": 0.0007021532765747951, + "loss": 0.84288561, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.35668945, + "step": 2009, + "time_per_iteration": 2.9926528930664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087876, + "balance_loss_mlp": 1.0511353, + "epoch": 0.38668718737976143, + "flos": 727302067200.0, + "grad_norm": 0.05631620148302912, + "language_loss": 0.7933259, + "learning_rate": 0.0007018682934229162, + "loss": 0.8042047, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.36743164, + "step": 2010, + "time_per_iteration": 2.924781322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087301, + "balance_loss_mlp": 1.05103779, + "epoch": 0.386879569065025, + "flos": 525218079744.0, + "grad_norm": 0.05794664731816873, + "language_loss": 0.82387936, + "learning_rate": 0.0007015832318988152, + "loss": 0.83475244, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.36328125, + "step": 2011, + "time_per_iteration": 2.668565511703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_mlp": 1.02173615, + "epoch": 0.38707195075028855, + "flos": 1527036005376.0, + "grad_norm": 0.019732384975687786, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74922872, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11083984, + "step": 2012, + "time_per_iteration": 4.948081970214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085105, + "balance_loss_mlp": 1.04886508, + "epoch": 0.38726433243555214, + "flos": 557045821440.0, + "grad_norm": 0.049164425244227684, + "language_loss": 0.84333575, + "learning_rate": 0.0007010128741766604, + "loss": 0.85418677, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.36230469, + "step": 2013, + "time_per_iteration": 2.7263684272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073123, + "balance_loss_mlp": 1.03695476, + "epoch": 0.38745671412081567, + "flos": 553431647232.0, + "grad_norm": 0.06787190277242791, + "language_loss": 0.84107876, + "learning_rate": 0.0007007275782000391, + "loss": 0.85181004, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.36181641, + "step": 2014, + "time_per_iteration": 2.6458756923675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082824, + "balance_loss_mlp": 1.04679942, + "epoch": 0.38764909580607926, + "flos": 458175384576.0, + "grad_norm": 0.05583745089019265, + "language_loss": 0.85148585, + "learning_rate": 0.0007004422042940605, + "loss": 0.86231411, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.3605957, + "step": 2015, + "time_per_iteration": 2.5374679565429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074881, + "balance_loss_mlp": 1.03847444, + "epoch": 0.38784147749134285, + "flos": 521973462528.0, + "grad_norm": 0.056017537147394686, + "language_loss": 0.89528251, + "learning_rate": 0.0007001567525695169, + "loss": 0.90603131, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.36425781, + "step": 2016, + "time_per_iteration": 2.5863571166992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081449, + "balance_loss_mlp": 1.04504275, + "epoch": 0.3880338591766064, + "flos": 665696600064.0, + "grad_norm": 0.05583938490392423, + "language_loss": 0.839926, + "learning_rate": 0.0006998712231372303, + "loss": 0.85074055, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.36425781, + "step": 2017, + "time_per_iteration": 2.998652219772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076964, + "balance_loss_mlp": 1.04103458, + "epoch": 0.38822624086186996, + "flos": 593660427264.0, + "grad_norm": 0.044278068469259586, + "language_loss": 0.86088806, + "learning_rate": 0.0006995856161080532, + "loss": 0.87165773, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.35961914, + "step": 2018, + "time_per_iteration": 2.870619297027588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077822, + "balance_loss_mlp": 1.03972268, + "epoch": 0.3884186225471335, + "flos": 612256596480.0, + "grad_norm": 0.10653426783792587, + "language_loss": 0.8221643, + "learning_rate": 0.0006992999315928679, + "loss": 0.83294249, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.38061523, + "step": 2019, + "time_per_iteration": 2.7867438793182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074258, + "balance_loss_mlp": 1.03782773, + "epoch": 0.3886110042323971, + "flos": 606737820672.0, + "grad_norm": 0.05830260104080337, + "language_loss": 0.85476196, + "learning_rate": 0.0006990141697025871, + "loss": 0.8655045, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.36401367, + "step": 2020, + "time_per_iteration": 2.7722346782684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008633, + "balance_loss_mlp": 0.99642587, + "epoch": 0.3888033859176606, + "flos": 1527289108992.0, + "grad_norm": 0.011259985776032525, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77368271, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.12207031, + "step": 2021, + "time_per_iteration": 4.71975302696228 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069739, + "balance_loss_mlp": 1.03368998, + "epoch": 0.3889957676029242, + "flos": 692145340416.0, + "grad_norm": 0.08577921290538253, + "language_loss": 0.82040119, + "learning_rate": 0.0006984424142405392, + "loss": 0.83109856, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.36035156, + "step": 2022, + "time_per_iteration": 2.8003616333007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070636, + "balance_loss_mlp": 1.03413415, + "epoch": 0.3891881492881878, + "flos": 514937170944.0, + "grad_norm": 0.06357614890279897, + "language_loss": 0.81860286, + "learning_rate": 0.0006981564208907474, + "loss": 0.82930923, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.36499023, + "step": 2023, + "time_per_iteration": 2.581556558609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074791, + "balance_loss_mlp": 1.03933799, + "epoch": 0.3893805309734513, + "flos": 628770663936.0, + "grad_norm": 0.04985256691663517, + "language_loss": 0.90055227, + "learning_rate": 0.0006978703506098102, + "loss": 0.91130018, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.35498047, + "step": 2024, + "time_per_iteration": 2.7220654487609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080786, + "balance_loss_mlp": 1.04497564, + "epoch": 0.3895729126587149, + "flos": 543894234624.0, + "grad_norm": 0.06500254639996711, + "language_loss": 0.88078821, + "learning_rate": 0.00069758420350879, + "loss": 0.89159608, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.35839844, + "step": 2025, + "time_per_iteration": 2.6044023036956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086131, + "balance_loss_mlp": 1.05012965, + "epoch": 0.38976529434397844, + "flos": 617987778048.0, + "grad_norm": 0.06153368317516065, + "language_loss": 0.86008936, + "learning_rate": 0.000697297979698779, + "loss": 0.87095064, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.36010742, + "step": 2026, + "time_per_iteration": 2.7051053047180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091671, + "balance_loss_mlp": 1.05628932, + "epoch": 0.38995767602924203, + "flos": 834518287872.0, + "grad_norm": 0.05732441037152358, + "language_loss": 0.83766049, + "learning_rate": 0.0006970116792908992, + "loss": 0.84857726, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.35400391, + "step": 2027, + "time_per_iteration": 3.086228847503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096114, + "balance_loss_mlp": 1.06032705, + "epoch": 0.39015005771450556, + "flos": 541343651328.0, + "grad_norm": 0.060477391230123065, + "language_loss": 0.8159399, + "learning_rate": 0.000696725302396302, + "loss": 0.82690096, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.3581543, + "step": 2028, + "time_per_iteration": 2.6521902084350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093769, + "balance_loss_mlp": 1.05867422, + "epoch": 0.39034243939976915, + "flos": 1007102536704.0, + "grad_norm": 0.04866281229524116, + "language_loss": 0.85781944, + "learning_rate": 0.0006964388491261692, + "loss": 0.86875713, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.35131836, + "step": 2029, + "time_per_iteration": 3.2338647842407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099777, + "balance_loss_mlp": 1.06401408, + "epoch": 0.3905348210850327, + "flos": 678723121152.0, + "grad_norm": 0.05278281932643199, + "language_loss": 0.87335277, + "learning_rate": 0.0006961523195917114, + "loss": 0.88435054, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.35791016, + "step": 2030, + "time_per_iteration": 2.8414504528045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098277, + "balance_loss_mlp": 1.06256151, + "epoch": 0.39072720277029627, + "flos": 548606905344.0, + "grad_norm": 0.05643291477722073, + "language_loss": 0.77850938, + "learning_rate": 0.0006958657139041696, + "loss": 0.78949213, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.35742188, + "step": 2031, + "time_per_iteration": 2.7278060913085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091715, + "balance_loss_mlp": 1.07807708, + "epoch": 0.39091958445555985, + "flos": 1546912401408.0, + "grad_norm": 0.03426807627657635, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77804685, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.13671875, + "step": 2032, + "time_per_iteration": 4.927444696426392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099209, + "balance_loss_mlp": 1.06399429, + "epoch": 0.3911119661408234, + "flos": 503741058048.0, + "grad_norm": 0.050822615130563034, + "language_loss": 0.78371578, + "learning_rate": 0.0006952922745149434, + "loss": 0.79470789, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.35253906, + "step": 2033, + "time_per_iteration": 2.6730306148529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095244, + "balance_loss_mlp": 1.05933857, + "epoch": 0.391304347826087, + "flos": 556967245824.0, + "grad_norm": 0.05118619150019999, + "language_loss": 0.87770367, + "learning_rate": 0.000695005441035888, + "loss": 0.88865614, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.359375, + "step": 2034, + "time_per_iteration": 2.6585283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_mlp": 1.02461255, + "epoch": 0.3914967295113505, + "flos": 1499309858304.0, + "grad_norm": 0.00946210886057752, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74762058, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.140625, + "step": 2035, + "time_per_iteration": 4.8464648723602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087203, + "balance_loss_mlp": 1.05182171, + "epoch": 0.3916891111966141, + "flos": 706715518464.0, + "grad_norm": 0.07007748344060821, + "language_loss": 0.81416976, + "learning_rate": 0.0006944315470656863, + "loss": 0.82504177, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.35424805, + "step": 2036, + "time_per_iteration": 2.9289584159851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010868, + "balance_loss_mlp": 1.05051255, + "epoch": 0.3918814928818776, + "flos": 556085537280.0, + "grad_norm": 0.05570869743183256, + "language_loss": 0.91007531, + "learning_rate": 0.000694144486797345, + "loss": 0.92094326, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.36303711, + "step": 2037, + "time_per_iteration": 0.0013861656188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043662, + "balance_loss_mlp": 1.02954793, + "epoch": 0.3920738745671412, + "flos": 1537845032448.0, + "grad_norm": 0.018656318140729232, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80564094, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.14160156, + "step": 2038, + "time_per_iteration": 4.6249003410339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091153, + "balance_loss_mlp": 1.05586696, + "epoch": 0.39226625625240474, + "flos": 498594811392.0, + "grad_norm": 0.06764177247916761, + "language_loss": 0.89479941, + "learning_rate": 0.0006935701402514156, + "loss": 0.90571094, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.35327148, + "step": 2039, + "time_per_iteration": 2.535269260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_mlp": 1.01963174, + "epoch": 0.39245863793766833, + "flos": 1346465203200.0, + "grad_norm": 0.017448285120256823, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.7406826, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.13769531, + "step": 2040, + "time_per_iteration": 4.913180589675903 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086444, + "balance_loss_mlp": 1.05072939, + "epoch": 0.3926510196229319, + "flos": 1345614474240.0, + "grad_norm": 0.055350347752650936, + "language_loss": 0.8453002, + "learning_rate": 0.0006929954931031422, + "loss": 0.85616457, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.35742188, + "step": 2041, + "time_per_iteration": 3.6796491146087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081967, + "balance_loss_mlp": 1.04722965, + "epoch": 0.39284340130819545, + "flos": 499333925376.0, + "grad_norm": 0.05434437059814268, + "language_loss": 0.88856936, + "learning_rate": 0.0006927080570819805, + "loss": 0.89938903, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.34790039, + "step": 2042, + "time_per_iteration": 2.634052038192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087578, + "balance_loss_mlp": 1.05217266, + "epoch": 0.39303578299345904, + "flos": 520077625344.0, + "grad_norm": 0.06468620716873735, + "language_loss": 0.80649555, + "learning_rate": 0.0006924205462449161, + "loss": 0.81737131, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.35473633, + "step": 2043, + "time_per_iteration": 2.6021780967712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078315, + "balance_loss_mlp": 1.04288566, + "epoch": 0.39322816467872257, + "flos": 907529301504.0, + "grad_norm": 0.05516365318311268, + "language_loss": 0.82013571, + "learning_rate": 0.0006921329607035702, + "loss": 0.83091891, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.35473633, + "step": 2044, + "time_per_iteration": 3.2195992469787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078665, + "balance_loss_mlp": 1.04473805, + "epoch": 0.39342054636398616, + "flos": 517330603008.0, + "grad_norm": 0.046703626280748714, + "language_loss": 0.88374329, + "learning_rate": 0.0006918453005695938, + "loss": 0.89452994, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.33959961, + "step": 2045, + "time_per_iteration": 2.6319282054901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080714, + "balance_loss_mlp": 1.04435515, + "epoch": 0.3936129280492497, + "flos": 547646621184.0, + "grad_norm": 0.04434497339872072, + "language_loss": 0.8422206, + "learning_rate": 0.0006915575659546662, + "loss": 0.8530277, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.36401367, + "step": 2046, + "time_per_iteration": 2.648895263671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081267, + "balance_loss_mlp": 1.04519427, + "epoch": 0.3938053097345133, + "flos": 525858269184.0, + "grad_norm": 0.0524234289418272, + "language_loss": 0.80648899, + "learning_rate": 0.0006912697569704959, + "loss": 0.81730163, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.36083984, + "step": 2047, + "time_per_iteration": 2.6330111026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085798, + "balance_loss_mlp": 1.04934382, + "epoch": 0.39399769141977686, + "flos": 471390990336.0, + "grad_norm": 0.0542278175669412, + "language_loss": 0.86721706, + "learning_rate": 0.0006909818737288205, + "loss": 0.87807506, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.36450195, + "step": 2048, + "time_per_iteration": 2.537581205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090734, + "balance_loss_mlp": 1.05468488, + "epoch": 0.3941900731050404, + "flos": 501490220544.0, + "grad_norm": 0.056383256559611315, + "language_loss": 0.80660325, + "learning_rate": 0.000690693916341406, + "loss": 0.8175106, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.3605957, + "step": 2049, + "time_per_iteration": 2.622183084487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096224, + "balance_loss_mlp": 1.05898309, + "epoch": 0.394382454790304, + "flos": 580577241600.0, + "grad_norm": 0.11468284139168772, + "language_loss": 0.82465422, + "learning_rate": 0.0006904058849200475, + "loss": 0.83561641, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.37255859, + "step": 2050, + "time_per_iteration": 2.7216436862945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087551, + "balance_loss_mlp": 1.05088186, + "epoch": 0.3945748364755675, + "flos": 513563659776.0, + "grad_norm": 0.05187056217607278, + "language_loss": 0.84988606, + "learning_rate": 0.0006901177795765683, + "loss": 0.86076152, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.36694336, + "step": 2051, + "time_per_iteration": 2.5725293159484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079371, + "balance_loss_mlp": 1.04291642, + "epoch": 0.3947672181608311, + "flos": 593683748352.0, + "grad_norm": 0.0518129521666432, + "language_loss": 0.8131091, + "learning_rate": 0.0006898296004228213, + "loss": 0.82390279, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.36450195, + "step": 2052, + "time_per_iteration": 2.6969447135925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050959, + "balance_loss_mlp": 1.0379895, + "epoch": 0.39495959984609463, + "flos": 1546829443584.0, + "grad_norm": 0.029989620736742544, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79177701, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12988281, + "step": 2053, + "time_per_iteration": 4.8486199378967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078173, + "balance_loss_mlp": 1.04233825, + "epoch": 0.3951519815313582, + "flos": 496271190528.0, + "grad_norm": 0.06693197740080077, + "language_loss": 0.80026031, + "learning_rate": 0.0006892530211320763, + "loss": 0.81104207, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.35864258, + "step": 2054, + "time_per_iteration": 2.6731534004211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082597, + "balance_loss_mlp": 1.04657161, + "epoch": 0.39534436321662175, + "flos": 530934704640.0, + "grad_norm": 0.06400198340094926, + "language_loss": 0.8367995, + "learning_rate": 0.000688964621218926, + "loss": 0.84762549, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.36010742, + "step": 2055, + "time_per_iteration": 2.623870611190796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107818, + "balance_loss_mlp": 1.0422982, + "epoch": 0.39553674490188534, + "flos": 702224017920.0, + "grad_norm": 0.05929287076568038, + "language_loss": 0.80154717, + "learning_rate": 0.0006886761479432037, + "loss": 0.81232893, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.35864258, + "step": 2056, + "time_per_iteration": 2.810593366622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088441, + "balance_loss_mlp": 1.05263042, + "epoch": 0.3957291265871489, + "flos": 409552768512.0, + "grad_norm": 0.05784227470554994, + "language_loss": 0.84645867, + "learning_rate": 0.0006883876014169045, + "loss": 0.85734308, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.3581543, + "step": 2057, + "time_per_iteration": 2.464358329772949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088007, + "balance_loss_mlp": 1.05121863, + "epoch": 0.39592150827241246, + "flos": 618204566016.0, + "grad_norm": 0.05454135250908964, + "language_loss": 0.90161431, + "learning_rate": 0.000688098981752052, + "loss": 0.91249436, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.36791992, + "step": 2058, + "time_per_iteration": 2.742589235305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094817, + "balance_loss_mlp": 1.05819607, + "epoch": 0.39611388995767605, + "flos": 820986969600.0, + "grad_norm": 0.05656267147709111, + "language_loss": 0.80105305, + "learning_rate": 0.0006878102890606982, + "loss": 0.81200117, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.36621094, + "step": 2059, + "time_per_iteration": 3.0725343227386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096787, + "balance_loss_mlp": 1.06018949, + "epoch": 0.3963062716429396, + "flos": 491977539072.0, + "grad_norm": 0.0710153527902746, + "language_loss": 0.81321216, + "learning_rate": 0.0006875215234549239, + "loss": 0.82418001, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.3659668, + "step": 2060, + "time_per_iteration": 2.542090654373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097398, + "balance_loss_mlp": 1.06015694, + "epoch": 0.39649865332820317, + "flos": 584466430464.0, + "grad_norm": 0.08966956269211096, + "language_loss": 0.8554219, + "learning_rate": 0.0006872326850468376, + "loss": 0.86639589, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.37231445, + "step": 2061, + "time_per_iteration": 2.7194440364837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010999, + "balance_loss_mlp": 1.06139588, + "epoch": 0.3966910350134667, + "flos": 458328153600.0, + "grad_norm": 0.05276871533818733, + "language_loss": 0.78609985, + "learning_rate": 0.0006869437739485762, + "loss": 0.79709888, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.38476562, + "step": 2062, + "time_per_iteration": 2.6032114028930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089079, + "balance_loss_mlp": 1.05281568, + "epoch": 0.3968834166987303, + "flos": 508388299776.0, + "grad_norm": 0.05735909750828215, + "language_loss": 0.93035084, + "learning_rate": 0.0006866547902723053, + "loss": 0.94124162, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.36279297, + "step": 2063, + "time_per_iteration": 2.666294813156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097295, + "balance_loss_mlp": 1.06110287, + "epoch": 0.3970757983839938, + "flos": 572349321216.0, + "grad_norm": 0.05819495660266105, + "language_loss": 0.79961425, + "learning_rate": 0.000686365734130218, + "loss": 0.81058717, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.36206055, + "step": 2064, + "time_per_iteration": 2.667443037033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093878, + "balance_loss_mlp": 1.05639839, + "epoch": 0.3972681800692574, + "flos": 481391092224.0, + "grad_norm": 0.051061390118103664, + "language_loss": 0.84029245, + "learning_rate": 0.000686076605634536, + "loss": 0.85123128, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.37475586, + "step": 2065, + "time_per_iteration": 2.605252981185913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091284, + "balance_loss_mlp": 1.05406737, + "epoch": 0.397460561754521, + "flos": 487683887616.0, + "grad_norm": 0.060621892107923396, + "language_loss": 0.84327424, + "learning_rate": 0.0006857874048975088, + "loss": 0.85418713, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.37207031, + "step": 2066, + "time_per_iteration": 2.5779786109924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090658, + "balance_loss_mlp": 1.05329823, + "epoch": 0.3976529434397845, + "flos": 421768802304.0, + "grad_norm": 0.05929679602335689, + "language_loss": 0.86975348, + "learning_rate": 0.0006854981320314142, + "loss": 0.88066006, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.37353516, + "step": 2067, + "time_per_iteration": 2.4723403453826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082754, + "balance_loss_mlp": 1.04591799, + "epoch": 0.3978453251250481, + "flos": 545331764736.0, + "grad_norm": 0.058605050617464606, + "language_loss": 0.86758339, + "learning_rate": 0.0006852087871485579, + "loss": 0.87841094, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.36816406, + "step": 2068, + "time_per_iteration": 2.6007602214813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082239, + "balance_loss_mlp": 1.04602289, + "epoch": 0.39803770681031164, + "flos": 650548841472.0, + "grad_norm": 0.06821675645960798, + "language_loss": 0.81689966, + "learning_rate": 0.0006849193703612735, + "loss": 0.82772201, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.36206055, + "step": 2069, + "time_per_iteration": 2.7661337852478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083569, + "balance_loss_mlp": 1.04661417, + "epoch": 0.39823008849557523, + "flos": 739734888960.0, + "grad_norm": 0.059947122372600754, + "language_loss": 0.77649361, + "learning_rate": 0.0006846298817819225, + "loss": 0.78732932, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.36987305, + "step": 2070, + "time_per_iteration": 2.9364843368530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091151, + "balance_loss_mlp": 1.05436325, + "epoch": 0.39842247018083876, + "flos": 384825337344.0, + "grad_norm": 0.0736862776590319, + "language_loss": 0.80732799, + "learning_rate": 0.0006843403215228945, + "loss": 0.81823957, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.36767578, + "step": 2071, + "time_per_iteration": 2.4597537517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078334, + "balance_loss_mlp": 1.04133189, + "epoch": 0.39861485186610235, + "flos": 533431443456.0, + "grad_norm": 0.052578385162892496, + "language_loss": 0.80366135, + "learning_rate": 0.0006840506896966065, + "loss": 0.81444472, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.36962891, + "step": 2072, + "time_per_iteration": 2.6826841831207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081243, + "balance_loss_mlp": 1.04397774, + "epoch": 0.39880723355136594, + "flos": 642834482688.0, + "grad_norm": 0.055481383737447196, + "language_loss": 0.82090193, + "learning_rate": 0.0006837609864155038, + "loss": 0.83171439, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.37255859, + "step": 2073, + "time_per_iteration": 2.8541154861450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075844, + "balance_loss_mlp": 1.0408442, + "epoch": 0.39899961523662947, + "flos": 515587534848.0, + "grad_norm": 0.07686004257588779, + "language_loss": 0.83464944, + "learning_rate": 0.0006834712117920592, + "loss": 0.84540784, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.3503418, + "step": 2074, + "time_per_iteration": 2.6023800373077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107731, + "balance_loss_mlp": 1.04025948, + "epoch": 0.39919199692189306, + "flos": 464148085248.0, + "grad_norm": 0.0625246810856132, + "language_loss": 0.85794407, + "learning_rate": 0.0006831813659387729, + "loss": 0.86871719, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.37036133, + "step": 2075, + "time_per_iteration": 2.5916242599487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071695, + "balance_loss_mlp": 1.0353837, + "epoch": 0.3993843786071566, + "flos": 531382837248.0, + "grad_norm": 0.05588277312371317, + "language_loss": 0.84130096, + "learning_rate": 0.0006828914489681733, + "loss": 0.852018, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.36303711, + "step": 2076, + "time_per_iteration": 2.7014079093933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078543, + "balance_loss_mlp": 1.04168355, + "epoch": 0.3995767602924202, + "flos": 503701770240.0, + "grad_norm": 0.05616101270921505, + "language_loss": 0.85284638, + "learning_rate": 0.0006826014609928162, + "loss": 0.86363184, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.36816406, + "step": 2077, + "time_per_iteration": 2.6714975833892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089101, + "balance_loss_mlp": 1.07622623, + "epoch": 0.3997691419776837, + "flos": 1453780500480.0, + "grad_norm": 0.03492718818999835, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.8428849, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.12890625, + "step": 2078, + "time_per_iteration": 4.8198041915893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075675, + "balance_loss_mlp": 1.03943551, + "epoch": 0.3999615236629473, + "flos": 530418170880.0, + "grad_norm": 0.06060184252075253, + "language_loss": 0.79984158, + "learning_rate": 0.0006820212724781896, + "loss": 0.81059831, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.36279297, + "step": 2079, + "time_per_iteration": 2.725576400756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077383, + "balance_loss_mlp": 1.04209709, + "epoch": 0.4001539053482108, + "flos": 694823961600.0, + "grad_norm": 0.12722864956638674, + "language_loss": 0.83843565, + "learning_rate": 0.0006817310721641694, + "loss": 0.84920955, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.35302734, + "step": 2080, + "time_per_iteration": 2.808981418609619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083492, + "balance_loss_mlp": 1.04732358, + "epoch": 0.4003462870334744, + "flos": 520102356480.0, + "grad_norm": 0.09864886938158902, + "language_loss": 0.84025592, + "learning_rate": 0.00068144080129589, + "loss": 0.85109079, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.36181641, + "step": 2081, + "time_per_iteration": 2.61795973777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090466, + "balance_loss_mlp": 1.05403543, + "epoch": 0.400538668718738, + "flos": 492272902656.0, + "grad_norm": 0.05814134634807872, + "language_loss": 0.83103502, + "learning_rate": 0.0006811504599860441, + "loss": 0.84193969, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.36450195, + "step": 2082, + "time_per_iteration": 2.5586163997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109661, + "balance_loss_mlp": 1.06161022, + "epoch": 0.40073105040400153, + "flos": 490083111936.0, + "grad_norm": 0.05292967428813452, + "language_loss": 0.85547149, + "learning_rate": 0.0006808600483473526, + "loss": 0.86643761, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.35058594, + "step": 2083, + "time_per_iteration": 2.8549985885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110158, + "balance_loss_mlp": 1.06584144, + "epoch": 0.4009234320892651, + "flos": 562088761344.0, + "grad_norm": 0.051341860757237005, + "language_loss": 0.85926497, + "learning_rate": 0.0006805695664925629, + "loss": 0.87028074, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.35791016, + "step": 2084, + "time_per_iteration": 2.7807514667510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111507, + "balance_loss_mlp": 1.07619727, + "epoch": 0.40111581377452865, + "flos": 425786029056.0, + "grad_norm": 0.07139972521672847, + "language_loss": 0.84098327, + "learning_rate": 0.0006802790145344506, + "loss": 0.85209835, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.35327148, + "step": 2085, + "time_per_iteration": 2.4653491973876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106176, + "balance_loss_mlp": 1.07024658, + "epoch": 0.40130819545979224, + "flos": 612148907520.0, + "grad_norm": 0.09859033966702202, + "language_loss": 0.87080699, + "learning_rate": 0.0006799883925858176, + "loss": 0.88186872, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.35961914, + "step": 2086, + "time_per_iteration": 2.8432652950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101686, + "balance_loss_mlp": 1.06580365, + "epoch": 0.40150057714505577, + "flos": 523179648000.0, + "grad_norm": 0.06735788816740666, + "language_loss": 0.85303611, + "learning_rate": 0.0006796977007594933, + "loss": 0.86405295, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.35913086, + "step": 2087, + "time_per_iteration": 2.597883701324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109903, + "balance_loss_mlp": 1.06240904, + "epoch": 0.40169295883031936, + "flos": 561143033856.0, + "grad_norm": 0.0524220318715257, + "language_loss": 0.86402881, + "learning_rate": 0.0006794069391683345, + "loss": 0.87501919, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.36621094, + "step": 2088, + "time_per_iteration": 2.7313365936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101072, + "balance_loss_mlp": 1.06414104, + "epoch": 0.4018853405155829, + "flos": 518743401984.0, + "grad_norm": 0.056795041649419745, + "language_loss": 0.80919069, + "learning_rate": 0.0006791161079252248, + "loss": 0.8202014, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.36914062, + "step": 2089, + "time_per_iteration": 2.57450532913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109207, + "balance_loss_mlp": 1.05652201, + "epoch": 0.4020777222008465, + "flos": 525957193728.0, + "grad_norm": 0.05166370887572794, + "language_loss": 0.82473212, + "learning_rate": 0.0006788252071430747, + "loss": 0.83565277, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.35546875, + "step": 2090, + "time_per_iteration": 2.6603012084960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097817, + "balance_loss_mlp": 1.06100535, + "epoch": 0.40227010388611006, + "flos": 525494504448.0, + "grad_norm": 0.056931817338158205, + "language_loss": 0.86595076, + "learning_rate": 0.0006785342369348222, + "loss": 0.87692893, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.3684082, + "step": 2091, + "time_per_iteration": 2.807980537414551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091263, + "balance_loss_mlp": 1.05566692, + "epoch": 0.4024624855713736, + "flos": 432074442240.0, + "grad_norm": 0.0736357586886409, + "language_loss": 0.79799104, + "learning_rate": 0.0006782431974134316, + "loss": 0.80890369, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.35668945, + "step": 2092, + "time_per_iteration": 2.5331132411956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097919, + "balance_loss_mlp": 1.06044006, + "epoch": 0.4026548672566372, + "flos": 766304312832.0, + "grad_norm": 0.05288336614740697, + "language_loss": 0.89230573, + "learning_rate": 0.0006779520886918949, + "loss": 0.90328491, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.375, + "step": 2093, + "time_per_iteration": 3.014895439147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093032, + "balance_loss_mlp": 1.0560298, + "epoch": 0.4028472489419007, + "flos": 642636633600.0, + "grad_norm": 0.05102527643704043, + "language_loss": 0.8125242, + "learning_rate": 0.0006776609108832301, + "loss": 0.8234545, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.36987305, + "step": 2094, + "time_per_iteration": 2.7778923511505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089446, + "balance_loss_mlp": 1.05311072, + "epoch": 0.4030396306271643, + "flos": 491593425408.0, + "grad_norm": 0.053262929353227066, + "language_loss": 0.84942901, + "learning_rate": 0.0006773696641004828, + "loss": 0.86032349, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.36352539, + "step": 2095, + "time_per_iteration": 2.580313205718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089479, + "balance_loss_mlp": 1.05238152, + "epoch": 0.40323201231242783, + "flos": 901363133952.0, + "grad_norm": 0.05931554649921985, + "language_loss": 0.77618563, + "learning_rate": 0.0006770783484567247, + "loss": 0.78708041, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.37109375, + "step": 2096, + "time_per_iteration": 3.0955684185028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089916, + "balance_loss_mlp": 1.0536046, + "epoch": 0.4034243939976914, + "flos": 570267219456.0, + "grad_norm": 0.07944545156942663, + "language_loss": 0.8587091, + "learning_rate": 0.000676786964065055, + "loss": 0.86960828, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.36303711, + "step": 2097, + "time_per_iteration": 2.742293119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084372, + "balance_loss_mlp": 1.04829895, + "epoch": 0.403616775682955, + "flos": 507206845440.0, + "grad_norm": 0.04869402927646331, + "language_loss": 0.78305566, + "learning_rate": 0.0006764955110385986, + "loss": 0.79389936, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.3605957, + "step": 2098, + "time_per_iteration": 2.708390235900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086865, + "balance_loss_mlp": 1.05055428, + "epoch": 0.40380915736821854, + "flos": 519127515648.0, + "grad_norm": 0.06727344126892942, + "language_loss": 0.80247992, + "learning_rate": 0.0006762039894905083, + "loss": 0.81334853, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.36328125, + "step": 2099, + "time_per_iteration": 2.6428377628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095654, + "balance_loss_mlp": 1.05812716, + "epoch": 0.40400153905348213, + "flos": 441686048256.0, + "grad_norm": 0.06575852305434472, + "language_loss": 0.80233693, + "learning_rate": 0.000675912399533962, + "loss": 0.81329346, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.375, + "step": 2100, + "time_per_iteration": 2.5560812950134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088947, + "balance_loss_mlp": 1.05249298, + "epoch": 0.40419392073874566, + "flos": 771961300992.0, + "grad_norm": 0.1036114098840327, + "language_loss": 0.85183066, + "learning_rate": 0.0006756207412821656, + "loss": 0.86272013, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.36450195, + "step": 2101, + "time_per_iteration": 2.986583709716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086168, + "balance_loss_mlp": 1.05021429, + "epoch": 0.40438630242400925, + "flos": 766215562752.0, + "grad_norm": 0.06055449439143942, + "language_loss": 0.80025709, + "learning_rate": 0.0006753290148483505, + "loss": 0.81111872, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.36010742, + "step": 2102, + "time_per_iteration": 3.0076749324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080415, + "balance_loss_mlp": 1.04491425, + "epoch": 0.4045786841092728, + "flos": 415013317632.0, + "grad_norm": 0.052033945118291625, + "language_loss": 0.7866869, + "learning_rate": 0.0006750372203457752, + "loss": 0.79749095, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.35546875, + "step": 2103, + "time_per_iteration": 2.490941286087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083553, + "balance_loss_mlp": 1.04767144, + "epoch": 0.40477106579453637, + "flos": 538941454848.0, + "grad_norm": 0.07087529891902919, + "language_loss": 0.86455047, + "learning_rate": 0.0006747453578877242, + "loss": 0.875386, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.35864258, + "step": 2104, + "time_per_iteration": 2.6906399726867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083688, + "balance_loss_mlp": 1.04766345, + "epoch": 0.4049634474797999, + "flos": 826358768640.0, + "grad_norm": 0.07644078595746046, + "language_loss": 0.82677126, + "learning_rate": 0.0006744534275875085, + "loss": 0.83760816, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.3605957, + "step": 2105, + "time_per_iteration": 2.9925642013549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081927, + "balance_loss_mlp": 1.0459255, + "epoch": 0.4051558291650635, + "flos": 572417722368.0, + "grad_norm": 0.07127110995979934, + "language_loss": 0.8562066, + "learning_rate": 0.0006741614295584657, + "loss": 0.86702585, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.36010742, + "step": 2106, + "time_per_iteration": 2.6289658546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078264, + "balance_loss_mlp": 1.04321659, + "epoch": 0.4053482108503271, + "flos": 731541874176.0, + "grad_norm": 0.07814638610947379, + "language_loss": 0.78334522, + "learning_rate": 0.0006738693639139595, + "loss": 0.79412782, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.35083008, + "step": 2107, + "time_per_iteration": 3.0381481647491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078707, + "balance_loss_mlp": 1.04234815, + "epoch": 0.4055405925355906, + "flos": 1212588292608.0, + "grad_norm": 0.05182127384415646, + "language_loss": 0.77652568, + "learning_rate": 0.0006735772307673796, + "loss": 0.78731275, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.36376953, + "step": 2108, + "time_per_iteration": 3.5424931049346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075462, + "balance_loss_mlp": 1.03998494, + "epoch": 0.4057329742208542, + "flos": 715553104896.0, + "grad_norm": 0.0496802449600099, + "language_loss": 0.83129466, + "learning_rate": 0.0006732850302321421, + "loss": 0.84204924, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.35498047, + "step": 2109, + "time_per_iteration": 2.902758836746216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081445, + "balance_loss_mlp": 1.04506207, + "epoch": 0.4059253559061177, + "flos": 564623377920.0, + "grad_norm": 0.054690107844022846, + "language_loss": 0.84019876, + "learning_rate": 0.00067299276242169, + "loss": 0.85101312, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.36376953, + "step": 2110, + "time_per_iteration": 2.6453192234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108684, + "balance_loss_mlp": 1.07272601, + "epoch": 0.4061177375913813, + "flos": 1592886919680.0, + "grad_norm": 0.03852995701507201, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75469011, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.14160156, + "step": 2111, + "time_per_iteration": 4.936276197433472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092328, + "balance_loss_mlp": 1.05587411, + "epoch": 0.40631011927664484, + "flos": 615122892288.0, + "grad_norm": 0.05227822307204106, + "language_loss": 0.77911901, + "learning_rate": 0.0006724080254290395, + "loss": 0.79004228, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.36425781, + "step": 2112, + "time_per_iteration": 2.804931402206421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_mlp": 1.04893136, + "epoch": 0.40650250096190843, + "flos": 557390647296.0, + "grad_norm": 0.056265148252134925, + "language_loss": 0.89716649, + "learning_rate": 0.0006721155564738566, + "loss": 0.90801871, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.36303711, + "step": 2113, + "time_per_iteration": 2.756901502609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050781, + "balance_loss_mlp": 1.03676188, + "epoch": 0.40669488264717196, + "flos": 1579301756928.0, + "grad_norm": 0.015026311101099392, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79673421, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.140625, + "step": 2114, + "time_per_iteration": 4.975963354110718 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109673, + "balance_loss_mlp": 1.0599184, + "epoch": 0.40688726433243555, + "flos": 507398902272.0, + "grad_norm": 0.07464761746525102, + "language_loss": 0.85648221, + "learning_rate": 0.0006715304182135078, + "loss": 0.86744952, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.36816406, + "step": 2115, + "time_per_iteration": 2.5924360752105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104151, + "balance_loss_mlp": 1.06726742, + "epoch": 0.40707964601769914, + "flos": 588757109760.0, + "grad_norm": 0.06427267203463374, + "language_loss": 0.88647795, + "learning_rate": 0.0006712377491355127, + "loss": 0.89751947, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.36889648, + "step": 2116, + "time_per_iteration": 2.887439489364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097518, + "balance_loss_mlp": 1.06135035, + "epoch": 0.40727202770296267, + "flos": 580134901248.0, + "grad_norm": 0.10612280790481599, + "language_loss": 0.81211627, + "learning_rate": 0.0006709450135771274, + "loss": 0.82309151, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.36206055, + "step": 2117, + "time_per_iteration": 2.9730725288391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101548, + "balance_loss_mlp": 1.06523705, + "epoch": 0.40746440938822626, + "flos": 503819633664.0, + "grad_norm": 0.05032701187252936, + "language_loss": 0.86683893, + "learning_rate": 0.0006706522116520023, + "loss": 0.87785447, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.36328125, + "step": 2118, + "time_per_iteration": 2.6400580406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096769, + "balance_loss_mlp": 1.06122053, + "epoch": 0.4076567910734898, + "flos": 605323611648.0, + "grad_norm": 0.05658204986861598, + "language_loss": 0.82839441, + "learning_rate": 0.0006703593434738127, + "loss": 0.83936214, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.35571289, + "step": 2119, + "time_per_iteration": 2.77944016456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091629, + "balance_loss_mlp": 1.05622339, + "epoch": 0.4078491727587534, + "flos": 479313372672.0, + "grad_norm": 0.0532477275953574, + "language_loss": 0.78150344, + "learning_rate": 0.0006700664091562604, + "loss": 0.79241967, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.35449219, + "step": 2120, + "time_per_iteration": 2.580658435821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093922, + "balance_loss_mlp": 1.05780149, + "epoch": 0.4080415544440169, + "flos": 510126985728.0, + "grad_norm": 0.045251762284626275, + "language_loss": 0.85188484, + "learning_rate": 0.0006697734088130725, + "loss": 0.86282408, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.36157227, + "step": 2121, + "time_per_iteration": 2.5990941524505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108927, + "balance_loss_mlp": 1.05329287, + "epoch": 0.4082339361292805, + "flos": 734318009856.0, + "grad_norm": 0.06207508790269206, + "language_loss": 0.85326135, + "learning_rate": 0.0006694803425580018, + "loss": 0.86415404, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.36010742, + "step": 2122, + "time_per_iteration": 2.9514336585998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093256, + "balance_loss_mlp": 1.05687356, + "epoch": 0.4084263178145441, + "flos": 457239831552.0, + "grad_norm": 0.08260422277145335, + "language_loss": 0.84467387, + "learning_rate": 0.0006691872105048268, + "loss": 0.85560644, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.36401367, + "step": 2123, + "time_per_iteration": 2.584765672683716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093552, + "balance_loss_mlp": 1.05762231, + "epoch": 0.4086186994998076, + "flos": 562659139584.0, + "grad_norm": 0.056985949085160005, + "language_loss": 0.84641832, + "learning_rate": 0.0006688940127673513, + "loss": 0.85735387, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.35961914, + "step": 2124, + "time_per_iteration": 2.698777675628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100727, + "balance_loss_mlp": 1.06446397, + "epoch": 0.4088110811850712, + "flos": 573364859904.0, + "grad_norm": 0.04747345440626025, + "language_loss": 0.85754699, + "learning_rate": 0.0006686007494594049, + "loss": 0.86855423, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.36279297, + "step": 2125, + "time_per_iteration": 2.8035151958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101938, + "balance_loss_mlp": 1.06538868, + "epoch": 0.40900346287033473, + "flos": 456702948864.0, + "grad_norm": 0.06322616011827766, + "language_loss": 0.80074888, + "learning_rate": 0.0006683074206948425, + "loss": 0.81176829, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.36547852, + "step": 2126, + "time_per_iteration": 2.4856953620910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103432, + "balance_loss_mlp": 1.06697774, + "epoch": 0.4091958445555983, + "flos": 617097305088.0, + "grad_norm": 0.05684118517242104, + "language_loss": 0.8146261, + "learning_rate": 0.0006680140265875443, + "loss": 0.82566047, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.36474609, + "step": 2127, + "time_per_iteration": 2.772571325302124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111548, + "balance_loss_mlp": 1.07564259, + "epoch": 0.40938822624086185, + "flos": 472159217664.0, + "grad_norm": 0.051537767424008556, + "language_loss": 0.95483583, + "learning_rate": 0.0006677205672514162, + "loss": 0.96595132, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.35888672, + "step": 2128, + "time_per_iteration": 2.6006312370300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114142, + "balance_loss_mlp": 1.07642448, + "epoch": 0.40958060792612544, + "flos": 569734718976.0, + "grad_norm": 0.04853999942998699, + "language_loss": 0.88646978, + "learning_rate": 0.000667427042800389, + "loss": 0.8976112, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.37670898, + "step": 2129, + "time_per_iteration": 2.742804765701294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107096, + "balance_loss_mlp": 1.07030797, + "epoch": 0.40977298961138897, + "flos": 609065823744.0, + "grad_norm": 0.053374560930054, + "language_loss": 0.8288517, + "learning_rate": 0.0006671334533484192, + "loss": 0.83992267, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.36767578, + "step": 2130, + "time_per_iteration": 2.7175474166870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105432, + "balance_loss_mlp": 1.06854916, + "epoch": 0.40996537129665256, + "flos": 581463332352.0, + "grad_norm": 0.10187828374301312, + "language_loss": 0.83427989, + "learning_rate": 0.0006668397990094881, + "loss": 0.84533429, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.36889648, + "step": 2131, + "time_per_iteration": 2.718189239501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102513, + "balance_loss_mlp": 1.06438994, + "epoch": 0.41015775298191615, + "flos": 516296125440.0, + "grad_norm": 0.05088305967580112, + "language_loss": 0.84777439, + "learning_rate": 0.0006665460798976027, + "loss": 0.85879958, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.38134766, + "step": 2132, + "time_per_iteration": 2.754838228225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103514, + "balance_loss_mlp": 1.06448531, + "epoch": 0.4103501346671797, + "flos": 510083315712.0, + "grad_norm": 0.04980971333778078, + "language_loss": 0.81075269, + "learning_rate": 0.0006662522961267947, + "loss": 0.82178783, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.38989258, + "step": 2133, + "time_per_iteration": 2.630645513534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105465, + "balance_loss_mlp": 1.06514883, + "epoch": 0.41054251635244327, + "flos": 549459500544.0, + "grad_norm": 0.047627275091831754, + "language_loss": 0.87016159, + "learning_rate": 0.0006659584478111211, + "loss": 0.88121629, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.40307617, + "step": 2134, + "time_per_iteration": 2.7775702476501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114637, + "balance_loss_mlp": 1.07408166, + "epoch": 0.4107348980377068, + "flos": 839549643264.0, + "grad_norm": 0.06581962625194586, + "language_loss": 0.82464856, + "learning_rate": 0.000665664535064664, + "loss": 0.83579493, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.40551758, + "step": 2135, + "time_per_iteration": 3.0234854221343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011149, + "balance_loss_mlp": 1.07501245, + "epoch": 0.4109272797229704, + "flos": 503445694464.0, + "grad_norm": 0.05498766410062668, + "language_loss": 0.82554698, + "learning_rate": 0.0006653705580015303, + "loss": 0.83669591, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.39892578, + "step": 2136, + "time_per_iteration": 2.740478992462158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110871, + "balance_loss_mlp": 1.06786942, + "epoch": 0.4111196614082339, + "flos": 610533877248.0, + "grad_norm": 0.1069583069182241, + "language_loss": 0.86098707, + "learning_rate": 0.0006650765167358523, + "loss": 0.87207425, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.40844727, + "step": 2137, + "time_per_iteration": 2.7766735553741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112768, + "balance_loss_mlp": 1.07264185, + "epoch": 0.4113120430934975, + "flos": 452931623424.0, + "grad_norm": 0.06240188984530218, + "language_loss": 0.8998509, + "learning_rate": 0.0006647824113817864, + "loss": 0.91097856, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.40112305, + "step": 2138, + "time_per_iteration": 2.558088779449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109501, + "balance_loss_mlp": 1.06992376, + "epoch": 0.41150442477876104, + "flos": 541324712448.0, + "grad_norm": 0.06351755199965968, + "language_loss": 0.81488299, + "learning_rate": 0.000664488242053515, + "loss": 0.82597804, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.39550781, + "step": 2139, + "time_per_iteration": 2.7064287662506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102585, + "balance_loss_mlp": 1.06405628, + "epoch": 0.4116968064640246, + "flos": 576017339904.0, + "grad_norm": 0.052717271070364294, + "language_loss": 0.8372525, + "learning_rate": 0.0006641940088652445, + "loss": 0.8482784, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.38500977, + "step": 2140, + "time_per_iteration": 2.8360941410064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107136, + "balance_loss_mlp": 1.0685842, + "epoch": 0.4118891881492882, + "flos": 495857963520.0, + "grad_norm": 0.05632128251923113, + "language_loss": 0.82241237, + "learning_rate": 0.0006638997119312065, + "loss": 0.83348376, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.38500977, + "step": 2141, + "time_per_iteration": 2.695482015609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432807, + "balance_loss_mlp": 1.41773903, + "epoch": 0.41208156983455174, + "flos": 1537604923392.0, + "grad_norm": 0.12335560313674339, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76496112, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.15039062, + "step": 2142, + "time_per_iteration": 4.938086032867432 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096248, + "balance_loss_mlp": 1.05800605, + "epoch": 0.41227395151981533, + "flos": 584697775104.0, + "grad_norm": 0.06073263389064812, + "language_loss": 0.84852999, + "learning_rate": 0.000663310927282877, + "loss": 0.85949242, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.38208008, + "step": 2143, + "time_per_iteration": 2.776041269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098979, + "balance_loss_mlp": 1.06183362, + "epoch": 0.41246633320507886, + "flos": 442685620224.0, + "grad_norm": 0.05843533128868507, + "language_loss": 0.85999441, + "learning_rate": 0.000663016439797172, + "loss": 0.8709842, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.37109375, + "step": 2144, + "time_per_iteration": 2.6550843715667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099188, + "balance_loss_mlp": 1.06280541, + "epoch": 0.41265871489034245, + "flos": 579680976384.0, + "grad_norm": 0.05476235673703619, + "language_loss": 0.80718118, + "learning_rate": 0.0006627218890228724, + "loss": 0.81817305, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.36376953, + "step": 2145, + "time_per_iteration": 2.748966693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098226, + "balance_loss_mlp": 1.06139088, + "epoch": 0.412851096575606, + "flos": 760906372608.0, + "grad_norm": 0.06511227414480983, + "language_loss": 0.83519912, + "learning_rate": 0.0006624272750743326, + "loss": 0.84618139, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.3684082, + "step": 2146, + "time_per_iteration": 2.987541913986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098071, + "balance_loss_mlp": 1.05994785, + "epoch": 0.41304347826086957, + "flos": 555062644224.0, + "grad_norm": 0.04596756157996359, + "language_loss": 0.82878035, + "learning_rate": 0.0006621325980659322, + "loss": 0.83976108, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.38061523, + "step": 2147, + "time_per_iteration": 2.821556568145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104625, + "balance_loss_mlp": 1.0655247, + "epoch": 0.41323585994613315, + "flos": 665418765312.0, + "grad_norm": 0.06740751064613239, + "language_loss": 0.8204211, + "learning_rate": 0.000661837858112075, + "loss": 0.83146733, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.390625, + "step": 2148, + "time_per_iteration": 2.7922754287719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089584, + "balance_loss_mlp": 1.05136561, + "epoch": 0.4134282416313967, + "flos": 548429405184.0, + "grad_norm": 0.050771109286751076, + "language_loss": 0.88476944, + "learning_rate": 0.0006615430553271888, + "loss": 0.89566529, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.38208008, + "step": 2149, + "time_per_iteration": 2.7367136478424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091169, + "balance_loss_mlp": 1.05326056, + "epoch": 0.4136206233166603, + "flos": 645951062016.0, + "grad_norm": 0.056682848656222896, + "language_loss": 0.85300201, + "learning_rate": 0.0006612481898257264, + "loss": 0.86391366, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.37866211, + "step": 2150, + "time_per_iteration": 2.862969160079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082558, + "balance_loss_mlp": 1.04398179, + "epoch": 0.4138130050019238, + "flos": 517103640576.0, + "grad_norm": 0.07190872816549171, + "language_loss": 0.85216105, + "learning_rate": 0.000660953261722165, + "loss": 0.86298662, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.38549805, + "step": 2151, + "time_per_iteration": 2.608966588973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072414, + "balance_loss_mlp": 1.03379023, + "epoch": 0.4140053866871874, + "flos": 608977073664.0, + "grad_norm": 0.05213877076699988, + "language_loss": 0.82764488, + "learning_rate": 0.0006606582711310055, + "loss": 0.83836901, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.38574219, + "step": 2152, + "time_per_iteration": 2.704941511154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081545, + "balance_loss_mlp": 1.04287302, + "epoch": 0.4141977683724509, + "flos": 579493301760.0, + "grad_norm": 0.0573275470165796, + "language_loss": 0.83345616, + "learning_rate": 0.0006603632181667736, + "loss": 0.8442716, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.38671875, + "step": 2153, + "time_per_iteration": 2.670036792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157558, + "balance_loss_mlp": 1.14086878, + "epoch": 0.4143901500577145, + "flos": 1306598777856.0, + "grad_norm": 0.04466441147089705, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.80100882, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.16699219, + "step": 2154, + "time_per_iteration": 4.936178684234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088281, + "balance_loss_mlp": 1.04989576, + "epoch": 0.41458253174297804, + "flos": 459957740544.0, + "grad_norm": 0.05825483779723247, + "language_loss": 0.81504506, + "learning_rate": 0.0006597729255773153, + "loss": 0.82592785, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.38354492, + "step": 2155, + "time_per_iteration": 2.5436675548553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095127, + "balance_loss_mlp": 1.056885, + "epoch": 0.41477491342824163, + "flos": 553096995840.0, + "grad_norm": 0.14369101348323118, + "language_loss": 0.82126498, + "learning_rate": 0.0006594776861812608, + "loss": 0.83221632, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.38183594, + "step": 2156, + "time_per_iteration": 2.6603870391845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102341, + "balance_loss_mlp": 1.06414664, + "epoch": 0.4149672951135052, + "flos": 697444356096.0, + "grad_norm": 0.09619651786969989, + "language_loss": 0.86957002, + "learning_rate": 0.0006591823848704776, + "loss": 0.88059342, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.38183594, + "step": 2157, + "time_per_iteration": 2.888523578643799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112154, + "balance_loss_mlp": 1.07362556, + "epoch": 0.41515967679876875, + "flos": 565480355328.0, + "grad_norm": 0.06180894820080996, + "language_loss": 0.81514823, + "learning_rate": 0.0006588870217596117, + "loss": 0.82626975, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.38500977, + "step": 2158, + "time_per_iteration": 2.7872376441955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124674, + "balance_loss_mlp": 1.08497691, + "epoch": 0.41535205848403234, + "flos": 500938781184.0, + "grad_norm": 0.08519942481898463, + "language_loss": 0.85712391, + "learning_rate": 0.0006585915969633334, + "loss": 0.86837065, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.39672852, + "step": 2159, + "time_per_iteration": 2.5857338905334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135799, + "balance_loss_mlp": 1.09703159, + "epoch": 0.41554444016929587, + "flos": 607268911104.0, + "grad_norm": 0.06479316283343547, + "language_loss": 0.89294302, + "learning_rate": 0.0006582961105963366, + "loss": 0.90430105, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.38720703, + "step": 2160, + "time_per_iteration": 2.7831602096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153796, + "balance_loss_mlp": 1.11493373, + "epoch": 0.41573682185455946, + "flos": 528856985088.0, + "grad_norm": 0.06215124272048543, + "language_loss": 0.77626073, + "learning_rate": 0.0006580005627733395, + "loss": 0.7877987, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.38818359, + "step": 2161, + "time_per_iteration": 2.6620304584503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152884, + "balance_loss_mlp": 1.11349678, + "epoch": 0.415929203539823, + "flos": 504686785536.0, + "grad_norm": 0.0577168801928891, + "language_loss": 0.81587994, + "learning_rate": 0.0006577049536090838, + "loss": 0.82740879, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.39355469, + "step": 2162, + "time_per_iteration": 2.6678874492645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144655, + "balance_loss_mlp": 1.10693753, + "epoch": 0.4161215852250866, + "flos": 582467286528.0, + "grad_norm": 0.07160302952697103, + "language_loss": 0.85415941, + "learning_rate": 0.000657409283218335, + "loss": 0.86560595, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.37695312, + "step": 2163, + "time_per_iteration": 2.6405746936798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134301, + "balance_loss_mlp": 1.09570062, + "epoch": 0.4163139669103501, + "flos": 490432320000.0, + "grad_norm": 0.051386242205519156, + "language_loss": 0.80774486, + "learning_rate": 0.0006571135517158829, + "loss": 0.81908786, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.38549805, + "step": 2164, + "time_per_iteration": 2.6496996879577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218225, + "balance_loss_mlp": 1.20143986, + "epoch": 0.4165063485956137, + "flos": 1287445377024.0, + "grad_norm": 0.06520745435981959, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77982283, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.16796875, + "step": 2165, + "time_per_iteration": 4.76560640335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127203, + "balance_loss_mlp": 1.09003401, + "epoch": 0.4166987302808773, + "flos": 495015542784.0, + "grad_norm": 0.07154886739030113, + "language_loss": 0.83213758, + "learning_rate": 0.0006565219058351444, + "loss": 0.8434096, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.37133789, + "step": 2166, + "time_per_iteration": 2.539856433868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112149, + "balance_loss_mlp": 1.07397866, + "epoch": 0.4168911119661408, + "flos": 463823608320.0, + "grad_norm": 0.0764039854303378, + "language_loss": 0.83196324, + "learning_rate": 0.0006562259916865553, + "loss": 0.84308469, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.38110352, + "step": 2167, + "time_per_iteration": 2.5938220024108887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106062, + "balance_loss_mlp": 1.06939304, + "epoch": 0.4170834936514044, + "flos": 536499970560.0, + "grad_norm": 0.052882286550722295, + "language_loss": 0.7941224, + "learning_rate": 0.0006559300168856573, + "loss": 0.80518305, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.36694336, + "step": 2168, + "time_per_iteration": 2.7382309436798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100629, + "balance_loss_mlp": 1.0633167, + "epoch": 0.41727587533666793, + "flos": 550418374656.0, + "grad_norm": 0.05257418188896324, + "language_loss": 0.85768378, + "learning_rate": 0.0006556339815473577, + "loss": 0.86869007, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.37280273, + "step": 2169, + "time_per_iteration": 2.6762564182281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110352, + "balance_loss_mlp": 1.06501567, + "epoch": 0.4174682570219315, + "flos": 630795949056.0, + "grad_norm": 0.0440641640787593, + "language_loss": 0.85913342, + "learning_rate": 0.000655337885786588, + "loss": 0.87016863, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.38452148, + "step": 2170, + "time_per_iteration": 2.8669848442077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098716, + "balance_loss_mlp": 1.06068778, + "epoch": 0.41766063870719505, + "flos": 519501454848.0, + "grad_norm": 0.07103396575336611, + "language_loss": 0.84732234, + "learning_rate": 0.0006550417297183025, + "loss": 0.85830951, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.37988281, + "step": 2171, + "time_per_iteration": 2.6471290588378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110284, + "balance_loss_mlp": 1.0640254, + "epoch": 0.41785302039245864, + "flos": 557656897536.0, + "grad_norm": 0.051327988161677204, + "language_loss": 0.8175863, + "learning_rate": 0.0006547455134574793, + "loss": 0.82861477, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.38793945, + "step": 2172, + "time_per_iteration": 2.71508526802063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100888, + "balance_loss_mlp": 1.06338453, + "epoch": 0.41804540207772223, + "flos": 788156683776.0, + "grad_norm": 0.052280747851499734, + "language_loss": 0.84377366, + "learning_rate": 0.0006544492371191198, + "loss": 0.85478258, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.37475586, + "step": 2173, + "time_per_iteration": 3.114607810974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096475, + "balance_loss_mlp": 1.05775642, + "epoch": 0.41823778376298576, + "flos": 903944240640.0, + "grad_norm": 0.04972167781175626, + "language_loss": 0.83103442, + "learning_rate": 0.0006541529008182485, + "loss": 0.84199917, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.38696289, + "step": 2174, + "time_per_iteration": 3.165484666824341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094456, + "balance_loss_mlp": 1.0563333, + "epoch": 0.41843016544824935, + "flos": 511308440064.0, + "grad_norm": 0.05116159603840096, + "language_loss": 0.8702668, + "learning_rate": 0.0006538565046699136, + "loss": 0.88121128, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.38085938, + "step": 2175, + "time_per_iteration": 2.5701253414154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101136, + "balance_loss_mlp": 1.06389487, + "epoch": 0.4186225471335129, + "flos": 652774947840.0, + "grad_norm": 0.05537675869017034, + "language_loss": 0.81610411, + "learning_rate": 0.0006535600487891862, + "loss": 0.82711548, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.37231445, + "step": 2176, + "time_per_iteration": 2.7980031967163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096315, + "balance_loss_mlp": 1.05900216, + "epoch": 0.41881492881877647, + "flos": 568892298240.0, + "grad_norm": 0.05573219506936483, + "language_loss": 0.89184308, + "learning_rate": 0.0006532635332911603, + "loss": 0.90280616, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.37304688, + "step": 2177, + "time_per_iteration": 2.64104962348938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092004, + "balance_loss_mlp": 1.05495393, + "epoch": 0.41900731050404, + "flos": 911478127104.0, + "grad_norm": 0.05325324025552218, + "language_loss": 0.80538237, + "learning_rate": 0.0006529669582909541, + "loss": 0.81630242, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.37011719, + "step": 2178, + "time_per_iteration": 3.21323299407959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108647, + "balance_loss_mlp": 1.04896641, + "epoch": 0.4191996921893036, + "flos": 535498988544.0, + "grad_norm": 0.06510625194491998, + "language_loss": 0.85975909, + "learning_rate": 0.0006526703239037077, + "loss": 0.87062377, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.375, + "step": 2179, + "time_per_iteration": 2.630338430404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086501, + "balance_loss_mlp": 1.0496887, + "epoch": 0.4193920738745671, + "flos": 582363979776.0, + "grad_norm": 0.04783092813648227, + "language_loss": 0.86411011, + "learning_rate": 0.0006523736302445851, + "loss": 0.8749752, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.36816406, + "step": 2180, + "time_per_iteration": 2.7710120677948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083583, + "balance_loss_mlp": 1.04681921, + "epoch": 0.4195844555598307, + "flos": 1335279720960.0, + "grad_norm": 0.05415818779113344, + "language_loss": 0.77215266, + "learning_rate": 0.0006520768774287728, + "loss": 0.78298849, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.36743164, + "step": 2181, + "time_per_iteration": 3.738273859024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082946, + "balance_loss_mlp": 1.04642057, + "epoch": 0.4197768372450943, + "flos": 598480786944.0, + "grad_norm": 0.04672312513315136, + "language_loss": 0.85467362, + "learning_rate": 0.0006517800655714806, + "loss": 0.86550307, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.36547852, + "step": 2182, + "time_per_iteration": 2.796132802963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076263, + "balance_loss_mlp": 1.04016638, + "epoch": 0.4199692189303578, + "flos": 734929085952.0, + "grad_norm": 0.05966366646918548, + "language_loss": 0.84806752, + "learning_rate": 0.0006514831947879407, + "loss": 0.85883021, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.36132812, + "step": 2183, + "time_per_iteration": 2.9417624473571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077956, + "balance_loss_mlp": 1.04243183, + "epoch": 0.4201616006156214, + "flos": 749854264320.0, + "grad_norm": 0.05811307518141115, + "language_loss": 0.78259802, + "learning_rate": 0.0006511862651934091, + "loss": 0.79337758, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.35522461, + "step": 2184, + "time_per_iteration": 3.0546512603759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082116, + "balance_loss_mlp": 1.04601932, + "epoch": 0.42035398230088494, + "flos": 546764912640.0, + "grad_norm": 0.041926600273946305, + "language_loss": 0.82459891, + "learning_rate": 0.0006508892769031638, + "loss": 0.83542007, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.36083984, + "step": 2185, + "time_per_iteration": 2.7021775245666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085419, + "balance_loss_mlp": 1.04972804, + "epoch": 0.42054636398614853, + "flos": 616628823552.0, + "grad_norm": 0.31605549573939495, + "language_loss": 0.86902821, + "learning_rate": 0.000650592230032506, + "loss": 0.87988245, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.35742188, + "step": 2186, + "time_per_iteration": 2.725625514984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090024, + "balance_loss_mlp": 1.05175829, + "epoch": 0.42073874567141206, + "flos": 640077285888.0, + "grad_norm": 0.04878826269588872, + "language_loss": 0.84995645, + "learning_rate": 0.0006502951246967595, + "loss": 0.86085677, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.38256836, + "step": 2187, + "time_per_iteration": 2.8762335777282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092344, + "balance_loss_mlp": 1.05517459, + "epoch": 0.42093112735667565, + "flos": 493524168192.0, + "grad_norm": 0.05435264660880543, + "language_loss": 0.86905056, + "learning_rate": 0.0006499979610112706, + "loss": 0.87997395, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.37158203, + "step": 2188, + "time_per_iteration": 2.7210283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105519, + "balance_loss_mlp": 1.06615603, + "epoch": 0.4211235090419392, + "flos": 542097321984.0, + "grad_norm": 0.05832158753777823, + "language_loss": 0.84076196, + "learning_rate": 0.000649700739091409, + "loss": 0.85181713, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.39331055, + "step": 2189, + "time_per_iteration": 2.70627498626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109273, + "balance_loss_mlp": 1.09582591, + "epoch": 0.42131589072720277, + "flos": 1531342651392.0, + "grad_norm": 0.0317680876714807, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74945545, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.13476562, + "step": 2190, + "time_per_iteration": 4.8291919231414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103729, + "balance_loss_mlp": 1.0656538, + "epoch": 0.42150827241246636, + "flos": 566583234048.0, + "grad_norm": 0.055290985630161965, + "language_loss": 0.85335857, + "learning_rate": 0.0006491061210101557, + "loss": 0.86439586, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.38037109, + "step": 2191, + "time_per_iteration": 2.669895887374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096378, + "balance_loss_mlp": 1.05770612, + "epoch": 0.4217006540977299, + "flos": 707242226688.0, + "grad_norm": 0.050091435221191714, + "language_loss": 0.83998156, + "learning_rate": 0.0006488087250796157, + "loss": 0.85094529, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.38623047, + "step": 2192, + "time_per_iteration": 2.951594352722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098297, + "balance_loss_mlp": 1.05864835, + "epoch": 0.4218930357829935, + "flos": 626975161344.0, + "grad_norm": 0.047618767001194696, + "language_loss": 0.81377089, + "learning_rate": 0.0006485112713764049, + "loss": 0.82475388, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.39624023, + "step": 2193, + "time_per_iteration": 2.943021535873413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095527, + "balance_loss_mlp": 1.05592585, + "epoch": 0.422085417468257, + "flos": 460110509568.0, + "grad_norm": 0.051159508672241207, + "language_loss": 0.83686495, + "learning_rate": 0.0006482137600160051, + "loss": 0.84782028, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.39575195, + "step": 2194, + "time_per_iteration": 2.5134236812591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095144, + "balance_loss_mlp": 1.05590069, + "epoch": 0.4222777991535206, + "flos": 473788804608.0, + "grad_norm": 0.10490890222415104, + "language_loss": 0.84473735, + "learning_rate": 0.0006479161911139206, + "loss": 0.85568881, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.39208984, + "step": 2195, + "time_per_iteration": 2.577578544616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096098, + "balance_loss_mlp": 1.05754566, + "epoch": 0.4224701808387841, + "flos": 470647494144.0, + "grad_norm": 0.0782943385788455, + "language_loss": 0.85684174, + "learning_rate": 0.0006476185647856778, + "loss": 0.86780274, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.38500977, + "step": 2196, + "time_per_iteration": 2.578495740890503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102208, + "balance_loss_mlp": 1.06286871, + "epoch": 0.4226625625240477, + "flos": 677202633216.0, + "grad_norm": 0.22187176821456261, + "language_loss": 0.81400013, + "learning_rate": 0.0006473208811468255, + "loss": 0.82502222, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.39306641, + "step": 2197, + "time_per_iteration": 2.870922088623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099017, + "balance_loss_mlp": 1.05984497, + "epoch": 0.4228549442093113, + "flos": 503268194304.0, + "grad_norm": 0.05214229642018916, + "language_loss": 0.8430717, + "learning_rate": 0.0006470231403129347, + "loss": 0.85406196, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.39135742, + "step": 2198, + "time_per_iteration": 2.5834295749664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098125, + "balance_loss_mlp": 1.05959654, + "epoch": 0.42304732589457483, + "flos": 611543623680.0, + "grad_norm": 0.055955286861533095, + "language_loss": 0.81645906, + "learning_rate": 0.0006467253423995988, + "loss": 0.82744032, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.38500977, + "step": 2199, + "time_per_iteration": 2.8634603023529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097002, + "balance_loss_mlp": 1.05854511, + "epoch": 0.4232397075798384, + "flos": 515302345728.0, + "grad_norm": 0.05326479811347408, + "language_loss": 0.79026473, + "learning_rate": 0.000646427487522433, + "loss": 0.80123472, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.38452148, + "step": 2200, + "time_per_iteration": 2.649003744125366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102129, + "balance_loss_mlp": 1.063815, + "epoch": 0.42343208926510195, + "flos": 589513752576.0, + "grad_norm": 0.053706873495154336, + "language_loss": 0.83035368, + "learning_rate": 0.0006461295757970749, + "loss": 0.84137499, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.3828125, + "step": 2201, + "time_per_iteration": 2.8269903659820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103583, + "balance_loss_mlp": 1.06379044, + "epoch": 0.42362447095036554, + "flos": 640342126080.0, + "grad_norm": 0.05615670023579285, + "language_loss": 0.8144629, + "learning_rate": 0.0006458316073391839, + "loss": 0.8254987, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.39770508, + "step": 2202, + "time_per_iteration": 2.9145257472991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094508, + "balance_loss_mlp": 1.05595589, + "epoch": 0.42381685263562907, + "flos": 512421493248.0, + "grad_norm": 0.05176927409450969, + "language_loss": 0.87622833, + "learning_rate": 0.0006455335822644422, + "loss": 0.88717341, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.38525391, + "step": 2203, + "time_per_iteration": 2.596822500228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099235, + "balance_loss_mlp": 1.06032515, + "epoch": 0.42400923432089266, + "flos": 546523393536.0, + "grad_norm": 0.08269999762480702, + "language_loss": 0.77441901, + "learning_rate": 0.0006452355006885527, + "loss": 0.78541136, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.38867188, + "step": 2204, + "time_per_iteration": 2.6238672733306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105054, + "balance_loss_mlp": 1.06533396, + "epoch": 0.4242016160061562, + "flos": 621872584704.0, + "grad_norm": 0.06279334467905663, + "language_loss": 0.86963212, + "learning_rate": 0.0006449373627272412, + "loss": 0.88068271, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.39697266, + "step": 2205, + "time_per_iteration": 2.715792417526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094037, + "balance_loss_mlp": 1.05515122, + "epoch": 0.4243939976914198, + "flos": 571649495040.0, + "grad_norm": 0.055815664393925046, + "language_loss": 0.82368463, + "learning_rate": 0.0006446391684962553, + "loss": 0.83462495, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.38867188, + "step": 2206, + "time_per_iteration": 2.642230987548828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096131, + "balance_loss_mlp": 1.05822253, + "epoch": 0.42458637937668336, + "flos": 448509934080.0, + "grad_norm": 0.05868479731789126, + "language_loss": 0.83175069, + "learning_rate": 0.000644340918111364, + "loss": 0.84271193, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.37841797, + "step": 2207, + "time_per_iteration": 2.5489144325256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096536, + "balance_loss_mlp": 1.0566721, + "epoch": 0.4247787610619469, + "flos": 435176464896.0, + "grad_norm": 0.05469710752121124, + "language_loss": 0.84862429, + "learning_rate": 0.0006440426116883585, + "loss": 0.8595897, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.3984375, + "step": 2208, + "time_per_iteration": 2.5027823448181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106001, + "balance_loss_mlp": 1.06563711, + "epoch": 0.4249711427472105, + "flos": 495818675712.0, + "grad_norm": 0.04694631121992161, + "language_loss": 0.86197406, + "learning_rate": 0.0006437442493430519, + "loss": 0.87303412, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.40356445, + "step": 2209, + "time_per_iteration": 2.624462127685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111613, + "balance_loss_mlp": 1.0711534, + "epoch": 0.425163524432474, + "flos": 655498649088.0, + "grad_norm": 0.06243114219893557, + "language_loss": 0.86437929, + "learning_rate": 0.000643445831191278, + "loss": 0.87549543, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.40454102, + "step": 2210, + "time_per_iteration": 2.883671760559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110528, + "balance_loss_mlp": 1.06544065, + "epoch": 0.4253559061177376, + "flos": 650317496832.0, + "grad_norm": 0.059150918853506505, + "language_loss": 0.81800103, + "learning_rate": 0.0006431473573488937, + "loss": 0.82905388, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.39819336, + "step": 2211, + "time_per_iteration": 2.723308563232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098753, + "balance_loss_mlp": 1.05807877, + "epoch": 0.42554828780300114, + "flos": 553894336512.0, + "grad_norm": 0.05841858860857517, + "language_loss": 0.84883767, + "learning_rate": 0.0006428488279317765, + "loss": 0.85982525, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.40673828, + "step": 2212, + "time_per_iteration": 2.628831148147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098786, + "balance_loss_mlp": 1.05904126, + "epoch": 0.4257406694882647, + "flos": 514154386944.0, + "grad_norm": 0.056764121975701104, + "language_loss": 0.87647104, + "learning_rate": 0.0006425502430558259, + "loss": 0.88745892, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.39746094, + "step": 2213, + "time_per_iteration": 2.604146718978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094618, + "balance_loss_mlp": 1.0550406, + "epoch": 0.42593305117352825, + "flos": 515380921344.0, + "grad_norm": 0.05046529876809897, + "language_loss": 0.84638417, + "learning_rate": 0.0006422516028369628, + "loss": 0.85733032, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.39550781, + "step": 2214, + "time_per_iteration": 2.6178741455078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088889, + "balance_loss_mlp": 1.04864407, + "epoch": 0.42612543285879184, + "flos": 587766302208.0, + "grad_norm": 0.04660283784017015, + "language_loss": 0.83496028, + "learning_rate": 0.0006419529073911296, + "loss": 0.84584916, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.40234375, + "step": 2215, + "time_per_iteration": 2.8105666637420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085879, + "balance_loss_mlp": 1.04515672, + "epoch": 0.42631781454405543, + "flos": 635153619456.0, + "grad_norm": 0.05277435964401644, + "language_loss": 0.85660267, + "learning_rate": 0.0006416541568342901, + "loss": 0.86746144, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.40722656, + "step": 2216, + "time_per_iteration": 2.880662441253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080832, + "balance_loss_mlp": 1.040277, + "epoch": 0.42651019622931896, + "flos": 540891136512.0, + "grad_norm": 0.04969535335028593, + "language_loss": 0.84409285, + "learning_rate": 0.0006413553512824297, + "loss": 0.85490113, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.40551758, + "step": 2217, + "time_per_iteration": 2.7169618606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108871, + "balance_loss_mlp": 1.0485599, + "epoch": 0.42670257791458255, + "flos": 557892624384.0, + "grad_norm": 0.052410461022671016, + "language_loss": 0.84532559, + "learning_rate": 0.0006410564908515549, + "loss": 0.85621268, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.40136719, + "step": 2218, + "time_per_iteration": 2.657231092453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077064, + "balance_loss_mlp": 1.03710461, + "epoch": 0.4268949595998461, + "flos": 621025781760.0, + "grad_norm": 0.054635208049088675, + "language_loss": 0.8539567, + "learning_rate": 0.0006407575756576935, + "loss": 0.86472738, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.39941406, + "step": 2219, + "time_per_iteration": 2.7336490154266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089202, + "balance_loss_mlp": 1.04921913, + "epoch": 0.42708734128510967, + "flos": 537646519296.0, + "grad_norm": 0.04674173481591379, + "language_loss": 0.8770538, + "learning_rate": 0.0006404586058168951, + "loss": 0.88794577, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.3996582, + "step": 2220, + "time_per_iteration": 2.757380723953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080306, + "balance_loss_mlp": 1.0401566, + "epoch": 0.4272797229703732, + "flos": 502617830400.0, + "grad_norm": 0.05080694298179496, + "language_loss": 0.86598134, + "learning_rate": 0.0006401595814452296, + "loss": 0.87678444, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.40136719, + "step": 2221, + "time_per_iteration": 2.583448886871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082627, + "balance_loss_mlp": 1.04252505, + "epoch": 0.4274721046556368, + "flos": 492208883712.0, + "grad_norm": 0.05244104927134987, + "language_loss": 0.80640519, + "learning_rate": 0.000639860502658789, + "loss": 0.81723142, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.40087891, + "step": 2222, + "time_per_iteration": 2.6454262733459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080551, + "balance_loss_mlp": 1.04149842, + "epoch": 0.4276644863409004, + "flos": 568094957568.0, + "grad_norm": 0.049852493850949496, + "language_loss": 0.84906983, + "learning_rate": 0.0006395613695736853, + "loss": 0.85987538, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.39038086, + "step": 2223, + "time_per_iteration": 2.6607768535614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108883, + "balance_loss_mlp": 1.04841852, + "epoch": 0.4278568680261639, + "flos": 607155429888.0, + "grad_norm": 0.052366739862963044, + "language_loss": 0.8181783, + "learning_rate": 0.0006392621823060529, + "loss": 0.82906657, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.40405273, + "step": 2224, + "time_per_iteration": 2.7084245681762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085727, + "balance_loss_mlp": 1.045434, + "epoch": 0.4280492497114275, + "flos": 560265707520.0, + "grad_norm": 0.062247479017330604, + "language_loss": 0.85044312, + "learning_rate": 0.0006389629409720465, + "loss": 0.86130041, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.40307617, + "step": 2225, + "time_per_iteration": 2.6494481563568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083023, + "balance_loss_mlp": 1.04451835, + "epoch": 0.428241631396691, + "flos": 720334176768.0, + "grad_norm": 0.05784613309553924, + "language_loss": 0.88236213, + "learning_rate": 0.0006386636456878417, + "loss": 0.89319241, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.38452148, + "step": 2226, + "time_per_iteration": 2.8575398921966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086633, + "balance_loss_mlp": 1.04643595, + "epoch": 0.4284340130819546, + "flos": 429243052032.0, + "grad_norm": 0.05660062263134159, + "language_loss": 0.9185167, + "learning_rate": 0.0006383642965696353, + "loss": 0.92938304, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.40185547, + "step": 2227, + "time_per_iteration": 2.436495065689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093242, + "balance_loss_mlp": 1.05240059, + "epoch": 0.42862639476721814, + "flos": 524732069376.0, + "grad_norm": 0.06503204597883332, + "language_loss": 0.82736492, + "learning_rate": 0.000638064893733645, + "loss": 0.83829737, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.40844727, + "step": 2228, + "time_per_iteration": 2.737835645675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097834, + "balance_loss_mlp": 1.05937719, + "epoch": 0.42881877645248173, + "flos": 465089430528.0, + "grad_norm": 0.05835798065495767, + "language_loss": 0.90023828, + "learning_rate": 0.000637765437296109, + "loss": 0.91121662, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.38427734, + "step": 2229, + "time_per_iteration": 2.6694185733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102575, + "balance_loss_mlp": 1.06383204, + "epoch": 0.42901115813774526, + "flos": 560034362880.0, + "grad_norm": 0.048777417646368525, + "language_loss": 0.85443366, + "learning_rate": 0.000637465927373287, + "loss": 0.86545944, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.38720703, + "step": 2230, + "time_per_iteration": 2.608868360519409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097095, + "balance_loss_mlp": 1.05942452, + "epoch": 0.42920353982300885, + "flos": 561186703872.0, + "grad_norm": 0.058529600310023314, + "language_loss": 0.78994036, + "learning_rate": 0.000637166364081459, + "loss": 0.80091131, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.37670898, + "step": 2231, + "time_per_iteration": 2.6343741416931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109752, + "balance_loss_mlp": 1.06089842, + "epoch": 0.42939592150827244, + "flos": 555982230528.0, + "grad_norm": 0.06635954042372831, + "language_loss": 0.84122705, + "learning_rate": 0.0006368667475369256, + "loss": 0.8522023, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.36621094, + "step": 2232, + "time_per_iteration": 2.719153881072998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01385097, + "balance_loss_mlp": 1.36373484, + "epoch": 0.42958830319353597, + "flos": 1520796902400.0, + "grad_norm": 0.10507214536659652, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79912877, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.21386719, + "step": 2233, + "time_per_iteration": 4.869459390640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222541, + "balance_loss_mlp": 1.20547056, + "epoch": 0.42978068487879956, + "flos": 1495052522496.0, + "grad_norm": 0.06278147410173565, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.80117965, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.17089844, + "step": 2234, + "time_per_iteration": 4.809493780136108 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101654, + "balance_loss_mlp": 1.06386471, + "epoch": 0.4299730665640631, + "flos": 546725624832.0, + "grad_norm": 0.047028007384334866, + "language_loss": 0.86220634, + "learning_rate": 0.0006359675795504112, + "loss": 0.87322283, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.37744141, + "step": 2235, + "time_per_iteration": 2.644548177719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104888, + "balance_loss_mlp": 1.06671751, + "epoch": 0.4301654482493267, + "flos": 1128839473152.0, + "grad_norm": 0.053864842268977364, + "language_loss": 0.7475214, + "learning_rate": 0.0006356677511584775, + "loss": 0.75857025, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.38134766, + "step": 2236, + "time_per_iteration": 3.473637580871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104941, + "balance_loss_mlp": 1.06784356, + "epoch": 0.4303578299345902, + "flos": 495502963200.0, + "grad_norm": 0.07035023985335077, + "language_loss": 0.8582648, + "learning_rate": 0.0006353678700956511, + "loss": 0.86931419, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.37084961, + "step": 2237, + "time_per_iteration": 2.5412683486938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110161, + "balance_loss_mlp": 1.0728724, + "epoch": 0.4305502116198538, + "flos": 615472100352.0, + "grad_norm": 0.048926528615743585, + "language_loss": 0.83597398, + "learning_rate": 0.0006350679364783569, + "loss": 0.84707558, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.37255859, + "step": 2238, + "time_per_iteration": 2.7351441383361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108038, + "balance_loss_mlp": 1.0704397, + "epoch": 0.4307425933051173, + "flos": 558995503104.0, + "grad_norm": 0.05635941331688695, + "language_loss": 0.85586011, + "learning_rate": 0.0006347679504230393, + "loss": 0.8669405, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.37573242, + "step": 2239, + "time_per_iteration": 2.628014326095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108584, + "balance_loss_mlp": 1.06981754, + "epoch": 0.4309349749903809, + "flos": 971755163136.0, + "grad_norm": 0.06390031403556296, + "language_loss": 0.75844669, + "learning_rate": 0.0006344679120461632, + "loss": 0.76953256, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.38745117, + "step": 2240, + "time_per_iteration": 3.325970411300659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099219, + "balance_loss_mlp": 1.06123924, + "epoch": 0.4311273566756445, + "flos": 541663746048.0, + "grad_norm": 0.07957466882071795, + "language_loss": 0.79994094, + "learning_rate": 0.0006341678214642134, + "loss": 0.81093317, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.37963867, + "step": 2241, + "time_per_iteration": 2.598954916000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098329, + "balance_loss_mlp": 1.06118321, + "epoch": 0.43131973836090803, + "flos": 761316627456.0, + "grad_norm": 0.06316124390987561, + "language_loss": 0.82909411, + "learning_rate": 0.0006338676787936963, + "loss": 0.8400774, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.37133789, + "step": 2242, + "time_per_iteration": 3.057990074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092454, + "balance_loss_mlp": 1.0547359, + "epoch": 0.4315121200461716, + "flos": 554263893504.0, + "grad_norm": 0.058630582948494374, + "language_loss": 0.83799654, + "learning_rate": 0.0006335674841511367, + "loss": 0.84892106, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.37670898, + "step": 2243, + "time_per_iteration": 2.667917490005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152602, + "balance_loss_mlp": 1.1380111, + "epoch": 0.43170450173143515, + "flos": 1484499419136.0, + "grad_norm": 0.03105866471095203, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80333769, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.14550781, + "step": 2244, + "time_per_iteration": 4.996346473693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147416, + "balance_loss_mlp": 1.13225269, + "epoch": 0.43189688341669874, + "flos": 1472897433600.0, + "grad_norm": 0.02634625536346193, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78512967, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.15136719, + "step": 2245, + "time_per_iteration": 4.925641775131226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090243, + "balance_loss_mlp": 1.05293071, + "epoch": 0.43208926510196227, + "flos": 492677365248.0, + "grad_norm": 0.04832922480589342, + "language_loss": 0.82476389, + "learning_rate": 0.0006326665895567652, + "loss": 0.83566636, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.37304688, + "step": 2246, + "time_per_iteration": 2.6338651180267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108663, + "balance_loss_mlp": 1.04876888, + "epoch": 0.43228164678722586, + "flos": 519969936384.0, + "grad_norm": 0.06353903654252775, + "language_loss": 0.86891162, + "learning_rate": 0.0006323661881916976, + "loss": 0.87977791, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.37841797, + "step": 2247, + "time_per_iteration": 2.7270143032073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088946, + "balance_loss_mlp": 1.05082273, + "epoch": 0.4324740284724894, + "flos": 795722655744.0, + "grad_norm": 0.06655581665723238, + "language_loss": 0.81039822, + "learning_rate": 0.0006320657354375179, + "loss": 0.82128775, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.38134766, + "step": 2248, + "time_per_iteration": 2.9334113597869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090387, + "balance_loss_mlp": 1.05183434, + "epoch": 0.432666410157753, + "flos": 481917800448.0, + "grad_norm": 0.05858711608638651, + "language_loss": 0.87308645, + "learning_rate": 0.0006317652314108726, + "loss": 0.88399029, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.38500977, + "step": 2249, + "time_per_iteration": 2.5155436992645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083427, + "balance_loss_mlp": 1.04508948, + "epoch": 0.43285879184301657, + "flos": 499963940352.0, + "grad_norm": 0.06176153995331203, + "language_loss": 0.91197717, + "learning_rate": 0.0006314646762284277, + "loss": 0.92281145, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.38305664, + "step": 2250, + "time_per_iteration": 2.5938589572906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151081, + "balance_loss_mlp": 1.13324702, + "epoch": 0.4330511735282801, + "flos": 1509615346176.0, + "grad_norm": 0.03602865793169688, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76576912, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.17871094, + "step": 2251, + "time_per_iteration": 4.858763217926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082209, + "balance_loss_mlp": 1.04322791, + "epoch": 0.4332435552135437, + "flos": 699270382080.0, + "grad_norm": 0.07106828010915285, + "language_loss": 0.77364099, + "learning_rate": 0.0006308634128629022, + "loss": 0.78446311, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.3894043, + "step": 2252, + "time_per_iteration": 2.857311487197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081116, + "balance_loss_mlp": 1.04163396, + "epoch": 0.4334359368988072, + "flos": 591995934720.0, + "grad_norm": 0.05494240381392999, + "language_loss": 0.87411273, + "learning_rate": 0.0006305627049132531, + "loss": 0.88492393, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.39453125, + "step": 2253, + "time_per_iteration": 2.7931392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074672, + "balance_loss_mlp": 1.03628647, + "epoch": 0.4336283185840708, + "flos": 842440670208.0, + "grad_norm": 0.045544810523015906, + "language_loss": 0.85602796, + "learning_rate": 0.0006302619462746662, + "loss": 0.86677468, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.38330078, + "step": 2254, + "time_per_iteration": 3.137031078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072053, + "balance_loss_mlp": 1.03521752, + "epoch": 0.43382070026933434, + "flos": 625974179328.0, + "grad_norm": 0.05597321467051534, + "language_loss": 0.90273923, + "learning_rate": 0.0006299611370639069, + "loss": 0.91345972, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.36816406, + "step": 2255, + "time_per_iteration": 2.7370500564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078678, + "balance_loss_mlp": 1.04029226, + "epoch": 0.4340130819545979, + "flos": 590837801472.0, + "grad_norm": 0.05249156720482198, + "language_loss": 0.7960273, + "learning_rate": 0.0006296602773977593, + "loss": 0.80681407, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.38354492, + "step": 2256, + "time_per_iteration": 2.671543836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082288, + "balance_loss_mlp": 1.04387856, + "epoch": 0.4342054636398615, + "flos": 490624376832.0, + "grad_norm": 0.047941706130753194, + "language_loss": 0.87283635, + "learning_rate": 0.0006293593673930277, + "loss": 0.88365924, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.3840332, + "step": 2257, + "time_per_iteration": 2.622807741165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084566, + "balance_loss_mlp": 1.04694366, + "epoch": 0.43439784532512504, + "flos": 698679654912.0, + "grad_norm": 0.05256563639723818, + "language_loss": 0.78625226, + "learning_rate": 0.0006290584071665358, + "loss": 0.79709792, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.3762207, + "step": 2258, + "time_per_iteration": 2.8814268112182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084583, + "balance_loss_mlp": 1.0463171, + "epoch": 0.43459022701038863, + "flos": 485581436928.0, + "grad_norm": 0.05582719483060078, + "language_loss": 0.82315511, + "learning_rate": 0.0006287573968351266, + "loss": 0.83400095, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.38256836, + "step": 2259, + "time_per_iteration": 2.530107259750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093075, + "balance_loss_mlp": 1.05585814, + "epoch": 0.43478260869565216, + "flos": 642818515968.0, + "grad_norm": 0.06362082652150813, + "language_loss": 0.82416236, + "learning_rate": 0.0006284563365156626, + "loss": 0.83509314, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.37182617, + "step": 2260, + "time_per_iteration": 2.798595905303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088803, + "balance_loss_mlp": 1.05103791, + "epoch": 0.43497499038091575, + "flos": 425870396928.0, + "grad_norm": 0.05655312611086985, + "language_loss": 0.87709838, + "learning_rate": 0.0006281552263250261, + "loss": 0.88798642, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.37719727, + "step": 2261, + "time_per_iteration": 2.452665090560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160927, + "balance_loss_mlp": 1.14223516, + "epoch": 0.4351673720661793, + "flos": 1537594748928.0, + "grad_norm": 0.04176446008295971, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.8185246, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.18652344, + "step": 2262, + "time_per_iteration": 4.821255207061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101716, + "balance_loss_mlp": 1.0650475, + "epoch": 0.43535975375144287, + "flos": 748828551168.0, + "grad_norm": 0.06957692587484587, + "language_loss": 0.81302369, + "learning_rate": 0.0006275528567978593, + "loss": 0.82404089, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.36669922, + "step": 2263, + "time_per_iteration": 2.9021594524383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105224, + "balance_loss_mlp": 1.06710052, + "epoch": 0.4355521354367064, + "flos": 860914593792.0, + "grad_norm": 0.05359116837259303, + "language_loss": 0.8251968, + "learning_rate": 0.0006272515976951898, + "loss": 0.83624899, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.38134766, + "step": 2264, + "time_per_iteration": 3.051140546798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100567, + "balance_loss_mlp": 1.06160915, + "epoch": 0.43574451712197, + "flos": 734200146432.0, + "grad_norm": 0.04085362180640218, + "language_loss": 0.79003727, + "learning_rate": 0.0006269502891890687, + "loss": 0.80104291, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.38916016, + "step": 2265, + "time_per_iteration": 2.987435817718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109612, + "balance_loss_mlp": 1.05899858, + "epoch": 0.4359368988072336, + "flos": 570296332800.0, + "grad_norm": 0.04646658934269887, + "language_loss": 0.88059056, + "learning_rate": 0.0006266489313964743, + "loss": 0.89155173, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.37109375, + "step": 2266, + "time_per_iteration": 2.718259572982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098621, + "balance_loss_mlp": 1.06040287, + "epoch": 0.4361292804924971, + "flos": 555244526592.0, + "grad_norm": 0.06168340797293566, + "language_loss": 0.85241735, + "learning_rate": 0.0006263475244344041, + "loss": 0.86340356, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.38183594, + "step": 2267, + "time_per_iteration": 2.822174072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099998, + "balance_loss_mlp": 1.06232774, + "epoch": 0.4363216621777607, + "flos": 557021090304.0, + "grad_norm": 0.06545155195827496, + "language_loss": 0.84663981, + "learning_rate": 0.0006260460684198746, + "loss": 0.85763973, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.37646484, + "step": 2268, + "time_per_iteration": 2.652629852294922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092951, + "balance_loss_mlp": 1.05556679, + "epoch": 0.4365140438630242, + "flos": 477979149312.0, + "grad_norm": 0.06144025960698331, + "language_loss": 0.84485406, + "learning_rate": 0.0006257445634699213, + "loss": 0.85578358, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.3737793, + "step": 2269, + "time_per_iteration": 2.526547431945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091306, + "balance_loss_mlp": 1.05506659, + "epoch": 0.4367064255482878, + "flos": 578646498816.0, + "grad_norm": 0.047950904811088546, + "language_loss": 0.82840669, + "learning_rate": 0.0006254430097015993, + "loss": 0.83931977, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.36279297, + "step": 2270, + "time_per_iteration": 2.6397740840911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121077, + "balance_loss_mlp": 1.1094898, + "epoch": 0.43689880723355135, + "flos": 1458117669888.0, + "grad_norm": 0.029995875979849037, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77600169, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.11572266, + "step": 2271, + "time_per_iteration": 4.781012535095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093451, + "balance_loss_mlp": 1.0559721, + "epoch": 0.43709118891881493, + "flos": 667295663616.0, + "grad_norm": 0.05579821190743498, + "language_loss": 0.85169244, + "learning_rate": 0.0006248397561781609, + "loss": 0.86262697, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.37426758, + "step": 2272, + "time_per_iteration": 2.8750343322753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109427, + "balance_loss_mlp": 1.05617118, + "epoch": 0.43728357060407846, + "flos": 544612999680.0, + "grad_norm": 0.06638881020832643, + "language_loss": 0.86299849, + "learning_rate": 0.0006245380566572482, + "loss": 0.87394118, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.38085938, + "step": 2273, + "time_per_iteration": 2.667287826538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095571, + "balance_loss_mlp": 1.05873561, + "epoch": 0.43747595228934205, + "flos": 746504930304.0, + "grad_norm": 0.06509502789500103, + "language_loss": 0.75652242, + "learning_rate": 0.0006242363087863744, + "loss": 0.76747811, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.36816406, + "step": 2274, + "time_per_iteration": 2.948168992996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088988, + "balance_loss_mlp": 1.05060267, + "epoch": 0.43766833397460564, + "flos": 631060789248.0, + "grad_norm": 0.0773983629565932, + "language_loss": 0.85681164, + "learning_rate": 0.0006239345126826878, + "loss": 0.86770147, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.38354492, + "step": 2275, + "time_per_iteration": 2.7522637844085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084857, + "balance_loss_mlp": 1.04682946, + "epoch": 0.43786071565986917, + "flos": 530709152256.0, + "grad_norm": 0.05397848209837344, + "language_loss": 0.84028137, + "learning_rate": 0.0006236326684633561, + "loss": 0.85112989, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.37988281, + "step": 2276, + "time_per_iteration": 2.8013172149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083155, + "balance_loss_mlp": 1.04479384, + "epoch": 0.43805309734513276, + "flos": 538295473152.0, + "grad_norm": 0.057720697432170794, + "language_loss": 0.74613291, + "learning_rate": 0.0006233307762455658, + "loss": 0.75696445, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.38354492, + "step": 2277, + "time_per_iteration": 4.090092658996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088607, + "balance_loss_mlp": 1.05057979, + "epoch": 0.4382454790303963, + "flos": 864188324352.0, + "grad_norm": 0.052083504639934525, + "language_loss": 0.83232701, + "learning_rate": 0.0006230288361465216, + "loss": 0.84321308, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.37988281, + "step": 2278, + "time_per_iteration": 3.0360679626464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092026, + "balance_loss_mlp": 1.05368817, + "epoch": 0.4384378607156599, + "flos": 765175292928.0, + "grad_norm": 0.0765632057362916, + "language_loss": 0.85051048, + "learning_rate": 0.0006227268482834473, + "loss": 0.86143076, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.38305664, + "step": 2279, + "time_per_iteration": 2.875603437423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092125, + "balance_loss_mlp": 1.05369186, + "epoch": 0.4386302424009234, + "flos": 668260329984.0, + "grad_norm": 0.06746087226793605, + "language_loss": 0.87309432, + "learning_rate": 0.000622424812773585, + "loss": 0.88401562, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.3840332, + "step": 2280, + "time_per_iteration": 2.815737724304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091003, + "balance_loss_mlp": 1.05335641, + "epoch": 0.438822624086187, + "flos": 484941247488.0, + "grad_norm": 0.06660247150401381, + "language_loss": 0.7952022, + "learning_rate": 0.000622122729734195, + "loss": 0.80611223, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.3762207, + "step": 2281, + "time_per_iteration": 2.528907060623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010937, + "balance_loss_mlp": 1.05653024, + "epoch": 0.4390150057714506, + "flos": 498959986176.0, + "grad_norm": 0.07198447175498815, + "language_loss": 0.87400854, + "learning_rate": 0.0006218205992825566, + "loss": 0.88494551, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.37158203, + "step": 2282, + "time_per_iteration": 2.6437437534332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086909, + "balance_loss_mlp": 1.04895234, + "epoch": 0.4392073874567141, + "flos": 557937704448.0, + "grad_norm": 0.0537918663445124, + "language_loss": 0.81690598, + "learning_rate": 0.0006215184215359671, + "loss": 0.82777506, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.37939453, + "step": 2283, + "time_per_iteration": 2.7374680042266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082056, + "balance_loss_mlp": 1.04531598, + "epoch": 0.4393997691419777, + "flos": 605028248064.0, + "grad_norm": 0.053438963610997155, + "language_loss": 0.86718416, + "learning_rate": 0.0006212161966117425, + "loss": 0.87800473, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.36743164, + "step": 2284, + "time_per_iteration": 2.7031607627868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082719, + "balance_loss_mlp": 1.04476333, + "epoch": 0.43959215082724123, + "flos": 803812363776.0, + "grad_norm": 0.05414488390239245, + "language_loss": 0.81261152, + "learning_rate": 0.0006209139246272164, + "loss": 0.8234387, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.37915039, + "step": 2285, + "time_per_iteration": 2.942938804626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081504, + "balance_loss_mlp": 1.04354775, + "epoch": 0.4397845325125048, + "flos": 487403080704.0, + "grad_norm": 0.06213580776851028, + "language_loss": 0.8193686, + "learning_rate": 0.0006206116056997421, + "loss": 0.83018363, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.37939453, + "step": 2286, + "time_per_iteration": 2.549246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083431, + "balance_loss_mlp": 1.04671431, + "epoch": 0.43997691419776835, + "flos": 480569020416.0, + "grad_norm": 0.047189645190622125, + "language_loss": 0.82737786, + "learning_rate": 0.0006203092399466892, + "loss": 0.83821213, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.36694336, + "step": 2287, + "time_per_iteration": 2.533667802810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079141, + "balance_loss_mlp": 1.04259157, + "epoch": 0.44016929588303194, + "flos": 482873702400.0, + "grad_norm": 0.04521232958061075, + "language_loss": 0.85280973, + "learning_rate": 0.0006200068274854473, + "loss": 0.86360115, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.36523438, + "step": 2288, + "time_per_iteration": 2.6336212158203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087714, + "balance_loss_mlp": 1.05013943, + "epoch": 0.4403616775682955, + "flos": 571562155008.0, + "grad_norm": 0.04238785738832165, + "language_loss": 0.85822582, + "learning_rate": 0.0006197043684334229, + "loss": 0.86910295, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.37548828, + "step": 2289, + "time_per_iteration": 2.7420616149902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108734, + "balance_loss_mlp": 1.05028939, + "epoch": 0.44055405925355906, + "flos": 630563194368.0, + "grad_norm": 0.0573866619632787, + "language_loss": 0.79627317, + "learning_rate": 0.0006194018629080411, + "loss": 0.80714655, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.37036133, + "step": 2290, + "time_per_iteration": 2.7804791927337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108993, + "balance_loss_mlp": 1.0514729, + "epoch": 0.44074644093882265, + "flos": 536523291648.0, + "grad_norm": 0.052070709818396434, + "language_loss": 0.81445479, + "learning_rate": 0.0006190993110267451, + "loss": 0.82535404, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.38427734, + "step": 2291, + "time_per_iteration": 2.6991255283355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079676, + "balance_loss_mlp": 1.04317451, + "epoch": 0.4409388226240862, + "flos": 462995744256.0, + "grad_norm": 0.05365602748785357, + "language_loss": 0.84155387, + "learning_rate": 0.0006187967129069958, + "loss": 0.85235059, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.36523438, + "step": 2292, + "time_per_iteration": 2.558609962463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082437, + "balance_loss_mlp": 1.04569674, + "epoch": 0.44113120430934977, + "flos": 565717492224.0, + "grad_norm": 0.05065606510830679, + "language_loss": 0.87013716, + "learning_rate": 0.0006184940686662722, + "loss": 0.88096148, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.36743164, + "step": 2293, + "time_per_iteration": 2.753314733505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078141, + "balance_loss_mlp": 1.04125786, + "epoch": 0.4413235859946133, + "flos": 543313681920.0, + "grad_norm": 0.05240936044313176, + "language_loss": 0.89929485, + "learning_rate": 0.0006181913784220714, + "loss": 0.91007626, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.36865234, + "step": 2294, + "time_per_iteration": 2.6420986652374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111961, + "balance_loss_mlp": 1.09889555, + "epoch": 0.4415159676798769, + "flos": 1569016465920.0, + "grad_norm": 0.03544098021349555, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81665742, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.13085938, + "step": 2295, + "time_per_iteration": 4.864506483078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085965, + "balance_loss_mlp": 1.04831886, + "epoch": 0.4417083493651404, + "flos": 658423171584.0, + "grad_norm": 0.06256258413724265, + "language_loss": 0.79847091, + "learning_rate": 0.0006175858603933146, + "loss": 0.80933058, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.3762207, + "step": 2296, + "time_per_iteration": 2.8739333152770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079624, + "balance_loss_mlp": 1.04328871, + "epoch": 0.441900731050404, + "flos": 740119002624.0, + "grad_norm": 0.05454759239937102, + "language_loss": 0.80644178, + "learning_rate": 0.0006172830328438416, + "loss": 0.81723803, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.36352539, + "step": 2297, + "time_per_iteration": 2.9661777019500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082208, + "balance_loss_mlp": 1.0437274, + "epoch": 0.44209311273566754, + "flos": 539153860608.0, + "grad_norm": 0.05386131456834753, + "language_loss": 0.87081188, + "learning_rate": 0.0006169801597610572, + "loss": 0.88163394, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.38452148, + "step": 2298, + "time_per_iteration": 2.732304573059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107238, + "balance_loss_mlp": 1.03604531, + "epoch": 0.4422854944209311, + "flos": 621335702016.0, + "grad_norm": 0.07013675434202182, + "language_loss": 0.89663231, + "learning_rate": 0.0006166772412625469, + "loss": 0.90735614, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.36328125, + "step": 2299, + "time_per_iteration": 2.70890736579895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075195, + "balance_loss_mlp": 1.03793061, + "epoch": 0.4424778761061947, + "flos": 658516303872.0, + "grad_norm": 0.06419018913135732, + "language_loss": 0.81816357, + "learning_rate": 0.0006163742774659141, + "loss": 0.8289156, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.37255859, + "step": 2300, + "time_per_iteration": 2.830306053161621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081661, + "balance_loss_mlp": 1.0454216, + "epoch": 0.44267025779145824, + "flos": 568297188864.0, + "grad_norm": 0.05261241955347018, + "language_loss": 0.85695601, + "learning_rate": 0.0006160712684887801, + "loss": 0.86777264, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.36279297, + "step": 2301, + "time_per_iteration": 2.7931785583496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010826, + "balance_loss_mlp": 1.04600239, + "epoch": 0.44286263947672183, + "flos": 496469039616.0, + "grad_norm": 0.05340137710748247, + "language_loss": 0.81907189, + "learning_rate": 0.0006157682144487832, + "loss": 0.82989788, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.36572266, + "step": 2302, + "time_per_iteration": 2.7355551719665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108501, + "balance_loss_mlp": 1.04793596, + "epoch": 0.44305502116198536, + "flos": 609096347136.0, + "grad_norm": 0.060309070663334345, + "language_loss": 0.82788789, + "learning_rate": 0.0006154651154635793, + "loss": 0.83873796, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.37084961, + "step": 2303, + "time_per_iteration": 2.8048007488250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088624, + "balance_loss_mlp": 1.05150199, + "epoch": 0.44324740284724895, + "flos": 470558744064.0, + "grad_norm": 0.05169590776144269, + "language_loss": 0.84867418, + "learning_rate": 0.0006151619716508421, + "loss": 0.85956049, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.37084961, + "step": 2304, + "time_per_iteration": 2.5419833660125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.05046785, + "epoch": 0.4434397845325125, + "flos": 578454441984.0, + "grad_norm": 0.05720417651641939, + "language_loss": 0.86974978, + "learning_rate": 0.0006148587831282625, + "loss": 0.88062799, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.37353516, + "step": 2305, + "time_per_iteration": 2.689751386642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055473, + "balance_loss_mlp": 1.04326594, + "epoch": 0.44363216621777607, + "flos": 1495765343232.0, + "grad_norm": 0.012762307031937271, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80231541, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.12207031, + "step": 2306, + "time_per_iteration": 4.886535406112671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092659, + "balance_loss_mlp": 1.05699158, + "epoch": 0.44382454790303966, + "flos": 477082884096.0, + "grad_norm": 0.06286570611305137, + "language_loss": 0.86913157, + "learning_rate": 0.0006142522724244255, + "loss": 0.88005817, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.35693359, + "step": 2307, + "time_per_iteration": 2.499870777130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054077, + "balance_loss_mlp": 1.04177487, + "epoch": 0.4440169295883032, + "flos": 1543321548288.0, + "grad_norm": 0.013017387525484581, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.775388, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.12255859, + "step": 2308, + "time_per_iteration": 4.8646886348724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087414, + "balance_loss_mlp": 1.05115092, + "epoch": 0.4442093112735668, + "flos": 590789749248.0, + "grad_norm": 0.050195382328210664, + "language_loss": 0.77274799, + "learning_rate": 0.000613645584293942, + "loss": 0.78362215, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.36279297, + "step": 2309, + "time_per_iteration": 2.877244472503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087256, + "balance_loss_mlp": 1.05056334, + "epoch": 0.4444016929588303, + "flos": 530009326080.0, + "grad_norm": 0.047114011401622066, + "language_loss": 0.83068305, + "learning_rate": 0.0006133421739881185, + "loss": 0.8415556, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.36694336, + "step": 2310, + "time_per_iteration": 2.667240858078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081979, + "balance_loss_mlp": 1.04557252, + "epoch": 0.4445940746440939, + "flos": 619947634176.0, + "grad_norm": 0.055208144480819774, + "language_loss": 0.82587862, + "learning_rate": 0.0006130387196789605, + "loss": 0.83669835, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.36425781, + "step": 2311, + "time_per_iteration": 2.7925667762756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082597, + "balance_loss_mlp": 1.04704881, + "epoch": 0.4447864563293574, + "flos": 628782248448.0, + "grad_norm": 0.049856185775691036, + "language_loss": 0.83914995, + "learning_rate": 0.0006127352214842795, + "loss": 0.84997582, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.35571289, + "step": 2312, + "time_per_iteration": 2.9495813846588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079657, + "balance_loss_mlp": 1.04236865, + "epoch": 0.444978838014621, + "flos": 650548841472.0, + "grad_norm": 0.0527905378587152, + "language_loss": 0.85049295, + "learning_rate": 0.0006124316795219041, + "loss": 0.8612895, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.37255859, + "step": 2313, + "time_per_iteration": 2.760117769241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077597, + "balance_loss_mlp": 1.04119062, + "epoch": 0.44517121969988455, + "flos": 612153289728.0, + "grad_norm": 0.047764928605774304, + "language_loss": 0.82297838, + "learning_rate": 0.0006121280939096794, + "loss": 0.8337543, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.36401367, + "step": 2314, + "time_per_iteration": 2.737471580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075674, + "balance_loss_mlp": 1.0385046, + "epoch": 0.44536360138514813, + "flos": 488491402752.0, + "grad_norm": 0.07620217918322614, + "language_loss": 0.87685931, + "learning_rate": 0.000611824464765468, + "loss": 0.88761604, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.37133789, + "step": 2315, + "time_per_iteration": 2.5991926193237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103119, + "balance_loss_mlp": 1.01922143, + "epoch": 0.4455559830704117, + "flos": 1515425255424.0, + "grad_norm": 0.013293348061684912, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79626131, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.11962891, + "step": 2316, + "time_per_iteration": 4.652711391448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079239, + "balance_loss_mlp": 1.04335713, + "epoch": 0.44574836475567525, + "flos": 615314949120.0, + "grad_norm": 0.04747333782009751, + "language_loss": 0.85680878, + "learning_rate": 0.000611217076352619, + "loss": 0.86760116, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.35913086, + "step": 2317, + "time_per_iteration": 2.7729227542877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077005, + "balance_loss_mlp": 1.04140949, + "epoch": 0.44594074644093884, + "flos": 506070471168.0, + "grad_norm": 0.2761075259266177, + "language_loss": 0.82980591, + "learning_rate": 0.0006109133173197905, + "loss": 0.84057599, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.35620117, + "step": 2318, + "time_per_iteration": 2.6684277057647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087623, + "balance_loss_mlp": 1.05243218, + "epoch": 0.44613312812620237, + "flos": 726647321088.0, + "grad_norm": 0.057083346058123784, + "language_loss": 0.85251284, + "learning_rate": 0.0006106095152265935, + "loss": 0.86338907, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.35229492, + "step": 2319, + "time_per_iteration": 2.9197404384613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092234, + "balance_loss_mlp": 1.05604196, + "epoch": 0.44632550981146596, + "flos": 635419869696.0, + "grad_norm": 0.048967973341694476, + "language_loss": 0.8448627, + "learning_rate": 0.0006103056701909739, + "loss": 0.85578501, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.36230469, + "step": 2320, + "time_per_iteration": 2.885965347290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101974, + "balance_loss_mlp": 1.06604421, + "epoch": 0.4465178914967295, + "flos": 826690447872.0, + "grad_norm": 0.04429440839494469, + "language_loss": 0.82779431, + "learning_rate": 0.0006100017823308956, + "loss": 0.83881408, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.35961914, + "step": 2321, + "time_per_iteration": 3.1523914337158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110877, + "balance_loss_mlp": 1.0737319, + "epoch": 0.4467102731819931, + "flos": 665532246528.0, + "grad_norm": 0.05773147459468349, + "language_loss": 0.79802787, + "learning_rate": 0.0006096978517643377, + "loss": 0.80913663, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.37158203, + "step": 2322, + "time_per_iteration": 2.8030614852905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123327, + "balance_loss_mlp": 1.08668184, + "epoch": 0.4469026548672566, + "flos": 512692125696.0, + "grad_norm": 0.052696901781691036, + "language_loss": 0.83731532, + "learning_rate": 0.0006093938786092968, + "loss": 0.84854853, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.3659668, + "step": 2323, + "time_per_iteration": 2.6108593940734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112741, + "balance_loss_mlp": 1.0761435, + "epoch": 0.4470950365525202, + "flos": 683774825472.0, + "grad_norm": 0.0683875942547517, + "language_loss": 0.89724207, + "learning_rate": 0.0006090898629837857, + "loss": 0.90836942, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.3659668, + "step": 2324, + "time_per_iteration": 2.8141510486602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121866, + "balance_loss_mlp": 1.08515, + "epoch": 0.4472874182377838, + "flos": 627018831360.0, + "grad_norm": 0.05799026068482576, + "language_loss": 0.87375617, + "learning_rate": 0.0006087858050058337, + "loss": 0.88497484, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.3671875, + "step": 2325, + "time_per_iteration": 2.8174242973327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106391, + "balance_loss_mlp": 1.07053268, + "epoch": 0.4474797999230473, + "flos": 546946795008.0, + "grad_norm": 0.06107345330372946, + "language_loss": 0.81985253, + "learning_rate": 0.0006084817047934866, + "loss": 0.8309164, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.35888672, + "step": 2326, + "time_per_iteration": 2.627870798110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111871, + "balance_loss_mlp": 1.08211279, + "epoch": 0.4476721816083109, + "flos": 455585513472.0, + "grad_norm": 0.09021260210248909, + "language_loss": 0.89277744, + "learning_rate": 0.0006081775624648066, + "loss": 0.90396452, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.3659668, + "step": 2327, + "time_per_iteration": 2.517587900161743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107902, + "balance_loss_mlp": 1.07154357, + "epoch": 0.44786456329357444, + "flos": 481273228800.0, + "grad_norm": 0.05788938613905733, + "language_loss": 0.8277235, + "learning_rate": 0.0006078733781378721, + "loss": 0.83880252, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.36401367, + "step": 2328, + "time_per_iteration": 2.5216193199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102085, + "balance_loss_mlp": 1.06579816, + "epoch": 0.448056944978838, + "flos": 551822409216.0, + "grad_norm": 0.05774471450654044, + "language_loss": 0.82095438, + "learning_rate": 0.0006075691519307781, + "loss": 0.83197522, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.36303711, + "step": 2329, + "time_per_iteration": 2.8394477367401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093614, + "balance_loss_mlp": 1.05551517, + "epoch": 0.44824932666410156, + "flos": 550571143680.0, + "grad_norm": 0.05485541452922095, + "language_loss": 0.82042563, + "learning_rate": 0.0006072648839616356, + "loss": 0.83136177, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.38061523, + "step": 2330, + "time_per_iteration": 2.650087594985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089159, + "balance_loss_mlp": 1.05229926, + "epoch": 0.44844170834936514, + "flos": 988161541632.0, + "grad_norm": 0.0454185508799419, + "language_loss": 0.82814097, + "learning_rate": 0.0006069605743485718, + "loss": 0.83903253, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.3684082, + "step": 2331, + "time_per_iteration": 3.345179319381714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085878, + "balance_loss_mlp": 1.0494473, + "epoch": 0.44863409003462873, + "flos": 591040032768.0, + "grad_norm": 0.057018102026312835, + "language_loss": 0.83470714, + "learning_rate": 0.0006066562232097303, + "loss": 0.84556592, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.36425781, + "step": 2332, + "time_per_iteration": 2.7025153636932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089148, + "balance_loss_mlp": 1.0525744, + "epoch": 0.44882647171989226, + "flos": 724313525760.0, + "grad_norm": 0.055435808375502424, + "language_loss": 0.86104345, + "learning_rate": 0.0006063518306632708, + "loss": 0.87193495, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.36572266, + "step": 2333, + "time_per_iteration": 2.934469699859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082178, + "balance_loss_mlp": 1.04465127, + "epoch": 0.44901885340515585, + "flos": 534662360064.0, + "grad_norm": 0.061394686563490536, + "language_loss": 0.82313985, + "learning_rate": 0.0006060473968273688, + "loss": 0.83396161, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.375, + "step": 2334, + "time_per_iteration": 2.6561286449432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139417, + "balance_loss_mlp": 1.12782979, + "epoch": 0.4492112350904194, + "flos": 1554456462336.0, + "grad_norm": 0.048192148717983975, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.79018956, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.11572266, + "step": 2335, + "time_per_iteration": 4.895314693450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092745, + "balance_loss_mlp": 1.08144426, + "epoch": 0.44940361677568297, + "flos": 1522525413888.0, + "grad_norm": 0.0355581806637232, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.8209796, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.11279297, + "step": 2336, + "time_per_iteration": 4.86665940284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088054, + "balance_loss_mlp": 1.05064595, + "epoch": 0.4495959984609465, + "flos": 382289310720.0, + "grad_norm": 0.06064477802371089, + "language_loss": 0.88117951, + "learning_rate": 0.0006051338487650047, + "loss": 0.89206004, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.3737793, + "step": 2337, + "time_per_iteration": 2.4159162044525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086135, + "balance_loss_mlp": 1.04777336, + "epoch": 0.4497883801462101, + "flos": 497630145024.0, + "grad_norm": 0.058257925131248826, + "language_loss": 0.82456082, + "learning_rate": 0.0006048292509534095, + "loss": 0.83542222, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.38354492, + "step": 2338, + "time_per_iteration": 2.5835769176483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081051, + "balance_loss_mlp": 1.04392958, + "epoch": 0.4499807618314736, + "flos": 614166990336.0, + "grad_norm": 0.053787147945734054, + "language_loss": 0.77580249, + "learning_rate": 0.0006045246124434895, + "loss": 0.78661299, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.37109375, + "step": 2339, + "time_per_iteration": 2.7258870601654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080837, + "balance_loss_mlp": 1.04311895, + "epoch": 0.4501731435167372, + "flos": 1005122331648.0, + "grad_norm": 0.06446556175990359, + "language_loss": 0.86143219, + "learning_rate": 0.0006042199333535162, + "loss": 0.87224054, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.37695312, + "step": 2340, + "time_per_iteration": 3.2644054889678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089021, + "balance_loss_mlp": 1.05132723, + "epoch": 0.4503655252020008, + "flos": 820519898112.0, + "grad_norm": 0.05440597484835576, + "language_loss": 0.8378191, + "learning_rate": 0.0006039152138017763, + "loss": 0.84870934, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.37695312, + "step": 2341, + "time_per_iteration": 3.0747756958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082643, + "balance_loss_mlp": 1.04566467, + "epoch": 0.4505579068872643, + "flos": 486113937408.0, + "grad_norm": 0.06051531382505287, + "language_loss": 0.83470345, + "learning_rate": 0.0006036104539065726, + "loss": 0.84552985, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.36962891, + "step": 2342, + "time_per_iteration": 2.6581151485443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076125, + "balance_loss_mlp": 1.03812099, + "epoch": 0.4507502885725279, + "flos": 884421282816.0, + "grad_norm": 0.05288539322407846, + "language_loss": 0.845487, + "learning_rate": 0.000603305653786223, + "loss": 0.85624826, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.37963867, + "step": 2343, + "time_per_iteration": 3.1298844814300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079349, + "balance_loss_mlp": 1.04208446, + "epoch": 0.45094267025779144, + "flos": 578070328320.0, + "grad_norm": 0.04730162576611683, + "language_loss": 0.83859873, + "learning_rate": 0.0006030008135590622, + "loss": 0.84939224, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.37255859, + "step": 2344, + "time_per_iteration": 2.685067892074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107533, + "balance_loss_mlp": 1.03799331, + "epoch": 0.45113505194305503, + "flos": 525124947456.0, + "grad_norm": 0.051192045733620226, + "language_loss": 0.80228901, + "learning_rate": 0.0006026959333434387, + "loss": 0.81304228, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.37353516, + "step": 2345, + "time_per_iteration": 2.783407688140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107739, + "balance_loss_mlp": 1.04014897, + "epoch": 0.45132743362831856, + "flos": 501791376384.0, + "grad_norm": 0.05199160611628431, + "language_loss": 0.77699506, + "learning_rate": 0.0006023910132577181, + "loss": 0.78776896, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.37207031, + "step": 2346, + "time_per_iteration": 2.646801233291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077398, + "balance_loss_mlp": 1.03968024, + "epoch": 0.45151981531358215, + "flos": 431690328576.0, + "grad_norm": 0.04922592508563583, + "language_loss": 0.84707314, + "learning_rate": 0.0006020860534202806, + "loss": 0.85784709, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.37670898, + "step": 2347, + "time_per_iteration": 2.4788920879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078489, + "balance_loss_mlp": 1.04036641, + "epoch": 0.4517121969988457, + "flos": 711826859520.0, + "grad_norm": 0.07725824631471088, + "language_loss": 0.80951411, + "learning_rate": 0.0006017810539495224, + "loss": 0.82029903, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.38110352, + "step": 2348, + "time_per_iteration": 3.013258934020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071848, + "balance_loss_mlp": 1.03587079, + "epoch": 0.45190457868410927, + "flos": 579197938176.0, + "grad_norm": 0.052394100693581906, + "language_loss": 0.82200068, + "learning_rate": 0.0006014760149638547, + "loss": 0.83271921, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.35986328, + "step": 2349, + "time_per_iteration": 2.6988728046417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073469, + "balance_loss_mlp": 1.03823042, + "epoch": 0.45209696036937286, + "flos": 482415395328.0, + "grad_norm": 0.04812495303687425, + "language_loss": 0.88394493, + "learning_rate": 0.000601170936581704, + "loss": 0.89467961, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.35253906, + "step": 2350, + "time_per_iteration": 2.5537099838256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108294, + "balance_loss_mlp": 1.04617548, + "epoch": 0.4522893420546364, + "flos": 539945409024.0, + "grad_norm": 0.059990427154632556, + "language_loss": 0.84346575, + "learning_rate": 0.0006008658189215121, + "loss": 0.85429513, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.36767578, + "step": 2351, + "time_per_iteration": 2.649442434310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084325, + "balance_loss_mlp": 1.04803789, + "epoch": 0.4524817237399, + "flos": 496423959552.0, + "grad_norm": 0.09153462549619036, + "language_loss": 0.7966159, + "learning_rate": 0.0006005606621017366, + "loss": 0.80745912, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.36328125, + "step": 2352, + "time_per_iteration": 2.55026912689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086533, + "balance_loss_mlp": 1.04891062, + "epoch": 0.4526741054251635, + "flos": 652229300736.0, + "grad_norm": 0.05116414037173521, + "language_loss": 0.80266565, + "learning_rate": 0.0006002554662408496, + "loss": 0.81353092, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.3762207, + "step": 2353, + "time_per_iteration": 2.8708717823028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089479, + "balance_loss_mlp": 1.05259538, + "epoch": 0.4528664871104271, + "flos": 570674654208.0, + "grad_norm": 0.05934636879993742, + "language_loss": 0.91137719, + "learning_rate": 0.0005999502314573388, + "loss": 0.92227197, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.36865234, + "step": 2354, + "time_per_iteration": 2.636732339859009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091866, + "balance_loss_mlp": 1.05424321, + "epoch": 0.45305886879569063, + "flos": 458480922624.0, + "grad_norm": 0.06511026561582739, + "language_loss": 0.85993183, + "learning_rate": 0.0005996449578697066, + "loss": 0.87085044, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.3762207, + "step": 2355, + "time_per_iteration": 2.6497340202331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095767, + "balance_loss_mlp": 1.05916929, + "epoch": 0.4532512504809542, + "flos": 504922512384.0, + "grad_norm": 0.05408585590104452, + "language_loss": 0.81462455, + "learning_rate": 0.0005993396455964709, + "loss": 0.82558227, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.36621094, + "step": 2356, + "time_per_iteration": 2.67404842376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090921, + "balance_loss_mlp": 1.05360866, + "epoch": 0.4534436321662178, + "flos": 581940578304.0, + "grad_norm": 0.046652791791384825, + "language_loss": 0.81415474, + "learning_rate": 0.0005990342947561647, + "loss": 0.82506394, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.37304688, + "step": 2357, + "time_per_iteration": 2.694093942642212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092731, + "balance_loss_mlp": 1.05577612, + "epoch": 0.45363601385148133, + "flos": 549458090496.0, + "grad_norm": 0.05811050095266086, + "language_loss": 0.77914369, + "learning_rate": 0.0005987289054673351, + "loss": 0.79007101, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.36987305, + "step": 2358, + "time_per_iteration": 2.6171157360076904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187917, + "balance_loss_mlp": 1.16912949, + "epoch": 0.4538283955367449, + "flos": 1473754411008.0, + "grad_norm": 0.03301673104438644, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77763653, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.1875, + "step": 2359, + "time_per_iteration": 4.821492910385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096443, + "balance_loss_mlp": 1.05986929, + "epoch": 0.45402077722200845, + "flos": 584441699328.0, + "grad_norm": 0.059282629275687046, + "language_loss": 0.91217041, + "learning_rate": 0.0005981180120183722, + "loss": 0.92313486, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.36572266, + "step": 2360, + "time_per_iteration": 2.6678080558776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109692, + "balance_loss_mlp": 1.05901098, + "epoch": 0.45421315890727204, + "flos": 531462822912.0, + "grad_norm": 0.0444268091974553, + "language_loss": 0.85307455, + "learning_rate": 0.0005978125080954089, + "loss": 0.86404377, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.37915039, + "step": 2361, + "time_per_iteration": 2.7723591327667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093514, + "balance_loss_mlp": 1.05651164, + "epoch": 0.4544055405925356, + "flos": 784890307584.0, + "grad_norm": 0.08031817047800895, + "language_loss": 0.7639026, + "learning_rate": 0.000597506966198262, + "loss": 0.77483773, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.36987305, + "step": 2362, + "time_per_iteration": 2.9897196292877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109078, + "balance_loss_mlp": 1.05389667, + "epoch": 0.45459792227779916, + "flos": 517950443520.0, + "grad_norm": 0.07752194494873299, + "language_loss": 0.84128416, + "learning_rate": 0.0005972013864455536, + "loss": 0.85219198, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.36914062, + "step": 2363, + "time_per_iteration": 2.580357074737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091271, + "balance_loss_mlp": 1.05515027, + "epoch": 0.4547903039630627, + "flos": 537306075648.0, + "grad_norm": 0.05808697989569881, + "language_loss": 0.85570788, + "learning_rate": 0.0005968957689559203, + "loss": 0.8666206, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.36132812, + "step": 2364, + "time_per_iteration": 2.64911150932312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095782, + "balance_loss_mlp": 1.05997205, + "epoch": 0.4549826856483263, + "flos": 528423409152.0, + "grad_norm": 0.05494979115149378, + "language_loss": 0.88544732, + "learning_rate": 0.0005965901138480131, + "loss": 0.8964051, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.35839844, + "step": 2365, + "time_per_iteration": 2.61967396736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100343, + "balance_loss_mlp": 1.06379294, + "epoch": 0.45517506733358987, + "flos": 520649413632.0, + "grad_norm": 0.0583285525672419, + "language_loss": 0.87046576, + "learning_rate": 0.0005962844212404982, + "loss": 0.88146913, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.36547852, + "step": 2366, + "time_per_iteration": 2.663799524307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108056, + "balance_loss_mlp": 1.07067156, + "epoch": 0.4553674490188534, + "flos": 450814616064.0, + "grad_norm": 0.06095483853323617, + "language_loss": 0.86969483, + "learning_rate": 0.0005959786912520558, + "loss": 0.88077545, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.37353516, + "step": 2367, + "time_per_iteration": 2.604011058807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104168, + "balance_loss_mlp": 1.06740427, + "epoch": 0.455559830704117, + "flos": 546308015616.0, + "grad_norm": 0.04613637765687707, + "language_loss": 0.83717126, + "learning_rate": 0.0005956729240013806, + "loss": 0.84821296, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.36743164, + "step": 2368, + "time_per_iteration": 2.7852706909179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110558, + "balance_loss_mlp": 1.06917334, + "epoch": 0.4557522123893805, + "flos": 583491589632.0, + "grad_norm": 0.05161395773765414, + "language_loss": 0.91501808, + "learning_rate": 0.0005953671196071824, + "loss": 0.92607391, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.36401367, + "step": 2369, + "time_per_iteration": 2.7515223026275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110279, + "balance_loss_mlp": 1.06681311, + "epoch": 0.4559445940746441, + "flos": 526149250560.0, + "grad_norm": 0.05240938085212211, + "language_loss": 0.80084532, + "learning_rate": 0.0005950612781881846, + "loss": 0.8118732, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.35986328, + "step": 2370, + "time_per_iteration": 2.6867175102233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096163, + "balance_loss_mlp": 1.05873156, + "epoch": 0.45613697575990764, + "flos": 651810281472.0, + "grad_norm": 0.06280114629685846, + "language_loss": 0.7594825, + "learning_rate": 0.0005947553998631259, + "loss": 0.77044415, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.37451172, + "step": 2371, + "time_per_iteration": 2.8399033546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096425, + "balance_loss_mlp": 1.05985141, + "epoch": 0.4563293574451712, + "flos": 866744699904.0, + "grad_norm": 0.04396235367342953, + "language_loss": 0.78598678, + "learning_rate": 0.000594449484750758, + "loss": 0.79695106, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.36572266, + "step": 2372, + "time_per_iteration": 3.140890121459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088429, + "balance_loss_mlp": 1.05140269, + "epoch": 0.45652173913043476, + "flos": 497817819648.0, + "grad_norm": 0.06709411136792778, + "language_loss": 0.82665753, + "learning_rate": 0.0005941435329698484, + "loss": 0.83754182, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.36987305, + "step": 2373, + "time_per_iteration": 2.6316027641296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089126, + "balance_loss_mlp": 1.05238533, + "epoch": 0.45671412081569834, + "flos": 560581420032.0, + "grad_norm": 0.05173954705628188, + "language_loss": 0.82881534, + "learning_rate": 0.0005938375446391778, + "loss": 0.83970654, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.36743164, + "step": 2374, + "time_per_iteration": 2.6999659538269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096506, + "balance_loss_mlp": 1.05823994, + "epoch": 0.45690650250096193, + "flos": 502873906176.0, + "grad_norm": 0.06488189122368912, + "language_loss": 0.88693655, + "learning_rate": 0.0005935315198775415, + "loss": 0.89790159, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.38232422, + "step": 2375, + "time_per_iteration": 2.584855556488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084023, + "balance_loss_mlp": 1.04675794, + "epoch": 0.45709888418622546, + "flos": 430473968640.0, + "grad_norm": 0.054054227258136585, + "language_loss": 0.86900407, + "learning_rate": 0.0005932254588037486, + "loss": 0.87984431, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.37207031, + "step": 2376, + "time_per_iteration": 2.4713377952575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087126, + "balance_loss_mlp": 1.04907441, + "epoch": 0.45729126587148905, + "flos": 525395579904.0, + "grad_norm": 0.22673198102288197, + "language_loss": 0.86219609, + "learning_rate": 0.000592919361536623, + "loss": 0.87306732, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.38037109, + "step": 2377, + "time_per_iteration": 2.6324362754821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074984, + "balance_loss_mlp": 1.03821993, + "epoch": 0.4574836475567526, + "flos": 637717349376.0, + "grad_norm": 0.06562895013351942, + "language_loss": 0.88980031, + "learning_rate": 0.0005926132281950017, + "loss": 0.90055019, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.36767578, + "step": 2378, + "time_per_iteration": 2.7336690425872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079258, + "balance_loss_mlp": 1.04194546, + "epoch": 0.45767602924201617, + "flos": 649288811520.0, + "grad_norm": 0.05221471992659685, + "language_loss": 0.84916019, + "learning_rate": 0.0005923070588977367, + "loss": 0.85995281, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.37280273, + "step": 2379, + "time_per_iteration": 2.796694755554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073672, + "balance_loss_mlp": 1.03745568, + "epoch": 0.4578684109272797, + "flos": 746356543488.0, + "grad_norm": 0.05948192069014845, + "language_loss": 0.86265379, + "learning_rate": 0.0005920008537636931, + "loss": 0.8733905, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.36230469, + "step": 2380, + "time_per_iteration": 2.919175863265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073893, + "balance_loss_mlp": 1.03734303, + "epoch": 0.4580607926125433, + "flos": 641155433472.0, + "grad_norm": 0.07082348059879481, + "language_loss": 0.86767799, + "learning_rate": 0.0005916946129117504, + "loss": 0.8784169, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.36523438, + "step": 2381, + "time_per_iteration": 2.8834073543548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076603, + "balance_loss_mlp": 1.03983903, + "epoch": 0.4582531742978069, + "flos": 801513474048.0, + "grad_norm": 0.06015762492268947, + "language_loss": 0.80385733, + "learning_rate": 0.0005913883364608017, + "loss": 0.81462336, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.36791992, + "step": 2382, + "time_per_iteration": 3.05711030960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077489, + "balance_loss_mlp": 1.03984237, + "epoch": 0.4584455559830704, + "flos": 683991613440.0, + "grad_norm": 0.05122280126715116, + "language_loss": 0.88575673, + "learning_rate": 0.0005910820245297542, + "loss": 0.89653164, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.37646484, + "step": 2383, + "time_per_iteration": 2.8739712238311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107409, + "balance_loss_mlp": 1.03682566, + "epoch": 0.458637937668334, + "flos": 517902391296.0, + "grad_norm": 0.06830932289634356, + "language_loss": 0.80442882, + "learning_rate": 0.000590775677237529, + "loss": 0.81516975, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.37231445, + "step": 2384, + "time_per_iteration": 2.7162787914276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083903, + "balance_loss_mlp": 1.04585159, + "epoch": 0.4588303193535975, + "flos": 505242607104.0, + "grad_norm": 0.06045305543182838, + "language_loss": 0.80110037, + "learning_rate": 0.0005904692947030601, + "loss": 0.81193942, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.38012695, + "step": 2385, + "time_per_iteration": 2.615645408630371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077094, + "balance_loss_mlp": 1.04054475, + "epoch": 0.4590227010388611, + "flos": 495655732224.0, + "grad_norm": 0.07817461665700527, + "language_loss": 0.89474368, + "learning_rate": 0.0005901628770452963, + "loss": 0.90551466, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.36572266, + "step": 2386, + "time_per_iteration": 2.545145273208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_mlp": 1.03952503, + "epoch": 0.45921508272412465, + "flos": 493375781376.0, + "grad_norm": 0.05719900676000999, + "language_loss": 0.87518173, + "learning_rate": 0.000589856424383199, + "loss": 0.88595015, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.37280273, + "step": 2387, + "time_per_iteration": 2.5866873264312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078911, + "balance_loss_mlp": 1.04000092, + "epoch": 0.45940746440938823, + "flos": 691096306176.0, + "grad_norm": 0.05272732350360167, + "language_loss": 0.82854474, + "learning_rate": 0.000589549936835744, + "loss": 0.83933389, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.38867188, + "step": 2388, + "time_per_iteration": 2.886815309524536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082474, + "balance_loss_mlp": 1.04485154, + "epoch": 0.45959984609465176, + "flos": 503489364480.0, + "grad_norm": 0.061476086167368736, + "language_loss": 0.79490817, + "learning_rate": 0.0005892434145219202, + "loss": 0.80573285, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.37597656, + "step": 2389, + "time_per_iteration": 2.669055461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078287, + "balance_loss_mlp": 1.04035497, + "epoch": 0.45979222777991535, + "flos": 676339863552.0, + "grad_norm": 0.13998924312013794, + "language_loss": 0.82966721, + "learning_rate": 0.0005889368575607303, + "loss": 0.84045005, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.37890625, + "step": 2390, + "time_per_iteration": 2.8364429473876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075252, + "balance_loss_mlp": 1.03941786, + "epoch": 0.45998460946517894, + "flos": 777308368896.0, + "grad_norm": 0.05472501976139028, + "language_loss": 0.78496212, + "learning_rate": 0.00058863026607119, + "loss": 0.79571462, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.35864258, + "step": 2391, + "time_per_iteration": 3.104703664779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078215, + "balance_loss_mlp": 1.04059267, + "epoch": 0.46017699115044247, + "flos": 851073053184.0, + "grad_norm": 0.06149888926191146, + "language_loss": 0.79584855, + "learning_rate": 0.0005883236401723287, + "loss": 0.80663073, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.37597656, + "step": 2392, + "time_per_iteration": 3.1967198848724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074328, + "balance_loss_mlp": 1.03603745, + "epoch": 0.46036937283570606, + "flos": 575608495104.0, + "grad_norm": 0.05401888737183198, + "language_loss": 0.84525239, + "learning_rate": 0.0005880169799831893, + "loss": 0.85599566, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.3828125, + "step": 2393, + "time_per_iteration": 2.6700267791748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078621, + "balance_loss_mlp": 1.04049826, + "epoch": 0.4605617545209696, + "flos": 611553798144.0, + "grad_norm": 0.04760801272162673, + "language_loss": 0.81405449, + "learning_rate": 0.0005877102856228278, + "loss": 0.82484066, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.38110352, + "step": 2394, + "time_per_iteration": 2.8472628593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079988, + "balance_loss_mlp": 1.04100633, + "epoch": 0.4607541362062332, + "flos": 532884386304.0, + "grad_norm": 0.0583897063043048, + "language_loss": 0.84685498, + "learning_rate": 0.0005874035572103133, + "loss": 0.85765481, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.38964844, + "step": 2395, + "time_per_iteration": 2.6390676498413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081925, + "balance_loss_mlp": 1.04437459, + "epoch": 0.4609465178914967, + "flos": 647023417344.0, + "grad_norm": 0.07571396195119524, + "language_loss": 0.82582867, + "learning_rate": 0.0005870967948647288, + "loss": 0.83664787, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.37573242, + "step": 2396, + "time_per_iteration": 2.7459003925323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115963, + "balance_loss_mlp": 1.09889209, + "epoch": 0.4611388995767603, + "flos": 1465487202816.0, + "grad_norm": 0.025541481833947964, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75424266, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.17089844, + "step": 2397, + "time_per_iteration": 5.318708896636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083182, + "balance_loss_mlp": 1.04446316, + "epoch": 0.46133128126202383, + "flos": 722772688896.0, + "grad_norm": 0.0770893227760576, + "language_loss": 0.8586902, + "learning_rate": 0.0005864831688507443, + "loss": 0.86952198, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.38696289, + "step": 2398, + "time_per_iteration": 3.0177690982818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092841, + "balance_loss_mlp": 1.05266774, + "epoch": 0.4615236629472874, + "flos": 547735371264.0, + "grad_norm": 0.05577558539065206, + "language_loss": 0.74877977, + "learning_rate": 0.0005861763054205754, + "loss": 0.75970817, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.40161133, + "step": 2399, + "time_per_iteration": 4.235994815826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089549, + "balance_loss_mlp": 1.04885101, + "epoch": 0.461716044632551, + "flos": 601942192128.0, + "grad_norm": 0.04983292023279428, + "language_loss": 0.80479169, + "learning_rate": 0.0005858694085337976, + "loss": 0.81568718, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.40698242, + "step": 2400, + "time_per_iteration": 2.807819366455078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095586, + "balance_loss_mlp": 1.0549593, + "epoch": 0.46190842631781454, + "flos": 474236937216.0, + "grad_norm": 0.0664642499777789, + "language_loss": 0.8348912, + "learning_rate": 0.0005855624783095589, + "loss": 0.84584707, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.40625, + "step": 2401, + "time_per_iteration": 2.572861909866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088871, + "balance_loss_mlp": 1.04848242, + "epoch": 0.4621008080030781, + "flos": 437254184448.0, + "grad_norm": 0.05436683283363487, + "language_loss": 0.85176182, + "learning_rate": 0.00058525551486702, + "loss": 0.86265051, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.40405273, + "step": 2402, + "time_per_iteration": 2.5116658210754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091528, + "balance_loss_mlp": 1.05056739, + "epoch": 0.46229318968834165, + "flos": 525203523072.0, + "grad_norm": 0.06054832474170735, + "language_loss": 0.81057394, + "learning_rate": 0.0005849485183253548, + "loss": 0.82148921, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.40942383, + "step": 2403, + "time_per_iteration": 2.6135447025299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109357, + "balance_loss_mlp": 1.05446947, + "epoch": 0.46248557137360524, + "flos": 439395922944.0, + "grad_norm": 0.05271308957386849, + "language_loss": 0.87085575, + "learning_rate": 0.0005846414888037501, + "loss": 0.88179141, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.39086914, + "step": 2404, + "time_per_iteration": 2.479233503341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094001, + "balance_loss_mlp": 1.05513883, + "epoch": 0.4626779530588688, + "flos": 617318475264.0, + "grad_norm": 0.05681624365321511, + "language_loss": 0.82982111, + "learning_rate": 0.0005843344264214049, + "loss": 0.84076107, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.38818359, + "step": 2405, + "time_per_iteration": 2.8025927543640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094528, + "balance_loss_mlp": 1.05478346, + "epoch": 0.46287033474413236, + "flos": 669796784640.0, + "grad_norm": 0.07573173665893672, + "language_loss": 0.84474289, + "learning_rate": 0.0005840273312975317, + "loss": 0.8556881, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.39746094, + "step": 2406, + "time_per_iteration": 2.880143642425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096681, + "balance_loss_mlp": 1.05705631, + "epoch": 0.46306271642939595, + "flos": 479992849920.0, + "grad_norm": 0.09801123732991168, + "language_loss": 0.90446943, + "learning_rate": 0.0005837202035513555, + "loss": 0.91543621, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.39599609, + "step": 2407, + "time_per_iteration": 2.5880489349365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109455, + "balance_loss_mlp": 1.05583048, + "epoch": 0.4632550981146595, + "flos": 580395359232.0, + "grad_norm": 0.057934056350582984, + "language_loss": 0.81573331, + "learning_rate": 0.0005834130433021136, + "loss": 0.82667881, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.38671875, + "step": 2408, + "time_per_iteration": 2.739018201828003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100791, + "balance_loss_mlp": 1.06121325, + "epoch": 0.46344747979992307, + "flos": 523701974016.0, + "grad_norm": 0.11568384778980019, + "language_loss": 0.73278892, + "learning_rate": 0.0005831058506690563, + "loss": 0.74379677, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.39550781, + "step": 2409, + "time_per_iteration": 2.6164803504943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109281, + "balance_loss_mlp": 1.05513954, + "epoch": 0.4636398614851866, + "flos": 746174661120.0, + "grad_norm": 0.10585491609730635, + "language_loss": 0.85966945, + "learning_rate": 0.0005827986257714464, + "loss": 0.87059754, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.3762207, + "step": 2410, + "time_per_iteration": 2.9002575874328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108768, + "balance_loss_mlp": 1.05008137, + "epoch": 0.4638322431704502, + "flos": 596273619456.0, + "grad_norm": 0.054458395819511424, + "language_loss": 0.88645154, + "learning_rate": 0.0005824913687285591, + "loss": 0.89732838, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.37597656, + "step": 2411, + "time_per_iteration": 2.65468168258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108455, + "balance_loss_mlp": 1.046808, + "epoch": 0.4640246248557137, + "flos": 539172799488.0, + "grad_norm": 0.10537111148670983, + "language_loss": 0.81237781, + "learning_rate": 0.0005821840796596821, + "loss": 0.82322335, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.37744141, + "step": 2412, + "time_per_iteration": 2.64800763130188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086752, + "balance_loss_mlp": 1.04979706, + "epoch": 0.4642170065409773, + "flos": 562330280448.0, + "grad_norm": 0.05022524173963101, + "language_loss": 0.80493259, + "learning_rate": 0.0005818767586841158, + "loss": 0.81580019, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.36962891, + "step": 2413, + "time_per_iteration": 2.755119800567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081928, + "balance_loss_mlp": 1.04657054, + "epoch": 0.46440938822624084, + "flos": 530684421120.0, + "grad_norm": 0.05374997972366647, + "language_loss": 0.86088538, + "learning_rate": 0.0005815694059211726, + "loss": 0.87170464, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.35400391, + "step": 2414, + "time_per_iteration": 2.6568868160247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.09606647, + "epoch": 0.4646017699115044, + "flos": 1525503780864.0, + "grad_norm": 0.029698276976430914, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.81986189, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.16503906, + "step": 2415, + "time_per_iteration": 4.772961378097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103885, + "balance_loss_mlp": 1.08795917, + "epoch": 0.464794151596768, + "flos": 1539999765504.0, + "grad_norm": 0.029205098078145548, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78048944, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.15917969, + "step": 2416, + "time_per_iteration": 4.972976446151733 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085394, + "balance_loss_mlp": 1.04908264, + "epoch": 0.46498653328203154, + "flos": 501200649216.0, + "grad_norm": 0.04510206741076235, + "language_loss": 0.86396641, + "learning_rate": 0.0005806471581013931, + "loss": 0.87482029, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.36328125, + "step": 2417, + "time_per_iteration": 2.6620965003967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084664, + "balance_loss_mlp": 1.04806709, + "epoch": 0.46517891496729513, + "flos": 675856825344.0, + "grad_norm": 0.06302462590955567, + "language_loss": 0.78826416, + "learning_rate": 0.0005803396793823146, + "loss": 0.79911077, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.36572266, + "step": 2418, + "time_per_iteration": 2.7901804447174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108772, + "balance_loss_mlp": 1.05190992, + "epoch": 0.46537129665255866, + "flos": 585062949888.0, + "grad_norm": 0.06339234247272847, + "language_loss": 0.85623956, + "learning_rate": 0.0005800321694726065, + "loss": 0.86711681, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.35839844, + "step": 2419, + "time_per_iteration": 2.728811740875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085796, + "balance_loss_mlp": 1.04836476, + "epoch": 0.46556367833782225, + "flos": 587425858560.0, + "grad_norm": 0.05222204092555794, + "language_loss": 0.8708874, + "learning_rate": 0.0005797246284916545, + "loss": 0.88174534, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.37402344, + "step": 2420, + "time_per_iteration": 2.6684653759002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045596, + "balance_loss_mlp": 1.03043234, + "epoch": 0.4657560600230858, + "flos": 1484674099200.0, + "grad_norm": 0.011675297447767578, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78550786, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.15136719, + "step": 2421, + "time_per_iteration": 4.958959102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109154, + "balance_loss_mlp": 1.05506182, + "epoch": 0.46594844170834937, + "flos": 579961783296.0, + "grad_norm": 0.06275032464162542, + "language_loss": 0.88184166, + "learning_rate": 0.0005791094537936233, + "loss": 0.89275706, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.36499023, + "step": 2422, + "time_per_iteration": 2.682985782623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085598, + "balance_loss_mlp": 1.04761815, + "epoch": 0.4661408233936129, + "flos": 512322568704.0, + "grad_norm": 0.05420418194823272, + "language_loss": 0.8170498, + "learning_rate": 0.0005788018203153762, + "loss": 0.82790577, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.37988281, + "step": 2423, + "time_per_iteration": 2.5706470012664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085407, + "balance_loss_mlp": 1.04883409, + "epoch": 0.4663332050788765, + "flos": 490839754752.0, + "grad_norm": 0.06546291293651209, + "language_loss": 0.85642946, + "learning_rate": 0.000578494156243549, + "loss": 0.86728358, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.36572266, + "step": 2424, + "time_per_iteration": 2.578847646713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085751, + "balance_loss_mlp": 1.04746079, + "epoch": 0.4665255867641401, + "flos": 512353092096.0, + "grad_norm": 0.059152702804089866, + "language_loss": 0.89097798, + "learning_rate": 0.0005781864616975878, + "loss": 0.90183544, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.38256836, + "step": 2425, + "time_per_iteration": 2.6408798694610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085338, + "balance_loss_mlp": 1.04585552, + "epoch": 0.4667179684494036, + "flos": 424590018048.0, + "grad_norm": 0.07480086545967683, + "language_loss": 0.84123272, + "learning_rate": 0.0005778787367969502, + "loss": 0.85208613, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.39477539, + "step": 2426, + "time_per_iteration": 2.5963637828826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077247, + "balance_loss_mlp": 1.03910041, + "epoch": 0.4669103501346672, + "flos": 707640897024.0, + "grad_norm": 0.07167303988395164, + "language_loss": 0.80844486, + "learning_rate": 0.0005775709816611053, + "loss": 0.81921738, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.38134766, + "step": 2427, + "time_per_iteration": 2.971285581588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079771, + "balance_loss_mlp": 1.04138589, + "epoch": 0.4671027318199307, + "flos": 554554874880.0, + "grad_norm": 0.05405801443852106, + "language_loss": 0.83748919, + "learning_rate": 0.0005772631964095346, + "loss": 0.84828693, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.38354492, + "step": 2428, + "time_per_iteration": 2.709364175796509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080886, + "balance_loss_mlp": 1.04271483, + "epoch": 0.4672951135051943, + "flos": 566839309824.0, + "grad_norm": 0.060777782070244445, + "language_loss": 0.8565498, + "learning_rate": 0.000576955381161731, + "loss": 0.86735862, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.38183594, + "step": 2429, + "time_per_iteration": 2.708270311355591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083121, + "balance_loss_mlp": 1.04452121, + "epoch": 0.46748749519045785, + "flos": 424294654464.0, + "grad_norm": 0.05633631430335825, + "language_loss": 0.85906339, + "learning_rate": 0.0005766475360371985, + "loss": 0.86989462, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.38574219, + "step": 2430, + "time_per_iteration": 2.617856740951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089079, + "balance_loss_mlp": 1.05055118, + "epoch": 0.46767987687572143, + "flos": 538088859648.0, + "grad_norm": 0.05568735360450276, + "language_loss": 0.84486759, + "learning_rate": 0.0005763396611554536, + "loss": 0.85575831, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.38476562, + "step": 2431, + "time_per_iteration": 2.6460912227630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093376, + "balance_loss_mlp": 1.0557059, + "epoch": 0.467872258560985, + "flos": 823360052736.0, + "grad_norm": 0.05823580457003032, + "language_loss": 0.80262822, + "learning_rate": 0.0005760317566360237, + "loss": 0.81356204, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.37646484, + "step": 2432, + "time_per_iteration": 3.010744094848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104411, + "balance_loss_mlp": 1.066836, + "epoch": 0.46806464024624855, + "flos": 661366632960.0, + "grad_norm": 0.07453415962543286, + "language_loss": 0.85120392, + "learning_rate": 0.000575723822598448, + "loss": 0.86224806, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.37573242, + "step": 2433, + "time_per_iteration": 2.7999444007873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100188, + "balance_loss_mlp": 1.06232667, + "epoch": 0.46825702193151214, + "flos": 755362865664.0, + "grad_norm": 0.08922556949000433, + "language_loss": 0.81824166, + "learning_rate": 0.0005754158591622773, + "loss": 0.82924354, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.37866211, + "step": 2434, + "time_per_iteration": 3.016101837158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089201, + "balance_loss_mlp": 1.05250812, + "epoch": 0.4684494036167757, + "flos": 439164578304.0, + "grad_norm": 0.06367410837717138, + "language_loss": 0.82359827, + "learning_rate": 0.0005751078664470732, + "loss": 0.8344903, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.36694336, + "step": 2435, + "time_per_iteration": 2.5870590209960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095131, + "balance_loss_mlp": 1.05762815, + "epoch": 0.46864178530203926, + "flos": 532446428160.0, + "grad_norm": 0.059213993455869605, + "language_loss": 0.85874772, + "learning_rate": 0.0005747998445724094, + "loss": 0.86969906, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.375, + "step": 2436, + "time_per_iteration": 2.606999397277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088052, + "balance_loss_mlp": 1.05135953, + "epoch": 0.4688341669873028, + "flos": 576328670208.0, + "grad_norm": 0.05282393784178956, + "language_loss": 0.89627349, + "learning_rate": 0.0005744917936578707, + "loss": 0.90715402, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.3671875, + "step": 2437, + "time_per_iteration": 2.7902729511260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075719, + "balance_loss_mlp": 1.03978968, + "epoch": 0.4690265486725664, + "flos": 539296455168.0, + "grad_norm": 0.04430533887369339, + "language_loss": 0.84245884, + "learning_rate": 0.0005741837138230526, + "loss": 0.85321605, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.35913086, + "step": 2438, + "time_per_iteration": 2.726710319519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082309, + "balance_loss_mlp": 1.04580677, + "epoch": 0.4692189303578299, + "flos": 770168770560.0, + "grad_norm": 0.06182369714878754, + "language_loss": 0.86213875, + "learning_rate": 0.0005738756051875627, + "loss": 0.87296176, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.36547852, + "step": 2439, + "time_per_iteration": 3.07755708694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077719, + "balance_loss_mlp": 1.04178953, + "epoch": 0.4694113120430935, + "flos": 571118404608.0, + "grad_norm": 0.047772699497207846, + "language_loss": 0.82990217, + "learning_rate": 0.0005735674678710192, + "loss": 0.84067929, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.359375, + "step": 2440, + "time_per_iteration": 2.6625607013702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080403, + "balance_loss_mlp": 1.04423499, + "epoch": 0.4696036937283571, + "flos": 748498281984.0, + "grad_norm": 0.07690297936976162, + "language_loss": 0.81414962, + "learning_rate": 0.0005732593019930517, + "loss": 0.82495368, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.36181641, + "step": 2441, + "time_per_iteration": 2.918219566345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082294, + "balance_loss_mlp": 1.04669785, + "epoch": 0.4697960754136206, + "flos": 493208455680.0, + "grad_norm": 0.061105529929901724, + "language_loss": 0.87989414, + "learning_rate": 0.0005729511076733008, + "loss": 0.89071703, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.35620117, + "step": 2442, + "time_per_iteration": 2.6301560401916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085737, + "balance_loss_mlp": 1.04909194, + "epoch": 0.4699884570988842, + "flos": 724809710592.0, + "grad_norm": 0.0773152930313349, + "language_loss": 0.84905529, + "learning_rate": 0.000572642885031418, + "loss": 0.85991269, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.36645508, + "step": 2443, + "time_per_iteration": 2.8638129234313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081528, + "balance_loss_mlp": 1.04619479, + "epoch": 0.47018083878414774, + "flos": 555141219840.0, + "grad_norm": 0.0470926044275737, + "language_loss": 0.80651355, + "learning_rate": 0.0005723346341870662, + "loss": 0.81732887, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.35351562, + "step": 2444, + "time_per_iteration": 2.7571544647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093224, + "balance_loss_mlp": 1.05767596, + "epoch": 0.4703732204694113, + "flos": 423846521856.0, + "grad_norm": 0.060426187781859556, + "language_loss": 0.8612802, + "learning_rate": 0.0005720263552599188, + "loss": 0.87221241, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.35595703, + "step": 2445, + "time_per_iteration": 2.457702398300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087133, + "balance_loss_mlp": 1.05003476, + "epoch": 0.47056560215467486, + "flos": 703179919872.0, + "grad_norm": 0.05103700331104036, + "language_loss": 0.79627156, + "learning_rate": 0.0005717180483696604, + "loss": 0.80714285, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.37084961, + "step": 2446, + "time_per_iteration": 2.851597785949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096579, + "balance_loss_mlp": 1.05981517, + "epoch": 0.47075798383993844, + "flos": 554701851648.0, + "grad_norm": 0.05942499594418206, + "language_loss": 0.82931131, + "learning_rate": 0.0005714097136359862, + "loss": 0.84027708, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.36791992, + "step": 2447, + "time_per_iteration": 2.6262872219085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088323, + "balance_loss_mlp": 1.05203617, + "epoch": 0.470950365525202, + "flos": 564009329664.0, + "grad_norm": 0.04849265524269106, + "language_loss": 0.86289024, + "learning_rate": 0.0005711013511786027, + "loss": 0.87377352, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.36303711, + "step": 2448, + "time_per_iteration": 2.7698192596435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087117, + "balance_loss_mlp": 1.05066276, + "epoch": 0.47114274721046556, + "flos": 534189496320.0, + "grad_norm": 0.0564117191668664, + "language_loss": 0.83740294, + "learning_rate": 0.0005707929611172263, + "loss": 0.84827411, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.36450195, + "step": 2449, + "time_per_iteration": 2.679288864135742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091785, + "balance_loss_mlp": 1.0557121, + "epoch": 0.47133512889572915, + "flos": 472877982720.0, + "grad_norm": 0.05809255973733416, + "language_loss": 0.83857393, + "learning_rate": 0.000570484543571585, + "loss": 0.84949178, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.3605957, + "step": 2450, + "time_per_iteration": 2.53946852684021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085264, + "balance_loss_mlp": 1.04914355, + "epoch": 0.4715275105809927, + "flos": 458776286208.0, + "grad_norm": 0.05957003441240347, + "language_loss": 0.83003706, + "learning_rate": 0.0005701760986614171, + "loss": 0.84088969, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.36132812, + "step": 2451, + "time_per_iteration": 2.578679323196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108432, + "balance_loss_mlp": 1.04784179, + "epoch": 0.47171989226625627, + "flos": 421783358976.0, + "grad_norm": 0.04971859173266034, + "language_loss": 0.86998093, + "learning_rate": 0.0005698676265064714, + "loss": 0.88082415, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.36499023, + "step": 2452, + "time_per_iteration": 2.5178701877593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108693, + "balance_loss_mlp": 1.04887831, + "epoch": 0.4719122739515198, + "flos": 457200543744.0, + "grad_norm": 0.06455625952921856, + "language_loss": 0.89101571, + "learning_rate": 0.0005695591272265074, + "loss": 0.90188503, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.38037109, + "step": 2453, + "time_per_iteration": 2.527940511703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094235, + "balance_loss_mlp": 1.05601645, + "epoch": 0.4721046556367834, + "flos": 514716000768.0, + "grad_norm": 0.05921175255811472, + "language_loss": 0.81955969, + "learning_rate": 0.0005692506009412954, + "loss": 0.83050203, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.3815918, + "step": 2454, + "time_per_iteration": 2.6692135334014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152126, + "balance_loss_mlp": 1.13209891, + "epoch": 0.4722970373220469, + "flos": 1571399723520.0, + "grad_norm": 0.04281653423243919, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78703392, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.20019531, + "step": 2455, + "time_per_iteration": 4.940452337265015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085506, + "balance_loss_mlp": 1.04731131, + "epoch": 0.4724894190073105, + "flos": 585919927296.0, + "grad_norm": 0.06574328103666784, + "language_loss": 0.89537692, + "learning_rate": 0.0005686334678342593, + "loss": 0.906232, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.38183594, + "step": 2456, + "time_per_iteration": 2.8626763820648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085091, + "balance_loss_mlp": 1.04816043, + "epoch": 0.4726818006925741, + "flos": 867290347008.0, + "grad_norm": 0.053689359601525224, + "language_loss": 0.81760311, + "learning_rate": 0.0005683248612520274, + "loss": 0.82845408, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.36914062, + "step": 2457, + "time_per_iteration": 3.062195301055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079889, + "balance_loss_mlp": 1.04300618, + "epoch": 0.4728741823778376, + "flos": 752653721088.0, + "grad_norm": 0.06424431420602757, + "language_loss": 0.83881927, + "learning_rate": 0.0005680162281437321, + "loss": 0.84961808, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.36865234, + "step": 2458, + "time_per_iteration": 4.24756932258606 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081915, + "balance_loss_mlp": 1.04474509, + "epoch": 0.4730665640631012, + "flos": 538301265408.0, + "grad_norm": 0.04398827684533395, + "language_loss": 0.84583557, + "learning_rate": 0.000567707568629195, + "loss": 0.8566547, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.37158203, + "step": 2459, + "time_per_iteration": 2.678410530090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077333, + "balance_loss_mlp": 1.04104519, + "epoch": 0.47325894574836475, + "flos": 491396986368.0, + "grad_norm": 0.04729381274413396, + "language_loss": 0.82117784, + "learning_rate": 0.0005673988828282486, + "loss": 0.83195114, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.36303711, + "step": 2460, + "time_per_iteration": 2.6379287242889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080146, + "balance_loss_mlp": 1.04397774, + "epoch": 0.47345132743362833, + "flos": 764117494272.0, + "grad_norm": 0.048508898725252214, + "language_loss": 0.80703068, + "learning_rate": 0.0005670901708607352, + "loss": 0.81783217, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.36206055, + "step": 2461, + "time_per_iteration": 2.9682881832122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079185, + "balance_loss_mlp": 1.04366088, + "epoch": 0.47364370911889186, + "flos": 539925060096.0, + "grad_norm": 0.06522156043574484, + "language_loss": 0.84211236, + "learning_rate": 0.0005667814328465076, + "loss": 0.8529042, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.35546875, + "step": 2462, + "time_per_iteration": 2.6927719116210938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074953, + "balance_loss_mlp": 1.04031122, + "epoch": 0.47383609080415545, + "flos": 406002613248.0, + "grad_norm": 0.06749328280555515, + "language_loss": 0.81615329, + "learning_rate": 0.0005664726689054285, + "loss": 0.82690287, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.34692383, + "step": 2463, + "time_per_iteration": 2.4384853839874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078599, + "balance_loss_mlp": 1.04345584, + "epoch": 0.474028472489419, + "flos": 453237161472.0, + "grad_norm": 0.0467114590315811, + "language_loss": 0.81182402, + "learning_rate": 0.0005661638791573704, + "loss": 0.82261002, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.35180664, + "step": 2464, + "time_per_iteration": 2.695479154586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108113, + "balance_loss_mlp": 1.04582047, + "epoch": 0.47422085417468257, + "flos": 491923694592.0, + "grad_norm": 0.04732653708909472, + "language_loss": 0.86637986, + "learning_rate": 0.0005658550637222164, + "loss": 0.87719119, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.35327148, + "step": 2465, + "time_per_iteration": 2.6167092323303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079374, + "balance_loss_mlp": 1.04365873, + "epoch": 0.47441323585994616, + "flos": 738537467904.0, + "grad_norm": 0.057300064889236176, + "language_loss": 0.82372761, + "learning_rate": 0.0005655462227198592, + "loss": 0.83452135, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.35742188, + "step": 2466, + "time_per_iteration": 2.9023492336273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080505, + "balance_loss_mlp": 1.04509962, + "epoch": 0.4746056175452097, + "flos": 484439270400.0, + "grad_norm": 0.05227273448390526, + "language_loss": 0.83720088, + "learning_rate": 0.0005652373562702016, + "loss": 0.84800589, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.35449219, + "step": 2467, + "time_per_iteration": 2.5808918476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082419, + "balance_loss_mlp": 1.04715681, + "epoch": 0.4747979992304733, + "flos": 460814717952.0, + "grad_norm": 0.05382206625072039, + "language_loss": 0.88037241, + "learning_rate": 0.000564928464493156, + "loss": 0.89119661, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.35302734, + "step": 2468, + "time_per_iteration": 2.5377156734466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087106, + "balance_loss_mlp": 1.05198669, + "epoch": 0.4749903809157368, + "flos": 864070460928.0, + "grad_norm": 0.0577962749951369, + "language_loss": 0.81768191, + "learning_rate": 0.000564619547508645, + "loss": 0.82855296, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.3515625, + "step": 2469, + "time_per_iteration": 3.043691396713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086191, + "balance_loss_mlp": 1.05042827, + "epoch": 0.4751827626010004, + "flos": 505296451584.0, + "grad_norm": 0.1751373121791138, + "language_loss": 0.83049238, + "learning_rate": 0.0005643106054366008, + "loss": 0.84135431, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.3581543, + "step": 2470, + "time_per_iteration": 2.6487743854522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085754, + "balance_loss_mlp": 1.05118382, + "epoch": 0.47537514428626393, + "flos": 559123540992.0, + "grad_norm": 0.05689297252919276, + "language_loss": 0.79414684, + "learning_rate": 0.000564001638396965, + "loss": 0.80500442, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.34594727, + "step": 2471, + "time_per_iteration": 2.749767780303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087834, + "balance_loss_mlp": 1.05228639, + "epoch": 0.4755675259715275, + "flos": 833907211776.0, + "grad_norm": 0.05462179859190678, + "language_loss": 0.81897652, + "learning_rate": 0.0005636926465096897, + "loss": 0.82985491, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.35546875, + "step": 2472, + "time_per_iteration": 3.043703556060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091887, + "balance_loss_mlp": 1.05569541, + "epoch": 0.47575990765679105, + "flos": 507989629440.0, + "grad_norm": 0.050841736172577985, + "language_loss": 0.87258822, + "learning_rate": 0.0005633836298947363, + "loss": 0.88350713, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.36206055, + "step": 2473, + "time_per_iteration": 2.564831018447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098688, + "balance_loss_mlp": 1.06206715, + "epoch": 0.47595228934205464, + "flos": 591566740992.0, + "grad_norm": 0.05674114123782856, + "language_loss": 0.70767033, + "learning_rate": 0.000563074588672075, + "loss": 0.7186572, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.3659668, + "step": 2474, + "time_per_iteration": 2.6735401153564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095847, + "balance_loss_mlp": 1.05960727, + "epoch": 0.4761446710273182, + "flos": 580340104704.0, + "grad_norm": 0.055780063244739476, + "language_loss": 0.84891874, + "learning_rate": 0.0005627655229616868, + "loss": 0.85987723, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.36230469, + "step": 2475, + "time_per_iteration": 2.672621488571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096569, + "balance_loss_mlp": 1.05899405, + "epoch": 0.47633705271258175, + "flos": 672597651456.0, + "grad_norm": 0.05102987049441457, + "language_loss": 0.90229654, + "learning_rate": 0.0005624564328835616, + "loss": 0.91326219, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.37524414, + "step": 2476, + "time_per_iteration": 2.8432443141937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102069, + "balance_loss_mlp": 1.0635407, + "epoch": 0.47652943439784534, + "flos": 541580788224.0, + "grad_norm": 0.0471064217807047, + "language_loss": 0.84254396, + "learning_rate": 0.0005621473185576986, + "loss": 0.85356462, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.38525391, + "step": 2477, + "time_per_iteration": 2.702977180480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095957, + "balance_loss_mlp": 1.05826259, + "epoch": 0.4767218160831089, + "flos": 524563333632.0, + "grad_norm": 0.057656530584244435, + "language_loss": 0.87137967, + "learning_rate": 0.0005618381801041068, + "loss": 0.88233924, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.37670898, + "step": 2478, + "time_per_iteration": 2.603593111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098137, + "balance_loss_mlp": 1.05953729, + "epoch": 0.47691419776837246, + "flos": 567789419520.0, + "grad_norm": 0.11168904607405869, + "language_loss": 0.82855433, + "learning_rate": 0.0005615290176428044, + "loss": 0.83953571, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.38574219, + "step": 2479, + "time_per_iteration": 2.6339292526245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109593, + "balance_loss_mlp": 1.05959523, + "epoch": 0.477106579453636, + "flos": 530659689984.0, + "grad_norm": 0.06204032147038535, + "language_loss": 0.85517442, + "learning_rate": 0.0005612198312938187, + "loss": 0.86613369, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.36328125, + "step": 2480, + "time_per_iteration": 2.727931261062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096505, + "balance_loss_mlp": 1.05912077, + "epoch": 0.4772989611388996, + "flos": 593980521984.0, + "grad_norm": 0.07113059060466843, + "language_loss": 0.79093325, + "learning_rate": 0.0005609106211771868, + "loss": 0.80189824, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.37402344, + "step": 2481, + "time_per_iteration": 2.8239502906799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091646, + "balance_loss_mlp": 1.05471444, + "epoch": 0.4774913428241631, + "flos": 544352541696.0, + "grad_norm": 0.07337307686737661, + "language_loss": 0.89208174, + "learning_rate": 0.0005606013874129543, + "loss": 0.90299821, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.36914062, + "step": 2482, + "time_per_iteration": 2.7480216026306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088167, + "balance_loss_mlp": 1.05187941, + "epoch": 0.4776837245094267, + "flos": 539817371136.0, + "grad_norm": 0.16520730257770824, + "language_loss": 0.80029452, + "learning_rate": 0.0005602921301211768, + "loss": 0.81117618, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.36303711, + "step": 2483, + "time_per_iteration": 2.6802146434783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096429, + "balance_loss_mlp": 1.06021321, + "epoch": 0.4778761061946903, + "flos": 471543759360.0, + "grad_norm": 0.07816325562851568, + "language_loss": 0.81835008, + "learning_rate": 0.0005599828494219185, + "loss": 0.82931435, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.36206055, + "step": 2484, + "time_per_iteration": 2.546365976333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094667, + "balance_loss_mlp": 1.05923831, + "epoch": 0.4780684878799538, + "flos": 725769994752.0, + "grad_norm": 0.05627448694129284, + "language_loss": 0.88551247, + "learning_rate": 0.0005596735454352527, + "loss": 0.89645922, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.35498047, + "step": 2485, + "time_per_iteration": 2.862647771835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106974, + "balance_loss_mlp": 1.07054353, + "epoch": 0.4782608695652174, + "flos": 548665132032.0, + "grad_norm": 0.07015146645765026, + "language_loss": 0.85657477, + "learning_rate": 0.0005593642182812619, + "loss": 0.86764455, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.36425781, + "step": 2486, + "time_per_iteration": 2.609184741973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101968, + "balance_loss_mlp": 1.06558526, + "epoch": 0.47845325125048094, + "flos": 829555333632.0, + "grad_norm": 0.061922125379274766, + "language_loss": 0.83543551, + "learning_rate": 0.0005590548680800378, + "loss": 0.84645522, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.36401367, + "step": 2487, + "time_per_iteration": 3.089769124984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110265, + "balance_loss_mlp": 1.0746448, + "epoch": 0.4786456329357445, + "flos": 513889546752.0, + "grad_norm": 0.2189409026834594, + "language_loss": 0.76099992, + "learning_rate": 0.0005587454949516804, + "loss": 0.77210259, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.35644531, + "step": 2488, + "time_per_iteration": 2.751055955886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110821, + "balance_loss_mlp": 1.07187533, + "epoch": 0.47883801462100806, + "flos": 564392033280.0, + "grad_norm": 0.10409544878795325, + "language_loss": 0.87659556, + "learning_rate": 0.0005584360990162993, + "loss": 0.88767767, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.36376953, + "step": 2489, + "time_per_iteration": 2.6652133464813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113142, + "balance_loss_mlp": 1.07563877, + "epoch": 0.47903039630627164, + "flos": 579296862720.0, + "grad_norm": 0.09667813376582209, + "language_loss": 0.8484993, + "learning_rate": 0.0005581266803940124, + "loss": 0.8596307, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.375, + "step": 2490, + "time_per_iteration": 2.736374616622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119685, + "balance_loss_mlp": 1.08206201, + "epoch": 0.47922277799153523, + "flos": 618667255296.0, + "grad_norm": 0.050098276566308, + "language_loss": 0.87162292, + "learning_rate": 0.0005578172392049471, + "loss": 0.88281971, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.37573242, + "step": 2491, + "time_per_iteration": 2.7753453254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011097, + "balance_loss_mlp": 1.07307923, + "epoch": 0.47941515967679876, + "flos": 639352728576.0, + "grad_norm": 0.06461059150776577, + "language_loss": 0.83998954, + "learning_rate": 0.0005575077755692386, + "loss": 0.85108656, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.3659668, + "step": 2492, + "time_per_iteration": 2.788609266281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113104, + "balance_loss_mlp": 1.07595801, + "epoch": 0.47960754136206235, + "flos": 519561091584.0, + "grad_norm": 0.0557937811773086, + "language_loss": 0.86232179, + "learning_rate": 0.0005571982896070316, + "loss": 0.87345278, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.37158203, + "step": 2493, + "time_per_iteration": 2.6394574642181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107707, + "balance_loss_mlp": 1.07111025, + "epoch": 0.4797999230473259, + "flos": 474798551040.0, + "grad_norm": 0.0598408121702559, + "language_loss": 0.90174985, + "learning_rate": 0.0005568887814384792, + "loss": 0.9128269, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.3659668, + "step": 2494, + "time_per_iteration": 2.534224033355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111594, + "balance_loss_mlp": 1.0754025, + "epoch": 0.47999230473258947, + "flos": 531766950912.0, + "grad_norm": 0.07246176028888049, + "language_loss": 0.87038457, + "learning_rate": 0.000556579251183743, + "loss": 0.88150048, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.36230469, + "step": 2495, + "time_per_iteration": 2.6398251056671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094859, + "balance_loss_mlp": 1.05802298, + "epoch": 0.480184686417853, + "flos": 601207460352.0, + "grad_norm": 0.06271692106547645, + "language_loss": 0.79938626, + "learning_rate": 0.0005562696989629936, + "loss": 0.8103348, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.3684082, + "step": 2496, + "time_per_iteration": 2.6642816066741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093996, + "balance_loss_mlp": 1.05766106, + "epoch": 0.4803770681031166, + "flos": 527931606528.0, + "grad_norm": 0.05594777531112506, + "language_loss": 0.82110333, + "learning_rate": 0.0005559601248964095, + "loss": 0.83204329, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.36352539, + "step": 2497, + "time_per_iteration": 2.636070966720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093639, + "balance_loss_mlp": 1.05739903, + "epoch": 0.4805694497883801, + "flos": 510934500864.0, + "grad_norm": 0.054324508936697755, + "language_loss": 0.85873795, + "learning_rate": 0.0005556505291041783, + "loss": 0.86967432, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.36254883, + "step": 2498, + "time_per_iteration": 2.7246336936950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094125, + "balance_loss_mlp": 1.05757546, + "epoch": 0.4807618314736437, + "flos": 600027416064.0, + "grad_norm": 0.37566577491106196, + "language_loss": 0.84318507, + "learning_rate": 0.0005553409117064954, + "loss": 0.85412627, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.36547852, + "step": 2499, + "time_per_iteration": 2.8535146713256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.06770992, + "epoch": 0.4809542131589073, + "flos": 568700241408.0, + "grad_norm": 0.05235544022747109, + "language_loss": 0.84675509, + "learning_rate": 0.0005550312728235654, + "loss": 0.85780698, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.37475586, + "step": 2500, + "time_per_iteration": 2.691314697265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118964, + "balance_loss_mlp": 1.08138871, + "epoch": 0.4811465948441708, + "flos": 575703037440.0, + "grad_norm": 0.0667425977867665, + "language_loss": 0.83709896, + "learning_rate": 0.0005547216125756003, + "loss": 0.84828854, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.37573242, + "step": 2501, + "time_per_iteration": 2.7381327152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126097, + "balance_loss_mlp": 1.08754468, + "epoch": 0.4813389765294344, + "flos": 823508439552.0, + "grad_norm": 0.052606522983796165, + "language_loss": 0.82174253, + "learning_rate": 0.0005544119310828211, + "loss": 0.83300352, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.38549805, + "step": 2502, + "time_per_iteration": 3.072216272354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134856, + "balance_loss_mlp": 1.09632754, + "epoch": 0.48153135821469795, + "flos": 635240959488.0, + "grad_norm": 0.048230358167368766, + "language_loss": 0.84706873, + "learning_rate": 0.0005541022284654568, + "loss": 0.85841727, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.38525391, + "step": 2503, + "time_per_iteration": 2.916139602661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128897, + "balance_loss_mlp": 1.09051132, + "epoch": 0.48172373989996153, + "flos": 503450076672.0, + "grad_norm": 0.07897645884633452, + "language_loss": 0.84086657, + "learning_rate": 0.0005537925048437446, + "loss": 0.85215557, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.38354492, + "step": 2504, + "time_per_iteration": 2.5921871662139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110906, + "balance_loss_mlp": 1.09278584, + "epoch": 0.48191612158522507, + "flos": 1531563821568.0, + "grad_norm": 0.0372588251023387, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76862371, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.18164062, + "step": 2505, + "time_per_iteration": 4.9559855461120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141303, + "balance_loss_mlp": 1.10132027, + "epoch": 0.48210850327048865, + "flos": 702078451200.0, + "grad_norm": 0.058816464552035166, + "language_loss": 0.88463128, + "learning_rate": 0.0005531729950682664, + "loss": 0.89604431, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.3996582, + "step": 2506, + "time_per_iteration": 3.0114240646362305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132181, + "balance_loss_mlp": 1.09353316, + "epoch": 0.4823008849557522, + "flos": 439548691968.0, + "grad_norm": 0.06626147096234755, + "language_loss": 0.84781104, + "learning_rate": 0.000552863209155015, + "loss": 0.85913289, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.38598633, + "step": 2507, + "time_per_iteration": 2.5784101486206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113394, + "balance_loss_mlp": 1.09390914, + "epoch": 0.48249326664101577, + "flos": 471622334976.0, + "grad_norm": 0.05712589242287889, + "language_loss": 0.82110274, + "learning_rate": 0.0005525534027184461, + "loss": 0.83244216, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.40014648, + "step": 2508, + "time_per_iteration": 2.552065372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132678, + "balance_loss_mlp": 1.09395885, + "epoch": 0.48268564832627936, + "flos": 562954503168.0, + "grad_norm": 0.04979156125943264, + "language_loss": 0.82958996, + "learning_rate": 0.0005522435758788365, + "loss": 0.84091675, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.38696289, + "step": 2509, + "time_per_iteration": 2.727841854095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121662, + "balance_loss_mlp": 1.08210802, + "epoch": 0.4828780300115429, + "flos": 629298782208.0, + "grad_norm": 0.054057791094232886, + "language_loss": 0.79695261, + "learning_rate": 0.0005519337287564721, + "loss": 0.80816925, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.39526367, + "step": 2510, + "time_per_iteration": 2.841032028198242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111392, + "balance_loss_mlp": 1.07582068, + "epoch": 0.4830704116968065, + "flos": 631562766336.0, + "grad_norm": 0.0770242195625866, + "language_loss": 0.83640802, + "learning_rate": 0.000551623861471646, + "loss": 0.84754717, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.38061523, + "step": 2511, + "time_per_iteration": 2.7330808639526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051582, + "balance_loss_mlp": 1.03489304, + "epoch": 0.48326279338207, + "flos": 1568434503168.0, + "grad_norm": 0.02207943535017646, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79870415, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.16699219, + "step": 2512, + "time_per_iteration": 4.847305536270142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119635, + "balance_loss_mlp": 1.08015239, + "epoch": 0.4834551750673336, + "flos": 508989201408.0, + "grad_norm": 0.07604353740704149, + "language_loss": 0.86230296, + "learning_rate": 0.0005510040668958211, + "loss": 0.87349927, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.39453125, + "step": 2513, + "time_per_iteration": 2.6358695030212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039681, + "balance_loss_mlp": 1.02423155, + "epoch": 0.48364755675259713, + "flos": 1527875453952.0, + "grad_norm": 0.016719139942629795, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78800267, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.15429688, + "step": 2514, + "time_per_iteration": 4.8266448974609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108341, + "balance_loss_mlp": 1.06895423, + "epoch": 0.4838399384378607, + "flos": 564726684672.0, + "grad_norm": 0.05692617769518991, + "language_loss": 0.8306818, + "learning_rate": 0.0005503841931138645, + "loss": 0.84176517, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.39355469, + "step": 2515, + "time_per_iteration": 4.18599271774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108623, + "balance_loss_mlp": 1.07073843, + "epoch": 0.4840323201231243, + "flos": 387479227392.0, + "grad_norm": 0.0681425082817114, + "language_loss": 0.81703341, + "learning_rate": 0.0005500742268214025, + "loss": 0.82811964, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.37841797, + "step": 2516, + "time_per_iteration": 2.4660089015960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109531, + "balance_loss_mlp": 1.07116938, + "epoch": 0.48422470180838784, + "flos": 630701406720.0, + "grad_norm": 0.09015941461472031, + "language_loss": 0.85304928, + "learning_rate": 0.0005497642410884014, + "loss": 0.86414456, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.38305664, + "step": 2517, + "time_per_iteration": 2.8147974014282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108486, + "balance_loss_mlp": 1.06845522, + "epoch": 0.4844170834936514, + "flos": 498955603968.0, + "grad_norm": 0.05998889999991439, + "language_loss": 0.8499558, + "learning_rate": 0.0005494542360352085, + "loss": 0.86104071, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.40014648, + "step": 2518, + "time_per_iteration": 2.639248847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101242, + "balance_loss_mlp": 1.06335747, + "epoch": 0.48460946517891496, + "flos": 550798106112.0, + "grad_norm": 0.04916831458391579, + "language_loss": 0.85637897, + "learning_rate": 0.0005491442117821783, + "loss": 0.86739141, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.37866211, + "step": 2519, + "time_per_iteration": 2.7024173736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101572, + "balance_loss_mlp": 1.06275773, + "epoch": 0.48480184686417854, + "flos": 529123235328.0, + "grad_norm": 0.05557918275255021, + "language_loss": 0.87415975, + "learning_rate": 0.0005488341684496732, + "loss": 0.88517547, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.38793945, + "step": 2520, + "time_per_iteration": 2.6733944416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094952, + "balance_loss_mlp": 1.05732954, + "epoch": 0.4849942285494421, + "flos": 531630148608.0, + "grad_norm": 0.049677430441928086, + "language_loss": 0.91897535, + "learning_rate": 0.0005485241061580624, + "loss": 0.92992491, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.37646484, + "step": 2521, + "time_per_iteration": 2.7186949253082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087464, + "balance_loss_mlp": 1.04802954, + "epoch": 0.48518661023470566, + "flos": 722231424000.0, + "grad_norm": 0.05969395587297076, + "language_loss": 0.84698212, + "learning_rate": 0.0005482140250277228, + "loss": 0.85785675, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.39404297, + "step": 2522, + "time_per_iteration": 3.0005805492401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084269, + "balance_loss_mlp": 1.04664636, + "epoch": 0.4853789919199692, + "flos": 505843508736.0, + "grad_norm": 0.0576168536354582, + "language_loss": 0.87382847, + "learning_rate": 0.0005479039251790387, + "loss": 0.88467115, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.37597656, + "step": 2523, + "time_per_iteration": 2.612565517425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083753, + "balance_loss_mlp": 1.04508114, + "epoch": 0.4855713736052328, + "flos": 660185178624.0, + "grad_norm": 0.05213001441745639, + "language_loss": 0.84754556, + "learning_rate": 0.0005475938067324014, + "loss": 0.85838306, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.38647461, + "step": 2524, + "time_per_iteration": 2.7874755859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084646, + "balance_loss_mlp": 1.04556894, + "epoch": 0.48576375529049637, + "flos": 436727476224.0, + "grad_norm": 0.04741211423020534, + "language_loss": 0.83422267, + "learning_rate": 0.0005472836698082098, + "loss": 0.84506917, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.39086914, + "step": 2525, + "time_per_iteration": 2.50516676902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076506, + "balance_loss_mlp": 1.03764343, + "epoch": 0.4859561369757599, + "flos": 581424044544.0, + "grad_norm": 0.04357292691167825, + "language_loss": 0.84170592, + "learning_rate": 0.0005469735145268694, + "loss": 0.85247099, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.38818359, + "step": 2526, + "time_per_iteration": 2.7474558353424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076384, + "balance_loss_mlp": 1.03723574, + "epoch": 0.4861485186610235, + "flos": 487723175424.0, + "grad_norm": 0.056946126423794464, + "language_loss": 0.80818385, + "learning_rate": 0.0005466633410087933, + "loss": 0.81894767, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.39111328, + "step": 2527, + "time_per_iteration": 2.690655469894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080703, + "balance_loss_mlp": 1.06363261, + "epoch": 0.486340900346287, + "flos": 1556893564416.0, + "grad_norm": 0.03973044492620415, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78341526, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.17089844, + "step": 2528, + "time_per_iteration": 4.852689981460571 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076, + "balance_loss_mlp": 1.03723347, + "epoch": 0.4865332820315506, + "flos": 482760221184.0, + "grad_norm": 0.04657742417719492, + "language_loss": 0.88156307, + "learning_rate": 0.0005460429397441214, + "loss": 0.89232314, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.38720703, + "step": 2529, + "time_per_iteration": 2.55281662940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079921, + "balance_loss_mlp": 1.04053402, + "epoch": 0.48672566371681414, + "flos": 535548450816.0, + "grad_norm": 0.06549810250084472, + "language_loss": 0.86653185, + "learning_rate": 0.0005457327122383866, + "loss": 0.87733108, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.39379883, + "step": 2530, + "time_per_iteration": 2.671656847000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035467, + "balance_loss_mlp": 1.01963639, + "epoch": 0.4869180454020777, + "flos": 1411876901376.0, + "grad_norm": 0.025637836045087663, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75671959, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.15820312, + "step": 2531, + "time_per_iteration": 4.814793348312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081396, + "balance_loss_mlp": 1.04322505, + "epoch": 0.48711042708734126, + "flos": 572836741632.0, + "grad_norm": 0.048652424424379774, + "language_loss": 0.7607469, + "learning_rate": 0.0005451122040823244, + "loss": 0.77156091, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.38134766, + "step": 2532, + "time_per_iteration": 2.7569382190704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081141, + "balance_loss_mlp": 1.04246926, + "epoch": 0.48730280877260485, + "flos": 626231665152.0, + "grad_norm": 0.05261384345268123, + "language_loss": 0.76949328, + "learning_rate": 0.0005448019236728997, + "loss": 0.78030467, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.38647461, + "step": 2533, + "time_per_iteration": 2.8791191577911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082063, + "balance_loss_mlp": 1.04439306, + "epoch": 0.48749519045786843, + "flos": 512233818624.0, + "grad_norm": 0.05361284003065004, + "language_loss": 0.84639871, + "learning_rate": 0.0005444916258698255, + "loss": 0.85721934, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.37670898, + "step": 2534, + "time_per_iteration": 2.584188938140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108354, + "balance_loss_mlp": 1.04548812, + "epoch": 0.48768757214313196, + "flos": 525149678592.0, + "grad_norm": 0.044479444876285516, + "language_loss": 0.85999918, + "learning_rate": 0.0005441813107935704, + "loss": 0.87083459, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.38037109, + "step": 2535, + "time_per_iteration": 2.63484787940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089581, + "balance_loss_mlp": 1.05141044, + "epoch": 0.48787995382839555, + "flos": 504784300032.0, + "grad_norm": 0.05225590764746468, + "language_loss": 0.85801542, + "learning_rate": 0.0005438709785646091, + "loss": 0.86891127, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.38110352, + "step": 2536, + "time_per_iteration": 2.5857274532318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087898, + "balance_loss_mlp": 1.0496794, + "epoch": 0.4880723355136591, + "flos": 574904286720.0, + "grad_norm": 0.05427082704851873, + "language_loss": 0.8654719, + "learning_rate": 0.0005435606293034234, + "loss": 0.87635088, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.3815918, + "step": 2537, + "time_per_iteration": 2.6441421508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082535, + "balance_loss_mlp": 1.04498374, + "epoch": 0.48826471719892267, + "flos": 561172147200.0, + "grad_norm": 0.0666705066547564, + "language_loss": 0.84424317, + "learning_rate": 0.0005432502631305016, + "loss": 0.8550685, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.37548828, + "step": 2538, + "time_per_iteration": 2.657888174057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081573, + "balance_loss_mlp": 1.04383135, + "epoch": 0.4884570988841862, + "flos": 725849980416.0, + "grad_norm": 0.04200092081923836, + "language_loss": 0.83068514, + "learning_rate": 0.0005429398801663386, + "loss": 0.84150088, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.37744141, + "step": 2539, + "time_per_iteration": 2.926213264465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085607, + "balance_loss_mlp": 1.04726946, + "epoch": 0.4886494805694498, + "flos": 430794063360.0, + "grad_norm": 0.05775520457519848, + "language_loss": 0.82975113, + "learning_rate": 0.0005426294805314355, + "loss": 0.84060717, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.38305664, + "step": 2540, + "time_per_iteration": 2.476100444793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087326, + "balance_loss_mlp": 1.0497514, + "epoch": 0.4888418622547134, + "flos": 672673254912.0, + "grad_norm": 0.050739997063638825, + "language_loss": 0.79934752, + "learning_rate": 0.0005423190643463003, + "loss": 0.81022084, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.37573242, + "step": 2541, + "time_per_iteration": 2.983567953109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108794, + "balance_loss_mlp": 1.05005538, + "epoch": 0.4890342439399769, + "flos": 541639014912.0, + "grad_norm": 0.05834464255250002, + "language_loss": 0.82589471, + "learning_rate": 0.0005420086317314473, + "loss": 0.83677411, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.37841797, + "step": 2542, + "time_per_iteration": 2.6762986183166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088795, + "balance_loss_mlp": 1.04957485, + "epoch": 0.4892266256252405, + "flos": 590380904448.0, + "grad_norm": 0.056502349447813176, + "language_loss": 0.8105309, + "learning_rate": 0.0005416981828073971, + "loss": 0.82141888, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.39208984, + "step": 2543, + "time_per_iteration": 2.798063039779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111053, + "balance_loss_mlp": 1.0975107, + "epoch": 0.48941900731050403, + "flos": 1515460008960.0, + "grad_norm": 0.049245887260565786, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78226066, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.13574219, + "step": 2544, + "time_per_iteration": 4.86514949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084158, + "balance_loss_mlp": 1.04632151, + "epoch": 0.4896113889957676, + "flos": 470327399424.0, + "grad_norm": 0.0633775200016376, + "language_loss": 0.84418309, + "learning_rate": 0.000541077236513819, + "loss": 0.85502464, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.37792969, + "step": 2545, + "time_per_iteration": 2.590907335281372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085016, + "balance_loss_mlp": 1.04698849, + "epoch": 0.48980377068103115, + "flos": 496310478336.0, + "grad_norm": 0.05034497234802515, + "language_loss": 0.82352334, + "learning_rate": 0.0005407667393853638, + "loss": 0.83437347, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.37988281, + "step": 2546, + "time_per_iteration": 2.6386098861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079303, + "balance_loss_mlp": 1.04187095, + "epoch": 0.48999615236629473, + "flos": 692539628544.0, + "grad_norm": 0.05625529240804266, + "language_loss": 0.83240199, + "learning_rate": 0.0005404562264298569, + "loss": 0.84319508, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.37426758, + "step": 2547, + "time_per_iteration": 2.8305716514587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083931, + "balance_loss_mlp": 1.04459167, + "epoch": 0.49018853405155827, + "flos": 541432401408.0, + "grad_norm": 0.05508159523705553, + "language_loss": 0.83712828, + "learning_rate": 0.0005401456977678498, + "loss": 0.84796757, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.39306641, + "step": 2548, + "time_per_iteration": 2.647726058959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079917, + "balance_loss_mlp": 1.0415554, + "epoch": 0.49038091573682185, + "flos": 695304027648.0, + "grad_norm": 0.06449580544702971, + "language_loss": 0.77341408, + "learning_rate": 0.0005398351535199008, + "loss": 0.7842133, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.38330078, + "step": 2549, + "time_per_iteration": 3.0876851081848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087981, + "balance_loss_mlp": 1.04976225, + "epoch": 0.49057329742208544, + "flos": 596614063104.0, + "grad_norm": 0.053976289964032184, + "language_loss": 0.83800292, + "learning_rate": 0.0005395245938065735, + "loss": 0.84888279, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.38183594, + "step": 2550, + "time_per_iteration": 2.804429769515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082681, + "balance_loss_mlp": 1.04372382, + "epoch": 0.490765679107349, + "flos": 513154814976.0, + "grad_norm": 0.06066311696873723, + "language_loss": 0.8244735, + "learning_rate": 0.0005392140187484379, + "loss": 0.83530027, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.38916016, + "step": 2551, + "time_per_iteration": 2.597642421722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078468, + "balance_loss_mlp": 1.04001141, + "epoch": 0.49095806079261256, + "flos": 629298782208.0, + "grad_norm": 0.0491826620467597, + "language_loss": 0.89348012, + "learning_rate": 0.0005389034284660701, + "loss": 0.90426481, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.3840332, + "step": 2552, + "time_per_iteration": 2.7942707538604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081847, + "balance_loss_mlp": 1.04231691, + "epoch": 0.4911504424778761, + "flos": 914938122240.0, + "grad_norm": 0.07682264853807555, + "language_loss": 0.82114685, + "learning_rate": 0.000538592823080052, + "loss": 0.83196527, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.39501953, + "step": 2553, + "time_per_iteration": 3.1190438270568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079314, + "balance_loss_mlp": 1.04154849, + "epoch": 0.4913428241631397, + "flos": 438716445696.0, + "grad_norm": 0.05210768805810414, + "language_loss": 0.85049736, + "learning_rate": 0.000538282202710971, + "loss": 0.86129045, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.37768555, + "step": 2554, + "time_per_iteration": 2.5379602909088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073496, + "balance_loss_mlp": 1.03613555, + "epoch": 0.4915352058484032, + "flos": 635806955520.0, + "grad_norm": 0.06005848629390598, + "language_loss": 0.81770831, + "learning_rate": 0.000537971567479421, + "loss": 0.82844329, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.37329102, + "step": 2555, + "time_per_iteration": 2.7403476238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107527, + "balance_loss_mlp": 1.0371232, + "epoch": 0.4917275875336668, + "flos": 504272148480.0, + "grad_norm": 0.05941814666543565, + "language_loss": 0.87821388, + "learning_rate": 0.0005376609175060011, + "loss": 0.88896656, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.38110352, + "step": 2556, + "time_per_iteration": 2.5817511081695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069861, + "balance_loss_mlp": 1.03192806, + "epoch": 0.49191996921893033, + "flos": 654251765760.0, + "grad_norm": 0.06032782721564886, + "language_loss": 0.80381918, + "learning_rate": 0.0005373502529113162, + "loss": 0.81451786, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.37915039, + "step": 2557, + "time_per_iteration": 2.7871665954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077426, + "balance_loss_mlp": 1.03939795, + "epoch": 0.4921123509041939, + "flos": 492101194752.0, + "grad_norm": 0.054204772274654804, + "language_loss": 0.81538296, + "learning_rate": 0.0005370395738159773, + "loss": 0.82615721, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.38012695, + "step": 2558, + "time_per_iteration": 2.667402744293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071328, + "balance_loss_mlp": 1.03368151, + "epoch": 0.4923047325894575, + "flos": 545907935232.0, + "grad_norm": 0.05883600684350466, + "language_loss": 0.82952267, + "learning_rate": 0.0005367288803406003, + "loss": 0.84023595, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.3762207, + "step": 2559, + "time_per_iteration": 2.626527786254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078144, + "balance_loss_mlp": 1.03937757, + "epoch": 0.49249711427472104, + "flos": 596195043840.0, + "grad_norm": 0.05079842806629368, + "language_loss": 0.8133688, + "learning_rate": 0.0005364181726058073, + "loss": 0.82415026, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.38720703, + "step": 2560, + "time_per_iteration": 2.6742072105407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079994, + "balance_loss_mlp": 1.0413698, + "epoch": 0.4926894959599846, + "flos": 497580682752.0, + "grad_norm": 0.07402195837362009, + "language_loss": 0.8230688, + "learning_rate": 0.0005361074507322261, + "loss": 0.83386874, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.38574219, + "step": 2561, + "time_per_iteration": 2.5911788940429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079985, + "balance_loss_mlp": 1.04226756, + "epoch": 0.49288187764524816, + "flos": 535868545536.0, + "grad_norm": 0.051530448614758514, + "language_loss": 0.81235635, + "learning_rate": 0.000535796714840489, + "loss": 0.82315624, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.37695312, + "step": 2562, + "time_per_iteration": 2.607124090194702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108504, + "balance_loss_mlp": 1.04694033, + "epoch": 0.49307425933051174, + "flos": 641267504640.0, + "grad_norm": 0.0614534794373117, + "language_loss": 0.83895457, + "learning_rate": 0.0005354859650512348, + "loss": 0.84980506, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.38037109, + "step": 2563, + "time_per_iteration": 2.757147789001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.04889464, + "epoch": 0.4932666410157753, + "flos": 516000761856.0, + "grad_norm": 0.06049941260890761, + "language_loss": 0.87262708, + "learning_rate": 0.0005351752014851074, + "loss": 0.88350135, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.38500977, + "step": 2564, + "time_per_iteration": 2.546381711959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090812, + "balance_loss_mlp": 1.05190217, + "epoch": 0.49345902270103886, + "flos": 601217634816.0, + "grad_norm": 0.06075916964602771, + "language_loss": 0.83327425, + "learning_rate": 0.0005348644242627553, + "loss": 0.84418237, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.38867188, + "step": 2565, + "time_per_iteration": 2.737234592437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080753, + "balance_loss_mlp": 1.06368184, + "epoch": 0.49365140438630245, + "flos": 1492849585152.0, + "grad_norm": 0.03629255242441858, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76367378, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.17089844, + "step": 2566, + "time_per_iteration": 4.96724271774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093722, + "balance_loss_mlp": 1.05462122, + "epoch": 0.493843786071566, + "flos": 629303164416.0, + "grad_norm": 0.05641611710897844, + "language_loss": 0.81215966, + "learning_rate": 0.0005342428293320013, + "loss": 0.82309687, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.390625, + "step": 2567, + "time_per_iteration": 2.75099778175354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085401, + "balance_loss_mlp": 1.04722989, + "epoch": 0.49403616775682957, + "flos": 617283569664.0, + "grad_norm": 0.05682733114828458, + "language_loss": 0.83676398, + "learning_rate": 0.0005339320118649238, + "loss": 0.84761798, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.3815918, + "step": 2568, + "time_per_iteration": 2.6829991340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087145, + "balance_loss_mlp": 1.04945099, + "epoch": 0.4942285494420931, + "flos": 577357355520.0, + "grad_norm": 0.053270861905881636, + "language_loss": 0.86332101, + "learning_rate": 0.000533621181224271, + "loss": 0.87419248, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.37646484, + "step": 2569, + "time_per_iteration": 2.777698278427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092012, + "balance_loss_mlp": 1.0536983, + "epoch": 0.4944209311273567, + "flos": 629899683840.0, + "grad_norm": 0.059449335887268515, + "language_loss": 0.81470358, + "learning_rate": 0.0005333103375307182, + "loss": 0.82562375, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.3828125, + "step": 2570, + "time_per_iteration": 2.866680145263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087838, + "balance_loss_mlp": 1.0502398, + "epoch": 0.4946133128126202, + "flos": 587337108480.0, + "grad_norm": 0.04632852912872097, + "language_loss": 0.86004198, + "learning_rate": 0.0005329994809049451, + "loss": 0.8709203, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.37548828, + "step": 2571, + "time_per_iteration": 2.719249963760376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089339, + "balance_loss_mlp": 1.05147839, + "epoch": 0.4948056944978838, + "flos": 583437745152.0, + "grad_norm": 0.05131083950778726, + "language_loss": 0.87596244, + "learning_rate": 0.0005326886114676375, + "loss": 0.88685584, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.37866211, + "step": 2572, + "time_per_iteration": 2.7392373085021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083396, + "balance_loss_mlp": 1.04524934, + "epoch": 0.49499807618314734, + "flos": 481583149056.0, + "grad_norm": 0.0472919496744071, + "language_loss": 0.87958217, + "learning_rate": 0.0005323777293394854, + "loss": 0.89041615, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.38110352, + "step": 2573, + "time_per_iteration": 2.531196355819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078942, + "balance_loss_mlp": 1.04072404, + "epoch": 0.4951904578684109, + "flos": 518714288640.0, + "grad_norm": 0.0452048253819277, + "language_loss": 0.82375443, + "learning_rate": 0.000532066834641184, + "loss": 0.83454382, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.38183594, + "step": 2574, + "time_per_iteration": 2.6414644718170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076991, + "balance_loss_mlp": 1.03939271, + "epoch": 0.4953828395536745, + "flos": 535238530560.0, + "grad_norm": 0.0513606490930485, + "language_loss": 0.84946954, + "learning_rate": 0.0005317559274934334, + "loss": 0.86023939, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.37573242, + "step": 2575, + "time_per_iteration": 2.764742374420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075902, + "balance_loss_mlp": 1.03904271, + "epoch": 0.49557522123893805, + "flos": 528305545728.0, + "grad_norm": 0.0624025017343203, + "language_loss": 0.80560994, + "learning_rate": 0.0005314450080169382, + "loss": 0.816369, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.3684082, + "step": 2576, + "time_per_iteration": 2.594782590866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078443, + "balance_loss_mlp": 1.04017663, + "epoch": 0.49576760292420163, + "flos": 427780790784.0, + "grad_norm": 0.059991931078834576, + "language_loss": 0.80652928, + "learning_rate": 0.0005311340763324083, + "loss": 0.81731379, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.38232422, + "step": 2577, + "time_per_iteration": 2.5488879680633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107968, + "balance_loss_mlp": 1.04232025, + "epoch": 0.49595998460946517, + "flos": 564968203776.0, + "grad_norm": 0.04956045110382575, + "language_loss": 0.81899893, + "learning_rate": 0.0005308231325605578, + "loss": 0.82979578, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.37329102, + "step": 2578, + "time_per_iteration": 2.6677722930908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077103, + "balance_loss_mlp": 1.03905153, + "epoch": 0.49615236629472875, + "flos": 702161409024.0, + "grad_norm": 0.04106026216453222, + "language_loss": 0.76928478, + "learning_rate": 0.0005305121768221061, + "loss": 0.78005582, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.38012695, + "step": 2579, + "time_per_iteration": 3.070509672164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024013, + "balance_loss_mlp": 1.00970817, + "epoch": 0.4963447479799923, + "flos": 1440896573952.0, + "grad_norm": 0.02117966265403326, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76062334, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.14257812, + "step": 2580, + "time_per_iteration": 4.802190780639648 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084007, + "balance_loss_mlp": 1.04669428, + "epoch": 0.49653712966525587, + "flos": 537370094592.0, + "grad_norm": 0.04967277918174837, + "language_loss": 0.91594803, + "learning_rate": 0.0005298902299282984, + "loss": 0.92678809, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.37304688, + "step": 2581, + "time_per_iteration": 2.5916941165924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075168, + "balance_loss_mlp": 1.03823721, + "epoch": 0.4967295113505194, + "flos": 607002660864.0, + "grad_norm": 0.058889996692992934, + "language_loss": 0.84090436, + "learning_rate": 0.0005295792390144033, + "loss": 0.85165608, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.36889648, + "step": 2582, + "time_per_iteration": 2.731971502304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077994, + "balance_loss_mlp": 1.04065764, + "epoch": 0.496921893035783, + "flos": 474340243968.0, + "grad_norm": 0.06551304805839393, + "language_loss": 0.83421808, + "learning_rate": 0.0005292682366168294, + "loss": 0.844998, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.37304688, + "step": 2583, + "time_per_iteration": 2.575511932373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071469, + "balance_loss_mlp": 1.03437066, + "epoch": 0.4971142747210466, + "flos": 597180059136.0, + "grad_norm": 0.09149919184070833, + "language_loss": 0.79965729, + "learning_rate": 0.0005289572228563181, + "loss": 0.81037199, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.37084961, + "step": 2584, + "time_per_iteration": 2.7206363677978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107391, + "balance_loss_mlp": 1.03533435, + "epoch": 0.4973066564063101, + "flos": 599321797632.0, + "grad_norm": 0.052533233614156426, + "language_loss": 0.82869196, + "learning_rate": 0.000528646197853616, + "loss": 0.83943105, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.38549805, + "step": 2585, + "time_per_iteration": 2.6923370361328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078928, + "balance_loss_mlp": 1.04097223, + "epoch": 0.4974990380915737, + "flos": 649152009216.0, + "grad_norm": 0.05229001766272028, + "language_loss": 0.85541296, + "learning_rate": 0.0005283351617294735, + "loss": 0.86620224, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.37939453, + "step": 2586, + "time_per_iteration": 2.929431915283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021123, + "balance_loss_mlp": 1.00719905, + "epoch": 0.49769141977683723, + "flos": 1528490912256.0, + "grad_norm": 0.01235864360091676, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77657783, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.13964844, + "step": 2587, + "time_per_iteration": 5.021655082702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077541, + "balance_loss_mlp": 1.03977549, + "epoch": 0.4978838014621008, + "flos": 536114446848.0, + "grad_norm": 0.05582319417935397, + "language_loss": 0.866669, + "learning_rate": 0.0005277130565998916, + "loss": 0.87744439, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.37719727, + "step": 2588, + "time_per_iteration": 2.729919195175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079282, + "balance_loss_mlp": 1.04163599, + "epoch": 0.49807618314736435, + "flos": 539335742976.0, + "grad_norm": 0.05154521563335112, + "language_loss": 0.81850547, + "learning_rate": 0.0005274019878359748, + "loss": 0.82929826, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.3762207, + "step": 2589, + "time_per_iteration": 2.692312240600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080128, + "balance_loss_mlp": 1.04243433, + "epoch": 0.49826856483262794, + "flos": 542215185408.0, + "grad_norm": 0.0590106194524904, + "language_loss": 0.87004912, + "learning_rate": 0.0005270909084336628, + "loss": 0.88085043, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.37695312, + "step": 2590, + "time_per_iteration": 2.684134006500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085832, + "balance_loss_mlp": 1.04637384, + "epoch": 0.4984609465178915, + "flos": 522062212608.0, + "grad_norm": 0.056922673879229405, + "language_loss": 0.89000517, + "learning_rate": 0.0005267798185137276, + "loss": 0.90086353, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.39428711, + "step": 2591, + "time_per_iteration": 2.6129040718078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087547, + "balance_loss_mlp": 1.04942417, + "epoch": 0.49865332820315506, + "flos": 574255332864.0, + "grad_norm": 0.05087809825508884, + "language_loss": 0.89274907, + "learning_rate": 0.0005264687181969444, + "loss": 0.90362453, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.38085938, + "step": 2592, + "time_per_iteration": 2.7253634929656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087493, + "balance_loss_mlp": 1.04891706, + "epoch": 0.49884570988841864, + "flos": 1013207657472.0, + "grad_norm": 0.06815052907107509, + "language_loss": 0.75056839, + "learning_rate": 0.0005261576076040937, + "loss": 0.76144326, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.38525391, + "step": 2593, + "time_per_iteration": 3.2982125282287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086885, + "balance_loss_mlp": 1.04790401, + "epoch": 0.4990380915736822, + "flos": 559315597824.0, + "grad_norm": 0.05997761702509101, + "language_loss": 0.84464318, + "learning_rate": 0.0005258464868559591, + "loss": 0.85551196, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.38964844, + "step": 2594, + "time_per_iteration": 2.650743007659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087007, + "balance_loss_mlp": 1.04819274, + "epoch": 0.49923047325894576, + "flos": 498708292608.0, + "grad_norm": 0.060987476024219604, + "language_loss": 0.88568228, + "learning_rate": 0.0005255353560733284, + "loss": 0.89655238, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.38793945, + "step": 2595, + "time_per_iteration": 2.5599913597106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041145, + "balance_loss_mlp": 1.02760279, + "epoch": 0.4994228549442093, + "flos": 1495851273216.0, + "grad_norm": 0.01946244961408958, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76619792, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.13574219, + "step": 2596, + "time_per_iteration": 4.769503593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108652, + "balance_loss_mlp": 1.0481348, + "epoch": 0.4996152366294729, + "flos": 557090901504.0, + "grad_norm": 0.052826274831603945, + "language_loss": 0.83429873, + "learning_rate": 0.0005249130648877492, + "loss": 0.84516394, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.38354492, + "step": 2597, + "time_per_iteration": 2.724168300628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085761, + "balance_loss_mlp": 1.04785287, + "epoch": 0.4998076183147364, + "flos": 415372700160.0, + "grad_norm": 0.05706521232724688, + "language_loss": 0.84317046, + "learning_rate": 0.0005246019047263953, + "loss": 0.85402811, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.37841797, + "step": 2598, + "time_per_iteration": 2.4463517665863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081855, + "balance_loss_mlp": 1.04475701, + "epoch": 0.5, + "flos": 467107513344.0, + "grad_norm": 0.6792645039501298, + "language_loss": 0.82562613, + "learning_rate": 0.0005242907350137353, + "loss": 0.83644474, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.37060547, + "step": 2599, + "time_per_iteration": 2.560786008834839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099746, + "balance_loss_mlp": 1.06193328, + "epoch": 0.5001923816852636, + "flos": 482460475392.0, + "grad_norm": 0.06436348420044716, + "language_loss": 0.78717571, + "learning_rate": 0.0005239795558705754, + "loss": 0.79817319, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.37817383, + "step": 2600, + "time_per_iteration": 2.691749095916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103949, + "balance_loss_mlp": 1.06613564, + "epoch": 0.5003847633705272, + "flos": 533534750208.0, + "grad_norm": 0.05701005713359991, + "language_loss": 0.89229304, + "learning_rate": 0.0005236683674177264, + "loss": 0.90333253, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.37744141, + "step": 2601, + "time_per_iteration": 2.6216700077056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118846, + "balance_loss_mlp": 1.08053231, + "epoch": 0.5005771450557907, + "flos": 737473876992.0, + "grad_norm": 0.059257141019647214, + "language_loss": 0.82444715, + "learning_rate": 0.0005233571697760021, + "loss": 0.83563566, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.3828125, + "step": 2602, + "time_per_iteration": 2.856107473373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127913, + "balance_loss_mlp": 1.08902669, + "epoch": 0.5007695267410542, + "flos": 778646974464.0, + "grad_norm": 0.08832305121279985, + "language_loss": 0.83020616, + "learning_rate": 0.0005230459630662203, + "loss": 0.84148532, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.38842773, + "step": 2603, + "time_per_iteration": 2.954914093017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133998, + "balance_loss_mlp": 1.09563613, + "epoch": 0.5009619084263178, + "flos": 623192251392.0, + "grad_norm": 0.09845505678723535, + "language_loss": 0.81501806, + "learning_rate": 0.0005227347474092022, + "loss": 0.82635808, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.38354492, + "step": 2604, + "time_per_iteration": 2.7330713272094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132886, + "balance_loss_mlp": 1.09223533, + "epoch": 0.5011542901115814, + "flos": 530812459008.0, + "grad_norm": 0.044602380755084235, + "language_loss": 0.83597159, + "learning_rate": 0.0005224235229257724, + "loss": 0.84730041, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.40649414, + "step": 2605, + "time_per_iteration": 2.682590961456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134983, + "balance_loss_mlp": 1.09485674, + "epoch": 0.5013466717968449, + "flos": 527262303744.0, + "grad_norm": 0.06172408458695075, + "language_loss": 0.86453664, + "learning_rate": 0.0005221122897367589, + "loss": 0.87588644, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.40136719, + "step": 2606, + "time_per_iteration": 2.7657558917999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130017, + "balance_loss_mlp": 1.08970046, + "epoch": 0.5015390534821085, + "flos": 565750987776.0, + "grad_norm": 0.060573415362282904, + "language_loss": 0.80914944, + "learning_rate": 0.0005218010479629932, + "loss": 0.82044959, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.40332031, + "step": 2607, + "time_per_iteration": 2.650521755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137201, + "balance_loss_mlp": 1.09564483, + "epoch": 0.5017314351673721, + "flos": 566430465024.0, + "grad_norm": 0.062462394429491495, + "language_loss": 0.82171839, + "learning_rate": 0.0005214897977253102, + "loss": 0.83309042, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.41552734, + "step": 2608, + "time_per_iteration": 2.679605484008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135192, + "balance_loss_mlp": 1.09222913, + "epoch": 0.5019238168526357, + "flos": 522018542592.0, + "grad_norm": 0.04524020883908707, + "language_loss": 0.84520149, + "learning_rate": 0.0005211785391445473, + "loss": 0.85655344, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.4296875, + "step": 2609, + "time_per_iteration": 2.727029323577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133676, + "balance_loss_mlp": 1.09128523, + "epoch": 0.5021161985378992, + "flos": 641135084544.0, + "grad_norm": 0.0754859849582408, + "language_loss": 0.79190326, + "learning_rate": 0.0005208672723415467, + "loss": 0.80324006, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.42358398, + "step": 2610, + "time_per_iteration": 2.7925145626068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132046, + "balance_loss_mlp": 1.09058475, + "epoch": 0.5023085802231627, + "flos": 591000744960.0, + "grad_norm": 0.05557553185326306, + "language_loss": 0.78870118, + "learning_rate": 0.0005205559974371525, + "loss": 0.80002165, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.41455078, + "step": 2611, + "time_per_iteration": 2.7993710041046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129577, + "balance_loss_mlp": 1.08747184, + "epoch": 0.5025009619084263, + "flos": 472134486528.0, + "grad_norm": 0.05627981978612443, + "language_loss": 0.81993866, + "learning_rate": 0.0005202447145522123, + "loss": 0.83123446, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.42089844, + "step": 2612, + "time_per_iteration": 2.6950342655181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120122, + "balance_loss_mlp": 1.0788281, + "epoch": 0.5026933435936899, + "flos": 454906036224.0, + "grad_norm": 0.05146182880646494, + "language_loss": 0.79119051, + "learning_rate": 0.0005199334238075769, + "loss": 0.80239171, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.4128418, + "step": 2613, + "time_per_iteration": 2.533280372619629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121533, + "balance_loss_mlp": 1.08064461, + "epoch": 0.5028857252789535, + "flos": 491504675328.0, + "grad_norm": 0.049706042989329166, + "language_loss": 0.91481262, + "learning_rate": 0.0005196221253241, + "loss": 0.92602801, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.40869141, + "step": 2614, + "time_per_iteration": 2.562459707260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125484, + "balance_loss_mlp": 1.08271146, + "epoch": 0.503078106964217, + "flos": 625280145408.0, + "grad_norm": 0.05688830610190983, + "language_loss": 0.82597703, + "learning_rate": 0.0005193108192226383, + "loss": 0.83723187, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.42797852, + "step": 2615, + "time_per_iteration": 2.7700836658477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124223, + "balance_loss_mlp": 1.08054483, + "epoch": 0.5032704886494805, + "flos": 578774536704.0, + "grad_norm": 0.07123141067873749, + "language_loss": 0.87046134, + "learning_rate": 0.000518999505624052, + "loss": 0.88170362, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.43701172, + "step": 2616, + "time_per_iteration": 2.6920361518859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110793, + "balance_loss_mlp": 1.06897473, + "epoch": 0.5034628703347441, + "flos": 471481150464.0, + "grad_norm": 0.07512500822512953, + "language_loss": 0.83250809, + "learning_rate": 0.000518688184649203, + "loss": 0.84361595, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.41845703, + "step": 2617, + "time_per_iteration": 2.8107755184173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109828, + "balance_loss_mlp": 1.06786621, + "epoch": 0.5036552520200077, + "flos": 489594281472.0, + "grad_norm": 0.05241889370213675, + "language_loss": 0.83636624, + "learning_rate": 0.0005183768564189577, + "loss": 0.84746444, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.41967773, + "step": 2618, + "time_per_iteration": 2.5401604175567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117501, + "balance_loss_mlp": 1.07649279, + "epoch": 0.5038476337052713, + "flos": 493991239680.0, + "grad_norm": 0.05660213632560354, + "language_loss": 0.8184489, + "learning_rate": 0.0005180655210541838, + "loss": 0.82962382, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.40991211, + "step": 2619, + "time_per_iteration": 2.603214979171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111785, + "balance_loss_mlp": 1.06829762, + "epoch": 0.5040400153905348, + "flos": 600321369600.0, + "grad_norm": 0.06441755274122189, + "language_loss": 0.83548617, + "learning_rate": 0.0005177541786757527, + "loss": 0.84660405, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.43481445, + "step": 2620, + "time_per_iteration": 2.760035276412964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122898, + "balance_loss_mlp": 1.07759881, + "epoch": 0.5042323970757984, + "flos": 811178924544.0, + "grad_norm": 0.05307882661131351, + "language_loss": 0.82779682, + "learning_rate": 0.000517442829404538, + "loss": 0.8390258, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.453125, + "step": 2621, + "time_per_iteration": 2.9839560985565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110947, + "balance_loss_mlp": 1.06581521, + "epoch": 0.504424778761062, + "flos": 626985335808.0, + "grad_norm": 0.08823829105457728, + "language_loss": 0.87315869, + "learning_rate": 0.0005171314733614166, + "loss": 0.88425338, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.43676758, + "step": 2622, + "time_per_iteration": 2.901881456375122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_mlp": 1.05961967, + "epoch": 0.5046171604463255, + "flos": 515651553792.0, + "grad_norm": 0.052612789537889, + "language_loss": 0.78039354, + "learning_rate": 0.0005168201106672671, + "loss": 0.79141223, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.42236328, + "step": 2623, + "time_per_iteration": 2.7674055099487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111898, + "balance_loss_mlp": 1.07046056, + "epoch": 0.504809542131589, + "flos": 527576606208.0, + "grad_norm": 0.08464756430959838, + "language_loss": 0.8495788, + "learning_rate": 0.0005165087414429717, + "loss": 0.86069775, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.41430664, + "step": 2624, + "time_per_iteration": 2.602158546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117075, + "balance_loss_mlp": 1.07261038, + "epoch": 0.5050019238168526, + "flos": 553855048704.0, + "grad_norm": 0.23140620797494316, + "language_loss": 0.83667731, + "learning_rate": 0.0005161973658094144, + "loss": 0.84784812, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.44458008, + "step": 2625, + "time_per_iteration": 2.6992454528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108998, + "balance_loss_mlp": 1.06834817, + "epoch": 0.5051943055021162, + "flos": 574486677504.0, + "grad_norm": 0.05317382862924398, + "language_loss": 0.82239455, + "learning_rate": 0.000515885983887482, + "loss": 0.83348453, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.40649414, + "step": 2626, + "time_per_iteration": 2.7204251289367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110206, + "balance_loss_mlp": 1.06781507, + "epoch": 0.5053866871873798, + "flos": 496438516224.0, + "grad_norm": 0.08071327634258786, + "language_loss": 0.84119672, + "learning_rate": 0.0005155745957980636, + "loss": 0.85229874, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.42382812, + "step": 2627, + "time_per_iteration": 2.5813376903533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118048, + "balance_loss_mlp": 1.0760628, + "epoch": 0.5055790688726434, + "flos": 501963084288.0, + "grad_norm": 0.04526623404133713, + "language_loss": 0.88577604, + "learning_rate": 0.000515263201662051, + "loss": 0.89695656, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.41992188, + "step": 2628, + "time_per_iteration": 2.6876380443573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111719, + "balance_loss_mlp": 1.07625389, + "epoch": 0.5057714505579068, + "flos": 844844276736.0, + "grad_norm": 0.05588400488715087, + "language_loss": 0.82233381, + "learning_rate": 0.0005149518016003378, + "loss": 0.83350569, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.40942383, + "step": 2629, + "time_per_iteration": 3.1858632564544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124651, + "balance_loss_mlp": 1.0810678, + "epoch": 0.5059638322431704, + "flos": 497580682752.0, + "grad_norm": 0.0555737706891176, + "language_loss": 0.82261145, + "learning_rate": 0.0005146403957338206, + "loss": 0.83385789, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.43603516, + "step": 2630, + "time_per_iteration": 2.548497438430786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118356, + "balance_loss_mlp": 1.07703853, + "epoch": 0.506156213928434, + "flos": 617526498816.0, + "grad_norm": 0.05055767229530262, + "language_loss": 0.82073247, + "learning_rate": 0.0005143289841833975, + "loss": 0.83191609, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.41308594, + "step": 2631, + "time_per_iteration": 2.847142457962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116463, + "balance_loss_mlp": 1.07500172, + "epoch": 0.5063485956136976, + "flos": 424624923648.0, + "grad_norm": 0.06986911289391046, + "language_loss": 0.81789684, + "learning_rate": 0.0005140175670699696, + "loss": 0.82906151, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.41455078, + "step": 2632, + "time_per_iteration": 2.6268298625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_mlp": 1.0729686, + "epoch": 0.5065409772989612, + "flos": 569641586688.0, + "grad_norm": 0.04802770333155415, + "language_loss": 0.8255887, + "learning_rate": 0.0005137061445144395, + "loss": 0.8367523, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.43383789, + "step": 2633, + "time_per_iteration": 2.93361759185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106091, + "balance_loss_mlp": 1.06458259, + "epoch": 0.5067333589842247, + "flos": 628510205952.0, + "grad_norm": 0.0826873370301202, + "language_loss": 0.86646289, + "learning_rate": 0.000513394716637712, + "loss": 0.87752378, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.4152832, + "step": 2634, + "time_per_iteration": 2.8372714519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083943, + "balance_loss_mlp": 1.06868434, + "epoch": 0.5069257406694883, + "flos": 1447062741504.0, + "grad_norm": 0.03147096823206272, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80275649, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.15234375, + "step": 2635, + "time_per_iteration": 4.893187046051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109252, + "balance_loss_mlp": 1.06812489, + "epoch": 0.5071181223547518, + "flos": 638530656768.0, + "grad_norm": 0.046825638192595165, + "language_loss": 0.80415404, + "learning_rate": 0.0005127718454042958, + "loss": 0.81524646, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.41113281, + "step": 2636, + "time_per_iteration": 2.8583669662475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104227, + "balance_loss_mlp": 1.06250417, + "epoch": 0.5073105040400154, + "flos": 713239658496.0, + "grad_norm": 0.061804914120772665, + "language_loss": 0.84210312, + "learning_rate": 0.0005124604022894269, + "loss": 0.85314542, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.41723633, + "step": 2637, + "time_per_iteration": 2.924973726272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047259, + "balance_loss_mlp": 1.03228605, + "epoch": 0.5075028857252789, + "flos": 1435658605056.0, + "grad_norm": 0.01918715016894911, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78235483, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.14941406, + "step": 2638, + "time_per_iteration": 4.856257915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103908, + "balance_loss_mlp": 1.06115913, + "epoch": 0.5076952674105425, + "flos": 570857946624.0, + "grad_norm": 0.0603044028086303, + "language_loss": 0.83185166, + "learning_rate": 0.0005118375016679325, + "loss": 0.84289074, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.42749023, + "step": 2639, + "time_per_iteration": 2.788266897201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108523, + "balance_loss_mlp": 1.06651402, + "epoch": 0.5078876490958061, + "flos": 516463451136.0, + "grad_norm": 0.06423032366665075, + "language_loss": 0.8059274, + "learning_rate": 0.0005115260444031382, + "loss": 0.81701261, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.42016602, + "step": 2640, + "time_per_iteration": 2.5973188877105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036794, + "balance_loss_mlp": 1.02191687, + "epoch": 0.5080800307810697, + "flos": 1583378620416.0, + "grad_norm": 0.017407415587129545, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.7976861, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.1484375, + "step": 2641, + "time_per_iteration": 4.9824395179748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107231, + "balance_loss_mlp": 1.06340933, + "epoch": 0.5082724124663333, + "flos": 484965978624.0, + "grad_norm": 0.05963770496992207, + "language_loss": 0.8711704, + "learning_rate": 0.0005109031165700483, + "loss": 0.88224268, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.43823242, + "step": 2642, + "time_per_iteration": 2.5530447959899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103344, + "balance_loss_mlp": 1.05997539, + "epoch": 0.5084647941515967, + "flos": 681928450560.0, + "grad_norm": 0.05207490997788611, + "language_loss": 0.8334229, + "learning_rate": 0.0005105916462435945, + "loss": 0.84445643, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.43359375, + "step": 2643, + "time_per_iteration": 2.8092200756073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101863, + "balance_loss_mlp": 1.05863762, + "epoch": 0.5086571758368603, + "flos": 548468692992.0, + "grad_norm": 0.0494294374601552, + "language_loss": 0.85464209, + "learning_rate": 0.0005102801718050989, + "loss": 0.86566073, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.43261719, + "step": 2644, + "time_per_iteration": 2.6660444736480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111917, + "balance_loss_mlp": 1.06735659, + "epoch": 0.5088495575221239, + "flos": 563751843840.0, + "grad_norm": 0.0695979688507087, + "language_loss": 0.88942361, + "learning_rate": 0.0005099686933754867, + "loss": 0.9005428, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.44580078, + "step": 2645, + "time_per_iteration": 2.673337697982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108704, + "balance_loss_mlp": 1.06283236, + "epoch": 0.5090419392073875, + "flos": 551132757504.0, + "grad_norm": 0.05355859457172443, + "language_loss": 0.84209561, + "learning_rate": 0.0005096572110756845, + "loss": 0.85318267, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.45874023, + "step": 2646, + "time_per_iteration": 2.6638782024383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112209, + "balance_loss_mlp": 1.06686139, + "epoch": 0.509234320892651, + "flos": 567504230400.0, + "grad_norm": 0.04874041351849401, + "language_loss": 0.85460532, + "learning_rate": 0.0005093457250266205, + "loss": 0.86572737, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.45361328, + "step": 2647, + "time_per_iteration": 2.6637892723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107252, + "balance_loss_mlp": 1.0633595, + "epoch": 0.5094267025779146, + "flos": 582339248640.0, + "grad_norm": 0.05998717956466229, + "language_loss": 0.8317883, + "learning_rate": 0.000509034235349224, + "loss": 0.84286082, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.43920898, + "step": 2648, + "time_per_iteration": 2.6878888607025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100829, + "balance_loss_mlp": 1.05846214, + "epoch": 0.5096190842631781, + "flos": 591704953344.0, + "grad_norm": 0.05244355272630434, + "language_loss": 0.812711, + "learning_rate": 0.0005087227421644266, + "loss": 0.82371926, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.42407227, + "step": 2649, + "time_per_iteration": 2.7117576599121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106927, + "balance_loss_mlp": 1.06346333, + "epoch": 0.5098114659484417, + "flos": 513307584000.0, + "grad_norm": 0.052249476616985355, + "language_loss": 0.8603372, + "learning_rate": 0.0005084112455931602, + "loss": 0.87140644, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.43457031, + "step": 2650, + "time_per_iteration": 2.6070332527160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106986, + "balance_loss_mlp": 1.06578696, + "epoch": 0.5100038476337053, + "flos": 484389808128.0, + "grad_norm": 0.053750245063259934, + "language_loss": 0.85138631, + "learning_rate": 0.0005080997457563586, + "loss": 0.8624562, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.41210938, + "step": 2651, + "time_per_iteration": 2.53045654296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105107, + "balance_loss_mlp": 1.06374109, + "epoch": 0.5101962293189688, + "flos": 461366157312.0, + "grad_norm": 0.06332454651149101, + "language_loss": 0.79166603, + "learning_rate": 0.0005077882427749569, + "loss": 0.80271709, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.41381836, + "step": 2652, + "time_per_iteration": 2.4946300983428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113144, + "balance_loss_mlp": 1.07084906, + "epoch": 0.5103886110042324, + "flos": 586760937984.0, + "grad_norm": 0.06191877346451425, + "language_loss": 0.8487432, + "learning_rate": 0.0005074767367698913, + "loss": 0.85987473, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.42285156, + "step": 2653, + "time_per_iteration": 2.6763722896575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105422, + "balance_loss_mlp": 1.06455684, + "epoch": 0.510580992689496, + "flos": 844906885632.0, + "grad_norm": 0.056937070163659766, + "language_loss": 0.83570945, + "learning_rate": 0.0005071652278620988, + "loss": 0.84676373, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.40869141, + "step": 2654, + "time_per_iteration": 3.0378835201263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109149, + "balance_loss_mlp": 1.06706858, + "epoch": 0.5107733743747596, + "flos": 658328629248.0, + "grad_norm": 0.057649397656864075, + "language_loss": 0.83013982, + "learning_rate": 0.0005068537161725186, + "loss": 0.84123135, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.42041016, + "step": 2655, + "time_per_iteration": 2.7623610496520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105478, + "balance_loss_mlp": 1.06385016, + "epoch": 0.510965756060023, + "flos": 701426677248.0, + "grad_norm": 0.05708536741035134, + "language_loss": 0.8435111, + "learning_rate": 0.0005065422018220893, + "loss": 0.85456586, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.41601562, + "step": 2656, + "time_per_iteration": 2.823542833328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102091, + "balance_loss_mlp": 1.06096351, + "epoch": 0.5111581377452866, + "flos": 559430489088.0, + "grad_norm": 0.05217113074905386, + "language_loss": 0.80225503, + "learning_rate": 0.0005062306849317521, + "loss": 0.81327593, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.41113281, + "step": 2657, + "time_per_iteration": 2.8275818824768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084461, + "balance_loss_mlp": 1.04314327, + "epoch": 0.5113505194305502, + "flos": 608745729024.0, + "grad_norm": 0.05701327198704139, + "language_loss": 0.83469534, + "learning_rate": 0.0005059191656224487, + "loss": 0.84553993, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.41308594, + "step": 2658, + "time_per_iteration": 2.7243552207946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094832, + "balance_loss_mlp": 1.05158317, + "epoch": 0.5115429011158138, + "flos": 534214227456.0, + "grad_norm": 0.0458707137929394, + "language_loss": 0.89186656, + "learning_rate": 0.0005056076440151212, + "loss": 0.90281487, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.43237305, + "step": 2659, + "time_per_iteration": 2.663668632507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047323, + "balance_loss_mlp": 1.0349257, + "epoch": 0.5117352828010774, + "flos": 1361451580416.0, + "grad_norm": 0.020991592608455897, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77335441, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.12402344, + "step": 2660, + "time_per_iteration": 4.851064205169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095712, + "balance_loss_mlp": 1.05420339, + "epoch": 0.5119276644863409, + "flos": 633444046848.0, + "grad_norm": 0.05508509945890564, + "language_loss": 0.87153888, + "learning_rate": 0.0005049845943901691, + "loss": 0.882496, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.41479492, + "step": 2661, + "time_per_iteration": 2.827824831008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085148, + "balance_loss_mlp": 1.04459286, + "epoch": 0.5121200461716044, + "flos": 585304468992.0, + "grad_norm": 0.05132624096148621, + "language_loss": 0.86219436, + "learning_rate": 0.0005046730666144338, + "loss": 0.8730458, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.40527344, + "step": 2662, + "time_per_iteration": 2.75281023979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096524, + "balance_loss_mlp": 1.05542088, + "epoch": 0.512312427856868, + "flos": 1032081661440.0, + "grad_norm": 0.048177160037868025, + "language_loss": 0.87700105, + "learning_rate": 0.0005043615370244532, + "loss": 0.88796628, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.41113281, + "step": 2663, + "time_per_iteration": 3.3618671894073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028213, + "balance_loss_mlp": 1.01524341, + "epoch": 0.5125048095421316, + "flos": 1537257277440.0, + "grad_norm": 0.012858425268609664, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79272604, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.12988281, + "step": 2664, + "time_per_iteration": 4.658047914505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093765, + "balance_loss_mlp": 1.05292368, + "epoch": 0.5126971912273951, + "flos": 590814480384.0, + "grad_norm": 0.04944817886166227, + "language_loss": 0.85279715, + "learning_rate": 0.0005037384728855425, + "loss": 0.86373478, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.40820312, + "step": 2665, + "time_per_iteration": 2.8461544513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098547, + "balance_loss_mlp": 1.05620384, + "epoch": 0.5128895729126587, + "flos": 551393215488.0, + "grad_norm": 0.158979293172939, + "language_loss": 0.84343994, + "learning_rate": 0.0005034269385785075, + "loss": 0.85442543, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.42333984, + "step": 2666, + "time_per_iteration": 2.651714563369751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092703, + "balance_loss_mlp": 1.05222011, + "epoch": 0.5130819545979223, + "flos": 481031709696.0, + "grad_norm": 0.06506731950678159, + "language_loss": 0.84809029, + "learning_rate": 0.0005031154029410168, + "loss": 0.85901731, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.40478516, + "step": 2667, + "time_per_iteration": 2.5316364765167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095809, + "balance_loss_mlp": 1.05368042, + "epoch": 0.5132743362831859, + "flos": 475556603904.0, + "grad_norm": 0.06903413954772, + "language_loss": 0.86695576, + "learning_rate": 0.0005028038660940197, + "loss": 0.87791383, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.42138672, + "step": 2668, + "time_per_iteration": 2.521328926086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090159, + "balance_loss_mlp": 1.04962766, + "epoch": 0.5134667179684494, + "flos": 503559175680.0, + "grad_norm": 0.047102953885103854, + "language_loss": 0.84545898, + "learning_rate": 0.0005024923281584648, + "loss": 0.85636055, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.4050293, + "step": 2669, + "time_per_iteration": 2.6462371349334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092047, + "balance_loss_mlp": 1.05330372, + "epoch": 0.5136590996537129, + "flos": 503647925760.0, + "grad_norm": 0.04719667862832961, + "language_loss": 0.82488692, + "learning_rate": 0.0005021807892553026, + "loss": 0.83580744, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.38696289, + "step": 2670, + "time_per_iteration": 2.732416868209839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094857, + "balance_loss_mlp": 1.05370605, + "epoch": 0.5138514813389765, + "flos": 624330035712.0, + "grad_norm": 0.05149766622145395, + "language_loss": 0.84497285, + "learning_rate": 0.0005018692495054828, + "loss": 0.85592139, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.41137695, + "step": 2671, + "time_per_iteration": 2.760014533996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092038, + "balance_loss_mlp": 1.05174494, + "epoch": 0.5140438630242401, + "flos": 583274801664.0, + "grad_norm": 0.05511271146100304, + "language_loss": 0.80692601, + "learning_rate": 0.0005015577090299561, + "loss": 0.81784636, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.40283203, + "step": 2672, + "time_per_iteration": 2.6871819496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102168, + "balance_loss_mlp": 1.06046844, + "epoch": 0.5142362447095037, + "flos": 487683887616.0, + "grad_norm": 0.05906966789334332, + "language_loss": 0.86718851, + "learning_rate": 0.0005012461679496729, + "loss": 0.87821019, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.41674805, + "step": 2673, + "time_per_iteration": 2.573075771331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111889, + "balance_loss_mlp": 1.06968939, + "epoch": 0.5144286263947672, + "flos": 526601765376.0, + "grad_norm": 0.050226260736663565, + "language_loss": 0.87357539, + "learning_rate": 0.0005009346263855848, + "loss": 0.88469428, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.42211914, + "step": 2674, + "time_per_iteration": 2.6014504432678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100642, + "balance_loss_mlp": 1.06106424, + "epoch": 0.5146210080800308, + "flos": 486252149760.0, + "grad_norm": 0.047502810841318265, + "language_loss": 0.8393209, + "learning_rate": 0.0005006230844586422, + "loss": 0.85032737, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.39599609, + "step": 2675, + "time_per_iteration": 2.7817234992980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102843, + "balance_loss_mlp": 1.06152487, + "epoch": 0.5148133897652943, + "flos": 515622440448.0, + "grad_norm": 0.04472754928085029, + "language_loss": 0.79101396, + "learning_rate": 0.0005003115422897968, + "loss": 0.80204242, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.4128418, + "step": 2676, + "time_per_iteration": 2.72664213180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102609, + "balance_loss_mlp": 1.06243563, + "epoch": 0.5150057714505579, + "flos": 510963614208.0, + "grad_norm": 0.061230997357755966, + "language_loss": 0.86760038, + "learning_rate": 0.0005, + "loss": 0.87862647, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.40161133, + "step": 2677, + "time_per_iteration": 2.6518850326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095319, + "balance_loss_mlp": 1.05648041, + "epoch": 0.5151981531358215, + "flos": 910541164032.0, + "grad_norm": 0.056847893934042666, + "language_loss": 0.79409456, + "learning_rate": 0.0004996884577102033, + "loss": 0.80504775, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.38818359, + "step": 2678, + "time_per_iteration": 3.0679850578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096526, + "balance_loss_mlp": 1.05623293, + "epoch": 0.515390534821085, + "flos": 471599013888.0, + "grad_norm": 0.047432465044858714, + "language_loss": 0.8447082, + "learning_rate": 0.000499376915541358, + "loss": 0.85567349, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.40283203, + "step": 2679, + "time_per_iteration": 2.6785037517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099943, + "balance_loss_mlp": 1.06086659, + "epoch": 0.5155829165063486, + "flos": 649811137536.0, + "grad_norm": 0.04795230358992159, + "language_loss": 0.81296241, + "learning_rate": 0.0004990653736144155, + "loss": 0.82396191, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.390625, + "step": 2680, + "time_per_iteration": 2.840188980102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100494, + "balance_loss_mlp": 1.06072533, + "epoch": 0.5157752981916122, + "flos": 414038476800.0, + "grad_norm": 0.062126395708719404, + "language_loss": 0.86077356, + "learning_rate": 0.0004987538320503271, + "loss": 0.87177849, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.3972168, + "step": 2681, + "time_per_iteration": 2.4594664573669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100598, + "balance_loss_mlp": 1.06054354, + "epoch": 0.5159676798768758, + "flos": 553569859584.0, + "grad_norm": 0.05537703124714055, + "language_loss": 0.82735646, + "learning_rate": 0.0004984422909700442, + "loss": 0.83836246, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.39990234, + "step": 2682, + "time_per_iteration": 2.66052508354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091816, + "balance_loss_mlp": 1.05292952, + "epoch": 0.5161600615621393, + "flos": 586234229760.0, + "grad_norm": 0.051780542585777085, + "language_loss": 0.83951235, + "learning_rate": 0.0004981307504945173, + "loss": 0.85043043, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.38867188, + "step": 2683, + "time_per_iteration": 2.6698381900787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109715, + "balance_loss_mlp": 1.05766809, + "epoch": 0.5163524432474028, + "flos": 588568025088.0, + "grad_norm": 0.05164690349476628, + "language_loss": 0.8939817, + "learning_rate": 0.0004978192107446976, + "loss": 0.90495312, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.39428711, + "step": 2684, + "time_per_iteration": 2.7249348163604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095053, + "balance_loss_mlp": 1.05325842, + "epoch": 0.5165448249326664, + "flos": 503642133504.0, + "grad_norm": 0.05677264338484585, + "language_loss": 0.87172639, + "learning_rate": 0.0004975076718415353, + "loss": 0.8826769, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.41796875, + "step": 2685, + "time_per_iteration": 2.599235773086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086892, + "balance_loss_mlp": 1.04676652, + "epoch": 0.51673720661793, + "flos": 416539597824.0, + "grad_norm": 0.05087662124677675, + "language_loss": 0.90954995, + "learning_rate": 0.0004971961339059806, + "loss": 0.92041892, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.40112305, + "step": 2686, + "time_per_iteration": 2.4647631645202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091735, + "balance_loss_mlp": 1.04986906, + "epoch": 0.5169295883031936, + "flos": 598696164864.0, + "grad_norm": 0.1190187036629449, + "language_loss": 0.83923638, + "learning_rate": 0.0004968845970589832, + "loss": 0.85015374, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.41870117, + "step": 2687, + "time_per_iteration": 2.6631908416748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087993, + "balance_loss_mlp": 1.04793859, + "epoch": 0.517121969988457, + "flos": 556543844352.0, + "grad_norm": 0.06869038553700607, + "language_loss": 0.8455354, + "learning_rate": 0.0004965730614214926, + "loss": 0.85641533, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.40039062, + "step": 2688, + "time_per_iteration": 2.628286361694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098374, + "balance_loss_mlp": 1.05576849, + "epoch": 0.5173143516737206, + "flos": 469214346240.0, + "grad_norm": 0.05001993876024353, + "language_loss": 0.85256827, + "learning_rate": 0.0004962615271144576, + "loss": 0.86355197, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.42602539, + "step": 2689, + "time_per_iteration": 2.5224428176879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091114, + "balance_loss_mlp": 1.05017805, + "epoch": 0.5175067333589842, + "flos": 719739067392.0, + "grad_norm": 0.0600896413832987, + "language_loss": 0.82435369, + "learning_rate": 0.0004959499942588264, + "loss": 0.8352648, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.40917969, + "step": 2690, + "time_per_iteration": 2.923792600631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043722, + "balance_loss_mlp": 1.02932107, + "epoch": 0.5176991150442478, + "flos": 1465402834944.0, + "grad_norm": 0.02659438930583784, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79243743, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.14355469, + "step": 2691, + "time_per_iteration": 4.779648542404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089552, + "balance_loss_mlp": 1.04863954, + "epoch": 0.5178914967295114, + "flos": 612345346560.0, + "grad_norm": 0.05374555179207371, + "language_loss": 0.85215712, + "learning_rate": 0.0004953269333855661, + "loss": 0.86305267, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.40917969, + "step": 2692, + "time_per_iteration": 2.7646090984344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086121, + "balance_loss_mlp": 1.04604328, + "epoch": 0.5180838784147749, + "flos": 500663766528.0, + "grad_norm": 0.05670677168127033, + "language_loss": 0.84148359, + "learning_rate": 0.0004950154056098309, + "loss": 0.85234475, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.40039062, + "step": 2693, + "time_per_iteration": 2.7038145065307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088689, + "balance_loss_mlp": 1.0469892, + "epoch": 0.5182762601000385, + "flos": 688531166208.0, + "grad_norm": 0.05599909013755839, + "language_loss": 0.84343493, + "learning_rate": 0.0004947038797692867, + "loss": 0.85432184, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.41699219, + "step": 2694, + "time_per_iteration": 2.8155903816223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092198, + "balance_loss_mlp": 1.05147612, + "epoch": 0.518468641785302, + "flos": 665315458560.0, + "grad_norm": 0.046372715162849826, + "language_loss": 0.77593923, + "learning_rate": 0.0004943923559848789, + "loss": 0.7868613, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.40698242, + "step": 2695, + "time_per_iteration": 2.787229061126709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086627, + "balance_loss_mlp": 1.04714453, + "epoch": 0.5186610234705656, + "flos": 566440639488.0, + "grad_norm": 0.05332286724917534, + "language_loss": 0.89972508, + "learning_rate": 0.0004940808343775515, + "loss": 0.9105913, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.39453125, + "step": 2696, + "time_per_iteration": 2.6648201942443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083742, + "balance_loss_mlp": 1.04292464, + "epoch": 0.5188534051558291, + "flos": 428652324864.0, + "grad_norm": 0.055572994373314345, + "language_loss": 0.82251114, + "learning_rate": 0.0004937693150682479, + "loss": 0.83334857, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.40820312, + "step": 2697, + "time_per_iteration": 2.5013554096221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089603, + "balance_loss_mlp": 1.04804635, + "epoch": 0.5190457868410927, + "flos": 546085435392.0, + "grad_norm": 0.05634548635888483, + "language_loss": 0.7652837, + "learning_rate": 0.0004934577981779107, + "loss": 0.77617967, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.41552734, + "step": 2698, + "time_per_iteration": 2.7512943744659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092838, + "balance_loss_mlp": 1.04958856, + "epoch": 0.5192381685263563, + "flos": 548321716224.0, + "grad_norm": 0.04670174030259061, + "language_loss": 0.81419832, + "learning_rate": 0.0004931462838274817, + "loss": 0.82512677, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.43237305, + "step": 2699, + "time_per_iteration": 2.8294084072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082299, + "balance_loss_mlp": 1.04296041, + "epoch": 0.5194305502116199, + "flos": 574993036800.0, + "grad_norm": 0.05440575131052059, + "language_loss": 0.83835357, + "learning_rate": 0.0004928347721379011, + "loss": 0.84917653, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.39331055, + "step": 2700, + "time_per_iteration": 2.643941879272461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084407, + "balance_loss_mlp": 1.04485357, + "epoch": 0.5196229318968835, + "flos": 434019741696.0, + "grad_norm": 0.054958496552239416, + "language_loss": 0.81611145, + "learning_rate": 0.0004925232632301089, + "loss": 0.8269555, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.39526367, + "step": 2701, + "time_per_iteration": 2.5408122539520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084434, + "balance_loss_mlp": 1.04638255, + "epoch": 0.5198153135821469, + "flos": 558607007232.0, + "grad_norm": 0.05193596738822722, + "language_loss": 0.79534626, + "learning_rate": 0.0004922117572250431, + "loss": 0.80619061, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.38037109, + "step": 2702, + "time_per_iteration": 2.6687467098236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078961, + "balance_loss_mlp": 1.04152906, + "epoch": 0.5200076952674105, + "flos": 565397397504.0, + "grad_norm": 0.04814908286006495, + "language_loss": 0.80652344, + "learning_rate": 0.0004919002542436414, + "loss": 0.81731308, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.37451172, + "step": 2703, + "time_per_iteration": 2.811460256576538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085168, + "balance_loss_mlp": 1.04644859, + "epoch": 0.5202000769526741, + "flos": 570916173312.0, + "grad_norm": 0.05555982935463854, + "language_loss": 0.81149572, + "learning_rate": 0.0004915887544068399, + "loss": 0.8223474, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.38720703, + "step": 2704, + "time_per_iteration": 2.6499714851379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093505, + "balance_loss_mlp": 1.05199671, + "epoch": 0.5203924586379377, + "flos": 693898583040.0, + "grad_norm": 0.050837486186397586, + "language_loss": 0.77994883, + "learning_rate": 0.0004912772578355736, + "loss": 0.7908839, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.41503906, + "step": 2705, + "time_per_iteration": 2.8637514114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094973, + "balance_loss_mlp": 1.0555619, + "epoch": 0.5205848403232012, + "flos": 566215087104.0, + "grad_norm": 0.054100857686445215, + "language_loss": 0.8301729, + "learning_rate": 0.000490965764650776, + "loss": 0.84112263, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.39404297, + "step": 2706, + "time_per_iteration": 2.8644323348999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085669, + "balance_loss_mlp": 1.04661632, + "epoch": 0.5207772220084648, + "flos": 1213775539200.0, + "grad_norm": 0.05228956126941533, + "language_loss": 0.82813179, + "learning_rate": 0.0004906542749733798, + "loss": 0.83898848, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.39013672, + "step": 2707, + "time_per_iteration": 3.6128242015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084528, + "balance_loss_mlp": 1.04635715, + "epoch": 0.5209696036937284, + "flos": 592547374080.0, + "grad_norm": 0.12708447176708407, + "language_loss": 0.84871459, + "learning_rate": 0.0004903427889243156, + "loss": 0.85955989, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.38134766, + "step": 2708, + "time_per_iteration": 2.86226487159729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109093, + "balance_loss_mlp": 1.05211544, + "epoch": 0.5211619853789919, + "flos": 522623826432.0, + "grad_norm": 0.05348625186790992, + "language_loss": 0.85548282, + "learning_rate": 0.0004900313066245134, + "loss": 0.86639208, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.38818359, + "step": 2709, + "time_per_iteration": 2.662485122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081893, + "balance_loss_mlp": 1.0432452, + "epoch": 0.5213543670642555, + "flos": 502534872576.0, + "grad_norm": 0.050688452880556414, + "language_loss": 0.80490649, + "learning_rate": 0.0004897198281949012, + "loss": 0.81572545, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.38647461, + "step": 2710, + "time_per_iteration": 2.6449263095855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085994, + "balance_loss_mlp": 1.04636908, + "epoch": 0.521546748749519, + "flos": 585682790400.0, + "grad_norm": 0.05860885905894002, + "language_loss": 0.77534401, + "learning_rate": 0.0004894083537564057, + "loss": 0.78620392, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.39599609, + "step": 2711, + "time_per_iteration": 2.7473373413085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083493, + "balance_loss_mlp": 1.04458284, + "epoch": 0.5217391304347826, + "flos": 569833643520.0, + "grad_norm": 0.04954385524753536, + "language_loss": 0.80801934, + "learning_rate": 0.0004890968834299519, + "loss": 0.81885427, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.38867188, + "step": 2712, + "time_per_iteration": 2.7709779739379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084865, + "balance_loss_mlp": 1.04621696, + "epoch": 0.5219315121200462, + "flos": 542501784576.0, + "grad_norm": 0.06807472429400872, + "language_loss": 0.78801876, + "learning_rate": 0.0004887854173364633, + "loss": 0.7988674, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.38623047, + "step": 2713, + "time_per_iteration": 2.710489273071289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084971, + "balance_loss_mlp": 1.04713416, + "epoch": 0.5221238938053098, + "flos": 550006557696.0, + "grad_norm": 0.048000843690728094, + "language_loss": 0.81816071, + "learning_rate": 0.0004884739555968617, + "loss": 0.82901043, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.37866211, + "step": 2714, + "time_per_iteration": 2.8097493648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_mlp": 1.01785719, + "epoch": 0.5223162754905732, + "flos": 1354373028864.0, + "grad_norm": 0.016208306264550634, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80007499, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.12597656, + "step": 2715, + "time_per_iteration": 4.9789557456970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083164, + "balance_loss_mlp": 1.04444456, + "epoch": 0.5225086571758368, + "flos": 567441621504.0, + "grad_norm": 0.04806245104826077, + "language_loss": 0.86670554, + "learning_rate": 0.0004878510456629992, + "loss": 0.87753725, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.38696289, + "step": 2716, + "time_per_iteration": 3.015443801879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084237, + "balance_loss_mlp": 1.0459466, + "epoch": 0.5227010388611004, + "flos": 499914478080.0, + "grad_norm": 0.051081355886524536, + "language_loss": 0.85046101, + "learning_rate": 0.00048753959771057314, + "loss": 0.86130333, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.3828125, + "step": 2717, + "time_per_iteration": 2.623352289199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084285, + "balance_loss_mlp": 1.04539871, + "epoch": 0.522893420546364, + "flos": 597372115968.0, + "grad_norm": 0.0531417340924391, + "language_loss": 0.82181746, + "learning_rate": 0.0004872281545957044, + "loss": 0.83266038, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.38842773, + "step": 2718, + "time_per_iteration": 2.7300612926483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080864, + "balance_loss_mlp": 1.04154897, + "epoch": 0.5230858022316276, + "flos": 664278008832.0, + "grad_norm": 0.05093940259468129, + "language_loss": 0.85964847, + "learning_rate": 0.0004869167164393055, + "loss": 0.87045711, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.39306641, + "step": 2719, + "time_per_iteration": 2.9219412803649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108043, + "balance_loss_mlp": 1.04206884, + "epoch": 0.5232781839168911, + "flos": 603547047936.0, + "grad_norm": 0.04294663688852852, + "language_loss": 0.89195794, + "learning_rate": 0.00048660528336228793, + "loss": 0.90276217, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.38330078, + "step": 2720, + "time_per_iteration": 2.7792000770568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076128, + "balance_loss_mlp": 1.03781438, + "epoch": 0.5234705656021547, + "flos": 550438723584.0, + "grad_norm": 0.04780199229625597, + "language_loss": 0.90052795, + "learning_rate": 0.0004862938554855606, + "loss": 0.91128922, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.3828125, + "step": 2721, + "time_per_iteration": 2.781075954437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083526, + "balance_loss_mlp": 1.04509294, + "epoch": 0.5236629472874182, + "flos": 504026247168.0, + "grad_norm": 0.06026541291367098, + "language_loss": 0.85920995, + "learning_rate": 0.0004859824329300304, + "loss": 0.87004519, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.3840332, + "step": 2722, + "time_per_iteration": 2.5523464679718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078682, + "balance_loss_mlp": 1.04043949, + "epoch": 0.5238553289726818, + "flos": 547394927616.0, + "grad_norm": 0.04759572809953804, + "language_loss": 0.83678633, + "learning_rate": 0.00048567101581660244, + "loss": 0.84757316, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.38208008, + "step": 2723, + "time_per_iteration": 2.62168288230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081139, + "balance_loss_mlp": 1.04208636, + "epoch": 0.5240477106579453, + "flos": 531702931968.0, + "grad_norm": 0.060086559712579084, + "language_loss": 0.87061596, + "learning_rate": 0.00048535960426617956, + "loss": 0.88142729, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.39038086, + "step": 2724, + "time_per_iteration": 2.5913078784942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081015, + "balance_loss_mlp": 1.04208124, + "epoch": 0.5242400923432089, + "flos": 617653126656.0, + "grad_norm": 0.05554996608046291, + "language_loss": 0.81582165, + "learning_rate": 0.0004850481983996621, + "loss": 0.82663178, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.3894043, + "step": 2725, + "time_per_iteration": 2.744001865386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083025, + "balance_loss_mlp": 1.04366207, + "epoch": 0.5244324740284725, + "flos": 416461022208.0, + "grad_norm": 0.051041166575027594, + "language_loss": 0.87690443, + "learning_rate": 0.0004847367983379492, + "loss": 0.88773465, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.39331055, + "step": 2726, + "time_per_iteration": 2.452622652053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081299, + "balance_loss_mlp": 1.04327154, + "epoch": 0.5246248557137361, + "flos": 626113801728.0, + "grad_norm": 0.0465947896589182, + "language_loss": 0.7866348, + "learning_rate": 0.00048442540420193643, + "loss": 0.7974478, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.38012695, + "step": 2727, + "time_per_iteration": 2.8958897590637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085515, + "balance_loss_mlp": 1.04524565, + "epoch": 0.5248172373989997, + "flos": 1247980746240.0, + "grad_norm": 0.0639927904505779, + "language_loss": 0.79006433, + "learning_rate": 0.0004841140161125182, + "loss": 0.80091947, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.40234375, + "step": 2728, + "time_per_iteration": 3.5769736766815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109207, + "balance_loss_mlp": 1.05370796, + "epoch": 0.5250096190842631, + "flos": 506616118272.0, + "grad_norm": 0.05909227072060698, + "language_loss": 0.84801137, + "learning_rate": 0.0004838026341905857, + "loss": 0.85893214, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.38354492, + "step": 2729, + "time_per_iteration": 2.6979076862335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082745, + "balance_loss_mlp": 1.04476523, + "epoch": 0.5252020007695267, + "flos": 611021297664.0, + "grad_norm": 0.0531469423300266, + "language_loss": 0.85391581, + "learning_rate": 0.00048349125855702844, + "loss": 0.86474323, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.37915039, + "step": 2730, + "time_per_iteration": 2.7757534980773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084625, + "balance_loss_mlp": 1.04669309, + "epoch": 0.5253943824547903, + "flos": 538970568192.0, + "grad_norm": 0.04649712268604906, + "language_loss": 0.81255782, + "learning_rate": 0.00048317988933273287, + "loss": 0.82340407, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.37939453, + "step": 2731, + "time_per_iteration": 2.7401769161224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094943, + "balance_loss_mlp": 1.05476904, + "epoch": 0.5255867641400539, + "flos": 697714988544.0, + "grad_norm": 0.05136039584795155, + "language_loss": 0.82178587, + "learning_rate": 0.00048286852663858367, + "loss": 0.8327353, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.40161133, + "step": 2732, + "time_per_iteration": 2.9572720527648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088204, + "balance_loss_mlp": 1.05084419, + "epoch": 0.5257791458253175, + "flos": 666975568896.0, + "grad_norm": 0.08443038207797475, + "language_loss": 0.83823925, + "learning_rate": 0.000482557170595462, + "loss": 0.84912133, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.37304688, + "step": 2733, + "time_per_iteration": 2.881659746170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092705, + "balance_loss_mlp": 1.05443931, + "epoch": 0.525971527510581, + "flos": 483375679488.0, + "grad_norm": 0.04826672636793544, + "language_loss": 0.87744856, + "learning_rate": 0.0004822458213242475, + "loss": 0.88837564, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.38232422, + "step": 2734, + "time_per_iteration": 2.5599043369293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090689, + "balance_loss_mlp": 1.05270863, + "epoch": 0.5261639091958445, + "flos": 829559715840.0, + "grad_norm": 0.055467035242162094, + "language_loss": 0.85945731, + "learning_rate": 0.00048193447894581627, + "loss": 0.87036419, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.37988281, + "step": 2735, + "time_per_iteration": 3.1253552436828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102448, + "balance_loss_mlp": 1.06258464, + "epoch": 0.5263562908811081, + "flos": 520461739008.0, + "grad_norm": 0.05936611315903256, + "language_loss": 0.87591684, + "learning_rate": 0.00048162314358104243, + "loss": 0.88694137, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.39868164, + "step": 2736, + "time_per_iteration": 2.5996334552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094957, + "balance_loss_mlp": 1.05704832, + "epoch": 0.5265486725663717, + "flos": 574722404352.0, + "grad_norm": 0.047689297469847035, + "language_loss": 0.82871807, + "learning_rate": 0.0004813118153507969, + "loss": 0.83966762, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.37890625, + "step": 2737, + "time_per_iteration": 2.7455976009368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058466, + "balance_loss_mlp": 1.04540098, + "epoch": 0.5267410542516352, + "flos": 1546439537664.0, + "grad_norm": 0.021507379855054985, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83505595, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.13085938, + "step": 2738, + "time_per_iteration": 4.774937629699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110016, + "balance_loss_mlp": 1.06184578, + "epoch": 0.5269334359368988, + "flos": 929576701440.0, + "grad_norm": 0.045277698895202834, + "language_loss": 0.83199632, + "learning_rate": 0.00048068918077736163, + "loss": 0.84299791, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.38305664, + "step": 2739, + "time_per_iteration": 3.253458261489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102121, + "balance_loss_mlp": 1.06256771, + "epoch": 0.5271258176221624, + "flos": 655079629824.0, + "grad_norm": 0.05720476143842487, + "language_loss": 0.81167477, + "learning_rate": 0.0004803778746759001, + "loss": 0.82269597, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.39526367, + "step": 2740, + "time_per_iteration": 2.890253782272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095422, + "balance_loss_mlp": 1.05777621, + "epoch": 0.527318199307426, + "flos": 542781181440.0, + "grad_norm": 0.064499445698322, + "language_loss": 0.81573081, + "learning_rate": 0.00048006657619242317, + "loss": 0.82668501, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.37646484, + "step": 2741, + "time_per_iteration": 2.696274518966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104347, + "balance_loss_mlp": 1.06419694, + "epoch": 0.5275105809926895, + "flos": 447629635584.0, + "grad_norm": 0.05845576302131632, + "language_loss": 0.78272831, + "learning_rate": 0.00047975528544778775, + "loss": 0.79377174, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.40112305, + "step": 2742, + "time_per_iteration": 2.6140294075012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094189, + "balance_loss_mlp": 1.05508804, + "epoch": 0.527702962677953, + "flos": 578656673280.0, + "grad_norm": 0.058395918180573554, + "language_loss": 0.88265073, + "learning_rate": 0.00047944400256284754, + "loss": 0.89359266, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.39086914, + "step": 2743, + "time_per_iteration": 2.7063393592834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097827, + "balance_loss_mlp": 1.0614922, + "epoch": 0.5278953443632166, + "flos": 652465027584.0, + "grad_norm": 0.07282412653967131, + "language_loss": 0.79796684, + "learning_rate": 0.0004791327276584532, + "loss": 0.80894512, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.36352539, + "step": 2744, + "time_per_iteration": 2.8260412216186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109844, + "balance_loss_mlp": 1.06031692, + "epoch": 0.5280877260484802, + "flos": 513741159936.0, + "grad_norm": 0.04991281876590649, + "language_loss": 0.80703586, + "learning_rate": 0.00047882146085545264, + "loss": 0.81802028, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.38061523, + "step": 2745, + "time_per_iteration": 2.6051464080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018989, + "balance_loss_mlp": 1.00611436, + "epoch": 0.5282801077337438, + "flos": 1444650370560.0, + "grad_norm": 0.010819489631099216, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76421368, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.12890625, + "step": 2746, + "time_per_iteration": 4.9944517612457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084488, + "balance_loss_mlp": 1.0470562, + "epoch": 0.5284724894190073, + "flos": 604580115456.0, + "grad_norm": 0.058273426421755106, + "language_loss": 0.79290295, + "learning_rate": 0.00047819895203700684, + "loss": 0.80374789, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.37451172, + "step": 2747, + "time_per_iteration": 2.728018045425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016603, + "balance_loss_mlp": 1.00410998, + "epoch": 0.5286648711042709, + "flos": 1494172224000.0, + "grad_norm": 0.012264329558562137, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76529038, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.125, + "step": 2748, + "time_per_iteration": 4.659038782119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077352, + "balance_loss_mlp": 1.03860867, + "epoch": 0.5288572527895344, + "flos": 597313889280.0, + "grad_norm": 0.056212558578819974, + "language_loss": 0.88259304, + "learning_rate": 0.0004775764770742277, + "loss": 0.89336658, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.38720703, + "step": 2749, + "time_per_iteration": 2.845102548599243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086383, + "balance_loss_mlp": 1.04699659, + "epoch": 0.529049634474798, + "flos": 557041439232.0, + "grad_norm": 0.05924821658857843, + "language_loss": 0.86565638, + "learning_rate": 0.00047726525259079777, + "loss": 0.87652022, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.39404297, + "step": 2750, + "time_per_iteration": 2.773296356201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085746, + "balance_loss_mlp": 1.04793251, + "epoch": 0.5292420161600616, + "flos": 580986086400.0, + "grad_norm": 0.05670035904014211, + "language_loss": 0.885436, + "learning_rate": 0.0004769540369337798, + "loss": 0.89629346, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.37792969, + "step": 2751, + "time_per_iteration": 2.715921401977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084379, + "balance_loss_mlp": 1.04563594, + "epoch": 0.5294343978453251, + "flos": 607989086208.0, + "grad_norm": 0.05448198338431079, + "language_loss": 0.86051679, + "learning_rate": 0.00047664283022399794, + "loss": 0.87136054, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.38720703, + "step": 2752, + "time_per_iteration": 2.8683502674102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078933, + "balance_loss_mlp": 1.04078627, + "epoch": 0.5296267795305887, + "flos": 646226076672.0, + "grad_norm": 0.05827570747642561, + "language_loss": 0.81129229, + "learning_rate": 0.00047633163258227376, + "loss": 0.82208163, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.38110352, + "step": 2753, + "time_per_iteration": 2.8427987098693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084727, + "balance_loss_mlp": 1.04595971, + "epoch": 0.5298191612158523, + "flos": 559482923520.0, + "grad_norm": 0.14342502720880523, + "language_loss": 0.85232151, + "learning_rate": 0.0004760204441294247, + "loss": 0.86316884, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.38745117, + "step": 2754, + "time_per_iteration": 2.644049882888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088138, + "balance_loss_mlp": 1.05096865, + "epoch": 0.5300115429011159, + "flos": 513776065536.0, + "grad_norm": 0.052931776937271004, + "language_loss": 0.86139393, + "learning_rate": 0.00047570926498626486, + "loss": 0.87227535, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.37133789, + "step": 2755, + "time_per_iteration": 2.6872901916503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092723, + "balance_loss_mlp": 1.05402756, + "epoch": 0.5302039245863793, + "flos": 672475405824.0, + "grad_norm": 0.0470441247054563, + "language_loss": 0.81654894, + "learning_rate": 0.00047539809527360474, + "loss": 0.82747614, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.38696289, + "step": 2756, + "time_per_iteration": 2.865553379058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093267, + "balance_loss_mlp": 1.05488133, + "epoch": 0.5303963062716429, + "flos": 730507396608.0, + "grad_norm": 0.04188022637432273, + "language_loss": 0.82037127, + "learning_rate": 0.0004750869351122511, + "loss": 0.83130395, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.38330078, + "step": 2757, + "time_per_iteration": 2.9792187213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093906, + "balance_loss_mlp": 1.0563792, + "epoch": 0.5305886879569065, + "flos": 573156836352.0, + "grad_norm": 0.0631181134246054, + "language_loss": 0.81604397, + "learning_rate": 0.00047477578462300685, + "loss": 0.82698298, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.37524414, + "step": 2758, + "time_per_iteration": 2.6986684799194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093632, + "balance_loss_mlp": 1.05553293, + "epoch": 0.5307810696421701, + "flos": 694988315136.0, + "grad_norm": 0.050985358767642326, + "language_loss": 0.79166949, + "learning_rate": 0.0004744646439266718, + "loss": 0.80260581, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.38085938, + "step": 2759, + "time_per_iteration": 2.978621006011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091884, + "balance_loss_mlp": 1.05342746, + "epoch": 0.5309734513274337, + "flos": 648629683200.0, + "grad_norm": 0.042424952199748935, + "language_loss": 0.92400765, + "learning_rate": 0.000474153513144041, + "loss": 0.93492657, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.38427734, + "step": 2760, + "time_per_iteration": 2.8996803760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109538, + "balance_loss_mlp": 1.05534935, + "epoch": 0.5311658330126972, + "flos": 604517506560.0, + "grad_norm": 0.048779343359875056, + "language_loss": 0.86932075, + "learning_rate": 0.00047384239239590633, + "loss": 0.88027459, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.39990234, + "step": 2761, + "time_per_iteration": 2.8649730682373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090925, + "balance_loss_mlp": 1.05342138, + "epoch": 0.5313582146979607, + "flos": 557995931136.0, + "grad_norm": 0.062125162710189655, + "language_loss": 0.88300002, + "learning_rate": 0.0004735312818030556, + "loss": 0.89390922, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.37475586, + "step": 2762, + "time_per_iteration": 2.664534091949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108596, + "balance_loss_mlp": 1.04776537, + "epoch": 0.5315505963832243, + "flos": 508152572928.0, + "grad_norm": 0.04725442501000759, + "language_loss": 0.82514352, + "learning_rate": 0.0004732201814862727, + "loss": 0.83600307, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.38183594, + "step": 2763, + "time_per_iteration": 2.7150583267211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100901, + "balance_loss_mlp": 1.06113279, + "epoch": 0.5317429780684879, + "flos": 626132740608.0, + "grad_norm": 0.050347986684343975, + "language_loss": 0.81810606, + "learning_rate": 0.0004729090915663373, + "loss": 0.82911509, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.39746094, + "step": 2764, + "time_per_iteration": 2.837186336517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093533, + "balance_loss_mlp": 1.05509973, + "epoch": 0.5319353597537514, + "flos": 476506713600.0, + "grad_norm": 0.06358705333883939, + "language_loss": 0.85396516, + "learning_rate": 0.00047259801216402534, + "loss": 0.86490047, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.38427734, + "step": 2765, + "time_per_iteration": 2.5005743503570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094119, + "balance_loss_mlp": 1.05592442, + "epoch": 0.532127741439015, + "flos": 501386913792.0, + "grad_norm": 0.06543180937467778, + "language_loss": 0.8612839, + "learning_rate": 0.00047228694340010845, + "loss": 0.87222505, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.38183594, + "step": 2766, + "time_per_iteration": 2.549018144607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096891, + "balance_loss_mlp": 1.0578146, + "epoch": 0.5323201231242786, + "flos": 1164114063360.0, + "grad_norm": 0.04837235133211893, + "language_loss": 0.85614288, + "learning_rate": 0.0004719758853953544, + "loss": 0.8671118, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.390625, + "step": 2767, + "time_per_iteration": 3.568779468536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095476, + "balance_loss_mlp": 1.05709052, + "epoch": 0.5325125048095422, + "flos": 378493254144.0, + "grad_norm": 0.06740098585195309, + "language_loss": 0.84098738, + "learning_rate": 0.00047166483827052645, + "loss": 0.85194218, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.38354492, + "step": 2768, + "time_per_iteration": 2.4389522075653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_mlp": 1.01784337, + "epoch": 0.5327048864948057, + "flos": 1540507534848.0, + "grad_norm": 0.01937833439113787, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78109497, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.13183594, + "step": 2769, + "time_per_iteration": 4.967049837112427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093166, + "balance_loss_mlp": 1.05320704, + "epoch": 0.5328972681800692, + "flos": 910877225472.0, + "grad_norm": 0.052506511923680964, + "language_loss": 0.83564013, + "learning_rate": 0.000471042777143682, + "loss": 0.8465718, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.3996582, + "step": 2770, + "time_per_iteration": 3.2065277099609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083074, + "balance_loss_mlp": 1.04530883, + "epoch": 0.5330896498653328, + "flos": 473660766720.0, + "grad_norm": 0.0519747156636442, + "language_loss": 0.79680347, + "learning_rate": 0.0004707317633831707, + "loss": 0.80763417, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.37744141, + "step": 2771, + "time_per_iteration": 2.5498273372650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091325, + "balance_loss_mlp": 1.05408382, + "epoch": 0.5332820315505964, + "flos": 501386913792.0, + "grad_norm": 0.05598064533442757, + "language_loss": 0.77608013, + "learning_rate": 0.00047042076098559673, + "loss": 0.78699338, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.37231445, + "step": 2772, + "time_per_iteration": 2.5759775638580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091384, + "balance_loss_mlp": 1.05323732, + "epoch": 0.53347441323586, + "flos": 924043368960.0, + "grad_norm": 0.060675625301583505, + "language_loss": 0.73884845, + "learning_rate": 0.00047010977007170174, + "loss": 0.7497623, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.38110352, + "step": 2773, + "time_per_iteration": 3.257833957672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089952, + "balance_loss_mlp": 1.05099463, + "epoch": 0.5336667949211235, + "flos": 574185521664.0, + "grad_norm": 0.06246333407972351, + "language_loss": 0.82451814, + "learning_rate": 0.00046979879076222334, + "loss": 0.83541769, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.38916016, + "step": 2774, + "time_per_iteration": 2.6394476890563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091431, + "balance_loss_mlp": 1.05306923, + "epoch": 0.533859176606387, + "flos": 1064233880064.0, + "grad_norm": 0.044878758318980805, + "language_loss": 0.85063684, + "learning_rate": 0.0004694878231778939, + "loss": 0.86155117, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.38330078, + "step": 2775, + "time_per_iteration": 3.3668456077575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085244, + "balance_loss_mlp": 1.04695392, + "epoch": 0.5340515582916506, + "flos": 746277967872.0, + "grad_norm": 0.04760082973405309, + "language_loss": 0.84270054, + "learning_rate": 0.0004691768674394423, + "loss": 0.85355294, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.38305664, + "step": 2776, + "time_per_iteration": 2.9580860137939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_mlp": 1.02644587, + "epoch": 0.5342439399769142, + "flos": 1444905036288.0, + "grad_norm": 0.01780260433895519, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85522568, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.12109375, + "step": 2777, + "time_per_iteration": 4.798782825469971 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036301, + "balance_loss_mlp": 1.02423704, + "epoch": 0.5344363216621778, + "flos": 1426790495232.0, + "grad_norm": 0.016806659478265918, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77689832, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.12060547, + "step": 2778, + "time_per_iteration": 4.971946477890015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083696, + "balance_loss_mlp": 1.04650259, + "epoch": 0.5346287033474413, + "flos": 527355436032.0, + "grad_norm": 0.27028176168378437, + "language_loss": 0.79060376, + "learning_rate": 0.00046824407250656676, + "loss": 0.80144072, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.37158203, + "step": 2779, + "time_per_iteration": 2.639554738998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083286, + "balance_loss_mlp": 1.04528189, + "epoch": 0.5348210850327049, + "flos": 510515481600.0, + "grad_norm": 0.04912000707376091, + "language_loss": 0.83288354, + "learning_rate": 0.0004679331653588161, + "loss": 0.84371638, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.37988281, + "step": 2780, + "time_per_iteration": 2.590897560119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082747, + "balance_loss_mlp": 1.04388487, + "epoch": 0.5350134667179685, + "flos": 462429748224.0, + "grad_norm": 0.07636572739089499, + "language_loss": 0.8547262, + "learning_rate": 0.0004676222706605147, + "loss": 0.86555368, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.38867188, + "step": 2781, + "time_per_iteration": 2.606795310974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088005, + "balance_loss_mlp": 1.04647303, + "epoch": 0.535205848403232, + "flos": 708566275584.0, + "grad_norm": 0.05667741573580048, + "language_loss": 0.84751678, + "learning_rate": 0.0004673113885323626, + "loss": 0.85839683, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.4152832, + "step": 2782, + "time_per_iteration": 2.813957691192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084509, + "balance_loss_mlp": 1.04507411, + "epoch": 0.5353982300884956, + "flos": 893855388672.0, + "grad_norm": 0.04933634097838137, + "language_loss": 0.78395712, + "learning_rate": 0.00046700051909505494, + "loss": 0.79480219, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.39404297, + "step": 2783, + "time_per_iteration": 3.151244878768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089347, + "balance_loss_mlp": 1.0476948, + "epoch": 0.5355906117737591, + "flos": 535701219840.0, + "grad_norm": 0.06378381527079717, + "language_loss": 0.83984947, + "learning_rate": 0.000466689662469282, + "loss": 0.85074294, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.41650391, + "step": 2784, + "time_per_iteration": 2.6275248527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081581, + "balance_loss_mlp": 1.04159856, + "epoch": 0.5357829934590227, + "flos": 868477593600.0, + "grad_norm": 0.05202541270375375, + "language_loss": 0.83895493, + "learning_rate": 0.00046637881877572917, + "loss": 0.84977078, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.3996582, + "step": 2785, + "time_per_iteration": 3.069645404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085059, + "balance_loss_mlp": 1.04481411, + "epoch": 0.5359753751442863, + "flos": 552999481344.0, + "grad_norm": 0.08844651025983005, + "language_loss": 0.8452431, + "learning_rate": 0.0004660679881350764, + "loss": 0.85609365, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.40234375, + "step": 2786, + "time_per_iteration": 2.7307839393615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059616, + "balance_loss_mlp": 1.04531133, + "epoch": 0.5361677568295499, + "flos": 1479687823872.0, + "grad_norm": 0.02226240505672553, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76667762, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.14257812, + "step": 2787, + "time_per_iteration": 5.010236740112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083366, + "balance_loss_mlp": 1.04352605, + "epoch": 0.5363601385148133, + "flos": 805910432256.0, + "grad_norm": 0.0562451411020875, + "language_loss": 0.78052628, + "learning_rate": 0.0004654463664951667, + "loss": 0.79135996, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.3984375, + "step": 2788, + "time_per_iteration": 2.9822394847869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090776, + "balance_loss_mlp": 1.05076993, + "epoch": 0.5365525202000769, + "flos": 507630246912.0, + "grad_norm": 0.05204597911301594, + "language_loss": 0.82849109, + "learning_rate": 0.0004651355757372447, + "loss": 0.83939886, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.39990234, + "step": 2789, + "time_per_iteration": 2.615691900253296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089332, + "balance_loss_mlp": 1.04937315, + "epoch": 0.5367449018853405, + "flos": 528660546048.0, + "grad_norm": 0.0871364316310779, + "language_loss": 0.854258, + "learning_rate": 0.00046482479851489274, + "loss": 0.86515129, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.39941406, + "step": 2790, + "time_per_iteration": 2.7088706493377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089329, + "balance_loss_mlp": 1.04853582, + "epoch": 0.5369372835706041, + "flos": 649614698496.0, + "grad_norm": 0.059769288934836705, + "language_loss": 0.78002077, + "learning_rate": 0.00046451403494876525, + "loss": 0.79091412, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.40795898, + "step": 2791, + "time_per_iteration": 2.8624680042266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082264, + "balance_loss_mlp": 1.04254341, + "epoch": 0.5371296652558677, + "flos": 584205972480.0, + "grad_norm": 0.05423678017273499, + "language_loss": 0.84187895, + "learning_rate": 0.0004642032851595111, + "loss": 0.8527016, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.3972168, + "step": 2792, + "time_per_iteration": 2.7222046852111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090422, + "balance_loss_mlp": 1.04877055, + "epoch": 0.5373220469411312, + "flos": 595570821120.0, + "grad_norm": 0.05596231110481221, + "language_loss": 0.84764576, + "learning_rate": 0.00046389254926777404, + "loss": 0.85855001, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.41674805, + "step": 2793, + "time_per_iteration": 2.8049495220184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083638, + "balance_loss_mlp": 1.04286838, + "epoch": 0.5375144286263948, + "flos": 1113965167104.0, + "grad_norm": 0.05603938595076487, + "language_loss": 0.78227508, + "learning_rate": 0.0004635818273941926, + "loss": 0.79311144, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.4074707, + "step": 2794, + "time_per_iteration": 3.506617307662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085514, + "balance_loss_mlp": 1.04495919, + "epoch": 0.5377068103116583, + "flos": 595319127552.0, + "grad_norm": 0.07610950885477011, + "language_loss": 0.81443048, + "learning_rate": 0.0004632711196593997, + "loss": 0.82528561, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.40527344, + "step": 2795, + "time_per_iteration": 2.7142324447631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083447, + "balance_loss_mlp": 1.04377437, + "epoch": 0.5378991919969219, + "flos": 883839320064.0, + "grad_norm": 0.061986224183990205, + "language_loss": 0.85229117, + "learning_rate": 0.00046296042618402297, + "loss": 0.86312562, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.39697266, + "step": 2796, + "time_per_iteration": 3.0699656009674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077763, + "balance_loss_mlp": 1.03801823, + "epoch": 0.5380915736821854, + "flos": 710344249344.0, + "grad_norm": 0.04828732184108336, + "language_loss": 0.792054, + "learning_rate": 0.0004626497470886839, + "loss": 0.80283165, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.39746094, + "step": 2797, + "time_per_iteration": 2.9337801933288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085503, + "balance_loss_mlp": 1.04444742, + "epoch": 0.538283955367449, + "flos": 556721344512.0, + "grad_norm": 0.04667541599746409, + "language_loss": 0.8208226, + "learning_rate": 0.00046233908249399897, + "loss": 0.83167768, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.41040039, + "step": 2798, + "time_per_iteration": 2.736253023147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086245, + "balance_loss_mlp": 1.04585731, + "epoch": 0.5384763370527126, + "flos": 513218833920.0, + "grad_norm": 0.05904964511977083, + "language_loss": 0.78162259, + "learning_rate": 0.00046202843252057905, + "loss": 0.79248506, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.40380859, + "step": 2799, + "time_per_iteration": 2.5839316844940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085466, + "balance_loss_mlp": 1.04503012, + "epoch": 0.5386687187379762, + "flos": 489490974720.0, + "grad_norm": 0.06428119470797507, + "language_loss": 0.83220208, + "learning_rate": 0.00046171779728902896, + "loss": 0.8430568, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.40405273, + "step": 2800, + "time_per_iteration": 2.6141908168792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087168, + "balance_loss_mlp": 1.04801977, + "epoch": 0.5388611004232398, + "flos": 482415395328.0, + "grad_norm": 0.12344174959648258, + "language_loss": 0.86207569, + "learning_rate": 0.000461407176919948, + "loss": 0.87294734, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.39111328, + "step": 2801, + "time_per_iteration": 2.503673791885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_mlp": 1.04158366, + "epoch": 0.5390534821085032, + "flos": 560709457920.0, + "grad_norm": 0.05013064620145656, + "language_loss": 0.85174656, + "learning_rate": 0.00046109657153392997, + "loss": 0.86255008, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.38720703, + "step": 2802, + "time_per_iteration": 2.6549510955810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086897, + "balance_loss_mlp": 1.04624677, + "epoch": 0.5392458637937668, + "flos": 488132020224.0, + "grad_norm": 0.05351248634305854, + "language_loss": 0.82771289, + "learning_rate": 0.0004607859812515622, + "loss": 0.8385818, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.40649414, + "step": 2803, + "time_per_iteration": 2.592742681503296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085335, + "balance_loss_mlp": 1.0456624, + "epoch": 0.5394382454790304, + "flos": 511810417152.0, + "grad_norm": 0.06156300752407298, + "language_loss": 0.87926197, + "learning_rate": 0.00046047540619342667, + "loss": 0.89011538, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.39648438, + "step": 2804, + "time_per_iteration": 2.566542863845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108144, + "balance_loss_mlp": 1.04343605, + "epoch": 0.539630627164294, + "flos": 567312173568.0, + "grad_norm": 0.04852529488921132, + "language_loss": 0.7995888, + "learning_rate": 0.00046016484648009933, + "loss": 0.81040317, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.38012695, + "step": 2805, + "time_per_iteration": 2.693988561630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108233, + "balance_loss_mlp": 1.04415882, + "epoch": 0.5398230088495575, + "flos": 526203095040.0, + "grad_norm": 0.058780411040176145, + "language_loss": 0.8077246, + "learning_rate": 0.0004598543022321501, + "loss": 0.81854796, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.38134766, + "step": 2806, + "time_per_iteration": 2.635873317718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079563, + "balance_loss_mlp": 1.0410111, + "epoch": 0.5400153905348211, + "flos": 538493322240.0, + "grad_norm": 0.05389643439716648, + "language_loss": 0.7979452, + "learning_rate": 0.0004595437735701433, + "loss": 0.80874085, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.38500977, + "step": 2807, + "time_per_iteration": 2.671004056930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082107, + "balance_loss_mlp": 1.04252934, + "epoch": 0.5402077722200846, + "flos": 513259531776.0, + "grad_norm": 0.056977099557855106, + "language_loss": 0.83333278, + "learning_rate": 0.00045923326061463623, + "loss": 0.84415388, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.39575195, + "step": 2808, + "time_per_iteration": 2.748844861984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108444, + "balance_loss_mlp": 1.04519629, + "epoch": 0.5404001539053482, + "flos": 675932428800.0, + "grad_norm": 0.053531678156081904, + "language_loss": 0.81448805, + "learning_rate": 0.00045892276348618113, + "loss": 0.82533252, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.39208984, + "step": 2809, + "time_per_iteration": 2.9712717533111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034069, + "balance_loss_mlp": 1.02195704, + "epoch": 0.5405925355906118, + "flos": 1553998155264.0, + "grad_norm": 0.02221665300745606, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79294896, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.12109375, + "step": 2810, + "time_per_iteration": 4.987140893936157 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085482, + "balance_loss_mlp": 1.04697728, + "epoch": 0.5407849172758753, + "flos": 647004478464.0, + "grad_norm": 0.050822756134718025, + "language_loss": 0.80942833, + "learning_rate": 0.000458301817192603, + "loss": 0.82028317, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.38500977, + "step": 2811, + "time_per_iteration": 2.826511859893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028161, + "balance_loss_mlp": 1.01576281, + "epoch": 0.5409772989611389, + "flos": 1406641904640.0, + "grad_norm": 0.017319914930323605, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81869948, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.12353516, + "step": 2812, + "time_per_iteration": 4.797938346862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083094, + "balance_loss_mlp": 1.04525733, + "epoch": 0.5411696806464025, + "flos": 554102360064.0, + "grad_norm": 0.08517188397837483, + "language_loss": 0.87214613, + "learning_rate": 0.00045768093565369983, + "loss": 0.88297707, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.37817383, + "step": 2813, + "time_per_iteration": 2.716890811920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082803, + "balance_loss_mlp": 1.04441762, + "epoch": 0.5413620623316661, + "flos": 527853030912.0, + "grad_norm": 0.05234072905155942, + "language_loss": 0.81825578, + "learning_rate": 0.0004573705194685646, + "loss": 0.8290838, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.38330078, + "step": 2814, + "time_per_iteration": 2.6517584323883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082477, + "balance_loss_mlp": 1.04380536, + "epoch": 0.5415544440169295, + "flos": 598464820224.0, + "grad_norm": 0.054888895455983605, + "language_loss": 0.84797984, + "learning_rate": 0.00045706011983366157, + "loss": 0.85880458, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.38623047, + "step": 2815, + "time_per_iteration": 2.670135974884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088358, + "balance_loss_mlp": 1.050807, + "epoch": 0.5417468257021931, + "flos": 470519456256.0, + "grad_norm": 0.06349065912195655, + "language_loss": 0.82603323, + "learning_rate": 0.00045674973686949847, + "loss": 0.8369168, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.37524414, + "step": 2816, + "time_per_iteration": 2.51487398147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085537, + "balance_loss_mlp": 1.04710388, + "epoch": 0.5419392073874567, + "flos": 680477773824.0, + "grad_norm": 0.04802331030108417, + "language_loss": 0.85519576, + "learning_rate": 0.0004564393706965766, + "loss": 0.86605108, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.3840332, + "step": 2817, + "time_per_iteration": 2.9650819301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088505, + "balance_loss_mlp": 1.05031061, + "epoch": 0.5421315890727203, + "flos": 462134384640.0, + "grad_norm": 0.11431790588446349, + "language_loss": 0.81361973, + "learning_rate": 0.00045612902143539116, + "loss": 0.82450485, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.3815918, + "step": 2818, + "time_per_iteration": 2.5874366760253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083753, + "balance_loss_mlp": 1.04620242, + "epoch": 0.5423239707579839, + "flos": 436727476224.0, + "grad_norm": 0.06287409893753121, + "language_loss": 0.81734043, + "learning_rate": 0.00045581868920642986, + "loss": 0.82817793, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.375, + "step": 2819, + "time_per_iteration": 2.4778597354888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.04818964, + "epoch": 0.5425163524432474, + "flos": 458067695616.0, + "grad_norm": 0.0556653381868651, + "language_loss": 0.79541689, + "learning_rate": 0.00045550837413017457, + "loss": 0.8062731, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.37402344, + "step": 2820, + "time_per_iteration": 2.653878688812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087265, + "balance_loss_mlp": 1.04873669, + "epoch": 0.542708734128511, + "flos": 419267681280.0, + "grad_norm": 0.047652791336190936, + "language_loss": 0.85203838, + "learning_rate": 0.0004551980763271005, + "loss": 0.86291105, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.38500977, + "step": 2821, + "time_per_iteration": 2.6410272121429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088978, + "balance_loss_mlp": 1.04942417, + "epoch": 0.5429011158137745, + "flos": 678142568448.0, + "grad_norm": 0.047512644994480734, + "language_loss": 0.83545935, + "learning_rate": 0.0004548877959176756, + "loss": 0.84634912, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.39550781, + "step": 2822, + "time_per_iteration": 2.8824410438537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083878, + "balance_loss_mlp": 1.04542077, + "epoch": 0.5430934974990381, + "flos": 540664174080.0, + "grad_norm": 0.05440283794038225, + "language_loss": 0.8588357, + "learning_rate": 0.00045457753302236166, + "loss": 0.86967444, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.3840332, + "step": 2823, + "time_per_iteration": 2.665828227996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078196, + "balance_loss_mlp": 1.04069233, + "epoch": 0.5432858791843016, + "flos": 658175860224.0, + "grad_norm": 0.053164692369765, + "language_loss": 0.86939847, + "learning_rate": 0.00045426728776161353, + "loss": 0.88018048, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.37475586, + "step": 2824, + "time_per_iteration": 2.79662823677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082032, + "balance_loss_mlp": 1.04367089, + "epoch": 0.5434782608695652, + "flos": 531678200832.0, + "grad_norm": 0.051257131946256196, + "language_loss": 0.81339788, + "learning_rate": 0.00045395706025587863, + "loss": 0.82421821, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.38330078, + "step": 2825, + "time_per_iteration": 2.612839698791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083107, + "balance_loss_mlp": 1.04298067, + "epoch": 0.5436706425548288, + "flos": 608219020800.0, + "grad_norm": 0.0654215378261843, + "language_loss": 0.8246271, + "learning_rate": 0.00045364685062559843, + "loss": 0.83545816, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.40112305, + "step": 2826, + "time_per_iteration": 2.8304717540740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077545, + "balance_loss_mlp": 1.03863502, + "epoch": 0.5438630242400924, + "flos": 705081549312.0, + "grad_norm": 0.05153461088450525, + "language_loss": 0.91323566, + "learning_rate": 0.0004533366589912067, + "loss": 0.92401117, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.38891602, + "step": 2827, + "time_per_iteration": 2.9909794330596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083973, + "balance_loss_mlp": 1.04399014, + "epoch": 0.544055405925356, + "flos": 856073885184.0, + "grad_norm": 0.06162926864421369, + "language_loss": 0.77631354, + "learning_rate": 0.0004530264854731306, + "loss": 0.78715324, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.3996582, + "step": 2828, + "time_per_iteration": 3.0477852821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079886, + "balance_loss_mlp": 1.0402137, + "epoch": 0.5442477876106194, + "flos": 571483579392.0, + "grad_norm": 0.04880017685382554, + "language_loss": 0.83835936, + "learning_rate": 0.00045271633019179034, + "loss": 0.84915829, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.39648438, + "step": 2829, + "time_per_iteration": 2.8048830032348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108649, + "balance_loss_mlp": 1.04684114, + "epoch": 0.544440169295883, + "flos": 625246649856.0, + "grad_norm": 0.05731672371216008, + "language_loss": 0.87693858, + "learning_rate": 0.0004524061932675986, + "loss": 0.88780355, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.39624023, + "step": 2830, + "time_per_iteration": 2.880328893661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081175, + "balance_loss_mlp": 1.0420748, + "epoch": 0.5446325509811466, + "flos": 835896181248.0, + "grad_norm": 0.061736377466748704, + "language_loss": 0.8659271, + "learning_rate": 0.00045209607482096125, + "loss": 0.87673885, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.390625, + "step": 2831, + "time_per_iteration": 2.9996933937072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080099, + "balance_loss_mlp": 1.04016387, + "epoch": 0.5448249326664102, + "flos": 483129778176.0, + "grad_norm": 0.057163759026562816, + "language_loss": 0.8399148, + "learning_rate": 0.0004517859749722772, + "loss": 0.85071582, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.39892578, + "step": 2832, + "time_per_iteration": 2.6431195735931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085662, + "balance_loss_mlp": 1.04606068, + "epoch": 0.5450173143516738, + "flos": 560799618048.0, + "grad_norm": 0.061436781325619555, + "language_loss": 0.78688192, + "learning_rate": 0.0004514758938419376, + "loss": 0.79773855, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.39575195, + "step": 2833, + "time_per_iteration": 2.811894655227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058831, + "balance_loss_mlp": 1.04280972, + "epoch": 0.5452096960369373, + "flos": 1469632467456.0, + "grad_norm": 0.020133642361800857, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77979416, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.16015625, + "step": 2834, + "time_per_iteration": 4.920469760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077415, + "balance_loss_mlp": 1.03798103, + "epoch": 0.5454020777222008, + "flos": 464827562496.0, + "grad_norm": 0.051503170745990534, + "language_loss": 0.83848447, + "learning_rate": 0.00045085578821782175, + "loss": 0.84925866, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.39404297, + "step": 2835, + "time_per_iteration": 2.523089647293091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048396, + "balance_loss_mlp": 1.03246999, + "epoch": 0.5455944594074644, + "flos": 1468921056768.0, + "grad_norm": 0.01613355837810212, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77183139, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.15917969, + "step": 2836, + "time_per_iteration": 4.865030288696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082059, + "balance_loss_mlp": 1.0422194, + "epoch": 0.545786841092728, + "flos": 532900353024.0, + "grad_norm": 0.04532447535161293, + "language_loss": 0.81224561, + "learning_rate": 0.00045023575891159866, + "loss": 0.82306617, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.3984375, + "step": 2837, + "time_per_iteration": 2.7024872303009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038306, + "balance_loss_mlp": 1.02285683, + "epoch": 0.5459792227779915, + "flos": 1351633360896.0, + "grad_norm": 0.01633471064412587, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75802112, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.15429688, + "step": 2838, + "time_per_iteration": 4.88713812828064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072439, + "balance_loss_mlp": 1.03436387, + "epoch": 0.5461716044632551, + "flos": 637584929280.0, + "grad_norm": 0.044187924464620755, + "language_loss": 0.77777064, + "learning_rate": 0.0004496158068861354, + "loss": 0.788495, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.38037109, + "step": 2839, + "time_per_iteration": 2.7734854221343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083666, + "balance_loss_mlp": 1.04451799, + "epoch": 0.5463639861485187, + "flos": 602458725888.0, + "grad_norm": 0.04916115853202861, + "language_loss": 0.80780178, + "learning_rate": 0.00044930586015455207, + "loss": 0.81863844, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.39111328, + "step": 2840, + "time_per_iteration": 2.776756525039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079059, + "balance_loss_mlp": 1.04105484, + "epoch": 0.5465563678337823, + "flos": 642208849920.0, + "grad_norm": 0.047638532734035705, + "language_loss": 0.89027333, + "learning_rate": 0.000448995933104179, + "loss": 0.90106392, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.37939453, + "step": 2841, + "time_per_iteration": 2.835770606994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084722, + "balance_loss_mlp": 1.04526389, + "epoch": 0.5467487495190458, + "flos": 613852687872.0, + "grad_norm": 0.05241434980763647, + "language_loss": 0.79585081, + "learning_rate": 0.00044868602585534077, + "loss": 0.80669802, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.39428711, + "step": 2842, + "time_per_iteration": 2.8165202140808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081214, + "balance_loss_mlp": 1.04297209, + "epoch": 0.5469411312043093, + "flos": 460957312512.0, + "grad_norm": 0.05377375824052972, + "language_loss": 0.88703167, + "learning_rate": 0.0004483761385283541, + "loss": 0.89784384, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.38183594, + "step": 2843, + "time_per_iteration": 2.5191187858581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085655, + "balance_loss_mlp": 1.04705536, + "epoch": 0.5471335128895729, + "flos": 560930628096.0, + "grad_norm": 0.05339183941738246, + "language_loss": 0.82029176, + "learning_rate": 0.0004480662712435281, + "loss": 0.83114827, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.38549805, + "step": 2844, + "time_per_iteration": 2.7347452640533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084571, + "balance_loss_mlp": 1.046996, + "epoch": 0.5473258945748365, + "flos": 518437863936.0, + "grad_norm": 0.05481278216627967, + "language_loss": 0.88263971, + "learning_rate": 0.0004477564241211635, + "loss": 0.89348543, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.37548828, + "step": 2845, + "time_per_iteration": 2.566675901412964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085991, + "balance_loss_mlp": 1.0476774, + "epoch": 0.5475182762601001, + "flos": 433600722432.0, + "grad_norm": 0.05360762168993706, + "language_loss": 0.87165999, + "learning_rate": 0.0004474465972815541, + "loss": 0.88251984, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.38256836, + "step": 2846, + "time_per_iteration": 2.458261489868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085193, + "balance_loss_mlp": 1.04754686, + "epoch": 0.5477106579453636, + "flos": 511308440064.0, + "grad_norm": 0.04786363547278841, + "language_loss": 0.87439841, + "learning_rate": 0.000447136790844985, + "loss": 0.88525033, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.37646484, + "step": 2847, + "time_per_iteration": 2.667609214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108262, + "balance_loss_mlp": 1.04547465, + "epoch": 0.5479030396306271, + "flos": 675606541824.0, + "grad_norm": 0.050829406458998395, + "language_loss": 0.80589354, + "learning_rate": 0.00044682700493173385, + "loss": 0.81671977, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.37133789, + "step": 2848, + "time_per_iteration": 2.83048677444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088336, + "balance_loss_mlp": 1.04978406, + "epoch": 0.5480954213158907, + "flos": 875720498688.0, + "grad_norm": 0.057674115143319986, + "language_loss": 0.80473161, + "learning_rate": 0.00044651723966207004, + "loss": 0.81561506, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.38500977, + "step": 2849, + "time_per_iteration": 3.1320085525512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084454, + "balance_loss_mlp": 1.04780865, + "epoch": 0.5482878030011543, + "flos": 621715433472.0, + "grad_norm": 0.04900831188074684, + "language_loss": 0.78059959, + "learning_rate": 0.00044620749515625536, + "loss": 0.79144412, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.36669922, + "step": 2850, + "time_per_iteration": 2.784318447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091667, + "balance_loss_mlp": 1.05404472, + "epoch": 0.5484801846864179, + "flos": 496946285568.0, + "grad_norm": 0.05697086220906577, + "language_loss": 0.84891641, + "learning_rate": 0.00044589777153454334, + "loss": 0.85983306, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.37597656, + "step": 2851, + "time_per_iteration": 2.7432825565338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087741, + "balance_loss_mlp": 1.04973722, + "epoch": 0.5486725663716814, + "flos": 442202582016.0, + "grad_norm": 0.05425914558119235, + "language_loss": 0.83565009, + "learning_rate": 0.00044558806891717895, + "loss": 0.84652746, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.37963867, + "step": 2852, + "time_per_iteration": 2.486581563949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093078, + "balance_loss_mlp": 1.05528831, + "epoch": 0.548864948056945, + "flos": 654867224064.0, + "grad_norm": 0.04695408394518552, + "language_loss": 0.79779923, + "learning_rate": 0.0004452783874243998, + "loss": 0.80873001, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.37817383, + "step": 2853, + "time_per_iteration": 2.823004722595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088751, + "balance_loss_mlp": 1.05246305, + "epoch": 0.5490573297422086, + "flos": 545760958464.0, + "grad_norm": 0.06406980317061135, + "language_loss": 0.84579176, + "learning_rate": 0.00044496872717643475, + "loss": 0.85667926, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.36279297, + "step": 2854, + "time_per_iteration": 2.6582207679748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104165, + "balance_loss_mlp": 1.02906144, + "epoch": 0.5492497114274721, + "flos": 1589450245632.0, + "grad_norm": 0.019738925867794382, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78130943, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.12597656, + "step": 2855, + "time_per_iteration": 4.917479991912842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086161, + "balance_loss_mlp": 1.0507319, + "epoch": 0.5494420931127356, + "flos": 750567237120.0, + "grad_norm": 0.05097157568088764, + "language_loss": 0.82032043, + "learning_rate": 0.0004443494708958217, + "loss": 0.83118206, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.35473633, + "step": 2856, + "time_per_iteration": 2.944794178009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087153, + "balance_loss_mlp": 1.04860103, + "epoch": 0.5496344747979992, + "flos": 625704956928.0, + "grad_norm": 0.05077616299787212, + "language_loss": 0.80950212, + "learning_rate": 0.0004440398751035906, + "loss": 0.82037365, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.38549805, + "step": 2857, + "time_per_iteration": 2.8557775020599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083742, + "balance_loss_mlp": 1.04707289, + "epoch": 0.5498268564832628, + "flos": 522859553280.0, + "grad_norm": 0.07234504005195413, + "language_loss": 0.83526963, + "learning_rate": 0.00044373030103700645, + "loss": 0.84610707, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.3671875, + "step": 2858, + "time_per_iteration": 2.5718507766723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079618, + "balance_loss_mlp": 1.04337823, + "epoch": 0.5500192381685264, + "flos": 604290544128.0, + "grad_norm": 0.05047837894946753, + "language_loss": 0.79457223, + "learning_rate": 0.000443420748816257, + "loss": 0.80536836, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.36279297, + "step": 2859, + "time_per_iteration": 2.791083335876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_mlp": 1.0475843, + "epoch": 0.55021161985379, + "flos": 520246361088.0, + "grad_norm": 0.05245161408681963, + "language_loss": 0.78267741, + "learning_rate": 0.0004431112185615208, + "loss": 0.79352212, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.36914062, + "step": 2860, + "time_per_iteration": 2.755300760269165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084785, + "balance_loss_mlp": 1.04873633, + "epoch": 0.5504040015390534, + "flos": 489426955776.0, + "grad_norm": 0.05433061967205067, + "language_loss": 0.79769695, + "learning_rate": 0.00044280171039296845, + "loss": 0.80854475, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.3605957, + "step": 2861, + "time_per_iteration": 2.611142873764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086738, + "balance_loss_mlp": 1.04925907, + "epoch": 0.550596383224317, + "flos": 575519745024.0, + "grad_norm": 0.06168485457456991, + "language_loss": 0.88482428, + "learning_rate": 0.0004424922244307616, + "loss": 0.89569169, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.375, + "step": 2862, + "time_per_iteration": 2.673872470855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085076, + "balance_loss_mlp": 1.04750168, + "epoch": 0.5507887649095806, + "flos": 642149213184.0, + "grad_norm": 0.06448144785997337, + "language_loss": 0.82166171, + "learning_rate": 0.00044218276079505315, + "loss": 0.83251244, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.37524414, + "step": 2863, + "time_per_iteration": 2.8468000888824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088216, + "balance_loss_mlp": 1.05126143, + "epoch": 0.5509811465948442, + "flos": 531589450752.0, + "grad_norm": 0.050966073807123834, + "language_loss": 0.7469635, + "learning_rate": 0.0004418733196059876, + "loss": 0.7578457, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.36938477, + "step": 2864, + "time_per_iteration": 2.662949323654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088219, + "balance_loss_mlp": 1.05174112, + "epoch": 0.5511735282801077, + "flos": 654439440384.0, + "grad_norm": 0.054186590964919915, + "language_loss": 0.79709429, + "learning_rate": 0.0004415639009837008, + "loss": 0.80797648, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.36474609, + "step": 2865, + "time_per_iteration": 2.8164796829223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080612, + "balance_loss_mlp": 1.04503989, + "epoch": 0.5513659099653713, + "flos": 529222159872.0, + "grad_norm": 0.05095499883513892, + "language_loss": 0.81590974, + "learning_rate": 0.00044125450504831955, + "loss": 0.82671583, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.35620117, + "step": 2866, + "time_per_iteration": 2.7417778968811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088604, + "balance_loss_mlp": 1.05162513, + "epoch": 0.5515582916506349, + "flos": 554594162688.0, + "grad_norm": 0.05682958193324047, + "language_loss": 0.82243145, + "learning_rate": 0.0004409451319199622, + "loss": 0.83331752, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.36987305, + "step": 2867, + "time_per_iteration": 2.6530325412750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082608, + "balance_loss_mlp": 1.04608202, + "epoch": 0.5517506733358984, + "flos": 735067298304.0, + "grad_norm": 0.04759427919913488, + "language_loss": 0.84027618, + "learning_rate": 0.0004406357817187381, + "loss": 0.85110223, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.36572266, + "step": 2868, + "time_per_iteration": 2.9475574493408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083501, + "balance_loss_mlp": 1.04590225, + "epoch": 0.551943055021162, + "flos": 1114861432320.0, + "grad_norm": 0.043872910920917114, + "language_loss": 0.80878294, + "learning_rate": 0.0004403264545647474, + "loss": 0.81961799, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.37597656, + "step": 2869, + "time_per_iteration": 3.5124435424804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080703, + "balance_loss_mlp": 1.04422534, + "epoch": 0.5521354367064255, + "flos": 544092083712.0, + "grad_norm": 0.0550168733336382, + "language_loss": 0.84926724, + "learning_rate": 0.00044001715057808154, + "loss": 0.86007428, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.36499023, + "step": 2870, + "time_per_iteration": 2.7501060962677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085896, + "balance_loss_mlp": 1.04855943, + "epoch": 0.5523278183916891, + "flos": 935889845760.0, + "grad_norm": 0.05461062340152541, + "language_loss": 0.81539249, + "learning_rate": 0.0004397078698788232, + "loss": 0.82625151, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.3737793, + "step": 2871, + "time_per_iteration": 3.2084577083587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026675, + "balance_loss_mlp": 1.01427722, + "epoch": 0.5525202000769527, + "flos": 1465117645824.0, + "grad_norm": 0.012296141252344654, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81469035, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.12353516, + "step": 2872, + "time_per_iteration": 4.909080266952515 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087659, + "balance_loss_mlp": 1.05115747, + "epoch": 0.5527125817622163, + "flos": 489554993664.0, + "grad_norm": 0.06201182150044637, + "language_loss": 0.78260124, + "learning_rate": 0.00043908937882281343, + "loss": 0.79347777, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.36523438, + "step": 2873, + "time_per_iteration": 2.5999958515167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078657, + "balance_loss_mlp": 1.0410111, + "epoch": 0.5529049634474797, + "flos": 634606562304.0, + "grad_norm": 0.05626101072807578, + "language_loss": 0.82624078, + "learning_rate": 0.0004387801687061814, + "loss": 0.83702731, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.37573242, + "step": 2874, + "time_per_iteration": 2.816607713699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082085, + "balance_loss_mlp": 1.04310322, + "epoch": 0.5530973451327433, + "flos": 580986086400.0, + "grad_norm": 0.04886656520386433, + "language_loss": 0.80143493, + "learning_rate": 0.0004384709823571958, + "loss": 0.8122558, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.38964844, + "step": 2875, + "time_per_iteration": 2.7270736694335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079953, + "balance_loss_mlp": 1.04113841, + "epoch": 0.5532897268180069, + "flos": 1122030144000.0, + "grad_norm": 0.06103557908182598, + "language_loss": 0.83129716, + "learning_rate": 0.0004381618198958932, + "loss": 0.84209669, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.38793945, + "step": 2876, + "time_per_iteration": 3.4826347827911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085381, + "balance_loss_mlp": 1.04721045, + "epoch": 0.5534821085032705, + "flos": 636965088768.0, + "grad_norm": 0.05070554688334561, + "language_loss": 0.83524168, + "learning_rate": 0.00043785268144230137, + "loss": 0.84609544, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.38183594, + "step": 2877, + "time_per_iteration": 2.8850836753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080685, + "balance_loss_mlp": 1.04332519, + "epoch": 0.5536744901885341, + "flos": 570837597696.0, + "grad_norm": 0.056027333180870484, + "language_loss": 0.82300985, + "learning_rate": 0.00043754356711643837, + "loss": 0.83381677, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.37353516, + "step": 2878, + "time_per_iteration": 2.6629955768585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079329, + "balance_loss_mlp": 1.04180145, + "epoch": 0.5538668718737976, + "flos": 595418052096.0, + "grad_norm": 0.051053801448504514, + "language_loss": 0.84143484, + "learning_rate": 0.0004372344770383132, + "loss": 0.85222816, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.37475586, + "step": 2879, + "time_per_iteration": 2.809924364089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080427, + "balance_loss_mlp": 1.04220867, + "epoch": 0.5540592535590612, + "flos": 532324182528.0, + "grad_norm": 0.054354704442993965, + "language_loss": 0.83048761, + "learning_rate": 0.00043692541132792507, + "loss": 0.8412919, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.38183594, + "step": 2880, + "time_per_iteration": 2.6826112270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076517, + "balance_loss_mlp": 1.03915703, + "epoch": 0.5542516352443247, + "flos": 412398715392.0, + "grad_norm": 0.060842521075957015, + "language_loss": 0.83359361, + "learning_rate": 0.00043661637010526384, + "loss": 0.84435874, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.37329102, + "step": 2881, + "time_per_iteration": 2.5412843227386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077156, + "balance_loss_mlp": 1.03946209, + "epoch": 0.5544440169295883, + "flos": 547341083136.0, + "grad_norm": 0.06506612292228302, + "language_loss": 0.82828653, + "learning_rate": 0.00043630735349031025, + "loss": 0.83905804, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.37646484, + "step": 2882, + "time_per_iteration": 2.6428792476654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079595, + "balance_loss_mlp": 1.04132843, + "epoch": 0.5546363986148518, + "flos": 621518994432.0, + "grad_norm": 0.04746548389090053, + "language_loss": 0.8146224, + "learning_rate": 0.00043599836160303495, + "loss": 0.82541835, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.38232422, + "step": 2883, + "time_per_iteration": 2.836928367614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076033, + "balance_loss_mlp": 1.03833902, + "epoch": 0.5548287803001154, + "flos": 704972450304.0, + "grad_norm": 0.05191443424956408, + "language_loss": 0.77216405, + "learning_rate": 0.0004356893945633995, + "loss": 0.78292441, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.37719727, + "step": 2884, + "time_per_iteration": 2.959998846054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077354, + "balance_loss_mlp": 1.03877735, + "epoch": 0.555021161985379, + "flos": 503952053760.0, + "grad_norm": 0.04795057861891694, + "language_loss": 0.8143183, + "learning_rate": 0.0004353804524913551, + "loss": 0.82509184, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.38549805, + "step": 2885, + "time_per_iteration": 2.587458848953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076676, + "balance_loss_mlp": 1.03960204, + "epoch": 0.5552135436706426, + "flos": 615782020608.0, + "grad_norm": 0.060100634137020215, + "language_loss": 0.81801999, + "learning_rate": 0.0004350715355068441, + "loss": 0.82878673, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.37109375, + "step": 2886, + "time_per_iteration": 2.739311933517456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080638, + "balance_loss_mlp": 1.04227662, + "epoch": 0.5554059253559062, + "flos": 463635933696.0, + "grad_norm": 0.06732751663430354, + "language_loss": 0.79759407, + "learning_rate": 0.00043476264372979847, + "loss": 0.80840045, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.38305664, + "step": 2887, + "time_per_iteration": 2.5322625637054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081996, + "balance_loss_mlp": 1.04425478, + "epoch": 0.5555983070411696, + "flos": 1561923813888.0, + "grad_norm": 0.05205208802168105, + "language_loss": 0.78767329, + "learning_rate": 0.0004344537772801408, + "loss": 0.79849327, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.37744141, + "step": 2888, + "time_per_iteration": 3.8099794387817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022363, + "balance_loss_mlp": 1.00986981, + "epoch": 0.5557906887264332, + "flos": 1467093468672.0, + "grad_norm": 0.012872465654446894, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74444818, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.12451172, + "step": 2889, + "time_per_iteration": 4.8980872631073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080645, + "balance_loss_mlp": 1.04373789, + "epoch": 0.5559830704116968, + "flos": 529575750144.0, + "grad_norm": 0.056518477254008576, + "language_loss": 0.83232135, + "learning_rate": 0.0004338361208426298, + "loss": 0.84312785, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.36889648, + "step": 2890, + "time_per_iteration": 2.596644163131714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108101, + "balance_loss_mlp": 1.04312527, + "epoch": 0.5561754520969604, + "flos": 650895077376.0, + "grad_norm": 0.04719414959796351, + "language_loss": 0.81189138, + "learning_rate": 0.00043352733109457164, + "loss": 0.82270145, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.37841797, + "step": 2891, + "time_per_iteration": 2.8776957988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079084, + "balance_loss_mlp": 1.04158103, + "epoch": 0.556367833782224, + "flos": 733968801792.0, + "grad_norm": 0.04510399892940866, + "language_loss": 0.84577823, + "learning_rate": 0.00043321856715349244, + "loss": 0.85656911, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.37451172, + "step": 2892, + "time_per_iteration": 2.9247210025787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107932, + "balance_loss_mlp": 1.04243708, + "epoch": 0.5565602154674875, + "flos": 672120405504.0, + "grad_norm": 0.04457708587394983, + "language_loss": 0.80344868, + "learning_rate": 0.00043290982913926466, + "loss": 0.81424183, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.36889648, + "step": 2893, + "time_per_iteration": 2.791151285171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087078, + "balance_loss_mlp": 1.04919362, + "epoch": 0.556752597152751, + "flos": 585911162880.0, + "grad_norm": 0.05091942660655845, + "language_loss": 0.84425044, + "learning_rate": 0.0004326011171717514, + "loss": 0.8551212, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.37866211, + "step": 2894, + "time_per_iteration": 2.8832085132598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085909, + "balance_loss_mlp": 1.04788101, + "epoch": 0.5569449788380146, + "flos": 437549548032.0, + "grad_norm": 0.04808991967010034, + "language_loss": 0.81074953, + "learning_rate": 0.0004322924313708051, + "loss": 0.82160866, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.38012695, + "step": 2895, + "time_per_iteration": 2.5033986568450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079423, + "balance_loss_mlp": 1.04315972, + "epoch": 0.5571373605232782, + "flos": 502002372096.0, + "grad_norm": 0.057289668121921454, + "language_loss": 0.84257507, + "learning_rate": 0.0004319837718562681, + "loss": 0.85336924, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.36254883, + "step": 2896, + "time_per_iteration": 2.55461049079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086447, + "balance_loss_mlp": 1.04856229, + "epoch": 0.5573297422085417, + "flos": 577126010880.0, + "grad_norm": 0.05427319641394577, + "language_loss": 0.83001935, + "learning_rate": 0.0004316751387479726, + "loss": 0.84088391, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.37841797, + "step": 2897, + "time_per_iteration": 2.726621150970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010828, + "balance_loss_mlp": 1.04622626, + "epoch": 0.5575221238938053, + "flos": 1343536754688.0, + "grad_norm": 0.07147882998338702, + "language_loss": 0.82389295, + "learning_rate": 0.0004313665321657409, + "loss": 0.83472097, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.36572266, + "step": 2898, + "time_per_iteration": 3.705557107925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084994, + "balance_loss_mlp": 1.04756212, + "epoch": 0.5577145055790689, + "flos": 601680324096.0, + "grad_norm": 0.06263472170874507, + "language_loss": 0.80018216, + "learning_rate": 0.00043105795222938436, + "loss": 0.81103212, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.37451172, + "step": 2899, + "time_per_iteration": 2.7069091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082839, + "balance_loss_mlp": 1.04500163, + "epoch": 0.5579068872643325, + "flos": 562353601536.0, + "grad_norm": 0.0921941925102754, + "language_loss": 0.78331131, + "learning_rate": 0.00043074939905870467, + "loss": 0.79413968, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.37817383, + "step": 2900, + "time_per_iteration": 2.6597537994384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108264, + "balance_loss_mlp": 1.04468393, + "epoch": 0.558099268949596, + "flos": 544292904960.0, + "grad_norm": 0.05487003421557055, + "language_loss": 0.80032802, + "learning_rate": 0.0004304408727734927, + "loss": 0.81115448, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.37939453, + "step": 2901, + "time_per_iteration": 2.61590838432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077829, + "balance_loss_mlp": 1.04120803, + "epoch": 0.5582916506348595, + "flos": 552520825344.0, + "grad_norm": 0.05406538300276566, + "language_loss": 0.88821226, + "learning_rate": 0.0004301323734935288, + "loss": 0.89899063, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.36645508, + "step": 2902, + "time_per_iteration": 2.6357102394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082783, + "balance_loss_mlp": 1.04573286, + "epoch": 0.5584840323201231, + "flos": 543126007296.0, + "grad_norm": 0.054631389421551546, + "language_loss": 0.87217975, + "learning_rate": 0.000429823901338583, + "loss": 0.88300759, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.37011719, + "step": 2903, + "time_per_iteration": 2.6050922870635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073691, + "balance_loss_mlp": 1.03678417, + "epoch": 0.5586764140053867, + "flos": 815212118016.0, + "grad_norm": 0.05529085617610277, + "language_loss": 0.86446041, + "learning_rate": 0.00042951545642841513, + "loss": 0.87519729, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.36914062, + "step": 2904, + "time_per_iteration": 3.0609569549560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076695, + "balance_loss_mlp": 1.03981209, + "epoch": 0.5588687956906503, + "flos": 486196895232.0, + "grad_norm": 0.04557850009306157, + "language_loss": 0.86361349, + "learning_rate": 0.0004292070388827737, + "loss": 0.87438047, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.3684082, + "step": 2905, + "time_per_iteration": 2.5549428462982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107784, + "balance_loss_mlp": 1.04017019, + "epoch": 0.5590611773759138, + "flos": 451809805824.0, + "grad_norm": 0.04842795237529701, + "language_loss": 0.8078168, + "learning_rate": 0.00042889864882139753, + "loss": 0.81859523, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.37646484, + "step": 2906, + "time_per_iteration": 2.6019363403320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072955, + "balance_loss_mlp": 1.03662026, + "epoch": 0.5592535590611774, + "flos": 520693083648.0, + "grad_norm": 0.04884179046821603, + "language_loss": 0.81762469, + "learning_rate": 0.0004285902863640139, + "loss": 0.8283543, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.36352539, + "step": 2907, + "time_per_iteration": 2.5899524688720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072848, + "balance_loss_mlp": 1.03622651, + "epoch": 0.5594459407464409, + "flos": 552250192896.0, + "grad_norm": 0.048074009249812255, + "language_loss": 0.8615104, + "learning_rate": 0.00042828195163033966, + "loss": 0.87223887, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.36645508, + "step": 2908, + "time_per_iteration": 2.676518440246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072859, + "balance_loss_mlp": 1.03585625, + "epoch": 0.5596383224317045, + "flos": 484596421632.0, + "grad_norm": 0.0512741694464887, + "language_loss": 0.79307508, + "learning_rate": 0.0004279736447400812, + "loss": 0.80380368, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.36987305, + "step": 2909, + "time_per_iteration": 2.590859889984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074509, + "balance_loss_mlp": 1.03676748, + "epoch": 0.5598307041169681, + "flos": 610976217600.0, + "grad_norm": 0.05469922136848912, + "language_loss": 0.78325337, + "learning_rate": 0.00042766536581293385, + "loss": 0.79399848, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.37695312, + "step": 2910, + "time_per_iteration": 2.7034008502960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074194, + "balance_loss_mlp": 1.03654802, + "epoch": 0.5600230858022316, + "flos": 488585945088.0, + "grad_norm": 0.05207227245540468, + "language_loss": 0.79564762, + "learning_rate": 0.0004273571149685819, + "loss": 0.80638957, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.37597656, + "step": 2911, + "time_per_iteration": 2.7075796127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074918, + "balance_loss_mlp": 1.03650868, + "epoch": 0.5602154674874952, + "flos": 598592858112.0, + "grad_norm": 0.04994756976596268, + "language_loss": 0.84006047, + "learning_rate": 0.00042704889232669937, + "loss": 0.85080969, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.38354492, + "step": 2912, + "time_per_iteration": 2.6922175884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071101, + "balance_loss_mlp": 1.03431344, + "epoch": 0.5604078491727588, + "flos": 585697347072.0, + "grad_norm": 0.05437848146357707, + "language_loss": 0.85302234, + "learning_rate": 0.0004267406980069484, + "loss": 0.86373341, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.36791992, + "step": 2913, + "time_per_iteration": 2.70796537399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067512, + "balance_loss_mlp": 1.03077149, + "epoch": 0.5606002308580224, + "flos": 540926042112.0, + "grad_norm": 0.045341959008097614, + "language_loss": 0.79753983, + "learning_rate": 0.0004264325321289808, + "loss": 0.80821496, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.3671875, + "step": 2914, + "time_per_iteration": 2.761362314224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107061, + "balance_loss_mlp": 1.03241491, + "epoch": 0.5607926125432858, + "flos": 583654533120.0, + "grad_norm": 0.0532534560102953, + "language_loss": 0.85864502, + "learning_rate": 0.00042612439481243736, + "loss": 0.86935115, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.38183594, + "step": 2915, + "time_per_iteration": 2.745008945465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073585, + "balance_loss_mlp": 1.03655863, + "epoch": 0.5609849942285494, + "flos": 627205095936.0, + "grad_norm": 0.06454697115510677, + "language_loss": 0.90024638, + "learning_rate": 0.00042581628617694735, + "loss": 0.91098225, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.37036133, + "step": 2916, + "time_per_iteration": 2.7654495239257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072441, + "balance_loss_mlp": 1.0346992, + "epoch": 0.561177375913813, + "flos": 588095161344.0, + "grad_norm": 0.05235254168005436, + "language_loss": 0.81651318, + "learning_rate": 0.0004255082063421296, + "loss": 0.82723755, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.37719727, + "step": 2917, + "time_per_iteration": 2.674204111099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107424, + "balance_loss_mlp": 1.03726149, + "epoch": 0.5613697575990766, + "flos": 526774883328.0, + "grad_norm": 0.05687183599046208, + "language_loss": 0.8481921, + "learning_rate": 0.00042520015542759065, + "loss": 0.85893452, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.36987305, + "step": 2918, + "time_per_iteration": 2.8309459686279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079966, + "balance_loss_mlp": 1.04134226, + "epoch": 0.5615621392843402, + "flos": 642351444480.0, + "grad_norm": 0.05024796403090353, + "language_loss": 0.88020825, + "learning_rate": 0.00042489213355292687, + "loss": 0.89100802, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.38598633, + "step": 2919, + "time_per_iteration": 2.8605942726135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083327, + "balance_loss_mlp": 1.04444087, + "epoch": 0.5617545209696037, + "flos": 427524715008.0, + "grad_norm": 0.05130722807003229, + "language_loss": 0.8097831, + "learning_rate": 0.00042458414083772276, + "loss": 0.82061636, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.38842773, + "step": 2920, + "time_per_iteration": 2.5186893939971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076823, + "balance_loss_mlp": 1.03920078, + "epoch": 0.5619469026548672, + "flos": 568140037632.0, + "grad_norm": 0.04280127072200588, + "language_loss": 0.84787017, + "learning_rate": 0.000424276177401552, + "loss": 0.85863835, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.37597656, + "step": 2921, + "time_per_iteration": 2.773881435394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079464, + "balance_loss_mlp": 1.04203272, + "epoch": 0.5621392843401308, + "flos": 504947243520.0, + "grad_norm": 0.056711430924252765, + "language_loss": 0.85714108, + "learning_rate": 0.0004239682433639763, + "loss": 0.86793578, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.37426758, + "step": 2922, + "time_per_iteration": 2.714646816253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081153, + "balance_loss_mlp": 1.04477036, + "epoch": 0.5623316660253944, + "flos": 516744258048.0, + "grad_norm": 0.060505090734525195, + "language_loss": 0.85348099, + "learning_rate": 0.0004236603388445467, + "loss": 0.8642925, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.36425781, + "step": 2923, + "time_per_iteration": 2.6141107082366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075818, + "balance_loss_mlp": 1.03905368, + "epoch": 0.5625240477106579, + "flos": 605732456448.0, + "grad_norm": 0.05369747698254185, + "language_loss": 0.81871819, + "learning_rate": 0.00042335246396280166, + "loss": 0.82947636, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.3671875, + "step": 2924, + "time_per_iteration": 2.7129671573638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081443, + "balance_loss_mlp": 1.0438447, + "epoch": 0.5627164293959215, + "flos": 450203539968.0, + "grad_norm": 0.06323509209264203, + "language_loss": 0.89955974, + "learning_rate": 0.0004230446188382693, + "loss": 0.9103741, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.3762207, + "step": 2925, + "time_per_iteration": 2.5567660331726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077212, + "balance_loss_mlp": 1.04101968, + "epoch": 0.5629088110811851, + "flos": 741734032896.0, + "grad_norm": 0.055420573846539395, + "language_loss": 0.80082184, + "learning_rate": 0.0004227368035904654, + "loss": 0.81159395, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.36181641, + "step": 2926, + "time_per_iteration": 2.947251319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108299, + "balance_loss_mlp": 1.04610705, + "epoch": 0.5631011927664487, + "flos": 496719323136.0, + "grad_norm": 0.04719463019166682, + "language_loss": 0.82913107, + "learning_rate": 0.00042242901833889474, + "loss": 0.83996093, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.36889648, + "step": 2927, + "time_per_iteration": 2.6429412364959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085596, + "balance_loss_mlp": 1.0498333, + "epoch": 0.5632935744517122, + "flos": 885774445056.0, + "grad_norm": 0.055780235249339845, + "language_loss": 0.85862845, + "learning_rate": 0.0004221212632030501, + "loss": 0.86948442, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.35791016, + "step": 2928, + "time_per_iteration": 3.0935142040252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085595, + "balance_loss_mlp": 1.04897451, + "epoch": 0.5634859561369757, + "flos": 604516096512.0, + "grad_norm": 0.08179321361553939, + "language_loss": 0.80431306, + "learning_rate": 0.0004218135383024124, + "loss": 0.81516898, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.3659668, + "step": 2929, + "time_per_iteration": 2.688404083251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079905, + "balance_loss_mlp": 1.04359436, + "epoch": 0.5636783378222393, + "flos": 453670737408.0, + "grad_norm": 0.05341288147748167, + "language_loss": 0.85107243, + "learning_rate": 0.0004215058437564511, + "loss": 0.86187148, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.36352539, + "step": 2930, + "time_per_iteration": 2.5591979026794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083948, + "balance_loss_mlp": 1.04725528, + "epoch": 0.5638707195075029, + "flos": 518206519296.0, + "grad_norm": 0.06241038231461263, + "language_loss": 0.82415265, + "learning_rate": 0.00042119817968462397, + "loss": 0.83499211, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.36694336, + "step": 2931, + "time_per_iteration": 2.5755324363708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075664, + "balance_loss_mlp": 1.03916192, + "epoch": 0.5640631011927665, + "flos": 564632142336.0, + "grad_norm": 0.06755883510394861, + "language_loss": 0.87004125, + "learning_rate": 0.0004208905462063766, + "loss": 0.88079786, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.36499023, + "step": 2932, + "time_per_iteration": 2.6330130100250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077211, + "balance_loss_mlp": 1.04097116, + "epoch": 0.56425548287803, + "flos": 516783545856.0, + "grad_norm": 0.04875434703648171, + "language_loss": 0.84473455, + "learning_rate": 0.00042058294344114315, + "loss": 0.85550666, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.36254883, + "step": 2933, + "time_per_iteration": 2.60188627243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081111, + "balance_loss_mlp": 1.04477572, + "epoch": 0.5644478645632935, + "flos": 853907415552.0, + "grad_norm": 0.05278955631679875, + "language_loss": 0.77495515, + "learning_rate": 0.0004202753715083456, + "loss": 0.78576624, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.36352539, + "step": 2934, + "time_per_iteration": 3.0625100135803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084889, + "balance_loss_mlp": 1.04860175, + "epoch": 0.5646402462485571, + "flos": 553175571456.0, + "grad_norm": 0.05717629686508025, + "language_loss": 0.81433523, + "learning_rate": 0.0004199678305273936, + "loss": 0.82518411, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.36279297, + "step": 2935, + "time_per_iteration": 2.6390254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082496, + "balance_loss_mlp": 1.04587531, + "epoch": 0.5648326279338207, + "flos": 685661898240.0, + "grad_norm": 0.05411523361189988, + "language_loss": 0.81180829, + "learning_rate": 0.0004196603206176854, + "loss": 0.82263327, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.36669922, + "step": 2936, + "time_per_iteration": 2.9184954166412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079161, + "balance_loss_mlp": 1.04354107, + "epoch": 0.5650250096190843, + "flos": 802990291968.0, + "grad_norm": 0.04902014595353554, + "language_loss": 0.83833814, + "learning_rate": 0.000419352841898607, + "loss": 0.84912974, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.35644531, + "step": 2937, + "time_per_iteration": 2.963693618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078443, + "balance_loss_mlp": 1.04248953, + "epoch": 0.5652173913043478, + "flos": 581787809280.0, + "grad_norm": 0.05926519799053672, + "language_loss": 0.77107543, + "learning_rate": 0.000419045394489532, + "loss": 0.78185987, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.359375, + "step": 2938, + "time_per_iteration": 2.727398633956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076353, + "balance_loss_mlp": 1.03975606, + "epoch": 0.5654097729896114, + "flos": 820269614592.0, + "grad_norm": 0.053889258634032246, + "language_loss": 0.76768535, + "learning_rate": 0.0004187379785098224, + "loss": 0.77844894, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.3659668, + "step": 2939, + "time_per_iteration": 3.1188313961029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079563, + "balance_loss_mlp": 1.04339492, + "epoch": 0.565602154674875, + "flos": 783826716672.0, + "grad_norm": 0.05512056097545077, + "language_loss": 0.83633238, + "learning_rate": 0.00041843059407882744, + "loss": 0.84712803, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.36206055, + "step": 2940, + "time_per_iteration": 2.983302116394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076269, + "balance_loss_mlp": 1.04072082, + "epoch": 0.5657945363601385, + "flos": 549418802688.0, + "grad_norm": 0.05159052201649483, + "language_loss": 0.82491434, + "learning_rate": 0.0004181232413158842, + "loss": 0.83567703, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.35571289, + "step": 2941, + "time_per_iteration": 2.6737120151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_mlp": 1.04028893, + "epoch": 0.5659869180454021, + "flos": 667826754048.0, + "grad_norm": 0.06466569325042074, + "language_loss": 0.82093412, + "learning_rate": 0.0004178159203403179, + "loss": 0.83170253, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.36547852, + "step": 2942, + "time_per_iteration": 2.8263752460479736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077242, + "balance_loss_mlp": 1.0423857, + "epoch": 0.5661792997306656, + "flos": 499707864576.0, + "grad_norm": 0.05486974364690197, + "language_loss": 0.81532693, + "learning_rate": 0.0004175086312714409, + "loss": 0.82609934, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.34912109, + "step": 2943, + "time_per_iteration": 2.5581164360046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084461, + "balance_loss_mlp": 1.04848337, + "epoch": 0.5663716814159292, + "flos": 600922271232.0, + "grad_norm": 0.04881995286740945, + "language_loss": 0.83686805, + "learning_rate": 0.00041720137422855366, + "loss": 0.84771264, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.35961914, + "step": 2944, + "time_per_iteration": 2.7574734687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080125, + "balance_loss_mlp": 1.04390931, + "epoch": 0.5665640631011928, + "flos": 540728193024.0, + "grad_norm": 0.05214507443979086, + "language_loss": 0.79004753, + "learning_rate": 0.00041689414933094383, + "loss": 0.80084872, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.36230469, + "step": 2945, + "time_per_iteration": 2.6470541954040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080942, + "balance_loss_mlp": 1.0463953, + "epoch": 0.5667564447864564, + "flos": 601655592960.0, + "grad_norm": 0.06146311821637782, + "language_loss": 0.80673099, + "learning_rate": 0.00041658695669788653, + "loss": 0.81754035, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.34594727, + "step": 2946, + "time_per_iteration": 2.721078872680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083586, + "balance_loss_mlp": 1.04791868, + "epoch": 0.5669488264717198, + "flos": 659224894464.0, + "grad_norm": 0.05891401598443517, + "language_loss": 0.80939281, + "learning_rate": 0.00041627979644864453, + "loss": 0.82022864, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.35717773, + "step": 2947, + "time_per_iteration": 2.877037286758423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085181, + "balance_loss_mlp": 1.04941845, + "epoch": 0.5671412081569834, + "flos": 485158035456.0, + "grad_norm": 0.042998309327625356, + "language_loss": 0.809735, + "learning_rate": 0.0004159726687024683, + "loss": 0.8205868, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.35791016, + "step": 2948, + "time_per_iteration": 2.617147207260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_mlp": 1.04832673, + "epoch": 0.567333589842247, + "flos": 729487475712.0, + "grad_norm": 0.049875608566737006, + "language_loss": 0.79203111, + "learning_rate": 0.00041566557357859506, + "loss": 0.80287302, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.35888672, + "step": 2949, + "time_per_iteration": 2.859217882156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080493, + "balance_loss_mlp": 1.04494464, + "epoch": 0.5675259715275106, + "flos": 968471258112.0, + "grad_norm": 0.06410563873068757, + "language_loss": 0.79063594, + "learning_rate": 0.0004153585111962502, + "loss": 0.80144083, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.35571289, + "step": 2950, + "time_per_iteration": 3.3080387115478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084603, + "balance_loss_mlp": 1.04767203, + "epoch": 0.5677183532127742, + "flos": 564879453696.0, + "grad_norm": 0.058242755990822084, + "language_loss": 0.84030402, + "learning_rate": 0.0004150514816746453, + "loss": 0.85115004, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.36938477, + "step": 2951, + "time_per_iteration": 2.66630220413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080246, + "balance_loss_mlp": 1.04517412, + "epoch": 0.5679107348980377, + "flos": 551432503296.0, + "grad_norm": 0.05117838990465897, + "language_loss": 0.85669959, + "learning_rate": 0.0004147444851329802, + "loss": 0.86750209, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.35107422, + "step": 2952, + "time_per_iteration": 2.645735502243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108119, + "balance_loss_mlp": 1.04585648, + "epoch": 0.5681031165833013, + "flos": 819115863552.0, + "grad_norm": 0.04931619960622222, + "language_loss": 0.85395974, + "learning_rate": 0.00041443752169044126, + "loss": 0.8647716, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.35351562, + "step": 2953, + "time_per_iteration": 3.025468349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087019, + "balance_loss_mlp": 1.05116129, + "epoch": 0.5682954982685648, + "flos": 617731702272.0, + "grad_norm": 0.05138113495872943, + "language_loss": 0.84811544, + "learning_rate": 0.0004141305914662025, + "loss": 0.85898566, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.35888672, + "step": 2954, + "time_per_iteration": 2.7767860889434814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108515, + "balance_loss_mlp": 1.04848099, + "epoch": 0.5684878799538284, + "flos": 647625729024.0, + "grad_norm": 0.04880277930525614, + "language_loss": 0.80257368, + "learning_rate": 0.0004138236945794246, + "loss": 0.81342518, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.36645508, + "step": 2955, + "time_per_iteration": 2.9492557048797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079722, + "balance_loss_mlp": 1.04434061, + "epoch": 0.5686802616390919, + "flos": 805615068672.0, + "grad_norm": 0.060523381383535066, + "language_loss": 0.83239132, + "learning_rate": 0.00041351683114925576, + "loss": 0.84318852, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.35424805, + "step": 2956, + "time_per_iteration": 3.0558693408966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080356, + "balance_loss_mlp": 1.0441637, + "epoch": 0.5688726433243555, + "flos": 546882776064.0, + "grad_norm": 0.06102379875806974, + "language_loss": 0.86688364, + "learning_rate": 0.0004132100012948308, + "loss": 0.87768722, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.36230469, + "step": 2957, + "time_per_iteration": 2.6131510734558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084018, + "balance_loss_mlp": 1.04689598, + "epoch": 0.5690650250096191, + "flos": 486324933120.0, + "grad_norm": 0.05856765821562534, + "language_loss": 0.84111595, + "learning_rate": 0.00041290320513527145, + "loss": 0.85195613, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.37133789, + "step": 2958, + "time_per_iteration": 2.584434986114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077095, + "balance_loss_mlp": 1.04154706, + "epoch": 0.5692574066948827, + "flos": 577184237568.0, + "grad_norm": 0.04674501738886335, + "language_loss": 0.85154927, + "learning_rate": 0.0004125964427896867, + "loss": 0.86232018, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.35571289, + "step": 2959, + "time_per_iteration": 2.6582295894622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071399, + "balance_loss_mlp": 1.03551733, + "epoch": 0.5694497883801463, + "flos": 454005388800.0, + "grad_norm": 0.055082869163009494, + "language_loss": 0.79042369, + "learning_rate": 0.0004122897143771723, + "loss": 0.80113769, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.35888672, + "step": 2960, + "time_per_iteration": 2.555941104888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075778, + "balance_loss_mlp": 1.0394429, + "epoch": 0.5696421700654097, + "flos": 559251578880.0, + "grad_norm": 0.0498118595632428, + "language_loss": 0.81253064, + "learning_rate": 0.0004119830200168109, + "loss": 0.82328844, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.36376953, + "step": 2961, + "time_per_iteration": 2.6521012783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073119, + "balance_loss_mlp": 1.03780937, + "epoch": 0.5698345517506733, + "flos": 465314982912.0, + "grad_norm": 0.05616905034177488, + "language_loss": 0.8830415, + "learning_rate": 0.0004116763598276714, + "loss": 0.89377272, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.35327148, + "step": 2962, + "time_per_iteration": 2.5006790161132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073408, + "balance_loss_mlp": 1.03702545, + "epoch": 0.5700269334359369, + "flos": 605645116416.0, + "grad_norm": 0.05368070912324084, + "language_loss": 0.8055867, + "learning_rate": 0.00041136973392881017, + "loss": 0.81632078, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.36376953, + "step": 2963, + "time_per_iteration": 2.8011715412139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073783, + "balance_loss_mlp": 1.03852105, + "epoch": 0.5702193151212005, + "flos": 562423412736.0, + "grad_norm": 0.05977105557008513, + "language_loss": 0.81818962, + "learning_rate": 0.00041106314243926983, + "loss": 0.82892752, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.35302734, + "step": 2964, + "time_per_iteration": 2.7296242713928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070445, + "balance_loss_mlp": 1.03558779, + "epoch": 0.570411696806464, + "flos": 522983208960.0, + "grad_norm": 0.05693204807949615, + "language_loss": 0.87045705, + "learning_rate": 0.0004107565854780798, + "loss": 0.88116145, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.34887695, + "step": 2965, + "time_per_iteration": 2.5964605808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075214, + "balance_loss_mlp": 1.04002357, + "epoch": 0.5706040784917276, + "flos": 717911631360.0, + "grad_norm": 0.05031367362382368, + "language_loss": 0.80980343, + "learning_rate": 0.000410450063164256, + "loss": 0.82055557, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.35229492, + "step": 2966, + "time_per_iteration": 2.8248300552368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076636, + "balance_loss_mlp": 1.04127812, + "epoch": 0.5707964601769911, + "flos": 476467425792.0, + "grad_norm": 0.059966750204006415, + "language_loss": 0.8167066, + "learning_rate": 0.00041014357561680115, + "loss": 0.82747293, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.35351562, + "step": 2967, + "time_per_iteration": 2.4996910095214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077241, + "balance_loss_mlp": 1.04278946, + "epoch": 0.5709888418622547, + "flos": 579823570944.0, + "grad_norm": 0.05891056148222195, + "language_loss": 0.85875672, + "learning_rate": 0.0004098371229547039, + "loss": 0.86952913, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.3449707, + "step": 2968, + "time_per_iteration": 2.6908459663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131838, + "balance_loss_mlp": 1.11677039, + "epoch": 0.5711812235475183, + "flos": 1579108290048.0, + "grad_norm": 0.050443633584492734, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81142646, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.15039062, + "step": 2969, + "time_per_iteration": 4.709675550460815 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107846, + "balance_loss_mlp": 1.04233932, + "epoch": 0.5713736052327818, + "flos": 468259854336.0, + "grad_norm": 0.04864564090032181, + "language_loss": 0.80513656, + "learning_rate": 0.00040922432276247107, + "loss": 0.81592119, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.36132812, + "step": 2970, + "time_per_iteration": 2.554276466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078647, + "balance_loss_mlp": 1.04412448, + "epoch": 0.5715659869180454, + "flos": 537390443520.0, + "grad_norm": 0.06858717783230618, + "language_loss": 0.84265316, + "learning_rate": 0.0004089179754702457, + "loss": 0.85343957, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.34570312, + "step": 2971, + "time_per_iteration": 2.7972512245178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072054, + "balance_loss_mlp": 1.0365299, + "epoch": 0.571758368603309, + "flos": 655778045952.0, + "grad_norm": 0.0710461233457747, + "language_loss": 0.79649973, + "learning_rate": 0.00040861166353919843, + "loss": 0.80722028, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.35546875, + "step": 2972, + "time_per_iteration": 2.7805516719818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076554, + "balance_loss_mlp": 1.04076695, + "epoch": 0.5719507502885726, + "flos": 667609966080.0, + "grad_norm": 0.05192257726698222, + "language_loss": 0.81693333, + "learning_rate": 0.00040830538708824983, + "loss": 0.82769883, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.35839844, + "step": 2973, + "time_per_iteration": 2.8635294437408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070547, + "balance_loss_mlp": 1.03507066, + "epoch": 0.572143131973836, + "flos": 476083312128.0, + "grad_norm": 0.060626408017241236, + "language_loss": 0.81790257, + "learning_rate": 0.000407999146236307, + "loss": 0.82860804, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.35498047, + "step": 2974, + "time_per_iteration": 2.5645899772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074889, + "balance_loss_mlp": 1.03943634, + "epoch": 0.5723355136590996, + "flos": 539255757312.0, + "grad_norm": 0.06009071322865027, + "language_loss": 0.83246768, + "learning_rate": 0.0004076929411022634, + "loss": 0.84321654, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.35449219, + "step": 2975, + "time_per_iteration": 2.655545234680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075121, + "balance_loss_mlp": 1.0383811, + "epoch": 0.5725278953443632, + "flos": 823784864256.0, + "grad_norm": 0.053970809123607175, + "language_loss": 0.79314309, + "learning_rate": 0.0004073867718049982, + "loss": 0.80389434, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.36743164, + "step": 2976, + "time_per_iteration": 3.0664896965026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078369, + "balance_loss_mlp": 1.0429157, + "epoch": 0.5727202770296268, + "flos": 587155226112.0, + "grad_norm": 0.05912475797179562, + "language_loss": 0.82244706, + "learning_rate": 0.00040708063846337704, + "loss": 0.83323073, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.35522461, + "step": 2977, + "time_per_iteration": 2.7131478786468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083864, + "balance_loss_mlp": 1.04800642, + "epoch": 0.5729126587148904, + "flos": 446723195904.0, + "grad_norm": 0.048537452765021645, + "language_loss": 0.80846637, + "learning_rate": 0.00040677454119625143, + "loss": 0.81930506, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.35864258, + "step": 2978, + "time_per_iteration": 2.6209888458251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078154, + "balance_loss_mlp": 1.0418427, + "epoch": 0.5731050404001539, + "flos": 519206091264.0, + "grad_norm": 0.05702144714813726, + "language_loss": 0.82471335, + "learning_rate": 0.0004064684801224587, + "loss": 0.83549494, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.36328125, + "step": 2979, + "time_per_iteration": 2.5915722846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077909, + "balance_loss_mlp": 1.04197955, + "epoch": 0.5732974220854175, + "flos": 504528224256.0, + "grad_norm": 0.05171310351774622, + "language_loss": 0.80115962, + "learning_rate": 0.00040616245536082224, + "loss": 0.8119387, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.35961914, + "step": 2980, + "time_per_iteration": 2.6032769680023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076097, + "balance_loss_mlp": 1.04057276, + "epoch": 0.573489803770681, + "flos": 592187991552.0, + "grad_norm": 0.049753074122949235, + "language_loss": 0.80894011, + "learning_rate": 0.00040585646703015165, + "loss": 0.81970108, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.35522461, + "step": 2981, + "time_per_iteration": 2.79546856880188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074296, + "balance_loss_mlp": 1.03891444, + "epoch": 0.5736821854559446, + "flos": 489672857088.0, + "grad_norm": 0.06088968225358262, + "language_loss": 0.78612393, + "learning_rate": 0.0004055505152492419, + "loss": 0.79686689, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.35449219, + "step": 2982, + "time_per_iteration": 2.6494040489196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078742, + "balance_loss_mlp": 1.04283655, + "epoch": 0.5738745671412081, + "flos": 457895987712.0, + "grad_norm": 0.05054468303814383, + "language_loss": 0.74372864, + "learning_rate": 0.00040524460013687425, + "loss": 0.75451601, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.359375, + "step": 2983, + "time_per_iteration": 2.7171366214752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078136, + "balance_loss_mlp": 1.04294515, + "epoch": 0.5740669488264717, + "flos": 580012655616.0, + "grad_norm": 0.044553783792680594, + "language_loss": 0.80828458, + "learning_rate": 0.0004049387218118155, + "loss": 0.81906593, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.35229492, + "step": 2984, + "time_per_iteration": 2.995347738265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073612, + "balance_loss_mlp": 1.03725314, + "epoch": 0.5742593305117353, + "flos": 524155898880.0, + "grad_norm": 0.05730874981758524, + "language_loss": 0.8475495, + "learning_rate": 0.00040463288039281777, + "loss": 0.85828567, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.36328125, + "step": 2985, + "time_per_iteration": 2.715092182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102117, + "balance_loss_mlp": 1.0106324, + "epoch": 0.5744517121969989, + "flos": 1553033488896.0, + "grad_norm": 0.021440825644231668, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78897589, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.10546875, + "step": 2986, + "time_per_iteration": 4.936111211776733 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071312, + "balance_loss_mlp": 1.03588247, + "epoch": 0.5746440938822625, + "flos": 751600304640.0, + "grad_norm": 0.05668637583843988, + "language_loss": 0.81840217, + "learning_rate": 0.0004040213087479444, + "loss": 0.82911527, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.35449219, + "step": 2987, + "time_per_iteration": 2.949164628982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074412, + "balance_loss_mlp": 1.03955531, + "epoch": 0.5748364755675259, + "flos": 501618258432.0, + "grad_norm": 0.05762088821448085, + "language_loss": 0.84999508, + "learning_rate": 0.0004037155787595018, + "loss": 0.86073923, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.34887695, + "step": 2988, + "time_per_iteration": 2.6570816040039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010738, + "balance_loss_mlp": 1.03863311, + "epoch": 0.5750288572527895, + "flos": 503757024768.0, + "grad_norm": 0.17757642281187902, + "language_loss": 0.80609345, + "learning_rate": 0.000403409886151987, + "loss": 0.81683147, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.35205078, + "step": 2989, + "time_per_iteration": 2.913994073867798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014651, + "balance_loss_mlp": 1.00430369, + "epoch": 0.5752212389380531, + "flos": 1540541030400.0, + "grad_norm": 0.007550989320398048, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83013755, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.10351562, + "step": 2990, + "time_per_iteration": 4.7991979122161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020765, + "balance_loss_mlp": 1.01027453, + "epoch": 0.5754136206233167, + "flos": 1566499378176.0, + "grad_norm": 0.009415259483784648, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79219365, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.10498047, + "step": 2991, + "time_per_iteration": 4.760354280471802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076847, + "balance_loss_mlp": 1.04282451, + "epoch": 0.5756060023085803, + "flos": 797806167552.0, + "grad_norm": 0.05030181344669937, + "language_loss": 0.76800382, + "learning_rate": 0.00040249303380173807, + "loss": 0.77877235, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.34057617, + "step": 2992, + "time_per_iteration": 3.083129644393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080903, + "balance_loss_mlp": 1.04573631, + "epoch": 0.5757983839938438, + "flos": 587588802048.0, + "grad_norm": 0.05896593059815975, + "language_loss": 0.78794599, + "learning_rate": 0.00040218749190459126, + "loss": 0.79875505, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.35229492, + "step": 2993, + "time_per_iteration": 2.763256788253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083869, + "balance_loss_mlp": 1.04884517, + "epoch": 0.5759907656791073, + "flos": 516576932352.0, + "grad_norm": 0.05409710441005256, + "language_loss": 0.82655573, + "learning_rate": 0.00040188198798162775, + "loss": 0.83739436, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.35058594, + "step": 2994, + "time_per_iteration": 2.6000871658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078452, + "balance_loss_mlp": 1.04333293, + "epoch": 0.5761831473643709, + "flos": 586845305856.0, + "grad_norm": 0.05831918093224265, + "language_loss": 0.85334295, + "learning_rate": 0.000401576522151455, + "loss": 0.8641274, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.3515625, + "step": 2995, + "time_per_iteration": 2.808647871017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081176, + "balance_loss_mlp": 1.04672456, + "epoch": 0.5763755290496345, + "flos": 543619219968.0, + "grad_norm": 0.04257335582462403, + "language_loss": 0.82291412, + "learning_rate": 0.0004012710945326651, + "loss": 0.83372593, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.34472656, + "step": 2996, + "time_per_iteration": 2.7611968517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082283, + "balance_loss_mlp": 1.04749799, + "epoch": 0.576567910734898, + "flos": 625930509312.0, + "grad_norm": 0.050767561493079726, + "language_loss": 0.80952752, + "learning_rate": 0.0004009657052438355, + "loss": 0.82035035, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.34814453, + "step": 2997, + "time_per_iteration": 2.788496971130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107931, + "balance_loss_mlp": 1.04392815, + "epoch": 0.5767602924201616, + "flos": 537985552896.0, + "grad_norm": 0.053276481047857226, + "language_loss": 0.85359365, + "learning_rate": 0.00040066035440352904, + "loss": 0.86438668, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.35400391, + "step": 2998, + "time_per_iteration": 2.6187028884887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010656, + "balance_loss_mlp": 1.05358338, + "epoch": 0.5769526741054252, + "flos": 1558969873920.0, + "grad_norm": 0.027624435835290975, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80358732, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.12011719, + "step": 2999, + "time_per_iteration": 4.880754470825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085333, + "balance_loss_mlp": 1.05071473, + "epoch": 0.5771450557906888, + "flos": 467939759616.0, + "grad_norm": 0.056203987299685475, + "language_loss": 0.7605744, + "learning_rate": 0.00040004976854266145, + "loss": 0.77142775, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.34667969, + "step": 3000, + "time_per_iteration": 2.537555694580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079139, + "balance_loss_mlp": 1.043329, + "epoch": 0.5773374374759523, + "flos": 574288828416.0, + "grad_norm": 0.059637526980377456, + "language_loss": 0.81006908, + "learning_rate": 0.0003997445337591505, + "loss": 0.82086051, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.35839844, + "step": 3001, + "time_per_iteration": 2.637199878692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072855, + "balance_loss_mlp": 1.03756905, + "epoch": 0.5775298191612158, + "flos": 528216795648.0, + "grad_norm": 0.054057225734739034, + "language_loss": 0.73747128, + "learning_rate": 0.0003994393378982635, + "loss": 0.74819982, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.35327148, + "step": 3002, + "time_per_iteration": 2.605628490447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042054, + "balance_loss_mlp": 1.03013277, + "epoch": 0.5777222008464794, + "flos": 1303178070528.0, + "grad_norm": 0.01828159888171313, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80580056, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.11914062, + "step": 3003, + "time_per_iteration": 4.791952848434448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072518, + "balance_loss_mlp": 1.03708899, + "epoch": 0.577914582531743, + "flos": 603344816640.0, + "grad_norm": 0.05129820562397971, + "language_loss": 0.88025165, + "learning_rate": 0.0003988290634182961, + "loss": 0.89097679, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.35449219, + "step": 3004, + "time_per_iteration": 2.7482082843780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107598, + "balance_loss_mlp": 1.04162431, + "epoch": 0.5781069642170066, + "flos": 486537338880.0, + "grad_norm": 0.060845290060135546, + "language_loss": 0.80967325, + "learning_rate": 0.0003985239850361453, + "loss": 0.82043308, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.34399414, + "step": 3005, + "time_per_iteration": 2.577929735183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074673, + "balance_loss_mlp": 1.03933978, + "epoch": 0.5782993459022701, + "flos": 506016626688.0, + "grad_norm": 0.06787324566679709, + "language_loss": 0.84799004, + "learning_rate": 0.0003982189460504777, + "loss": 0.85873681, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.35375977, + "step": 3006, + "time_per_iteration": 2.6993815898895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077227, + "balance_loss_mlp": 1.04179859, + "epoch": 0.5784917275875336, + "flos": 601872380928.0, + "grad_norm": 0.06968716045875477, + "language_loss": 0.79860866, + "learning_rate": 0.00039791394657971935, + "loss": 0.80938095, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.35449219, + "step": 3007, + "time_per_iteration": 2.6929664611816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070454, + "balance_loss_mlp": 1.03616893, + "epoch": 0.5786841092727972, + "flos": 521279428608.0, + "grad_norm": 0.07090711844515878, + "language_loss": 0.84396511, + "learning_rate": 0.00039760898674228205, + "loss": 0.85466969, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.34301758, + "step": 3008, + "time_per_iteration": 2.674983501434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074007, + "balance_loss_mlp": 1.03941262, + "epoch": 0.5788764909580608, + "flos": 767047809024.0, + "grad_norm": 0.04405411396785794, + "language_loss": 0.80589879, + "learning_rate": 0.0003973040666565613, + "loss": 0.81663889, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.34619141, + "step": 3009, + "time_per_iteration": 3.0445330142974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068256, + "balance_loss_mlp": 1.03347063, + "epoch": 0.5790688726433244, + "flos": 598786324992.0, + "grad_norm": 0.0464228238066257, + "language_loss": 0.81778955, + "learning_rate": 0.000396999186440938, + "loss": 0.82847214, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.34814453, + "step": 3010, + "time_per_iteration": 2.837510585784912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_mlp": 1.03594089, + "epoch": 0.5792612543285879, + "flos": 522805708800.0, + "grad_norm": 0.06076952990047212, + "language_loss": 0.8482464, + "learning_rate": 0.000396694346213777, + "loss": 0.85896629, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.36083984, + "step": 3011, + "time_per_iteration": 2.630096197128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071847, + "balance_loss_mlp": 1.03498721, + "epoch": 0.5794536360138515, + "flos": 876178805760.0, + "grad_norm": 0.045866643068031475, + "language_loss": 0.83350897, + "learning_rate": 0.0003963895460934276, + "loss": 0.84422737, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.3684082, + "step": 3012, + "time_per_iteration": 3.144862174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107149, + "balance_loss_mlp": 1.03555989, + "epoch": 0.5796460176991151, + "flos": 401221541376.0, + "grad_norm": 0.0681769397078292, + "language_loss": 0.84421676, + "learning_rate": 0.00039608478619822376, + "loss": 0.85493165, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.35961914, + "step": 3013, + "time_per_iteration": 2.459653854370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071668, + "balance_loss_mlp": 1.03545213, + "epoch": 0.5798383993843786, + "flos": 618229297152.0, + "grad_norm": 0.04312849012034037, + "language_loss": 0.82395273, + "learning_rate": 0.00039578006664648394, + "loss": 0.83466941, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.36206055, + "step": 3014, + "time_per_iteration": 2.759540557861328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068998, + "balance_loss_mlp": 1.0336163, + "epoch": 0.5800307810696421, + "flos": 843966950400.0, + "grad_norm": 0.05059644865737796, + "language_loss": 0.80954117, + "learning_rate": 0.0003954753875565105, + "loss": 0.82023108, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.35424805, + "step": 3015, + "time_per_iteration": 3.102818727493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065155, + "balance_loss_mlp": 1.02970195, + "epoch": 0.5802231627549057, + "flos": 569005779456.0, + "grad_norm": 0.049284538826036076, + "language_loss": 0.82072717, + "learning_rate": 0.00039517074904659057, + "loss": 0.83137876, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.35498047, + "step": 3016, + "time_per_iteration": 2.6733109951019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074461, + "balance_loss_mlp": 1.03884125, + "epoch": 0.5804155444401693, + "flos": 660160447488.0, + "grad_norm": 0.0506827974734746, + "language_loss": 0.84573597, + "learning_rate": 0.00039486615123499535, + "loss": 0.8564806, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.35668945, + "step": 3017, + "time_per_iteration": 2.8088088035583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071818, + "balance_loss_mlp": 1.0354352, + "epoch": 0.5806079261254329, + "flos": 513726603264.0, + "grad_norm": 0.053399367847764105, + "language_loss": 0.84808505, + "learning_rate": 0.00039456159423997996, + "loss": 0.85880327, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.36401367, + "step": 3018, + "time_per_iteration": 2.6254379749298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074265, + "balance_loss_mlp": 1.03747678, + "epoch": 0.5808003078106965, + "flos": 528379739136.0, + "grad_norm": 0.059071353461068586, + "language_loss": 0.89337808, + "learning_rate": 0.00039425707817978406, + "loss": 0.90412068, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.36767578, + "step": 3019, + "time_per_iteration": 2.65867280960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071634, + "balance_loss_mlp": 1.0357995, + "epoch": 0.58099268949596, + "flos": 476787520512.0, + "grad_norm": 0.06353889490099716, + "language_loss": 0.83356857, + "learning_rate": 0.00039395260317263124, + "loss": 0.84428501, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.35839844, + "step": 3020, + "time_per_iteration": 2.554124116897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074167, + "balance_loss_mlp": 1.03666329, + "epoch": 0.5811850711812235, + "flos": 517340777472.0, + "grad_norm": 0.05166922362438639, + "language_loss": 0.84975517, + "learning_rate": 0.0003936481693367291, + "loss": 0.86049688, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.37475586, + "step": 3021, + "time_per_iteration": 2.6460227966308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075928, + "balance_loss_mlp": 1.03976023, + "epoch": 0.5813774528664871, + "flos": 616122464256.0, + "grad_norm": 0.06649500378390247, + "language_loss": 0.876212, + "learning_rate": 0.0003933437767902697, + "loss": 0.88697129, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.36206055, + "step": 3022, + "time_per_iteration": 2.8114941120147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071818, + "balance_loss_mlp": 1.03588879, + "epoch": 0.5815698345517507, + "flos": 567194310144.0, + "grad_norm": 0.06503214921944889, + "language_loss": 0.78287327, + "learning_rate": 0.00039303942565142825, + "loss": 0.7935915, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.35961914, + "step": 3023, + "time_per_iteration": 2.7259762287139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071639, + "balance_loss_mlp": 1.03563786, + "epoch": 0.5817622162370142, + "flos": 562886102016.0, + "grad_norm": 0.05350887168996553, + "language_loss": 0.76429439, + "learning_rate": 0.0003927351160383644, + "loss": 0.77501082, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.36035156, + "step": 3024, + "time_per_iteration": 2.8155934810638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071996, + "balance_loss_mlp": 1.03730595, + "epoch": 0.5819545979222778, + "flos": 458982899712.0, + "grad_norm": 0.05396860990467202, + "language_loss": 0.77624023, + "learning_rate": 0.000392430848069222, + "loss": 0.78696012, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.34741211, + "step": 3025, + "time_per_iteration": 2.5123956203460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069758, + "balance_loss_mlp": 1.03387606, + "epoch": 0.5821469796075414, + "flos": 541215613440.0, + "grad_norm": 0.05894861582094883, + "language_loss": 0.82395303, + "learning_rate": 0.00039212662186212795, + "loss": 0.83465064, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.35913086, + "step": 3026, + "time_per_iteration": 2.6423861980438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075924, + "balance_loss_mlp": 1.03930306, + "epoch": 0.582339361292805, + "flos": 551994117120.0, + "grad_norm": 0.060293393109458415, + "language_loss": 0.77264106, + "learning_rate": 0.0003918224375351934, + "loss": 0.7834003, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.36621094, + "step": 3027, + "time_per_iteration": 2.691378593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075199, + "balance_loss_mlp": 1.04029393, + "epoch": 0.5825317429780685, + "flos": 496138770432.0, + "grad_norm": 0.05191318265313257, + "language_loss": 0.78248543, + "learning_rate": 0.0003915182952065135, + "loss": 0.79323745, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.34936523, + "step": 3028, + "time_per_iteration": 2.718275308609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073019, + "balance_loss_mlp": 1.03732777, + "epoch": 0.582724124663332, + "flos": 563890056192.0, + "grad_norm": 0.0482119369127772, + "language_loss": 0.87499475, + "learning_rate": 0.0003912141949941664, + "loss": 0.8857249, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.35766602, + "step": 3029, + "time_per_iteration": 2.6762070655822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075023, + "balance_loss_mlp": 1.03852117, + "epoch": 0.5829165063485956, + "flos": 491888788992.0, + "grad_norm": 0.06336756881053687, + "language_loss": 0.82355005, + "learning_rate": 0.0003909101370162143, + "loss": 0.83430028, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.36499023, + "step": 3030, + "time_per_iteration": 2.6055908203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035193, + "balance_loss_mlp": 1.02432156, + "epoch": 0.5831088880338592, + "flos": 1528134501888.0, + "grad_norm": 0.025423566517204055, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.7346909, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.10888672, + "step": 3031, + "time_per_iteration": 4.88014817237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071232, + "balance_loss_mlp": 1.03558815, + "epoch": 0.5833012697191228, + "flos": 617712763392.0, + "grad_norm": 0.04799878735573131, + "language_loss": 0.82774729, + "learning_rate": 0.0003903021482356622, + "loss": 0.83845961, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.35693359, + "step": 3032, + "time_per_iteration": 2.7778074741363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071187, + "balance_loss_mlp": 1.03542447, + "epoch": 0.5834936514043862, + "flos": 767578899456.0, + "grad_norm": 0.04830091888101656, + "language_loss": 0.82788891, + "learning_rate": 0.00038999821766910465, + "loss": 0.83860075, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.35791016, + "step": 3033, + "time_per_iteration": 2.9640953540802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070566, + "balance_loss_mlp": 1.03496981, + "epoch": 0.5836860330896498, + "flos": 458136096768.0, + "grad_norm": 0.045708981442043065, + "language_loss": 0.85570675, + "learning_rate": 0.00038969432980902606, + "loss": 0.8664124, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.35620117, + "step": 3034, + "time_per_iteration": 2.520869255065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029447, + "balance_loss_mlp": 1.01819336, + "epoch": 0.5838784147749134, + "flos": 1360485504000.0, + "grad_norm": 0.023110513117977256, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80813944, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.11230469, + "step": 3035, + "time_per_iteration": 4.791047811508179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076539, + "balance_loss_mlp": 1.04125297, + "epoch": 0.584070796460177, + "flos": 566942616576.0, + "grad_norm": 0.048603623386797364, + "language_loss": 0.82340151, + "learning_rate": 0.00038908668268020953, + "loss": 0.83416688, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.35302734, + "step": 3036, + "time_per_iteration": 2.6480767726898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073606, + "balance_loss_mlp": 1.03781927, + "epoch": 0.5842631781454406, + "flos": 611188623360.0, + "grad_norm": 0.04937423588772942, + "language_loss": 0.84850454, + "learning_rate": 0.00038878292364738097, + "loss": 0.85924065, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.3581543, + "step": 3037, + "time_per_iteration": 2.7739527225494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070708, + "balance_loss_mlp": 1.03418183, + "epoch": 0.5844555598307041, + "flos": 463148513280.0, + "grad_norm": 0.05602443207387838, + "language_loss": 0.86980963, + "learning_rate": 0.0003884792077928508, + "loss": 0.88051671, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.36523438, + "step": 3038, + "time_per_iteration": 2.488044500350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076481, + "balance_loss_mlp": 1.04083705, + "epoch": 0.5846479415159677, + "flos": 410005283328.0, + "grad_norm": 0.06107663121836191, + "language_loss": 0.76691568, + "learning_rate": 0.0003881755352345322, + "loss": 0.77768052, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.35644531, + "step": 3039, + "time_per_iteration": 2.4996848106384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076016, + "balance_loss_mlp": 1.03944278, + "epoch": 0.5848403232012312, + "flos": 491056542720.0, + "grad_norm": 0.04475599589029588, + "language_loss": 0.86940634, + "learning_rate": 0.0003878719060903207, + "loss": 0.88016653, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.36572266, + "step": 3040, + "time_per_iteration": 2.5631661415100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107371, + "balance_loss_mlp": 1.03823376, + "epoch": 0.5850327048864948, + "flos": 584146335744.0, + "grad_norm": 0.06623374989281658, + "language_loss": 0.82883763, + "learning_rate": 0.0003875683204780961, + "loss": 0.83957475, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.35522461, + "step": 3041, + "time_per_iteration": 2.7194101810455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074225, + "balance_loss_mlp": 1.03765166, + "epoch": 0.5852250865717584, + "flos": 651253049856.0, + "grad_norm": 0.05546398592496706, + "language_loss": 0.84983653, + "learning_rate": 0.00038726477851572043, + "loss": 0.86057878, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.36572266, + "step": 3042, + "time_per_iteration": 2.809687376022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072999, + "balance_loss_mlp": 1.03659296, + "epoch": 0.5854174682570219, + "flos": 534332090880.0, + "grad_norm": 0.07237686853447298, + "language_loss": 0.80418718, + "learning_rate": 0.0003869612803210395, + "loss": 0.81491715, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.36401367, + "step": 3043, + "time_per_iteration": 2.6141133308410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074965, + "balance_loss_mlp": 1.03872585, + "epoch": 0.5856098499422855, + "flos": 509501352960.0, + "grad_norm": 0.08321780378599658, + "language_loss": 0.83029413, + "learning_rate": 0.0003866578260118817, + "loss": 0.84104383, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.36254883, + "step": 3044, + "time_per_iteration": 2.5739400386810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070627, + "balance_loss_mlp": 1.03438699, + "epoch": 0.5858022316275491, + "flos": 593619729408.0, + "grad_norm": 0.061750802810204855, + "language_loss": 0.83199847, + "learning_rate": 0.0003863544157060581, + "loss": 0.84270471, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.36254883, + "step": 3045, + "time_per_iteration": 2.662442207336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077309, + "balance_loss_mlp": 1.04083109, + "epoch": 0.5859946133128127, + "flos": 558829587456.0, + "grad_norm": 0.0566139046566934, + "language_loss": 0.82210046, + "learning_rate": 0.0003860510495213634, + "loss": 0.83287358, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.36499023, + "step": 3046, + "time_per_iteration": 2.817676305770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086311, + "balance_loss_mlp": 1.04885542, + "epoch": 0.5861869949980761, + "flos": 553431647232.0, + "grad_norm": 0.06969052760403557, + "language_loss": 0.77781415, + "learning_rate": 0.0003857477275755746, + "loss": 0.78867728, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.37451172, + "step": 3047, + "time_per_iteration": 2.645547389984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076852, + "balance_loss_mlp": 1.03994477, + "epoch": 0.5863793766833397, + "flos": 718321886208.0, + "grad_norm": 0.060152245737565335, + "language_loss": 0.83672923, + "learning_rate": 0.00038544444998645167, + "loss": 0.84749776, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.36914062, + "step": 3048, + "time_per_iteration": 2.995572090148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080654, + "balance_loss_mlp": 1.04410434, + "epoch": 0.5865717583686033, + "flos": 472041354240.0, + "grad_norm": 0.05877541838315078, + "language_loss": 0.81869525, + "learning_rate": 0.00038514121687173767, + "loss": 0.82950181, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.36572266, + "step": 3049, + "time_per_iteration": 2.5653092861175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085484, + "balance_loss_mlp": 1.04819572, + "epoch": 0.5867641400538669, + "flos": 813143162880.0, + "grad_norm": 0.060327128014073625, + "language_loss": 0.82117838, + "learning_rate": 0.00038483802834915807, + "loss": 0.83203322, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.37280273, + "step": 3050, + "time_per_iteration": 2.9661922454833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074132, + "balance_loss_mlp": 1.03755879, + "epoch": 0.5869565217391305, + "flos": 486285645312.0, + "grad_norm": 0.05442603126978945, + "language_loss": 0.78767669, + "learning_rate": 0.00038453488453642074, + "loss": 0.79841799, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.36547852, + "step": 3051, + "time_per_iteration": 2.6421701908111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076937, + "balance_loss_mlp": 1.0401963, + "epoch": 0.587148903424394, + "flos": 569104704000.0, + "grad_norm": 0.050403805084847125, + "language_loss": 0.86714828, + "learning_rate": 0.00038423178555121697, + "loss": 0.87791765, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.36743164, + "step": 3052, + "time_per_iteration": 2.689039945602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079498, + "balance_loss_mlp": 1.04239988, + "epoch": 0.5873412851096576, + "flos": 746948680704.0, + "grad_norm": 0.04537735372020953, + "language_loss": 0.85335124, + "learning_rate": 0.00038392873151121994, + "loss": 0.86414617, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.37084961, + "step": 3053, + "time_per_iteration": 3.0252749919891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071586, + "balance_loss_mlp": 1.03510821, + "epoch": 0.5875336667949211, + "flos": 527882144256.0, + "grad_norm": 0.0531573443466337, + "language_loss": 0.82837141, + "learning_rate": 0.0003836257225340859, + "loss": 0.83908725, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.36474609, + "step": 3054, + "time_per_iteration": 2.6028475761413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074191, + "balance_loss_mlp": 1.03728426, + "epoch": 0.5877260484801847, + "flos": 823799420928.0, + "grad_norm": 0.057535155706969474, + "language_loss": 0.81870168, + "learning_rate": 0.00038332275873745336, + "loss": 0.82944363, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.36889648, + "step": 3055, + "time_per_iteration": 3.1007511615753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074496, + "balance_loss_mlp": 1.03682637, + "epoch": 0.5879184301654482, + "flos": 591325221888.0, + "grad_norm": 0.0460079349498171, + "language_loss": 0.82943761, + "learning_rate": 0.0003830198402389431, + "loss": 0.84018254, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.37646484, + "step": 3056, + "time_per_iteration": 2.6919126510620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010412, + "balance_loss_mlp": 1.02975643, + "epoch": 0.5881108118507118, + "flos": 1544953955328.0, + "grad_norm": 0.021887470100806234, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78390133, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.11425781, + "step": 3057, + "time_per_iteration": 4.971444368362427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072291, + "balance_loss_mlp": 1.03576517, + "epoch": 0.5883031935359754, + "flos": 489348380160.0, + "grad_norm": 0.055950804718103285, + "language_loss": 0.82692897, + "learning_rate": 0.0003824141396066855, + "loss": 0.83765185, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.36572266, + "step": 3058, + "time_per_iteration": 2.5848963260650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074876, + "balance_loss_mlp": 1.03842139, + "epoch": 0.588495575221239, + "flos": 582551654400.0, + "grad_norm": 0.05305150563857962, + "language_loss": 0.82647693, + "learning_rate": 0.000382111357708092, + "loss": 0.83722568, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.36499023, + "step": 3059, + "time_per_iteration": 2.750030279159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071916, + "balance_loss_mlp": 1.03558111, + "epoch": 0.5886879569065026, + "flos": 660751174656.0, + "grad_norm": 0.05165433097502605, + "language_loss": 0.83451211, + "learning_rate": 0.00038180862157792864, + "loss": 0.84523129, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.36303711, + "step": 3060, + "time_per_iteration": 2.7654812335968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070431, + "balance_loss_mlp": 1.03414369, + "epoch": 0.588880338591766, + "flos": 562392889344.0, + "grad_norm": 0.05703427459216956, + "language_loss": 0.82004499, + "learning_rate": 0.0003815059313337279, + "loss": 0.83074933, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.36279297, + "step": 3061, + "time_per_iteration": 2.659722089767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072147, + "balance_loss_mlp": 1.03585935, + "epoch": 0.5890727202770296, + "flos": 554451568128.0, + "grad_norm": 0.04901881896382658, + "language_loss": 0.77886307, + "learning_rate": 0.00038120328709300436, + "loss": 0.78958452, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.36279297, + "step": 3062, + "time_per_iteration": 2.8264663219451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076904, + "balance_loss_mlp": 1.04114151, + "epoch": 0.5892651019622932, + "flos": 655226606592.0, + "grad_norm": 0.057794453116502664, + "language_loss": 0.83449113, + "learning_rate": 0.0003809006889732549, + "loss": 0.84526014, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.35766602, + "step": 3063, + "time_per_iteration": 2.780714511871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073572, + "balance_loss_mlp": 1.03680801, + "epoch": 0.5894574836475568, + "flos": 452970911232.0, + "grad_norm": 0.048397381644471126, + "language_loss": 0.87604314, + "learning_rate": 0.0003805981370919589, + "loss": 0.88677883, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.36743164, + "step": 3064, + "time_per_iteration": 2.497511386871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077136, + "balance_loss_mlp": 1.03965652, + "epoch": 0.5896498653328203, + "flos": 518763750912.0, + "grad_norm": 0.05535483461806511, + "language_loss": 0.83910584, + "learning_rate": 0.0003802956315665771, + "loss": 0.84987724, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.37475586, + "step": 3065, + "time_per_iteration": 2.6540539264678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075706, + "balance_loss_mlp": 1.03965688, + "epoch": 0.5898422470180839, + "flos": 548793169920.0, + "grad_norm": 0.06978967624296899, + "language_loss": 0.81621277, + "learning_rate": 0.0003799931725145529, + "loss": 0.82696986, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.3605957, + "step": 3066, + "time_per_iteration": 2.5999929904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075756, + "balance_loss_mlp": 1.04015982, + "epoch": 0.5900346287033474, + "flos": 524046799872.0, + "grad_norm": 0.06178961053063138, + "language_loss": 0.85556895, + "learning_rate": 0.00037969076005331083, + "loss": 0.86632651, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.35571289, + "step": 3067, + "time_per_iteration": 2.7505955696105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080728, + "balance_loss_mlp": 1.04372525, + "epoch": 0.590227010388611, + "flos": 566893154304.0, + "grad_norm": 0.059517883137225745, + "language_loss": 0.88041914, + "learning_rate": 0.00037938839430025817, + "loss": 0.89122641, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.36962891, + "step": 3068, + "time_per_iteration": 2.6254634857177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072667, + "balance_loss_mlp": 1.03714228, + "epoch": 0.5904193920738746, + "flos": 583053631488.0, + "grad_norm": 0.05094647187222568, + "language_loss": 0.85285151, + "learning_rate": 0.0003790860753727835, + "loss": 0.8635782, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.35546875, + "step": 3069, + "time_per_iteration": 2.790996551513672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076132, + "balance_loss_mlp": 1.04056025, + "epoch": 0.5906117737591381, + "flos": 529428773376.0, + "grad_norm": 0.06487433034023032, + "language_loss": 0.82915914, + "learning_rate": 0.00037878380338825766, + "loss": 0.83992046, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.35644531, + "step": 3070, + "time_per_iteration": 2.6697611808776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078223, + "balance_loss_mlp": 1.04276967, + "epoch": 0.5908041554444017, + "flos": 683908655616.0, + "grad_norm": 0.053205750192721994, + "language_loss": 0.81560326, + "learning_rate": 0.00037848157846403287, + "loss": 0.8263855, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.35473633, + "step": 3071, + "time_per_iteration": 2.92523193359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077534, + "balance_loss_mlp": 1.04246306, + "epoch": 0.5909965371296653, + "flos": 549719958528.0, + "grad_norm": 0.04683417834560967, + "language_loss": 0.83405554, + "learning_rate": 0.0003781794007174435, + "loss": 0.84483093, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.35107422, + "step": 3072, + "time_per_iteration": 2.7881455421447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022638, + "balance_loss_mlp": 1.01200461, + "epoch": 0.5911889188149289, + "flos": 1491544475136.0, + "grad_norm": 0.008695883247199268, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75097167, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.10644531, + "step": 3073, + "time_per_iteration": 4.864701509475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078671, + "balance_loss_mlp": 1.04293227, + "epoch": 0.5913813005001923, + "flos": 487630043136.0, + "grad_norm": 0.053099165858615995, + "language_loss": 0.80592149, + "learning_rate": 0.0003775751872264152, + "loss": 0.81670815, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.35766602, + "step": 3074, + "time_per_iteration": 2.7932956218719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079498, + "balance_loss_mlp": 1.04409289, + "epoch": 0.5915736821854559, + "flos": 573034590720.0, + "grad_norm": 0.04575078918426429, + "language_loss": 0.86981148, + "learning_rate": 0.0003772731517165527, + "loss": 0.88060653, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.35449219, + "step": 3075, + "time_per_iteration": 2.7613656520843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075792, + "balance_loss_mlp": 1.04060149, + "epoch": 0.5917660638707195, + "flos": 789183959040.0, + "grad_norm": 0.06797753963070947, + "language_loss": 0.84194851, + "learning_rate": 0.0003769711638534784, + "loss": 0.85270643, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.35205078, + "step": 3076, + "time_per_iteration": 2.991854190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076527, + "balance_loss_mlp": 1.04181361, + "epoch": 0.5919584455559831, + "flos": 528487428096.0, + "grad_norm": 0.06227325112589354, + "language_loss": 0.78677326, + "learning_rate": 0.00037666922375443446, + "loss": 0.79753852, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.34765625, + "step": 3077, + "time_per_iteration": 2.591597557067871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072268, + "balance_loss_mlp": 1.03757811, + "epoch": 0.5921508272412467, + "flos": 560320962048.0, + "grad_norm": 0.056716138151229355, + "language_loss": 0.81505013, + "learning_rate": 0.00037636733153664396, + "loss": 0.82577276, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.34716797, + "step": 3078, + "time_per_iteration": 2.854278802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075513, + "balance_loss_mlp": 1.04144311, + "epoch": 0.5923432089265102, + "flos": 563008347648.0, + "grad_norm": 0.061835614307010005, + "language_loss": 0.79824865, + "learning_rate": 0.0003760654873173124, + "loss": 0.80900383, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.34082031, + "step": 3079, + "time_per_iteration": 2.66091251373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_mlp": 1.04387426, + "epoch": 0.5925355906117737, + "flos": 495488406528.0, + "grad_norm": 0.052514491856325576, + "language_loss": 0.81763887, + "learning_rate": 0.00037576369121362566, + "loss": 0.8284322, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.35498047, + "step": 3080, + "time_per_iteration": 2.5847787857055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079309, + "balance_loss_mlp": 1.04473865, + "epoch": 0.5927279722970373, + "flos": 565940072448.0, + "grad_norm": 0.05276703199883553, + "language_loss": 0.81885982, + "learning_rate": 0.0003754619433427516, + "loss": 0.82965291, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.34570312, + "step": 3081, + "time_per_iteration": 2.898594856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108187, + "balance_loss_mlp": 1.04682267, + "epoch": 0.5929203539823009, + "flos": 666674413056.0, + "grad_norm": 0.06717854488830324, + "language_loss": 0.77682364, + "learning_rate": 0.0003751602438218392, + "loss": 0.78764236, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.35083008, + "step": 3082, + "time_per_iteration": 2.7553367614746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083555, + "balance_loss_mlp": 1.0486505, + "epoch": 0.5931127356675644, + "flos": 555484635648.0, + "grad_norm": 0.05625551140275949, + "language_loss": 0.83254004, + "learning_rate": 0.0003748585927680186, + "loss": 0.84337556, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.34912109, + "step": 3083, + "time_per_iteration": 2.6493966579437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089532, + "balance_loss_mlp": 1.0530777, + "epoch": 0.593305117352828, + "flos": 534932992512.0, + "grad_norm": 0.07512877248395429, + "language_loss": 0.82828176, + "learning_rate": 0.00037455699029840086, + "loss": 0.83917707, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.36450195, + "step": 3084, + "time_per_iteration": 2.674532890319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079674, + "balance_loss_mlp": 1.04488921, + "epoch": 0.5934974990380916, + "flos": 593683748352.0, + "grad_norm": 0.05984158390569505, + "language_loss": 0.84177965, + "learning_rate": 0.0003742554365300787, + "loss": 0.85257638, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.34838867, + "step": 3085, + "time_per_iteration": 2.712371587753296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085917, + "balance_loss_mlp": 1.05044067, + "epoch": 0.5936898807233552, + "flos": 712339011072.0, + "grad_norm": 0.05068184961629974, + "language_loss": 0.78978491, + "learning_rate": 0.0003739539315801255, + "loss": 0.80064404, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.35473633, + "step": 3086, + "time_per_iteration": 2.916006565093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088952, + "balance_loss_mlp": 1.05345142, + "epoch": 0.5938822624086187, + "flos": 391684128768.0, + "grad_norm": 0.05263578767135529, + "language_loss": 0.9165324, + "learning_rate": 0.000373652475565596, + "loss": 0.92742193, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.35522461, + "step": 3087, + "time_per_iteration": 2.470960855484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094024, + "balance_loss_mlp": 1.05900025, + "epoch": 0.5940746440938822, + "flos": 480023373312.0, + "grad_norm": 0.060850763929597464, + "language_loss": 0.81550741, + "learning_rate": 0.00037335106860352587, + "loss": 0.82644761, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.35083008, + "step": 3088, + "time_per_iteration": 2.666472911834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097243, + "balance_loss_mlp": 1.06100357, + "epoch": 0.5942670257791458, + "flos": 483094872576.0, + "grad_norm": 0.049324641114684424, + "language_loss": 0.83196813, + "learning_rate": 0.00037304971081093146, + "loss": 0.84294057, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.36230469, + "step": 3089, + "time_per_iteration": 2.521000862121582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093812, + "balance_loss_mlp": 1.05967069, + "epoch": 0.5944594074644094, + "flos": 547656795648.0, + "grad_norm": 0.0533670066305608, + "language_loss": 0.81061506, + "learning_rate": 0.00037274840230481024, + "loss": 0.82155317, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.34179688, + "step": 3090, + "time_per_iteration": 2.7134556770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092625, + "balance_loss_mlp": 1.05700517, + "epoch": 0.594651789149673, + "flos": 448943510016.0, + "grad_norm": 0.055393993008082114, + "language_loss": 0.78753984, + "learning_rate": 0.00037244714320214077, + "loss": 0.79846609, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.35620117, + "step": 3091, + "time_per_iteration": 2.5576789379119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092048, + "balance_loss_mlp": 1.05640459, + "epoch": 0.5948441708349365, + "flos": 595969491456.0, + "grad_norm": 0.050698130573270175, + "language_loss": 0.83444929, + "learning_rate": 0.000372145933619882, + "loss": 0.84536982, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.35668945, + "step": 3092, + "time_per_iteration": 2.8742141723632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091606, + "balance_loss_mlp": 1.05636811, + "epoch": 0.5950365525202, + "flos": 548251905024.0, + "grad_norm": 0.05419961551348069, + "language_loss": 0.82168603, + "learning_rate": 0.000371844773674974, + "loss": 0.83260214, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.3527832, + "step": 3093, + "time_per_iteration": 2.6228530406951904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094358, + "balance_loss_mlp": 1.05890489, + "epoch": 0.5952289342054636, + "flos": 654385595904.0, + "grad_norm": 0.05844341434318606, + "language_loss": 0.81673229, + "learning_rate": 0.0003715436634843375, + "loss": 0.82767594, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.35498047, + "step": 3094, + "time_per_iteration": 2.8496577739715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084873, + "balance_loss_mlp": 1.04951525, + "epoch": 0.5954213158907272, + "flos": 603055245312.0, + "grad_norm": 0.0455107572696148, + "language_loss": 0.80728281, + "learning_rate": 0.00037124260316487355, + "loss": 0.81813157, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.35375977, + "step": 3095, + "time_per_iteration": 2.83181095123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084995, + "balance_loss_mlp": 1.05044806, + "epoch": 0.5956136975759908, + "flos": 486097970688.0, + "grad_norm": 0.0493360128544523, + "language_loss": 0.89028478, + "learning_rate": 0.0003709415928334643, + "loss": 0.90113473, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.34570312, + "step": 3096, + "time_per_iteration": 2.5334527492523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081969, + "balance_loss_mlp": 1.0465641, + "epoch": 0.5958060792612543, + "flos": 658462459392.0, + "grad_norm": 0.05334894182240255, + "language_loss": 0.80644953, + "learning_rate": 0.00037064063260697233, + "loss": 0.81726921, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.35424805, + "step": 3097, + "time_per_iteration": 2.868948221206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085643, + "balance_loss_mlp": 1.05004668, + "epoch": 0.5959984609465179, + "flos": 723201882624.0, + "grad_norm": 0.05441892470065276, + "language_loss": 0.78413296, + "learning_rate": 0.0003703397226022407, + "loss": 0.79498935, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.35595703, + "step": 3098, + "time_per_iteration": 3.0486435890197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054277, + "balance_loss_mlp": 1.04254675, + "epoch": 0.5961908426317815, + "flos": 1519010164224.0, + "grad_norm": 0.031936086773479797, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76554149, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.1171875, + "step": 3099, + "time_per_iteration": 4.9141762256622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082532, + "balance_loss_mlp": 1.04822397, + "epoch": 0.596383224317045, + "flos": 532357678080.0, + "grad_norm": 0.04537931846822051, + "language_loss": 0.83096731, + "learning_rate": 0.0003697380537253339, + "loss": 0.84179258, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.34350586, + "step": 3100, + "time_per_iteration": 2.6156232357025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082884, + "balance_loss_mlp": 1.04766929, + "epoch": 0.5965756060023086, + "flos": 590922169344.0, + "grad_norm": 0.060003355935897486, + "language_loss": 0.81679451, + "learning_rate": 0.0003694372950867471, + "loss": 0.82762337, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.3527832, + "step": 3101, + "time_per_iteration": 2.746100902557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084672, + "balance_loss_mlp": 1.04967189, + "epoch": 0.5967679876875721, + "flos": 861701760000.0, + "grad_norm": 0.05796500812003716, + "language_loss": 0.77373374, + "learning_rate": 0.0003691365871370976, + "loss": 0.78458047, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.3503418, + "step": 3102, + "time_per_iteration": 3.0448250770568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082291, + "balance_loss_mlp": 1.04710054, + "epoch": 0.5969603693728357, + "flos": 553574241792.0, + "grad_norm": 0.05791620467430745, + "language_loss": 0.854276, + "learning_rate": 0.00036883592999313093, + "loss": 0.86509889, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.35229492, + "step": 3103, + "time_per_iteration": 2.650810718536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082187, + "balance_loss_mlp": 1.04666269, + "epoch": 0.5971527510580993, + "flos": 718345207296.0, + "grad_norm": 0.05277795957282848, + "language_loss": 0.79037023, + "learning_rate": 0.0003685353237715722, + "loss": 0.80119205, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.35546875, + "step": 3104, + "time_per_iteration": 2.87162184715271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082645, + "balance_loss_mlp": 1.04812241, + "epoch": 0.5973451327433629, + "flos": 647324573184.0, + "grad_norm": 0.05039525348103138, + "language_loss": 0.81437027, + "learning_rate": 0.0003682347685891274, + "loss": 0.82519674, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.34570312, + "step": 3105, + "time_per_iteration": 2.844632863998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078322, + "balance_loss_mlp": 1.04284513, + "epoch": 0.5975375144286263, + "flos": 721374446592.0, + "grad_norm": 0.053848168408106474, + "language_loss": 0.80436707, + "learning_rate": 0.0003679342645624822, + "loss": 0.81515038, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.35498047, + "step": 3106, + "time_per_iteration": 2.961121082305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079962, + "balance_loss_mlp": 1.04374671, + "epoch": 0.5977298961138899, + "flos": 750616699392.0, + "grad_norm": 0.04889819009677852, + "language_loss": 0.8164891, + "learning_rate": 0.0003676338118083025, + "loss": 0.82728875, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.36230469, + "step": 3107, + "time_per_iteration": 2.997671127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076541, + "balance_loss_mlp": 1.04161251, + "epoch": 0.5979222777991535, + "flos": 530703360000.0, + "grad_norm": 0.05034919609110883, + "language_loss": 0.79592144, + "learning_rate": 0.0003673334104432347, + "loss": 0.80668688, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.34960938, + "step": 3108, + "time_per_iteration": 2.5946898460388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079709, + "balance_loss_mlp": 1.04461432, + "epoch": 0.5981146594844171, + "flos": 621459357696.0, + "grad_norm": 0.04952863942356172, + "language_loss": 0.83337331, + "learning_rate": 0.0003670330605839048, + "loss": 0.84417045, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.35131836, + "step": 3109, + "time_per_iteration": 2.7955031394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080275, + "balance_loss_mlp": 1.04470301, + "epoch": 0.5983070411696807, + "flos": 603309911040.0, + "grad_norm": 0.05233505638894281, + "language_loss": 0.76384044, + "learning_rate": 0.0003667327623469191, + "loss": 0.77464318, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.35571289, + "step": 3110, + "time_per_iteration": 2.7939095497131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080046, + "balance_loss_mlp": 1.04516506, + "epoch": 0.5984994228549442, + "flos": 633187971072.0, + "grad_norm": 0.05191698416970628, + "language_loss": 0.7765972, + "learning_rate": 0.00036643251584886333, + "loss": 0.78739762, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.34912109, + "step": 3111, + "time_per_iteration": 2.821956157684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076682, + "balance_loss_mlp": 1.0426122, + "epoch": 0.5986918045402078, + "flos": 525026022912.0, + "grad_norm": 0.05255438672232182, + "language_loss": 0.81679058, + "learning_rate": 0.00036613232120630393, + "loss": 0.82755744, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.34106445, + "step": 3112, + "time_per_iteration": 2.61639142036438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072562, + "balance_loss_mlp": 1.03751469, + "epoch": 0.5988841862254713, + "flos": 482942103552.0, + "grad_norm": 0.06309856820969045, + "language_loss": 0.8010537, + "learning_rate": 0.00036583217853578643, + "loss": 0.81177926, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.35083008, + "step": 3113, + "time_per_iteration": 2.544152021408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076935, + "balance_loss_mlp": 1.04241252, + "epoch": 0.5990765679107349, + "flos": 1139658674688.0, + "grad_norm": 0.05746596179478014, + "language_loss": 0.7739538, + "learning_rate": 0.000365532087953837, + "loss": 0.78472316, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.34545898, + "step": 3114, + "time_per_iteration": 3.6210074424743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074738, + "balance_loss_mlp": 1.04104948, + "epoch": 0.5992689495959984, + "flos": 516729701376.0, + "grad_norm": 0.0590793434639382, + "language_loss": 0.89283043, + "learning_rate": 0.00036523204957696065, + "loss": 0.9035778, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.3371582, + "step": 3115, + "time_per_iteration": 2.5835559368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079472, + "balance_loss_mlp": 1.0447346, + "epoch": 0.599461331281262, + "flos": 744288998400.0, + "grad_norm": 0.05148674480480004, + "language_loss": 0.80590332, + "learning_rate": 0.00036493206352164324, + "loss": 0.81669807, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.34790039, + "step": 3116, + "time_per_iteration": 2.9135849475860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073646, + "balance_loss_mlp": 1.03960013, + "epoch": 0.5996537129665256, + "flos": 592078892544.0, + "grad_norm": 0.05828379622393402, + "language_loss": 0.85252976, + "learning_rate": 0.000364632129904349, + "loss": 0.86326623, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.34082031, + "step": 3117, + "time_per_iteration": 2.7019104957580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107531, + "balance_loss_mlp": 1.03997648, + "epoch": 0.5998460946517892, + "flos": 558735045120.0, + "grad_norm": 0.05080253376139345, + "language_loss": 0.77507442, + "learning_rate": 0.00036433224884152283, + "loss": 0.78582752, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.35375977, + "step": 3118, + "time_per_iteration": 2.698032855987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082073, + "balance_loss_mlp": 1.04814649, + "epoch": 0.6000384763370528, + "flos": 484325789184.0, + "grad_norm": 0.058104830427354655, + "language_loss": 0.77595496, + "learning_rate": 0.00036403242044958875, + "loss": 0.78677565, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.33959961, + "step": 3119, + "time_per_iteration": 2.5694661140441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082708, + "balance_loss_mlp": 1.04763699, + "epoch": 0.6002308580223162, + "flos": 596490407424.0, + "grad_norm": 0.05350136271967441, + "language_loss": 0.91317761, + "learning_rate": 0.0003637326448449507, + "loss": 0.92400473, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.35083008, + "step": 3120, + "time_per_iteration": 2.7095799446105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083063, + "balance_loss_mlp": 1.04808724, + "epoch": 0.6004232397075798, + "flos": 544879249920.0, + "grad_norm": 0.044412764387293725, + "language_loss": 0.86037177, + "learning_rate": 0.00036343292214399177, + "loss": 0.87120235, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.34985352, + "step": 3121, + "time_per_iteration": 2.760568618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074765, + "balance_loss_mlp": 1.04112399, + "epoch": 0.6006156213928434, + "flos": 629647990272.0, + "grad_norm": 0.05788035172914192, + "language_loss": 0.770136, + "learning_rate": 0.00036313325246307456, + "loss": 0.78088361, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.33666992, + "step": 3122, + "time_per_iteration": 2.7645843029022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081915, + "balance_loss_mlp": 1.0479641, + "epoch": 0.600808003078107, + "flos": 582043885056.0, + "grad_norm": 0.05339440368403648, + "language_loss": 0.8713336, + "learning_rate": 0.0003628336359185411, + "loss": 0.8821528, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.33984375, + "step": 3123, + "time_per_iteration": 2.704559803009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084761, + "balance_loss_mlp": 1.04961848, + "epoch": 0.6010003847633705, + "flos": 634984883712.0, + "grad_norm": 0.051464767664237604, + "language_loss": 0.7543686, + "learning_rate": 0.000362534072626713, + "loss": 0.76521623, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.35180664, + "step": 3124, + "time_per_iteration": 2.767263174057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082659, + "balance_loss_mlp": 1.04837453, + "epoch": 0.6011927664486341, + "flos": 718448514048.0, + "grad_norm": 0.05118450522862765, + "language_loss": 0.80810112, + "learning_rate": 0.00036223456270389093, + "loss": 0.81892776, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.34326172, + "step": 3125, + "time_per_iteration": 2.972226858139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077124, + "balance_loss_mlp": 1.04272032, + "epoch": 0.6013851481338977, + "flos": 498782486016.0, + "grad_norm": 0.0486392008074567, + "language_loss": 0.81048089, + "learning_rate": 0.00036193510626635517, + "loss": 0.82125211, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.34423828, + "step": 3126, + "time_per_iteration": 2.6381988525390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080205, + "balance_loss_mlp": 1.04620612, + "epoch": 0.6015775298191612, + "flos": 749266509312.0, + "grad_norm": 0.057928922724073975, + "language_loss": 0.81419915, + "learning_rate": 0.0003616357034303649, + "loss": 0.82500118, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.34033203, + "step": 3127, + "time_per_iteration": 2.910590887069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077974, + "balance_loss_mlp": 1.04380846, + "epoch": 0.6017699115044248, + "flos": 592764162048.0, + "grad_norm": 0.06444067726606947, + "language_loss": 0.7886622, + "learning_rate": 0.0003613363543121584, + "loss": 0.79944193, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.34204102, + "step": 3128, + "time_per_iteration": 2.8243367671966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081251, + "balance_loss_mlp": 1.04627466, + "epoch": 0.6019622931896883, + "flos": 514839656448.0, + "grad_norm": 0.05655060163799935, + "language_loss": 0.8488009, + "learning_rate": 0.00036103705902795357, + "loss": 0.85961336, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.35009766, + "step": 3129, + "time_per_iteration": 2.691652297973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078555, + "balance_loss_mlp": 1.0440799, + "epoch": 0.6021546748749519, + "flos": 490219914240.0, + "grad_norm": 0.11187816626328603, + "language_loss": 0.79397345, + "learning_rate": 0.0003607378176939471, + "loss": 0.80475903, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.3449707, + "step": 3130, + "time_per_iteration": 2.59126353263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080272, + "balance_loss_mlp": 1.0459156, + "epoch": 0.6023470565602155, + "flos": 540763098624.0, + "grad_norm": 0.584663234761047, + "language_loss": 0.81865788, + "learning_rate": 0.00036043863042631465, + "loss": 0.82946062, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.34399414, + "step": 3131, + "time_per_iteration": 2.631758689880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082979, + "balance_loss_mlp": 1.04716837, + "epoch": 0.6025394382454791, + "flos": 844660984320.0, + "grad_norm": 0.054894708667503185, + "language_loss": 0.76558393, + "learning_rate": 0.00036013949734121133, + "loss": 0.77641368, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.3581543, + "step": 3132, + "time_per_iteration": 3.091432809829712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077376, + "balance_loss_mlp": 1.04249549, + "epoch": 0.6027318199307425, + "flos": 576903430656.0, + "grad_norm": 0.05648602970445555, + "language_loss": 0.82430494, + "learning_rate": 0.00035984041855477043, + "loss": 0.83507866, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.34912109, + "step": 3133, + "time_per_iteration": 2.707841396331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045948, + "balance_loss_mlp": 1.03345478, + "epoch": 0.6029242016160061, + "flos": 1470160585728.0, + "grad_norm": 0.017118275971869903, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79755843, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.125, + "step": 3134, + "time_per_iteration": 4.929067373275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077811, + "balance_loss_mlp": 1.0416429, + "epoch": 0.6031165833012697, + "flos": 480486062592.0, + "grad_norm": 0.057341971523643794, + "language_loss": 0.79656577, + "learning_rate": 0.00035924242434230637, + "loss": 0.80734384, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.36181641, + "step": 3135, + "time_per_iteration": 2.6362884044647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078775, + "balance_loss_mlp": 1.04294014, + "epoch": 0.6033089649865333, + "flos": 499220444160.0, + "grad_norm": 0.48805573037273664, + "language_loss": 0.78477532, + "learning_rate": 0.00035894350914844516, + "loss": 0.79556304, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.35864258, + "step": 3136, + "time_per_iteration": 2.5889768600463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095737, + "balance_loss_mlp": 1.05961668, + "epoch": 0.6035013466717969, + "flos": 556337230848.0, + "grad_norm": 0.06198645185938339, + "language_loss": 0.828888, + "learning_rate": 0.0003586446487175703, + "loss": 0.83984536, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.36132812, + "step": 3137, + "time_per_iteration": 2.6805853843688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105357, + "balance_loss_mlp": 1.06690025, + "epoch": 0.6036937283570604, + "flos": 594536343552.0, + "grad_norm": 0.04857529981882101, + "language_loss": 0.85242814, + "learning_rate": 0.0003583458431657099, + "loss": 0.86348164, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.3840332, + "step": 3138, + "time_per_iteration": 2.8694372177124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107329, + "balance_loss_mlp": 1.0691824, + "epoch": 0.603886110042324, + "flos": 540684523008.0, + "grad_norm": 0.0686265379907432, + "language_loss": 0.82493383, + "learning_rate": 0.00035804709260887056, + "loss": 0.83600712, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.38110352, + "step": 3139, + "time_per_iteration": 2.6613197326660156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111658, + "balance_loss_mlp": 1.07664514, + "epoch": 0.6040784917275875, + "flos": 518315618304.0, + "grad_norm": 0.04727969625034485, + "language_loss": 0.89413351, + "learning_rate": 0.0003577483971630373, + "loss": 0.90529931, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.39916992, + "step": 3140, + "time_per_iteration": 2.6468544006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112858, + "balance_loss_mlp": 1.08752418, + "epoch": 0.6042708734128511, + "flos": 660436872192.0, + "grad_norm": 0.0491739702694389, + "language_loss": 0.84699506, + "learning_rate": 0.00035744975694417414, + "loss": 0.8582809, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.41064453, + "step": 3141, + "time_per_iteration": 2.8567256927490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128353, + "balance_loss_mlp": 1.0867728, + "epoch": 0.6044632550981146, + "flos": 572035018752.0, + "grad_norm": 0.05704066286420323, + "language_loss": 0.82333231, + "learning_rate": 0.00035715117206822344, + "loss": 0.83461583, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.41577148, + "step": 3142, + "time_per_iteration": 2.7504515647888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141414, + "balance_loss_mlp": 1.09892821, + "epoch": 0.6046556367833782, + "flos": 546420086784.0, + "grad_norm": 0.06612582666460322, + "language_loss": 0.80943495, + "learning_rate": 0.0003568526426511065, + "loss": 0.82084912, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.42456055, + "step": 3143, + "time_per_iteration": 2.6085774898529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140491, + "balance_loss_mlp": 1.09817219, + "epoch": 0.6048480184686418, + "flos": 776505235968.0, + "grad_norm": 0.064383973380027, + "language_loss": 0.82750165, + "learning_rate": 0.000356554168808722, + "loss": 0.83890665, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.42358398, + "step": 3144, + "time_per_iteration": 2.9655168056488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140449, + "balance_loss_mlp": 1.09834385, + "epoch": 0.6050404001539054, + "flos": 656837254656.0, + "grad_norm": 0.05900200764303625, + "language_loss": 0.85025299, + "learning_rate": 0.00035625575065694837, + "loss": 0.8616575, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.42114258, + "step": 3145, + "time_per_iteration": 2.826193332672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134203, + "balance_loss_mlp": 1.09159803, + "epoch": 0.605232781839169, + "flos": 548710212096.0, + "grad_norm": 0.05530707742448767, + "language_loss": 0.77449524, + "learning_rate": 0.0003559573883116415, + "loss": 0.78583729, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.42626953, + "step": 3146, + "time_per_iteration": 2.6936702728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114118, + "balance_loss_mlp": 1.0976212, + "epoch": 0.6054251635244324, + "flos": 605093677056.0, + "grad_norm": 0.08058095897808437, + "language_loss": 0.85587645, + "learning_rate": 0.00035565908188863604, + "loss": 0.86728823, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.43579102, + "step": 3147, + "time_per_iteration": 2.8229072093963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113968, + "balance_loss_mlp": 1.09488153, + "epoch": 0.605617545209696, + "flos": 613398763008.0, + "grad_norm": 0.05127524075744011, + "language_loss": 0.79730809, + "learning_rate": 0.00035536083150374464, + "loss": 0.80870491, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.44799805, + "step": 3148, + "time_per_iteration": 2.782287836074829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139029, + "balance_loss_mlp": 1.12310266, + "epoch": 0.6058099268949596, + "flos": 1497477888000.0, + "grad_norm": 0.03498965475006418, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75886977, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.15917969, + "step": 3149, + "time_per_iteration": 4.813022613525391 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128616, + "balance_loss_mlp": 1.08696485, + "epoch": 0.6060023085802232, + "flos": 670170723840.0, + "grad_norm": 0.053702261720826414, + "language_loss": 0.85731369, + "learning_rate": 0.0003547644993114475, + "loss": 0.86859989, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.41650391, + "step": 3150, + "time_per_iteration": 2.7940874099731445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118224, + "balance_loss_mlp": 1.07688236, + "epoch": 0.6061946902654868, + "flos": 605885225472.0, + "grad_norm": 0.05286284770127293, + "language_loss": 0.79495448, + "learning_rate": 0.00035446641773555806, + "loss": 0.80613673, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.41357422, + "step": 3151, + "time_per_iteration": 2.7147328853607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116917, + "balance_loss_mlp": 1.07567072, + "epoch": 0.6063870719507503, + "flos": 557568147456.0, + "grad_norm": 0.052762165498596546, + "language_loss": 0.86798322, + "learning_rate": 0.000354168392660816, + "loss": 0.87915242, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.41235352, + "step": 3152, + "time_per_iteration": 2.7346954345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115838, + "balance_loss_mlp": 1.07583165, + "epoch": 0.6065794536360138, + "flos": 556874113536.0, + "grad_norm": 0.05405599690586098, + "language_loss": 0.82799989, + "learning_rate": 0.0003538704242029252, + "loss": 0.8391583, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.39990234, + "step": 3153, + "time_per_iteration": 2.705004930496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112343, + "balance_loss_mlp": 1.07169282, + "epoch": 0.6067718353212774, + "flos": 689836276224.0, + "grad_norm": 0.05919499383434511, + "language_loss": 0.77963281, + "learning_rate": 0.0003535725124775672, + "loss": 0.79075623, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.40649414, + "step": 3154, + "time_per_iteration": 2.8201727867126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110719, + "balance_loss_mlp": 1.07147574, + "epoch": 0.606964217006541, + "flos": 521531122176.0, + "grad_norm": 0.06643297661580516, + "language_loss": 0.86598241, + "learning_rate": 0.00035327465760040126, + "loss": 0.87708956, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.39233398, + "step": 3155, + "time_per_iteration": 2.6584889888763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100039, + "balance_loss_mlp": 1.06201148, + "epoch": 0.6071565986918045, + "flos": 641267504640.0, + "grad_norm": 0.0597836437175205, + "language_loss": 0.84776556, + "learning_rate": 0.00035297685968706526, + "loss": 0.85876596, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.37988281, + "step": 3156, + "time_per_iteration": 2.752196788787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109676, + "balance_loss_mlp": 1.05708754, + "epoch": 0.6073489803770681, + "flos": 560315169792.0, + "grad_norm": 0.05609890059594196, + "language_loss": 0.8300876, + "learning_rate": 0.00035267911885317454, + "loss": 0.84105527, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.39672852, + "step": 3157, + "time_per_iteration": 2.629136562347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109981, + "balance_loss_mlp": 1.06121039, + "epoch": 0.6075413620623317, + "flos": 585810828288.0, + "grad_norm": 0.05476186910904592, + "language_loss": 0.81797791, + "learning_rate": 0.0003523814352143222, + "loss": 0.82897604, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.38598633, + "step": 3158, + "time_per_iteration": 2.8239855766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087443, + "balance_loss_mlp": 1.04953456, + "epoch": 0.6077337437475953, + "flos": 630523906560.0, + "grad_norm": 0.060962114442721135, + "language_loss": 0.90981984, + "learning_rate": 0.00035208380888607937, + "loss": 0.92069423, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.37866211, + "step": 3159, + "time_per_iteration": 2.754648208618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068868, + "balance_loss_mlp": 1.05542111, + "epoch": 0.6079261254328588, + "flos": 1467726455808.0, + "grad_norm": 0.024644792756990472, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80530852, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.13476562, + "step": 3160, + "time_per_iteration": 4.849771022796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066394, + "balance_loss_mlp": 1.05323327, + "epoch": 0.6081185071181223, + "flos": 1522233022464.0, + "grad_norm": 0.022600356712689354, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76758623, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.13183594, + "step": 3161, + "time_per_iteration": 5.017123699188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081969, + "balance_loss_mlp": 1.04530025, + "epoch": 0.6083108888033859, + "flos": 556041867264.0, + "grad_norm": 0.07058889288065262, + "language_loss": 0.81635529, + "learning_rate": 0.00035119127492038446, + "loss": 0.82717502, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.3671875, + "step": 3162, + "time_per_iteration": 2.7839951515197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083576, + "balance_loss_mlp": 1.0463115, + "epoch": 0.6085032704886495, + "flos": 840819847680.0, + "grad_norm": 0.052086088834636966, + "language_loss": 0.82480276, + "learning_rate": 0.00035089387898984436, + "loss": 0.83563852, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.37207031, + "step": 3163, + "time_per_iteration": 3.0475828647613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079298, + "balance_loss_mlp": 1.04267716, + "epoch": 0.6086956521739131, + "flos": 684493590528.0, + "grad_norm": 0.05636679470966986, + "language_loss": 0.81840444, + "learning_rate": 0.0003505965409474343, + "loss": 0.82919747, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.36621094, + "step": 3164, + "time_per_iteration": 2.8719167709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081512, + "balance_loss_mlp": 1.04453373, + "epoch": 0.6088880338591766, + "flos": 535533894144.0, + "grad_norm": 0.05767475367988954, + "language_loss": 0.86591709, + "learning_rate": 0.0003502992609085913, + "loss": 0.87673223, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.36962891, + "step": 3165, + "time_per_iteration": 2.6596477031707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076933, + "balance_loss_mlp": 1.04007339, + "epoch": 0.6090804155444401, + "flos": 731197048320.0, + "grad_norm": 0.05479022562545965, + "language_loss": 0.82799208, + "learning_rate": 0.00035000203898872954, + "loss": 0.83876145, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.3684082, + "step": 3166, + "time_per_iteration": 2.985320568084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076201, + "balance_loss_mlp": 1.03845954, + "epoch": 0.6092727972297037, + "flos": 698708768256.0, + "grad_norm": 0.05187712745412687, + "language_loss": 0.84401566, + "learning_rate": 0.0003497048753032406, + "loss": 0.85477769, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.37695312, + "step": 3167, + "time_per_iteration": 2.876997470855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079403, + "balance_loss_mlp": 1.04213786, + "epoch": 0.6094651789149673, + "flos": 1051515869184.0, + "grad_norm": 0.16368682108793797, + "language_loss": 0.81000876, + "learning_rate": 0.000349407769967494, + "loss": 0.82080269, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.37255859, + "step": 3168, + "time_per_iteration": 3.376215696334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074675, + "balance_loss_mlp": 1.03721976, + "epoch": 0.6096575606002309, + "flos": 502834618368.0, + "grad_norm": 0.047663268241493265, + "language_loss": 0.84680313, + "learning_rate": 0.0003491107230968361, + "loss": 0.85754991, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.37475586, + "step": 3169, + "time_per_iteration": 2.660513401031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076232, + "balance_loss_mlp": 1.03872895, + "epoch": 0.6098499422854944, + "flos": 585339374592.0, + "grad_norm": 0.13699074886281146, + "language_loss": 0.81564283, + "learning_rate": 0.00034881373480659085, + "loss": 0.82640517, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.37475586, + "step": 3170, + "time_per_iteration": 2.831681728363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081961, + "balance_loss_mlp": 1.04364741, + "epoch": 0.610042323970758, + "flos": 468968444928.0, + "grad_norm": 0.06190459758057804, + "language_loss": 0.77871358, + "learning_rate": 0.0003485168052120594, + "loss": 0.78953326, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.3828125, + "step": 3171, + "time_per_iteration": 2.5600767135620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081834, + "balance_loss_mlp": 1.04387796, + "epoch": 0.6102347056560216, + "flos": 513923042304.0, + "grad_norm": 0.0838552496522472, + "language_loss": 0.80047345, + "learning_rate": 0.00034821993442851973, + "loss": 0.81129181, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.37890625, + "step": 3172, + "time_per_iteration": 2.564009666442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082985, + "balance_loss_mlp": 1.0452435, + "epoch": 0.6104270873412851, + "flos": 468776388096.0, + "grad_norm": 0.05938555160639068, + "language_loss": 0.82216555, + "learning_rate": 0.00034792312257122735, + "loss": 0.83299541, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.37719727, + "step": 3173, + "time_per_iteration": 2.6151862144470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078889, + "balance_loss_mlp": 1.04012203, + "epoch": 0.6106194690265486, + "flos": 549610859520.0, + "grad_norm": 0.05423157525738513, + "language_loss": 0.80451965, + "learning_rate": 0.00034762636975541506, + "loss": 0.81530857, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.38720703, + "step": 3174, + "time_per_iteration": 2.627699375152588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107833, + "balance_loss_mlp": 1.03965902, + "epoch": 0.6108118507118122, + "flos": 472602968064.0, + "grad_norm": 0.06986619017952604, + "language_loss": 0.80950004, + "learning_rate": 0.0003473296760962923, + "loss": 0.82028335, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.38647461, + "step": 3175, + "time_per_iteration": 2.6790359020233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073231, + "balance_loss_mlp": 1.06111896, + "epoch": 0.6110042323970758, + "flos": 1444416205824.0, + "grad_norm": 0.03162499472670903, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79606968, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.12109375, + "step": 3176, + "time_per_iteration": 4.660337924957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078339, + "balance_loss_mlp": 1.03966713, + "epoch": 0.6111966140823394, + "flos": 793807879680.0, + "grad_norm": 0.05300706067189078, + "language_loss": 0.8120122, + "learning_rate": 0.00034673646670883976, + "loss": 0.82279563, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.38623047, + "step": 3177, + "time_per_iteration": 2.9990971088409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046334, + "balance_loss_mlp": 1.03431749, + "epoch": 0.611388995767603, + "flos": 1556800432128.0, + "grad_norm": 0.020411675518342276, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76761359, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.12011719, + "step": 3178, + "time_per_iteration": 5.060986280441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078249, + "balance_loss_mlp": 1.03948236, + "epoch": 0.6115813774528664, + "flos": 711841416192.0, + "grad_norm": 0.052313854365800355, + "language_loss": 0.81582487, + "learning_rate": 0.0003461434953300865, + "loss": 0.82660735, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.38745117, + "step": 3179, + "time_per_iteration": 2.8902480602264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073999, + "balance_loss_mlp": 1.03535175, + "epoch": 0.61177375913813, + "flos": 683963910144.0, + "grad_norm": 0.0432149263415984, + "language_loss": 0.81232655, + "learning_rate": 0.0003458470991817515, + "loss": 0.82306653, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.38598633, + "step": 3180, + "time_per_iteration": 2.9921305179595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078708, + "balance_loss_mlp": 1.04068065, + "epoch": 0.6119661408233936, + "flos": 511411746816.0, + "grad_norm": 0.056171077714967085, + "language_loss": 0.84767073, + "learning_rate": 0.0003455507628808802, + "loss": 0.8584578, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.38012695, + "step": 3181, + "time_per_iteration": 2.5818896293640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073399, + "balance_loss_mlp": 1.03527629, + "epoch": 0.6121585225086572, + "flos": 556548226560.0, + "grad_norm": 0.057403680596608046, + "language_loss": 0.8451159, + "learning_rate": 0.00034525448654252076, + "loss": 0.85584986, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.38085938, + "step": 3182, + "time_per_iteration": 2.6865382194519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072137, + "balance_loss_mlp": 1.03384721, + "epoch": 0.6123509041939207, + "flos": 561585374208.0, + "grad_norm": 0.07466059986871497, + "language_loss": 0.82914555, + "learning_rate": 0.0003449582702816976, + "loss": 0.83986694, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.3828125, + "step": 3183, + "time_per_iteration": 2.6590259075164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079784, + "balance_loss_mlp": 1.0416131, + "epoch": 0.6125432858791843, + "flos": 557789317632.0, + "grad_norm": 0.05504997733679025, + "language_loss": 0.82930607, + "learning_rate": 0.0003446621142134122, + "loss": 0.84010386, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.3815918, + "step": 3184, + "time_per_iteration": 2.7104709148406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075053, + "balance_loss_mlp": 1.03776431, + "epoch": 0.6127356675644479, + "flos": 414796529664.0, + "grad_norm": 0.05785245107541848, + "language_loss": 0.84189403, + "learning_rate": 0.0003443660184526424, + "loss": 0.85264462, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.37255859, + "step": 3185, + "time_per_iteration": 2.441305160522461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078818, + "balance_loss_mlp": 1.04048026, + "epoch": 0.6129280492497114, + "flos": 603547047936.0, + "grad_norm": 0.04628969176382701, + "language_loss": 0.86441582, + "learning_rate": 0.0003440699831143429, + "loss": 0.87520397, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.38305664, + "step": 3186, + "time_per_iteration": 2.81016206741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081465, + "balance_loss_mlp": 1.04474831, + "epoch": 0.613120430934975, + "flos": 519492690432.0, + "grad_norm": 0.05115957600907009, + "language_loss": 0.82288289, + "learning_rate": 0.0003437740083134449, + "loss": 0.83369744, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.3671875, + "step": 3187, + "time_per_iteration": 2.695181369781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075008, + "balance_loss_mlp": 1.03798163, + "epoch": 0.6133128126202385, + "flos": 510835576320.0, + "grad_norm": 0.06733229983475184, + "language_loss": 0.83452654, + "learning_rate": 0.00034347809416485574, + "loss": 0.84527659, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.37011719, + "step": 3188, + "time_per_iteration": 2.5900075435638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081504, + "balance_loss_mlp": 1.04402518, + "epoch": 0.6135051943055021, + "flos": 607264528896.0, + "grad_norm": 0.053668382142468496, + "language_loss": 0.81688702, + "learning_rate": 0.0003431822407834597, + "loss": 0.82770205, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.37475586, + "step": 3189, + "time_per_iteration": 2.8129723072052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107806, + "balance_loss_mlp": 1.04062855, + "epoch": 0.6136975759907657, + "flos": 1159750600704.0, + "grad_norm": 0.0555928311696248, + "language_loss": 0.84534049, + "learning_rate": 0.00034288644828411706, + "loss": 0.85612106, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.37426758, + "step": 3190, + "time_per_iteration": 3.4628307819366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076796, + "balance_loss_mlp": 1.03931642, + "epoch": 0.6138899576760293, + "flos": 706631150592.0, + "grad_norm": 0.05334960591036923, + "language_loss": 0.75148171, + "learning_rate": 0.0003425907167816649, + "loss": 0.76224971, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.37475586, + "step": 3191, + "time_per_iteration": 2.867506265640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072461, + "balance_loss_mlp": 1.03510118, + "epoch": 0.6140823393612928, + "flos": 586151271936.0, + "grad_norm": 0.05066562210294406, + "language_loss": 0.84692401, + "learning_rate": 0.00034229504639091623, + "loss": 0.85764861, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.37329102, + "step": 3192, + "time_per_iteration": 2.757969617843628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075038, + "balance_loss_mlp": 1.03722489, + "epoch": 0.6142747210465563, + "flos": 803759929344.0, + "grad_norm": 0.052233657686543596, + "language_loss": 0.79899156, + "learning_rate": 0.0003419994372266606, + "loss": 0.80974191, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.37792969, + "step": 3193, + "time_per_iteration": 3.113477945327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074279, + "balance_loss_mlp": 1.03651392, + "epoch": 0.6144671027318199, + "flos": 529158140928.0, + "grad_norm": 0.04106506245407052, + "language_loss": 0.81734288, + "learning_rate": 0.00034170388940366335, + "loss": 0.82808566, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.37744141, + "step": 3194, + "time_per_iteration": 2.6896331310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078888, + "balance_loss_mlp": 1.04190898, + "epoch": 0.6146594844170835, + "flos": 805054864896.0, + "grad_norm": 0.05108636633203802, + "language_loss": 0.80077958, + "learning_rate": 0.0003414084030366667, + "loss": 0.8115685, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.36987305, + "step": 3195, + "time_per_iteration": 3.083922863006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078724, + "balance_loss_mlp": 1.04134059, + "epoch": 0.6148518661023471, + "flos": 501431993856.0, + "grad_norm": 0.05057450968707768, + "language_loss": 0.82827139, + "learning_rate": 0.0003411129782403883, + "loss": 0.83905864, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.3737793, + "step": 3196, + "time_per_iteration": 2.641129970550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107445, + "balance_loss_mlp": 1.03720951, + "epoch": 0.6150442477876106, + "flos": 510436905984.0, + "grad_norm": 0.062166834979967195, + "language_loss": 0.84822834, + "learning_rate": 0.0003408176151295225, + "loss": 0.85897291, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.37207031, + "step": 3197, + "time_per_iteration": 2.5532026290893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071235, + "balance_loss_mlp": 1.03425658, + "epoch": 0.6152366294728742, + "flos": 526758916608.0, + "grad_norm": 0.06002763695561428, + "language_loss": 0.770096, + "learning_rate": 0.00034052231381873944, + "loss": 0.78080833, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.36962891, + "step": 3198, + "time_per_iteration": 2.6175601482391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107449, + "balance_loss_mlp": 1.03746367, + "epoch": 0.6154290111581378, + "flos": 473055482880.0, + "grad_norm": 0.053906213257321506, + "language_loss": 0.85027397, + "learning_rate": 0.00034022707442268494, + "loss": 0.86101884, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.37060547, + "step": 3199, + "time_per_iteration": 2.5418269634246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075667, + "balance_loss_mlp": 1.03985643, + "epoch": 0.6156213928434013, + "flos": 550542030336.0, + "grad_norm": 0.04138117039405258, + "language_loss": 0.81766355, + "learning_rate": 0.0003399318970559813, + "loss": 0.82842016, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.35864258, + "step": 3200, + "time_per_iteration": 2.8180348873138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074649, + "balance_loss_mlp": 1.03795648, + "epoch": 0.6158137745286649, + "flos": 750587586048.0, + "grad_norm": 0.04925803113162635, + "language_loss": 0.84793299, + "learning_rate": 0.00033963678183322656, + "loss": 0.85867941, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.36694336, + "step": 3201, + "time_per_iteration": 3.032935857772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107717, + "balance_loss_mlp": 1.04035842, + "epoch": 0.6160061562139284, + "flos": 555544272384.0, + "grad_norm": 0.0447157472200271, + "language_loss": 0.82589877, + "learning_rate": 0.0003393417288689945, + "loss": 0.8366704, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.36816406, + "step": 3202, + "time_per_iteration": 2.675895929336548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076788, + "balance_loss_mlp": 1.03976154, + "epoch": 0.616198537899192, + "flos": 741856278528.0, + "grad_norm": 0.0597641092397592, + "language_loss": 0.75911278, + "learning_rate": 0.00033904673827783504, + "loss": 0.76988065, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.37060547, + "step": 3203, + "time_per_iteration": 2.930006265640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078121, + "balance_loss_mlp": 1.04111826, + "epoch": 0.6163909195844556, + "flos": 478569876480.0, + "grad_norm": 0.09425885378712065, + "language_loss": 0.8152014, + "learning_rate": 0.00033875181017427357, + "loss": 0.82598263, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.36962891, + "step": 3204, + "time_per_iteration": 2.624331474304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071675, + "balance_loss_mlp": 1.03524435, + "epoch": 0.6165833012697192, + "flos": 531231478272.0, + "grad_norm": 0.05217722063945812, + "language_loss": 0.80865437, + "learning_rate": 0.00033845694467281133, + "loss": 0.8193711, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.36450195, + "step": 3205, + "time_per_iteration": 2.8210368156433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107918, + "balance_loss_mlp": 1.0422256, + "epoch": 0.6167756829549826, + "flos": 807384278016.0, + "grad_norm": 0.04964273497495854, + "language_loss": 0.83231258, + "learning_rate": 0.00033816214188792516, + "loss": 0.84310448, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.36938477, + "step": 3206, + "time_per_iteration": 3.148005485534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074637, + "balance_loss_mlp": 1.03782535, + "epoch": 0.6169680646402462, + "flos": 488683459584.0, + "grad_norm": 0.053298610353503126, + "language_loss": 0.85231054, + "learning_rate": 0.00033786740193406784, + "loss": 0.8630569, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.36791992, + "step": 3207, + "time_per_iteration": 2.576956272125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083512, + "balance_loss_mlp": 1.04693818, + "epoch": 0.6171604463255098, + "flos": 618643934208.0, + "grad_norm": 0.05970709396928862, + "language_loss": 0.81620336, + "learning_rate": 0.00033757272492566736, + "loss": 0.82703847, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.3659668, + "step": 3208, + "time_per_iteration": 2.8902554512023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077083, + "balance_loss_mlp": 1.04070079, + "epoch": 0.6173528280107734, + "flos": 528600909312.0, + "grad_norm": 0.043205070358092235, + "language_loss": 0.87206829, + "learning_rate": 0.0003372781109771278, + "loss": 0.88283914, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.36401367, + "step": 3209, + "time_per_iteration": 2.688534736633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077515, + "balance_loss_mlp": 1.04036927, + "epoch": 0.617545209696037, + "flos": 596293968384.0, + "grad_norm": 0.05036658648833462, + "language_loss": 0.76489538, + "learning_rate": 0.0003369835602028281, + "loss": 0.77567053, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.37158203, + "step": 3210, + "time_per_iteration": 2.7890372276306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073752, + "balance_loss_mlp": 1.03763127, + "epoch": 0.6177375913813005, + "flos": 474848013312.0, + "grad_norm": 0.06457582449248328, + "language_loss": 0.7954967, + "learning_rate": 0.0003366890727171232, + "loss": 0.80623418, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.36132812, + "step": 3211, + "time_per_iteration": 2.6358649730682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076437, + "balance_loss_mlp": 1.03983986, + "epoch": 0.617929973066564, + "flos": 529546636800.0, + "grad_norm": 0.051638543668130914, + "language_loss": 0.78236932, + "learning_rate": 0.00033639464863434313, + "loss": 0.79313374, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.36621094, + "step": 3212, + "time_per_iteration": 2.6086573600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104581, + "balance_loss_mlp": 1.03403246, + "epoch": 0.6181223547518276, + "flos": 1419361477632.0, + "grad_norm": 0.031029800070293646, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79488277, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.11767578, + "step": 3213, + "time_per_iteration": 4.67001748085022 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082687, + "balance_loss_mlp": 1.04608989, + "epoch": 0.6183147364370912, + "flos": 739976408064.0, + "grad_norm": 0.057199257803381136, + "language_loss": 0.79338527, + "learning_rate": 0.00033580599113475543, + "loss": 0.80421209, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.36572266, + "step": 3214, + "time_per_iteration": 2.9583098888397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084038, + "balance_loss_mlp": 1.04791784, + "epoch": 0.6185071181223547, + "flos": 381442507776.0, + "grad_norm": 0.04917291397631135, + "language_loss": 0.85787857, + "learning_rate": 0.00033551175794648507, + "loss": 0.86871898, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.36108398, + "step": 3215, + "time_per_iteration": 2.450173854827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079107, + "balance_loss_mlp": 1.04191399, + "epoch": 0.6186994998076183, + "flos": 463109225472.0, + "grad_norm": 0.05232146419695497, + "language_loss": 0.8178426, + "learning_rate": 0.00033521758861821365, + "loss": 0.82863367, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.37158203, + "step": 3216, + "time_per_iteration": 2.566434144973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107802, + "balance_loss_mlp": 1.04132736, + "epoch": 0.6188918814928819, + "flos": 485029997568.0, + "grad_norm": 0.044556879100730015, + "language_loss": 0.88947988, + "learning_rate": 0.0003349234832641479, + "loss": 0.90026009, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.36669922, + "step": 3217, + "time_per_iteration": 2.5626957416534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087401, + "balance_loss_mlp": 1.05027926, + "epoch": 0.6190842631781455, + "flos": 656985641472.0, + "grad_norm": 0.056610001609600974, + "language_loss": 0.81178546, + "learning_rate": 0.00033462944199846975, + "loss": 0.82265949, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.37109375, + "step": 3218, + "time_per_iteration": 3.038856267929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091964, + "balance_loss_mlp": 1.054842, + "epoch": 0.619276644863409, + "flos": 403388011008.0, + "grad_norm": 0.051099399179052, + "language_loss": 0.86047733, + "learning_rate": 0.00033433546493533606, + "loss": 0.87139696, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.37109375, + "step": 3219, + "time_per_iteration": 2.4660589694976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086562, + "balance_loss_mlp": 1.04913092, + "epoch": 0.6194690265486725, + "flos": 582807730176.0, + "grad_norm": 0.07929462737326079, + "language_loss": 0.84635407, + "learning_rate": 0.00033404155218887897, + "loss": 0.8572197, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.37402344, + "step": 3220, + "time_per_iteration": 2.7270491123199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087776, + "balance_loss_mlp": 1.05127466, + "epoch": 0.6196614082339361, + "flos": 503963638272.0, + "grad_norm": 0.04746710197063832, + "language_loss": 0.87405616, + "learning_rate": 0.00033374770387320534, + "loss": 0.88493389, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.36499023, + "step": 3221, + "time_per_iteration": 2.7464041709899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086025, + "balance_loss_mlp": 1.04957032, + "epoch": 0.6198537899191997, + "flos": 575131249152.0, + "grad_norm": 0.04828799044899351, + "language_loss": 0.84905434, + "learning_rate": 0.00033345392010239737, + "loss": 0.85991454, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.36425781, + "step": 3222, + "time_per_iteration": 2.7124643325805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090712, + "balance_loss_mlp": 1.05432916, + "epoch": 0.6200461716044633, + "flos": 592871851008.0, + "grad_norm": 0.05455186914626242, + "language_loss": 0.8191222, + "learning_rate": 0.0003331602009905118, + "loss": 0.83002931, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.36376953, + "step": 3223, + "time_per_iteration": 2.7330005168914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084321, + "balance_loss_mlp": 1.04696107, + "epoch": 0.6202385532897268, + "flos": 665765001216.0, + "grad_norm": 0.046947333266423794, + "language_loss": 0.83694303, + "learning_rate": 0.00033286654665158085, + "loss": 0.84778625, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.37329102, + "step": 3224, + "time_per_iteration": 2.937727689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087568, + "balance_loss_mlp": 1.0515908, + "epoch": 0.6204309349749904, + "flos": 484709902848.0, + "grad_norm": 0.0575064293586871, + "language_loss": 0.87672997, + "learning_rate": 0.0003325729571996109, + "loss": 0.88760567, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.36010742, + "step": 3225, + "time_per_iteration": 2.6319355964660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085695, + "balance_loss_mlp": 1.04919314, + "epoch": 0.6206233166602539, + "flos": 583768014336.0, + "grad_norm": 0.048737024704114895, + "language_loss": 0.83402115, + "learning_rate": 0.000332279432748584, + "loss": 0.84487808, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.36523438, + "step": 3226, + "time_per_iteration": 2.733870029449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010819, + "balance_loss_mlp": 1.04656696, + "epoch": 0.6208156983455175, + "flos": 476669657088.0, + "grad_norm": 0.0460557240454385, + "language_loss": 0.87514353, + "learning_rate": 0.00033198597341245576, + "loss": 0.88596255, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.35375977, + "step": 3227, + "time_per_iteration": 2.567084789276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081012, + "balance_loss_mlp": 1.04420066, + "epoch": 0.6210080800307811, + "flos": 788716887552.0, + "grad_norm": 0.07539791999679457, + "language_loss": 0.81657004, + "learning_rate": 0.00033169257930515763, + "loss": 0.82738018, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.36816406, + "step": 3228, + "time_per_iteration": 3.074739694595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084245, + "balance_loss_mlp": 1.04655147, + "epoch": 0.6212004617160446, + "flos": 607514812416.0, + "grad_norm": 0.05269169473042375, + "language_loss": 0.82430172, + "learning_rate": 0.0003313992505405951, + "loss": 0.83514416, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.37695312, + "step": 3229, + "time_per_iteration": 2.711282730102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079533, + "balance_loss_mlp": 1.04305458, + "epoch": 0.6213928434013082, + "flos": 586248786432.0, + "grad_norm": 0.05753494770574613, + "language_loss": 0.8075214, + "learning_rate": 0.0003311059872326487, + "loss": 0.81831676, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.36474609, + "step": 3230, + "time_per_iteration": 2.6755940914154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082159, + "balance_loss_mlp": 1.04467952, + "epoch": 0.6215852250865718, + "flos": 535819083264.0, + "grad_norm": 0.04907016045640681, + "language_loss": 0.79111725, + "learning_rate": 0.0003308127894951734, + "loss": 0.80193883, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.37426758, + "step": 3231, + "time_per_iteration": 2.612122058868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086128, + "balance_loss_mlp": 1.04893494, + "epoch": 0.6217776067718354, + "flos": 617884471296.0, + "grad_norm": 0.0640423801123885, + "language_loss": 0.86435384, + "learning_rate": 0.00033051965744199834, + "loss": 0.87521511, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.37133789, + "step": 3232, + "time_per_iteration": 2.734384059906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107468, + "balance_loss_mlp": 1.03913224, + "epoch": 0.6219699884570988, + "flos": 545570311680.0, + "grad_norm": 0.045255868700115984, + "language_loss": 0.90312266, + "learning_rate": 0.0003302265911869276, + "loss": 0.91386944, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.35571289, + "step": 3233, + "time_per_iteration": 2.9088501930236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079236, + "balance_loss_mlp": 1.04216146, + "epoch": 0.6221623701423624, + "flos": 480899289600.0, + "grad_norm": 0.054924545254622516, + "language_loss": 0.83717418, + "learning_rate": 0.0003299335908437397, + "loss": 0.84796649, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.37060547, + "step": 3234, + "time_per_iteration": 2.5804450511932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077942, + "balance_loss_mlp": 1.04062915, + "epoch": 0.622354751827626, + "flos": 379812920832.0, + "grad_norm": 0.0810547632839198, + "language_loss": 0.80174738, + "learning_rate": 0.0003296406565261873, + "loss": 0.81252682, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.37304688, + "step": 3235, + "time_per_iteration": 2.480074405670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072254, + "balance_loss_mlp": 1.03610981, + "epoch": 0.6225471335128896, + "flos": 667570678272.0, + "grad_norm": 0.04590561718028109, + "language_loss": 0.84757555, + "learning_rate": 0.0003293477883479978, + "loss": 0.85829806, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.36181641, + "step": 3236, + "time_per_iteration": 2.8077552318573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107691, + "balance_loss_mlp": 1.03909636, + "epoch": 0.6227395151981532, + "flos": 770995224576.0, + "grad_norm": 0.06134325459280444, + "language_loss": 0.79419619, + "learning_rate": 0.0003290549864228727, + "loss": 0.80496532, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.37768555, + "step": 3237, + "time_per_iteration": 2.9485511779785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078391, + "balance_loss_mlp": 1.04084027, + "epoch": 0.6229318968834167, + "flos": 484104619008.0, + "grad_norm": 0.04787340801425507, + "language_loss": 0.86647016, + "learning_rate": 0.0003287622508644875, + "loss": 0.87725413, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.37548828, + "step": 3238, + "time_per_iteration": 2.723003387451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072928, + "balance_loss_mlp": 1.0360688, + "epoch": 0.6231242785686802, + "flos": 462700380672.0, + "grad_norm": 0.08533340323107003, + "language_loss": 0.86471462, + "learning_rate": 0.0003284695817864923, + "loss": 0.87544394, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.36865234, + "step": 3239, + "time_per_iteration": 2.4788854122161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079076, + "balance_loss_mlp": 1.04231155, + "epoch": 0.6233166602539438, + "flos": 608809747968.0, + "grad_norm": 0.06356990340371446, + "language_loss": 0.83732104, + "learning_rate": 0.0003281769793025116, + "loss": 0.84811181, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.36791992, + "step": 3240, + "time_per_iteration": 2.68833065032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071809, + "balance_loss_mlp": 1.03542674, + "epoch": 0.6235090419392074, + "flos": 438972521472.0, + "grad_norm": 0.05773237210342904, + "language_loss": 0.89384484, + "learning_rate": 0.00032788444352614346, + "loss": 0.90456295, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.36425781, + "step": 3241, + "time_per_iteration": 2.485630512237549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073948, + "balance_loss_mlp": 1.03706515, + "epoch": 0.6237014236244709, + "flos": 504656262144.0, + "grad_norm": 0.05916154923857777, + "language_loss": 0.80431205, + "learning_rate": 0.0003275919745709606, + "loss": 0.81505156, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.36889648, + "step": 3242, + "time_per_iteration": 2.557732582092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073489, + "balance_loss_mlp": 1.03710628, + "epoch": 0.6238938053097345, + "flos": 512648455680.0, + "grad_norm": 0.047494752086082274, + "language_loss": 0.82139623, + "learning_rate": 0.00032729957255050936, + "loss": 0.83213103, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.36376953, + "step": 3243, + "time_per_iteration": 2.653381586074829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075548, + "balance_loss_mlp": 1.03799748, + "epoch": 0.6240861869949981, + "flos": 736435017216.0, + "grad_norm": 0.07878714918390893, + "language_loss": 0.81488502, + "learning_rate": 0.0003270072375783102, + "loss": 0.8256405, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.37524414, + "step": 3244, + "time_per_iteration": 2.893857717514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068875, + "balance_loss_mlp": 1.03244424, + "epoch": 0.6242785686802617, + "flos": 494464103424.0, + "grad_norm": 0.05659954005953207, + "language_loss": 0.79646188, + "learning_rate": 0.00032671496976785774, + "loss": 0.8071506, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.36425781, + "step": 3245, + "time_per_iteration": 2.5836338996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072633, + "balance_loss_mlp": 1.03536868, + "epoch": 0.6244709503655252, + "flos": 745500976128.0, + "grad_norm": 0.04683044918703509, + "language_loss": 0.75894988, + "learning_rate": 0.0003264227692326205, + "loss": 0.76967621, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.37231445, + "step": 3246, + "time_per_iteration": 3.0129404067993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071862, + "balance_loss_mlp": 1.03524101, + "epoch": 0.6246633320507887, + "flos": 492366034944.0, + "grad_norm": 0.053825278075075034, + "language_loss": 0.85644072, + "learning_rate": 0.00032613063608604055, + "loss": 0.86715937, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.36645508, + "step": 3247, + "time_per_iteration": 2.5503756999969482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078462, + "balance_loss_mlp": 1.0416261, + "epoch": 0.6248557137360523, + "flos": 517142928384.0, + "grad_norm": 0.04781520773103446, + "language_loss": 0.8331461, + "learning_rate": 0.0003258385704415343, + "loss": 0.84393072, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.36816406, + "step": 3248, + "time_per_iteration": 2.560483455657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079309, + "balance_loss_mlp": 1.04161501, + "epoch": 0.6250480954213159, + "flos": 519098402304.0, + "grad_norm": 0.04627181605828338, + "language_loss": 0.83052945, + "learning_rate": 0.0003255465724124915, + "loss": 0.84132254, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.37670898, + "step": 3249, + "time_per_iteration": 2.7024102210998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075461, + "balance_loss_mlp": 1.03776741, + "epoch": 0.6252404771065795, + "flos": 515808705024.0, + "grad_norm": 0.04699281003283387, + "language_loss": 0.82845968, + "learning_rate": 0.00032525464211227587, + "loss": 0.83921427, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.37646484, + "step": 3250, + "time_per_iteration": 2.5934925079345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073962, + "balance_loss_mlp": 1.03712666, + "epoch": 0.6254328587918431, + "flos": 576647354880.0, + "grad_norm": 0.05335085924079445, + "language_loss": 0.85498369, + "learning_rate": 0.0003249627796542249, + "loss": 0.86572331, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.36816406, + "step": 3251, + "time_per_iteration": 2.6473746299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107209, + "balance_loss_mlp": 1.03472972, + "epoch": 0.6256252404771065, + "flos": 597638366208.0, + "grad_norm": 0.05949551618026705, + "language_loss": 0.83974731, + "learning_rate": 0.00032467098515164943, + "loss": 0.85046822, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.37353516, + "step": 3252, + "time_per_iteration": 2.8618545532226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074504, + "balance_loss_mlp": 1.03776419, + "epoch": 0.6258176221623701, + "flos": 508034709504.0, + "grad_norm": 0.05339688957223288, + "language_loss": 0.83978283, + "learning_rate": 0.00032437925871783456, + "loss": 0.85052788, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.36767578, + "step": 3253, + "time_per_iteration": 2.6301941871643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074357, + "balance_loss_mlp": 1.03680658, + "epoch": 0.6260100038476337, + "flos": 639357110784.0, + "grad_norm": 0.06013661875979651, + "language_loss": 0.84100354, + "learning_rate": 0.00032408760046603803, + "loss": 0.85174716, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.37548828, + "step": 3254, + "time_per_iteration": 2.798520565032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076909, + "balance_loss_mlp": 1.03923869, + "epoch": 0.6262023855328973, + "flos": 840648139776.0, + "grad_norm": 0.05406777705406554, + "language_loss": 0.77436024, + "learning_rate": 0.00032379601050949193, + "loss": 0.78512931, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.3762207, + "step": 3255, + "time_per_iteration": 3.0876083374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075377, + "balance_loss_mlp": 1.03746879, + "epoch": 0.6263947672181608, + "flos": 521884712448.0, + "grad_norm": 0.05001529336146337, + "language_loss": 0.8825866, + "learning_rate": 0.0003235044889614013, + "loss": 0.89334035, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.37866211, + "step": 3256, + "time_per_iteration": 2.616588592529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079768, + "balance_loss_mlp": 1.04221702, + "epoch": 0.6265871489034244, + "flos": 606747995136.0, + "grad_norm": 0.049239400336598835, + "language_loss": 0.83356363, + "learning_rate": 0.0003232130359349451, + "loss": 0.84436131, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.37524414, + "step": 3257, + "time_per_iteration": 2.8224074840545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083632, + "balance_loss_mlp": 1.04474616, + "epoch": 0.626779530588688, + "flos": 588208642560.0, + "grad_norm": 0.04846319258982293, + "language_loss": 0.81674659, + "learning_rate": 0.0003229216515432751, + "loss": 0.8275829, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.38842773, + "step": 3258, + "time_per_iteration": 2.78884220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081861, + "balance_loss_mlp": 1.0438329, + "epoch": 0.6269719122739515, + "flos": 438381794304.0, + "grad_norm": 0.061777321686694836, + "language_loss": 0.79815853, + "learning_rate": 0.0003226303358995174, + "loss": 0.80897713, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.37988281, + "step": 3259, + "time_per_iteration": 2.625014305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108494, + "balance_loss_mlp": 1.0462687, + "epoch": 0.6271642939592151, + "flos": 562590738432.0, + "grad_norm": 0.04793696937542698, + "language_loss": 0.8911407, + "learning_rate": 0.00032233908911677, + "loss": 0.90199006, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.38623047, + "step": 3260, + "time_per_iteration": 2.8619987964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081144, + "balance_loss_mlp": 1.04194832, + "epoch": 0.6273566756444786, + "flos": 514288217088.0, + "grad_norm": 0.06578723917558563, + "language_loss": 0.80680311, + "learning_rate": 0.0003220479113081053, + "loss": 0.81761456, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.3918457, + "step": 3261, + "time_per_iteration": 2.7102510929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080352, + "balance_loss_mlp": 1.04270554, + "epoch": 0.6275490573297422, + "flos": 585195369984.0, + "grad_norm": 0.0548628003226281, + "language_loss": 0.78727174, + "learning_rate": 0.00032175680258656836, + "loss": 0.79807532, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.3762207, + "step": 3262, + "time_per_iteration": 2.696701765060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083558, + "balance_loss_mlp": 1.04600739, + "epoch": 0.6277414390150058, + "flos": 559143889920.0, + "grad_norm": 0.044574681461427054, + "language_loss": 0.80117631, + "learning_rate": 0.00032146576306517794, + "loss": 0.81201196, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.37524414, + "step": 3263, + "time_per_iteration": 2.764273166656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077314, + "balance_loss_mlp": 1.03873789, + "epoch": 0.6279338207002694, + "flos": 612423922176.0, + "grad_norm": 0.04659103791946159, + "language_loss": 0.80601645, + "learning_rate": 0.0003211747928569255, + "loss": 0.81678957, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.38525391, + "step": 3264, + "time_per_iteration": 2.741144895553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077333, + "balance_loss_mlp": 1.03906703, + "epoch": 0.6281262023855329, + "flos": 625374687744.0, + "grad_norm": 0.044995138284684974, + "language_loss": 0.81407869, + "learning_rate": 0.0003208838920747754, + "loss": 0.82485199, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.38208008, + "step": 3265, + "time_per_iteration": 2.8458306789398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075806, + "balance_loss_mlp": 1.03753948, + "epoch": 0.6283185840707964, + "flos": 1123147579392.0, + "grad_norm": 0.051347706918532285, + "language_loss": 0.76555598, + "learning_rate": 0.0003205930608316656, + "loss": 0.77631402, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.38232422, + "step": 3266, + "time_per_iteration": 3.5019400119781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074561, + "balance_loss_mlp": 1.03631854, + "epoch": 0.62851096575606, + "flos": 514967694336.0, + "grad_norm": 0.055036634994397565, + "language_loss": 0.84812629, + "learning_rate": 0.00032030229924050673, + "loss": 0.85887194, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.38183594, + "step": 3267, + "time_per_iteration": 2.6514573097229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107346, + "balance_loss_mlp": 1.03495502, + "epoch": 0.6287033474413236, + "flos": 403949624832.0, + "grad_norm": 0.06092252438961513, + "language_loss": 0.79938138, + "learning_rate": 0.00032001160741418247, + "loss": 0.81011593, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.38452148, + "step": 3268, + "time_per_iteration": 2.6364564895629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076302, + "balance_loss_mlp": 1.03765488, + "epoch": 0.6288957291265872, + "flos": 525459598848.0, + "grad_norm": 0.06432688235517753, + "language_loss": 0.81921297, + "learning_rate": 0.0003197209854655494, + "loss": 0.82997596, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.38623047, + "step": 3269, + "time_per_iteration": 2.6190736293792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072941, + "balance_loss_mlp": 1.03531849, + "epoch": 0.6290881108118507, + "flos": 603414627840.0, + "grad_norm": 0.059396512475175293, + "language_loss": 0.74762654, + "learning_rate": 0.0003194304335074371, + "loss": 0.75835598, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.3762207, + "step": 3270, + "time_per_iteration": 2.829658031463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069412, + "balance_loss_mlp": 1.03190899, + "epoch": 0.6292804924971143, + "flos": 437446241280.0, + "grad_norm": 0.057734053913976915, + "language_loss": 0.8848114, + "learning_rate": 0.0003191399516526475, + "loss": 0.89550555, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.37451172, + "step": 3271, + "time_per_iteration": 2.520371675491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107074, + "balance_loss_mlp": 1.03369021, + "epoch": 0.6294728741823779, + "flos": 606368263680.0, + "grad_norm": 0.05065852355738081, + "language_loss": 0.79438859, + "learning_rate": 0.0003188495400139559, + "loss": 0.80509603, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.37060547, + "step": 3272, + "time_per_iteration": 2.771045207977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070525, + "balance_loss_mlp": 1.03354681, + "epoch": 0.6296652558676414, + "flos": 701220063744.0, + "grad_norm": 0.05978567707870047, + "language_loss": 0.84609801, + "learning_rate": 0.00031855919870411013, + "loss": 0.8568033, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.36987305, + "step": 3273, + "time_per_iteration": 2.8209264278411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072859, + "balance_loss_mlp": 1.03516483, + "epoch": 0.6298576375529049, + "flos": 523652511744.0, + "grad_norm": 0.05543489609660157, + "language_loss": 0.85005689, + "learning_rate": 0.0003182689278358305, + "loss": 0.86078548, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.37646484, + "step": 3274, + "time_per_iteration": 2.6735117435455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069939, + "balance_loss_mlp": 1.03360391, + "epoch": 0.6300500192381685, + "flos": 475723929600.0, + "grad_norm": 0.06241690898076668, + "language_loss": 0.79779917, + "learning_rate": 0.0003179787275218105, + "loss": 0.80849856, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.36352539, + "step": 3275, + "time_per_iteration": 2.5281076431274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071447, + "balance_loss_mlp": 1.03394365, + "epoch": 0.6302424009234321, + "flos": 520629064704.0, + "grad_norm": 0.04860664523501564, + "language_loss": 0.83985364, + "learning_rate": 0.0003176885978747155, + "loss": 0.85056806, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.375, + "step": 3276, + "time_per_iteration": 2.590137243270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073206, + "balance_loss_mlp": 1.03594065, + "epoch": 0.6304347826086957, + "flos": 694282696704.0, + "grad_norm": 0.06745994429641342, + "language_loss": 0.82557893, + "learning_rate": 0.0003173985390071839, + "loss": 0.83631098, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.37207031, + "step": 3277, + "time_per_iteration": 2.835454225540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026014, + "balance_loss_mlp": 1.01476038, + "epoch": 0.6306271642939593, + "flos": 1466067755520.0, + "grad_norm": 0.018393176098041853, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78926468, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.11230469, + "step": 3278, + "time_per_iteration": 4.83237099647522 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071213, + "balance_loss_mlp": 1.03440166, + "epoch": 0.6308195459792227, + "flos": 601444597248.0, + "grad_norm": 0.05391474190589518, + "language_loss": 0.8122592, + "learning_rate": 0.00031681863406122704, + "loss": 0.82297128, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.36816406, + "step": 3279, + "time_per_iteration": 2.7689826488494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071032, + "balance_loss_mlp": 1.03381503, + "epoch": 0.6310119276644863, + "flos": 726514900992.0, + "grad_norm": 0.04523972239140451, + "language_loss": 0.85147464, + "learning_rate": 0.00031652878820794087, + "loss": 0.86218488, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.37207031, + "step": 3280, + "time_per_iteration": 2.973525047302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074703, + "balance_loss_mlp": 1.03762913, + "epoch": 0.6312043093497499, + "flos": 519482515968.0, + "grad_norm": 0.0661931076661352, + "language_loss": 0.85199058, + "learning_rate": 0.00031623901358449627, + "loss": 0.86273754, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.37060547, + "step": 3281, + "time_per_iteration": 2.6226651668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074897, + "balance_loss_mlp": 1.03860974, + "epoch": 0.6313966910350135, + "flos": 530934704640.0, + "grad_norm": 0.050825479700673346, + "language_loss": 0.88810539, + "learning_rate": 0.0003159493103033936, + "loss": 0.89885437, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.36303711, + "step": 3282, + "time_per_iteration": 2.601001262664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022599, + "balance_loss_mlp": 1.01163197, + "epoch": 0.631589072720277, + "flos": 1379113606656.0, + "grad_norm": 0.015722809882928884, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80941653, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.10986328, + "step": 3283, + "time_per_iteration": 4.848982334136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075752, + "balance_loss_mlp": 1.03774858, + "epoch": 0.6317814544055406, + "flos": 624379497984.0, + "grad_norm": 0.05495466978446473, + "language_loss": 0.82262814, + "learning_rate": 0.0003153701182180776, + "loss": 0.83338571, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.37939453, + "step": 3284, + "time_per_iteration": 2.767197608947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074338, + "balance_loss_mlp": 1.03759754, + "epoch": 0.6319738360908042, + "flos": 497876046336.0, + "grad_norm": 0.052075562898617506, + "language_loss": 0.81654066, + "learning_rate": 0.00031508062963872655, + "loss": 0.82728398, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.36743164, + "step": 3285, + "time_per_iteration": 2.6035704612731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076725, + "balance_loss_mlp": 1.03836393, + "epoch": 0.6321662177760677, + "flos": 579474362880.0, + "grad_norm": 0.07288308638623867, + "language_loss": 0.79200375, + "learning_rate": 0.0003147912128514423, + "loss": 0.80277097, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.38330078, + "step": 3286, + "time_per_iteration": 2.716641426086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076397, + "balance_loss_mlp": 1.04046774, + "epoch": 0.6323585994613313, + "flos": 601207460352.0, + "grad_norm": 0.06971940923573844, + "language_loss": 0.8695125, + "learning_rate": 0.0003145018679685859, + "loss": 0.88027644, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.35913086, + "step": 3287, + "time_per_iteration": 2.7455978393554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107146, + "balance_loss_mlp": 1.03579235, + "epoch": 0.6325509811465948, + "flos": 528261875712.0, + "grad_norm": 0.04384193895060619, + "language_loss": 0.8763777, + "learning_rate": 0.00031421259510249134, + "loss": 0.88709229, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.35717773, + "step": 3288, + "time_per_iteration": 2.760524034500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070765, + "balance_loss_mlp": 1.03397667, + "epoch": 0.6327433628318584, + "flos": 573993464832.0, + "grad_norm": 0.05235334627417233, + "language_loss": 0.81302404, + "learning_rate": 0.00031392339436546414, + "loss": 0.82373166, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.36791992, + "step": 3289, + "time_per_iteration": 2.8397610187530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075193, + "balance_loss_mlp": 1.03876281, + "epoch": 0.632935744517122, + "flos": 516833008128.0, + "grad_norm": 0.06389388194591325, + "language_loss": 0.83106172, + "learning_rate": 0.00031363426586978205, + "loss": 0.84181368, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.36450195, + "step": 3290, + "time_per_iteration": 2.7519772052764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071007, + "balance_loss_mlp": 1.03438592, + "epoch": 0.6331281262023856, + "flos": 617180262912.0, + "grad_norm": 0.051172787966305235, + "language_loss": 0.84358442, + "learning_rate": 0.0003133452097276947, + "loss": 0.85429454, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.3659668, + "step": 3291, + "time_per_iteration": 2.7666964530944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066726, + "balance_loss_mlp": 1.03060579, + "epoch": 0.633320507887649, + "flos": 592665237504.0, + "grad_norm": 0.04649406007551123, + "language_loss": 0.84316128, + "learning_rate": 0.0003130562260514238, + "loss": 0.85382849, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.36132812, + "step": 3292, + "time_per_iteration": 2.7349252700805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107055, + "balance_loss_mlp": 1.03373802, + "epoch": 0.6335128895729126, + "flos": 582064233984.0, + "grad_norm": 0.04554083300278307, + "language_loss": 0.81461787, + "learning_rate": 0.0003127673149531626, + "loss": 0.82532346, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.36791992, + "step": 3293, + "time_per_iteration": 2.777203321456909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068218, + "balance_loss_mlp": 1.03150177, + "epoch": 0.6337052712581762, + "flos": 452803585536.0, + "grad_norm": 0.06587876286418191, + "language_loss": 0.83099329, + "learning_rate": 0.0003124784765450762, + "loss": 0.84167558, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.3671875, + "step": 3294, + "time_per_iteration": 2.5272936820983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076823, + "balance_loss_mlp": 1.0392009, + "epoch": 0.6338976529434398, + "flos": 573132105216.0, + "grad_norm": 0.07565645338931325, + "language_loss": 0.80265319, + "learning_rate": 0.0003121897109393017, + "loss": 0.81342143, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.37597656, + "step": 3295, + "time_per_iteration": 2.7182729244232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069809, + "balance_loss_mlp": 1.03318739, + "epoch": 0.6340900346287034, + "flos": 508497398784.0, + "grad_norm": 0.45372890194936744, + "language_loss": 0.89147079, + "learning_rate": 0.0003119010182479481, + "loss": 0.90216893, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.36621094, + "step": 3296, + "time_per_iteration": 2.613863706588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076319, + "balance_loss_mlp": 1.0396266, + "epoch": 0.6342824163139669, + "flos": 479505429504.0, + "grad_norm": 0.05534198375005729, + "language_loss": 0.82468164, + "learning_rate": 0.00031161239858309563, + "loss": 0.83544481, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.36669922, + "step": 3297, + "time_per_iteration": 2.581540822982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107642, + "balance_loss_mlp": 1.03917897, + "epoch": 0.6344747979992305, + "flos": 571762976256.0, + "grad_norm": 0.05796983524113203, + "language_loss": 0.8309406, + "learning_rate": 0.0003113238520567964, + "loss": 0.84170485, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.37182617, + "step": 3298, + "time_per_iteration": 2.666191816329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077442, + "balance_loss_mlp": 1.04082084, + "epoch": 0.634667179684494, + "flos": 605629149696.0, + "grad_norm": 0.056114886928888365, + "language_loss": 0.81702375, + "learning_rate": 0.00031103537878107403, + "loss": 0.82779819, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.36621094, + "step": 3299, + "time_per_iteration": 2.7362561225891113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080646, + "balance_loss_mlp": 1.04311848, + "epoch": 0.6348595613697576, + "flos": 646649478144.0, + "grad_norm": 0.06007496440704036, + "language_loss": 0.80261421, + "learning_rate": 0.0003107469788679238, + "loss": 0.81342065, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.37475586, + "step": 3300, + "time_per_iteration": 2.756533622741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073194, + "balance_loss_mlp": 1.03597736, + "epoch": 0.6350519430550212, + "flos": 638776558080.0, + "grad_norm": 0.05358633946635087, + "language_loss": 0.86808562, + "learning_rate": 0.00031045865242931267, + "loss": 0.87881756, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.37207031, + "step": 3301, + "time_per_iteration": 2.829094171524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080134, + "balance_loss_mlp": 1.043203, + "epoch": 0.6352443247402847, + "flos": 686091091968.0, + "grad_norm": 0.0476034793432377, + "language_loss": 0.83036846, + "learning_rate": 0.00031017039957717877, + "loss": 0.84116983, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.36938477, + "step": 3302, + "time_per_iteration": 2.9974441528320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073106, + "balance_loss_mlp": 1.03588891, + "epoch": 0.6354367064255483, + "flos": 559173003264.0, + "grad_norm": 0.056110934582374906, + "language_loss": 0.88712031, + "learning_rate": 0.0003098822204234318, + "loss": 0.89785135, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.37207031, + "step": 3303, + "time_per_iteration": 2.6585702896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076324, + "balance_loss_mlp": 1.03984571, + "epoch": 0.6356290881108119, + "flos": 979095582720.0, + "grad_norm": 0.062320507603927815, + "language_loss": 0.8736068, + "learning_rate": 0.00030959411507995273, + "loss": 0.88437009, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.36499023, + "step": 3304, + "time_per_iteration": 3.2019383907318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079953, + "balance_loss_mlp": 1.04299855, + "epoch": 0.6358214697960755, + "flos": 528005799936.0, + "grad_norm": 0.05770730921560322, + "language_loss": 0.80951726, + "learning_rate": 0.00030930608365859407, + "loss": 0.82031679, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.36938477, + "step": 3305, + "time_per_iteration": 2.6649279594421387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073678, + "balance_loss_mlp": 1.03793883, + "epoch": 0.6360138514813389, + "flos": 516547819008.0, + "grad_norm": 0.050398763649548706, + "language_loss": 0.87612951, + "learning_rate": 0.00030901812627117943, + "loss": 0.88686621, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.35791016, + "step": 3306, + "time_per_iteration": 2.6524715423583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072331, + "balance_loss_mlp": 1.0352571, + "epoch": 0.6362062331666025, + "flos": 466289823744.0, + "grad_norm": 0.06392175432949986, + "language_loss": 0.84607399, + "learning_rate": 0.000308730243029504, + "loss": 0.85679734, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.37084961, + "step": 3307, + "time_per_iteration": 2.619936943054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080745, + "balance_loss_mlp": 1.04407644, + "epoch": 0.6363986148518661, + "flos": 549458090496.0, + "grad_norm": 0.0791847929194259, + "language_loss": 0.79674953, + "learning_rate": 0.0003084424340453339, + "loss": 0.80755699, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.36669922, + "step": 3308, + "time_per_iteration": 2.847384214401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074674, + "balance_loss_mlp": 1.03688467, + "epoch": 0.6365909965371297, + "flos": 582772824576.0, + "grad_norm": 0.10517797210671455, + "language_loss": 0.82179588, + "learning_rate": 0.0003081546994304064, + "loss": 0.8325426, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.37744141, + "step": 3309, + "time_per_iteration": 2.745880365371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073786, + "balance_loss_mlp": 1.03644967, + "epoch": 0.6367833782223933, + "flos": 530998723584.0, + "grad_norm": 0.05446787183102227, + "language_loss": 0.8192482, + "learning_rate": 0.0003078670392964298, + "loss": 0.8299861, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.37304688, + "step": 3310, + "time_per_iteration": 2.6298861503601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075946, + "balance_loss_mlp": 1.03896689, + "epoch": 0.6369757599076568, + "flos": 569237124096.0, + "grad_norm": 0.05047878610686386, + "language_loss": 0.82755494, + "learning_rate": 0.00030757945375508406, + "loss": 0.83831441, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.36938477, + "step": 3311, + "time_per_iteration": 2.6519951820373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074481, + "balance_loss_mlp": 1.03652477, + "epoch": 0.6371681415929203, + "flos": 539684951040.0, + "grad_norm": 0.05551115328113397, + "language_loss": 0.81331229, + "learning_rate": 0.00030729194291801944, + "loss": 0.8240571, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.37915039, + "step": 3312, + "time_per_iteration": 2.6647114753723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078542, + "balance_loss_mlp": 1.04089594, + "epoch": 0.6373605232781839, + "flos": 483326217216.0, + "grad_norm": 0.05317823086949404, + "language_loss": 0.76999873, + "learning_rate": 0.00030700450689685787, + "loss": 0.78078413, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.37646484, + "step": 3313, + "time_per_iteration": 2.517679452896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_mlp": 1.03700447, + "epoch": 0.6375529049634475, + "flos": 578273969664.0, + "grad_norm": 0.05477509929208262, + "language_loss": 0.85436654, + "learning_rate": 0.00030671714580319186, + "loss": 0.86509454, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.35839844, + "step": 3314, + "time_per_iteration": 2.83425235748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078643, + "balance_loss_mlp": 1.04118717, + "epoch": 0.637745286648711, + "flos": 681953181696.0, + "grad_norm": 0.05493703572973109, + "language_loss": 0.83096623, + "learning_rate": 0.0003064298597485846, + "loss": 0.84175265, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.37426758, + "step": 3315, + "time_per_iteration": 2.8374462127685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107089, + "balance_loss_mlp": 1.03472173, + "epoch": 0.6379376683339746, + "flos": 504385629696.0, + "grad_norm": 0.05328451247600945, + "language_loss": 0.83983094, + "learning_rate": 0.00030614264884457054, + "loss": 0.8505398, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.36181641, + "step": 3316, + "time_per_iteration": 2.6181318759918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076564, + "balance_loss_mlp": 1.03896546, + "epoch": 0.6381300500192382, + "flos": 501771027456.0, + "grad_norm": 0.05692902887495298, + "language_loss": 0.77128184, + "learning_rate": 0.000305855513202655, + "loss": 0.78204751, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.37573242, + "step": 3317, + "time_per_iteration": 2.570751190185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072223, + "balance_loss_mlp": 1.03574491, + "epoch": 0.6383224317045018, + "flos": 400271431680.0, + "grad_norm": 0.0603897585499684, + "language_loss": 0.77435303, + "learning_rate": 0.0003055684529343138, + "loss": 0.78507531, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.36474609, + "step": 3318, + "time_per_iteration": 2.4171056747436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068524, + "balance_loss_mlp": 1.03249943, + "epoch": 0.6385148133897653, + "flos": 499131694080.0, + "grad_norm": 0.06663651312006989, + "language_loss": 0.78354919, + "learning_rate": 0.00030528146815099374, + "loss": 0.79423445, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.36010742, + "step": 3319, + "time_per_iteration": 2.5991523265838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072343, + "balance_loss_mlp": 1.03603208, + "epoch": 0.6387071950750288, + "flos": 527409280512.0, + "grad_norm": 0.04641062645518834, + "language_loss": 0.71934807, + "learning_rate": 0.00030499455896411203, + "loss": 0.73007143, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.36376953, + "step": 3320, + "time_per_iteration": 2.601541519165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047736, + "balance_loss_mlp": 1.03734136, + "epoch": 0.6388995767602924, + "flos": 1455200501760.0, + "grad_norm": 0.026504664974818824, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77348548, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.10400391, + "step": 3321, + "time_per_iteration": 4.919625997543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070963, + "balance_loss_mlp": 1.03417492, + "epoch": 0.639091958445556, + "flos": 603577571328.0, + "grad_norm": 0.051172481389266875, + "language_loss": 0.76476693, + "learning_rate": 0.0003044209678251865, + "loss": 0.77547657, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.36791992, + "step": 3322, + "time_per_iteration": 2.9173965454101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070731, + "balance_loss_mlp": 1.03406262, + "epoch": 0.6392843401308196, + "flos": 584230703616.0, + "grad_norm": 0.062017563043543965, + "language_loss": 0.84732592, + "learning_rate": 0.0003041342860958306, + "loss": 0.85803324, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.36694336, + "step": 3323, + "time_per_iteration": 2.751882791519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069123, + "balance_loss_mlp": 1.03269315, + "epoch": 0.6394767218160831, + "flos": 514420637184.0, + "grad_norm": 0.054747759386293726, + "language_loss": 0.91800594, + "learning_rate": 0.00030384768040828857, + "loss": 0.92869711, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.36425781, + "step": 3324, + "time_per_iteration": 2.6570470333099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070533, + "balance_loss_mlp": 1.03314865, + "epoch": 0.6396691035013466, + "flos": 541471689216.0, + "grad_norm": 0.049915114464213116, + "language_loss": 0.85503262, + "learning_rate": 0.00030356115087383094, + "loss": 0.86573792, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.3737793, + "step": 3325, + "time_per_iteration": 2.620593309402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068087, + "balance_loss_mlp": 1.03115582, + "epoch": 0.6398614851866102, + "flos": 525282098688.0, + "grad_norm": 0.05721597206599544, + "language_loss": 0.84746885, + "learning_rate": 0.00030327469760369803, + "loss": 0.85814971, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.36938477, + "step": 3326, + "time_per_iteration": 2.600210428237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070931, + "balance_loss_mlp": 1.03342783, + "epoch": 0.6400538668718738, + "flos": 622704830976.0, + "grad_norm": 0.3477735947082266, + "language_loss": 0.85250199, + "learning_rate": 0.0003029883207091009, + "loss": 0.86321133, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.375, + "step": 3327, + "time_per_iteration": 2.7323827743530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065435, + "balance_loss_mlp": 1.02910042, + "epoch": 0.6402462485571374, + "flos": 503096486400.0, + "grad_norm": 0.053886182744941745, + "language_loss": 0.78170431, + "learning_rate": 0.00030270202030122095, + "loss": 0.79235864, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.36328125, + "step": 3328, + "time_per_iteration": 2.6563096046447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107159, + "balance_loss_mlp": 1.03310895, + "epoch": 0.6404386302424009, + "flos": 818894693376.0, + "grad_norm": 0.06347117361698136, + "language_loss": 0.85806334, + "learning_rate": 0.00030241579649121, + "loss": 0.86877924, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.38476562, + "step": 3329, + "time_per_iteration": 2.9936435222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066015, + "balance_loss_mlp": 1.02901256, + "epoch": 0.6406310119276645, + "flos": 471568490496.0, + "grad_norm": 0.05226197441387588, + "language_loss": 0.79091239, + "learning_rate": 0.00030212964939018994, + "loss": 0.8015725, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.37011719, + "step": 3330, + "time_per_iteration": 2.5639078617095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107031, + "balance_loss_mlp": 1.0323776, + "epoch": 0.6408233936129281, + "flos": 425358245376.0, + "grad_norm": 0.06341229452326952, + "language_loss": 0.85196972, + "learning_rate": 0.0003018435791092527, + "loss": 0.86267287, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.37890625, + "step": 3331, + "time_per_iteration": 2.4909286499023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067837, + "balance_loss_mlp": 1.0288794, + "epoch": 0.6410157752981916, + "flos": 549522109440.0, + "grad_norm": 0.052178008313766185, + "language_loss": 0.81084096, + "learning_rate": 0.00030155758575946083, + "loss": 0.82151937, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.3894043, + "step": 3332, + "time_per_iteration": 2.64400315284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069681, + "balance_loss_mlp": 1.03246343, + "epoch": 0.6412081569834551, + "flos": 475659910656.0, + "grad_norm": 0.056966090936169146, + "language_loss": 0.83717507, + "learning_rate": 0.0003012716694518467, + "loss": 0.84787184, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.37231445, + "step": 3333, + "time_per_iteration": 2.5760622024536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068182, + "balance_loss_mlp": 1.02998757, + "epoch": 0.6414005386687187, + "flos": 540645235200.0, + "grad_norm": 0.0733128954911655, + "language_loss": 0.85120058, + "learning_rate": 0.000300985830297413, + "loss": 0.86188245, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.3815918, + "step": 3334, + "time_per_iteration": 2.6769511699676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068689, + "balance_loss_mlp": 1.03187692, + "epoch": 0.6415929203539823, + "flos": 1040909073408.0, + "grad_norm": 0.0544756341035146, + "language_loss": 0.87377876, + "learning_rate": 0.00030070006840713205, + "loss": 0.88446569, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.36865234, + "step": 3335, + "time_per_iteration": 3.3541831970214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070768, + "balance_loss_mlp": 1.03398037, + "epoch": 0.6417853020392459, + "flos": 648028781568.0, + "grad_norm": 0.051565037343947635, + "language_loss": 0.73971063, + "learning_rate": 0.000300414383891947, + "loss": 0.75041831, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.36791992, + "step": 3336, + "time_per_iteration": 2.802199602127075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072657, + "balance_loss_mlp": 1.03536844, + "epoch": 0.6419776837245095, + "flos": 500639035392.0, + "grad_norm": 0.04995187191956455, + "language_loss": 0.88918942, + "learning_rate": 0.00030012877686276973, + "loss": 0.89991605, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.37280273, + "step": 3337, + "time_per_iteration": 2.69291090965271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107373, + "balance_loss_mlp": 1.03677511, + "epoch": 0.642170065409773, + "flos": 620331747840.0, + "grad_norm": 0.054761035667788324, + "language_loss": 0.86300218, + "learning_rate": 0.0002998432474304832, + "loss": 0.87373948, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.36914062, + "step": 3338, + "time_per_iteration": 2.773374319076538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015993, + "balance_loss_mlp": 1.00283277, + "epoch": 0.6423624470950365, + "flos": 1422767476224.0, + "grad_norm": 0.016749722719595034, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80253339, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.13183594, + "step": 3339, + "time_per_iteration": 4.874187231063843 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073859, + "balance_loss_mlp": 1.03788161, + "epoch": 0.6425548287803001, + "flos": 562082969088.0, + "grad_norm": 0.04482420298263986, + "language_loss": 0.88213849, + "learning_rate": 0.00029927242179996107, + "loss": 0.8928771, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.36010742, + "step": 3340, + "time_per_iteration": 2.665893077850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068858, + "balance_loss_mlp": 1.03240371, + "epoch": 0.6427472104655637, + "flos": 585151699968.0, + "grad_norm": 0.04629279799595454, + "language_loss": 0.83241612, + "learning_rate": 0.0002989871258233398, + "loss": 0.84310472, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.36474609, + "step": 3341, + "time_per_iteration": 2.7554104328155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076157, + "balance_loss_mlp": 1.03927386, + "epoch": 0.6429395921508272, + "flos": 404067488256.0, + "grad_norm": 0.0587599408215441, + "language_loss": 0.82722974, + "learning_rate": 0.0002987019078868373, + "loss": 0.8379913, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.36865234, + "step": 3342, + "time_per_iteration": 2.4214284420013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074694, + "balance_loss_mlp": 1.03742945, + "epoch": 0.6431319738360908, + "flos": 548522537472.0, + "grad_norm": 0.05743775119998274, + "language_loss": 0.8159622, + "learning_rate": 0.00029841676810118484, + "loss": 0.82670915, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.37231445, + "step": 3343, + "time_per_iteration": 2.6899335384368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070744, + "balance_loss_mlp": 1.03390789, + "epoch": 0.6433243555213544, + "flos": 793044034560.0, + "grad_norm": 0.05135608784833761, + "language_loss": 0.87229836, + "learning_rate": 0.0002981317065770839, + "loss": 0.8830058, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.36816406, + "step": 3344, + "time_per_iteration": 3.038647413253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075006, + "balance_loss_mlp": 1.03771782, + "epoch": 0.643516737206618, + "flos": 582762650112.0, + "grad_norm": 0.05966061417455641, + "language_loss": 0.80907631, + "learning_rate": 0.00029784672342520493, + "loss": 0.81982636, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.37231445, + "step": 3345, + "time_per_iteration": 2.6487960815429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106957, + "balance_loss_mlp": 1.03244793, + "epoch": 0.6437091188918815, + "flos": 518501882880.0, + "grad_norm": 0.05291983306106443, + "language_loss": 0.83733785, + "learning_rate": 0.00029756181875618834, + "loss": 0.84803355, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.37133789, + "step": 3346, + "time_per_iteration": 2.5655863285064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073171, + "balance_loss_mlp": 1.03671718, + "epoch": 0.643901500577145, + "flos": 384736587264.0, + "grad_norm": 0.05666313029634666, + "language_loss": 0.83381206, + "learning_rate": 0.0002972769926806439, + "loss": 0.84454376, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.36474609, + "step": 3347, + "time_per_iteration": 2.456300735473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078252, + "balance_loss_mlp": 1.04122531, + "epoch": 0.6440938822624086, + "flos": 483478986240.0, + "grad_norm": 0.05181671155605703, + "language_loss": 0.88556045, + "learning_rate": 0.0002969922453091508, + "loss": 0.89634299, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.37036133, + "step": 3348, + "time_per_iteration": 2.5434532165527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078332, + "balance_loss_mlp": 1.04104328, + "epoch": 0.6442862639476722, + "flos": 540178163712.0, + "grad_norm": 0.04671333484936929, + "language_loss": 0.85028982, + "learning_rate": 0.00029670757675225777, + "loss": 0.86107314, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.37255859, + "step": 3349, + "time_per_iteration": 2.7254116535186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073632, + "balance_loss_mlp": 1.03715396, + "epoch": 0.6444786456329358, + "flos": 526651227648.0, + "grad_norm": 0.06388805390045102, + "language_loss": 0.7939328, + "learning_rate": 0.0002964229871204831, + "loss": 0.80466914, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.36474609, + "step": 3350, + "time_per_iteration": 2.623533248901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107868, + "balance_loss_mlp": 1.04274988, + "epoch": 0.6446710273181993, + "flos": 697576776192.0, + "grad_norm": 0.05363118847235426, + "language_loss": 0.83167213, + "learning_rate": 0.00029613847652431403, + "loss": 0.84245896, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.35961914, + "step": 3351, + "time_per_iteration": 2.835373640060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081252, + "balance_loss_mlp": 1.04536986, + "epoch": 0.6448634090034628, + "flos": 624705384960.0, + "grad_norm": 0.04827389624860956, + "language_loss": 0.79376614, + "learning_rate": 0.0002958540450742078, + "loss": 0.8045786, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.35864258, + "step": 3352, + "time_per_iteration": 2.905045986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078994, + "balance_loss_mlp": 1.04175305, + "epoch": 0.6450557906887264, + "flos": 600647256576.0, + "grad_norm": 0.04612026708575604, + "language_loss": 0.77379197, + "learning_rate": 0.0002955696928805901, + "loss": 0.7845819, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.37231445, + "step": 3353, + "time_per_iteration": 2.899186372756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081177, + "balance_loss_mlp": 1.04536617, + "epoch": 0.64524817237399, + "flos": 645905981952.0, + "grad_norm": 0.050963182313219675, + "language_loss": 0.86320436, + "learning_rate": 0.0002952854200538563, + "loss": 0.87401617, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.35839844, + "step": 3354, + "time_per_iteration": 2.7646782398223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107986, + "balance_loss_mlp": 1.04366803, + "epoch": 0.6454405540592536, + "flos": 473173346304.0, + "grad_norm": 0.05160537421710046, + "language_loss": 0.82000065, + "learning_rate": 0.000295001226704371, + "loss": 0.83079934, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.36206055, + "step": 3355, + "time_per_iteration": 2.5571465492248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080502, + "balance_loss_mlp": 1.04357088, + "epoch": 0.6456329357445171, + "flos": 611548005888.0, + "grad_norm": 0.052373080936441004, + "language_loss": 0.8272965, + "learning_rate": 0.00029471711294246783, + "loss": 0.83810151, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.36914062, + "step": 3356, + "time_per_iteration": 2.829554796218872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075402, + "balance_loss_mlp": 1.03890061, + "epoch": 0.6458253174297807, + "flos": 731373138432.0, + "grad_norm": 0.05569683801855411, + "language_loss": 0.82248133, + "learning_rate": 0.0002944330788784494, + "loss": 0.83323538, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.36499023, + "step": 3357, + "time_per_iteration": 2.93203067779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079005, + "balance_loss_mlp": 1.04276562, + "epoch": 0.6460176991150443, + "flos": 570129007104.0, + "grad_norm": 0.050517424210504216, + "language_loss": 0.84506869, + "learning_rate": 0.00029414912462258786, + "loss": 0.8558588, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.36254883, + "step": 3358, + "time_per_iteration": 2.819854259490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077222, + "balance_loss_mlp": 1.0391469, + "epoch": 0.6462100808003078, + "flos": 582890688000.0, + "grad_norm": 0.05841825537720819, + "language_loss": 0.81327105, + "learning_rate": 0.00029386525028512366, + "loss": 0.82404327, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.38037109, + "step": 3359, + "time_per_iteration": 2.698640823364258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081177, + "balance_loss_mlp": 1.04388809, + "epoch": 0.6464024624855714, + "flos": 483647721984.0, + "grad_norm": 0.05190666328104424, + "language_loss": 0.87126404, + "learning_rate": 0.0002935814559762666, + "loss": 0.88207585, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.37329102, + "step": 3360, + "time_per_iteration": 2.768366575241089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081829, + "balance_loss_mlp": 1.0439682, + "epoch": 0.6465948441708349, + "flos": 527508205056.0, + "grad_norm": 0.050745239197886684, + "language_loss": 0.79334629, + "learning_rate": 0.0002932977418061957, + "loss": 0.80416453, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.37841797, + "step": 3361, + "time_per_iteration": 2.632948637008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082687, + "balance_loss_mlp": 1.04582703, + "epoch": 0.6467872258560985, + "flos": 669121689600.0, + "grad_norm": 0.06228301103005666, + "language_loss": 0.80853021, + "learning_rate": 0.00029301410788505833, + "loss": 0.81935704, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.3684082, + "step": 3362, + "time_per_iteration": 2.7769224643707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084618, + "balance_loss_mlp": 1.04833102, + "epoch": 0.6469796075413621, + "flos": 431867828736.0, + "grad_norm": 0.06087250960931665, + "language_loss": 0.8065362, + "learning_rate": 0.00029273055432297126, + "loss": 0.81738234, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.36328125, + "step": 3363, + "time_per_iteration": 2.484450101852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084285, + "balance_loss_mlp": 1.04611397, + "epoch": 0.6471719892266257, + "flos": 803413693440.0, + "grad_norm": 0.05541447784561029, + "language_loss": 0.80514741, + "learning_rate": 0.00029244708123001917, + "loss": 0.81599021, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.3815918, + "step": 3364, + "time_per_iteration": 2.9762370586395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.04387355, + "epoch": 0.6473643709118891, + "flos": 576923779584.0, + "grad_norm": 0.051117290397423236, + "language_loss": 0.84345543, + "learning_rate": 0.0002921636887162565, + "loss": 0.85426897, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.37451172, + "step": 3365, + "time_per_iteration": 2.72733736038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085471, + "balance_loss_mlp": 1.04930282, + "epoch": 0.6475567525971527, + "flos": 761079490560.0, + "grad_norm": 0.06137767127044858, + "language_loss": 0.83554536, + "learning_rate": 0.00029188037689170595, + "loss": 0.84640002, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.36181641, + "step": 3366, + "time_per_iteration": 2.962611675262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081175, + "balance_loss_mlp": 1.04474497, + "epoch": 0.6477491342824163, + "flos": 842754972672.0, + "grad_norm": 0.05371519731752011, + "language_loss": 0.83465898, + "learning_rate": 0.0002915971458663586, + "loss": 0.84547073, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.36450195, + "step": 3367, + "time_per_iteration": 3.043851137161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082146, + "balance_loss_mlp": 1.04545331, + "epoch": 0.6479415159676799, + "flos": 884431457280.0, + "grad_norm": 0.05567471086198027, + "language_loss": 0.81976676, + "learning_rate": 0.00029131399575017494, + "loss": 0.83058822, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.36669922, + "step": 3368, + "time_per_iteration": 3.16506290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072939, + "balance_loss_mlp": 1.0362463, + "epoch": 0.6481338976529435, + "flos": 615211642368.0, + "grad_norm": 0.04146272732833695, + "language_loss": 0.85776877, + "learning_rate": 0.0002910309266530836, + "loss": 0.86849815, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.36694336, + "step": 3369, + "time_per_iteration": 2.810415267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082428, + "balance_loss_mlp": 1.04485345, + "epoch": 0.648326279338207, + "flos": 509757428736.0, + "grad_norm": 0.047563398394336556, + "language_loss": 0.85364866, + "learning_rate": 0.0002907479386849814, + "loss": 0.86447287, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.37573242, + "step": 3370, + "time_per_iteration": 2.6234049797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079087, + "balance_loss_mlp": 1.04258549, + "epoch": 0.6485186610234706, + "flos": 702157026816.0, + "grad_norm": 0.05547979254265798, + "language_loss": 0.79903388, + "learning_rate": 0.0002904650319557339, + "loss": 0.80982471, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.36523438, + "step": 3371, + "time_per_iteration": 3.052445411682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077959, + "balance_loss_mlp": 1.04148114, + "epoch": 0.6487110427087341, + "flos": 560418476544.0, + "grad_norm": 0.10081589784895977, + "language_loss": 0.80853498, + "learning_rate": 0.0002901822065751758, + "loss": 0.81931454, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.36499023, + "step": 3372, + "time_per_iteration": 2.679738759994507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072935, + "balance_loss_mlp": 1.03614688, + "epoch": 0.6489034243939977, + "flos": 679801268736.0, + "grad_norm": 0.07558571237012199, + "language_loss": 0.85327506, + "learning_rate": 0.0002898994626531093, + "loss": 0.86400437, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.36767578, + "step": 3373, + "time_per_iteration": 2.8318021297454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078194, + "balance_loss_mlp": 1.04131091, + "epoch": 0.6490958060792612, + "flos": 474172918272.0, + "grad_norm": 0.04995126846613369, + "language_loss": 0.87709844, + "learning_rate": 0.00028961680029930526, + "loss": 0.88788044, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.36865234, + "step": 3374, + "time_per_iteration": 2.550858736038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107378, + "balance_loss_mlp": 1.03751612, + "epoch": 0.6492881877645248, + "flos": 588563642880.0, + "grad_norm": 0.053073331698041674, + "language_loss": 0.76720631, + "learning_rate": 0.00028933421962350317, + "loss": 0.77794409, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.36279297, + "step": 3375, + "time_per_iteration": 2.7313249111175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073863, + "balance_loss_mlp": 1.0367415, + "epoch": 0.6494805694497884, + "flos": 642139038720.0, + "grad_norm": 0.0646432947435949, + "language_loss": 0.84017503, + "learning_rate": 0.0002890517207354104, + "loss": 0.8509137, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.37109375, + "step": 3376, + "time_per_iteration": 2.8168907165527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071643, + "balance_loss_mlp": 1.0345453, + "epoch": 0.649672951135052, + "flos": 531550162944.0, + "grad_norm": 0.054117289013755926, + "language_loss": 0.81647491, + "learning_rate": 0.0002887693037447029, + "loss": 0.82719135, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.37084961, + "step": 3377, + "time_per_iteration": 2.59980845451355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068068, + "balance_loss_mlp": 1.03170967, + "epoch": 0.6498653328203156, + "flos": 547124295168.0, + "grad_norm": 0.05861346811628937, + "language_loss": 0.82201707, + "learning_rate": 0.00028848696876102443, + "loss": 0.83269775, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.36352539, + "step": 3378, + "time_per_iteration": 2.6153130531311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071004, + "balance_loss_mlp": 1.03333366, + "epoch": 0.650057714505579, + "flos": 461996172288.0, + "grad_norm": 0.0689678336471058, + "language_loss": 0.83211708, + "learning_rate": 0.00028820471589398723, + "loss": 0.84282708, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.37646484, + "step": 3379, + "time_per_iteration": 2.553159236907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068457, + "balance_loss_mlp": 1.03100109, + "epoch": 0.6502500961908426, + "flos": 509905815552.0, + "grad_norm": 0.06047763604232794, + "language_loss": 0.77722514, + "learning_rate": 0.00028792254525317196, + "loss": 0.78790975, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.37451172, + "step": 3380, + "time_per_iteration": 2.680063009262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072989, + "balance_loss_mlp": 1.03519976, + "epoch": 0.6504424778761062, + "flos": 579557320704.0, + "grad_norm": 0.05541331386031739, + "language_loss": 0.81432557, + "learning_rate": 0.00028764045694812645, + "loss": 0.82505548, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.37768555, + "step": 3381, + "time_per_iteration": 2.7398667335510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069777, + "balance_loss_mlp": 1.03186822, + "epoch": 0.6506348595613698, + "flos": 519206091264.0, + "grad_norm": 0.0812129580253802, + "language_loss": 0.76837122, + "learning_rate": 0.0002873584510883671, + "loss": 0.77906895, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.37915039, + "step": 3382, + "time_per_iteration": 2.565248727798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070048, + "balance_loss_mlp": 1.03302145, + "epoch": 0.6508272412466333, + "flos": 510048410112.0, + "grad_norm": 0.048965932841550305, + "language_loss": 0.85894716, + "learning_rate": 0.0002870765277833788, + "loss": 0.86964768, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.37011719, + "step": 3383, + "time_per_iteration": 2.7287330627441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070639, + "balance_loss_mlp": 1.03366053, + "epoch": 0.6510196229318969, + "flos": 625329607680.0, + "grad_norm": 0.07719936634316926, + "language_loss": 0.80431008, + "learning_rate": 0.00028679468714261347, + "loss": 0.81501651, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.36938477, + "step": 3384, + "time_per_iteration": 2.73777437210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068083, + "balance_loss_mlp": 1.03141391, + "epoch": 0.6512120046171604, + "flos": 474453725184.0, + "grad_norm": 0.05390133741953619, + "language_loss": 0.77104408, + "learning_rate": 0.0002865129292754918, + "loss": 0.78172493, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.36645508, + "step": 3385, + "time_per_iteration": 2.570709228515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107061, + "balance_loss_mlp": 1.03396475, + "epoch": 0.651404386302424, + "flos": 551561951232.0, + "grad_norm": 0.04665998226112413, + "language_loss": 0.81778049, + "learning_rate": 0.00028623125429140105, + "loss": 0.82848656, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.36621094, + "step": 3386, + "time_per_iteration": 2.8083431720733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067177, + "balance_loss_mlp": 1.02964997, + "epoch": 0.6515967679876876, + "flos": 523047227904.0, + "grad_norm": 0.06778513311562764, + "language_loss": 0.86781728, + "learning_rate": 0.00028594966229969785, + "loss": 0.87848902, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.37524414, + "step": 3387, + "time_per_iteration": 2.652562379837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068807, + "balance_loss_mlp": 1.03237641, + "epoch": 0.6517891496729511, + "flos": 573590412288.0, + "grad_norm": 0.04915205130547935, + "language_loss": 0.81361043, + "learning_rate": 0.00028566815340970577, + "loss": 0.82429844, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.36450195, + "step": 3388, + "time_per_iteration": 2.7212326526641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069055, + "balance_loss_mlp": 1.0323149, + "epoch": 0.6519815313582147, + "flos": 555662135808.0, + "grad_norm": 0.05372700409854334, + "language_loss": 0.80874032, + "learning_rate": 0.0002853867277307162, + "loss": 0.81943083, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.36743164, + "step": 3389, + "time_per_iteration": 2.645580291748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072292, + "balance_loss_mlp": 1.03564715, + "epoch": 0.6521739130434783, + "flos": 480229986816.0, + "grad_norm": 0.04994212123605962, + "language_loss": 0.82347226, + "learning_rate": 0.00028510538537198824, + "loss": 0.8341952, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.36669922, + "step": 3390, + "time_per_iteration": 2.6053972244262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071186, + "balance_loss_mlp": 1.03456497, + "epoch": 0.6523662947287419, + "flos": 665380887552.0, + "grad_norm": 0.052060213121620263, + "language_loss": 0.86389101, + "learning_rate": 0.00028482412644274867, + "loss": 0.87460279, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.36621094, + "step": 3391, + "time_per_iteration": 2.9146382808685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071108, + "balance_loss_mlp": 1.03408146, + "epoch": 0.6525586764140053, + "flos": 548394499584.0, + "grad_norm": 0.05233101091155523, + "language_loss": 0.74427474, + "learning_rate": 0.00028454295105219207, + "loss": 0.75498581, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.37011719, + "step": 3392, + "time_per_iteration": 2.653144598007202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072074, + "balance_loss_mlp": 1.03457081, + "epoch": 0.6527510580992689, + "flos": 802529012736.0, + "grad_norm": 0.044337250552145664, + "language_loss": 0.7951991, + "learning_rate": 0.0002842618593094802, + "loss": 0.80591983, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.37475586, + "step": 3393, + "time_per_iteration": 3.1016182899475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075529, + "balance_loss_mlp": 1.0390985, + "epoch": 0.6529434397845325, + "flos": 670864757760.0, + "grad_norm": 0.06313497545988733, + "language_loss": 0.80366606, + "learning_rate": 0.00028398085132374243, + "loss": 0.81442136, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.36425781, + "step": 3394, + "time_per_iteration": 2.81162691116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070268, + "balance_loss_mlp": 1.03338432, + "epoch": 0.6531358214697961, + "flos": 828043610112.0, + "grad_norm": 0.05205360505405607, + "language_loss": 0.84108675, + "learning_rate": 0.0002836999272040761, + "loss": 0.85178936, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.36865234, + "step": 3395, + "time_per_iteration": 3.086585283279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073433, + "balance_loss_mlp": 1.03607285, + "epoch": 0.6533282031550597, + "flos": 487157179392.0, + "grad_norm": 0.06347573427267852, + "language_loss": 0.8364076, + "learning_rate": 0.00028341908705954575, + "loss": 0.84714192, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.37353516, + "step": 3396, + "time_per_iteration": 2.63339900970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101777, + "balance_loss_mlp": 1.00317848, + "epoch": 0.6535205848403232, + "flos": 1556908121088.0, + "grad_norm": 0.01725431962534194, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82779574, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.14550781, + "step": 3397, + "time_per_iteration": 4.886535167694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107024, + "balance_loss_mlp": 1.03342795, + "epoch": 0.6537129665255867, + "flos": 493464531456.0, + "grad_norm": 0.0583640657945681, + "language_loss": 0.78047717, + "learning_rate": 0.00028285765913198604, + "loss": 0.79117954, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.36816406, + "step": 3398, + "time_per_iteration": 2.5336763858795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075265, + "balance_loss_mlp": 1.03771448, + "epoch": 0.6539053482108503, + "flos": 604718327808.0, + "grad_norm": 0.10018787672366053, + "language_loss": 0.81953001, + "learning_rate": 0.0002825770715669227, + "loss": 0.83028269, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.37548828, + "step": 3399, + "time_per_iteration": 2.7225871086120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073476, + "balance_loss_mlp": 1.03656852, + "epoch": 0.6540977298961139, + "flos": 577504332288.0, + "grad_norm": 0.054796705255158284, + "language_loss": 0.81529284, + "learning_rate": 0.00028229656841292634, + "loss": 0.82602763, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.36938477, + "step": 3400, + "time_per_iteration": 2.7136409282684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.03675604, + "epoch": 0.6542901115813774, + "flos": 511500496896.0, + "grad_norm": 0.09810959054820141, + "language_loss": 0.76415372, + "learning_rate": 0.0002820161497788979, + "loss": 0.77489489, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.37304688, + "step": 3401, + "time_per_iteration": 2.561142921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107247, + "balance_loss_mlp": 1.03656387, + "epoch": 0.654482493266641, + "flos": 625201569792.0, + "grad_norm": 0.05065630966567836, + "language_loss": 0.86865586, + "learning_rate": 0.00028173581577370545, + "loss": 0.87938058, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.35913086, + "step": 3402, + "time_per_iteration": 2.771660327911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074844, + "balance_loss_mlp": 1.0377934, + "epoch": 0.6546748749519046, + "flos": 523712148480.0, + "grad_norm": 0.04769798618105731, + "language_loss": 0.78826487, + "learning_rate": 0.0002814555665061844, + "loss": 0.79901326, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.37011719, + "step": 3403, + "time_per_iteration": 2.6541905403137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070804, + "balance_loss_mlp": 1.03351498, + "epoch": 0.6548672566371682, + "flos": 478945225728.0, + "grad_norm": 0.05625408135925951, + "language_loss": 0.77440852, + "learning_rate": 0.00028117540208513715, + "loss": 0.78511655, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.37280273, + "step": 3404, + "time_per_iteration": 2.7175214290618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070835, + "balance_loss_mlp": 1.03428507, + "epoch": 0.6550596383224317, + "flos": 615732558336.0, + "grad_norm": 0.05404961750978507, + "language_loss": 0.84969914, + "learning_rate": 0.00028089532261933313, + "loss": 0.86040747, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.36523438, + "step": 3405, + "time_per_iteration": 2.6872446537017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079163, + "balance_loss_mlp": 1.04197001, + "epoch": 0.6552520200076952, + "flos": 488594709504.0, + "grad_norm": 0.0680253030817501, + "language_loss": 0.85329425, + "learning_rate": 0.0002806153282175087, + "loss": 0.86408579, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.37182617, + "step": 3406, + "time_per_iteration": 2.573843479156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069625, + "balance_loss_mlp": 1.0329802, + "epoch": 0.6554444016929588, + "flos": 687310424064.0, + "grad_norm": 0.0894093410202252, + "language_loss": 0.82802272, + "learning_rate": 0.0002803354189883679, + "loss": 0.83871901, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.36669922, + "step": 3407, + "time_per_iteration": 2.824995279312134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076204, + "balance_loss_mlp": 1.04017901, + "epoch": 0.6556367833782224, + "flos": 542772417024.0, + "grad_norm": 0.05173629873734528, + "language_loss": 0.85629022, + "learning_rate": 0.00028005559504058053, + "loss": 0.86705232, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.3605957, + "step": 3408, + "time_per_iteration": 2.709195852279663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074603, + "balance_loss_mlp": 1.03860188, + "epoch": 0.655829165063486, + "flos": 673237840896.0, + "grad_norm": 0.05391320536337509, + "language_loss": 0.76764786, + "learning_rate": 0.0002797758564827838, + "loss": 0.77839386, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.36010742, + "step": 3409, + "time_per_iteration": 2.7769269943237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073177, + "balance_loss_mlp": 1.03624606, + "epoch": 0.6560215467487496, + "flos": 531550162944.0, + "grad_norm": 0.059937965776424594, + "language_loss": 0.8368215, + "learning_rate": 0.0002794962034235824, + "loss": 0.84755325, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.36889648, + "step": 3410, + "time_per_iteration": 2.599886417388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072982, + "balance_loss_mlp": 1.03588414, + "epoch": 0.656213928434013, + "flos": 591025476096.0, + "grad_norm": 0.13531884717327836, + "language_loss": 0.74423587, + "learning_rate": 0.00027921663597154695, + "loss": 0.75496566, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.37084961, + "step": 3411, + "time_per_iteration": 2.7206108570098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107222, + "balance_loss_mlp": 1.03686285, + "epoch": 0.6564063101192766, + "flos": 415564756992.0, + "grad_norm": 0.08609193384147822, + "language_loss": 0.80696797, + "learning_rate": 0.00027893715423521525, + "loss": 0.81769013, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.35375977, + "step": 3412, + "time_per_iteration": 2.4493868350982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067552, + "balance_loss_mlp": 1.03183699, + "epoch": 0.6565986918045402, + "flos": 453084392448.0, + "grad_norm": 0.05044036578156056, + "language_loss": 0.8354848, + "learning_rate": 0.00027865775832309163, + "loss": 0.84616029, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.35742188, + "step": 3413, + "time_per_iteration": 2.665999174118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074779, + "balance_loss_mlp": 1.03899264, + "epoch": 0.6567910734898038, + "flos": 547483677696.0, + "grad_norm": 0.060493690389786, + "language_loss": 0.85984117, + "learning_rate": 0.00027837844834364733, + "loss": 0.87058896, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.35839844, + "step": 3414, + "time_per_iteration": 2.6195499897003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072987, + "balance_loss_mlp": 1.03677094, + "epoch": 0.6569834551750673, + "flos": 655207667712.0, + "grad_norm": 0.11318049634335087, + "language_loss": 0.86511016, + "learning_rate": 0.00027809922440532, + "loss": 0.87583995, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.36254883, + "step": 3415, + "time_per_iteration": 2.823486566543579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072977, + "balance_loss_mlp": 1.03664172, + "epoch": 0.6571758368603309, + "flos": 539399761920.0, + "grad_norm": 0.08390902906870049, + "language_loss": 0.80793774, + "learning_rate": 0.00027782008661651406, + "loss": 0.81866741, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.36352539, + "step": 3416, + "time_per_iteration": 2.762639045715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071461, + "balance_loss_mlp": 1.03588891, + "epoch": 0.6573682185455945, + "flos": 497088880128.0, + "grad_norm": 0.049698407396127284, + "language_loss": 0.87283665, + "learning_rate": 0.00027754103508560013, + "loss": 0.8835513, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.35620117, + "step": 3417, + "time_per_iteration": 2.5768332481384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070469, + "balance_loss_mlp": 1.03389549, + "epoch": 0.657560600230858, + "flos": 447244111872.0, + "grad_norm": 0.06621650904732551, + "language_loss": 0.8256399, + "learning_rate": 0.0002772620699209163, + "loss": 0.83634454, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.36572266, + "step": 3418, + "time_per_iteration": 2.5885636806488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072181, + "balance_loss_mlp": 1.03606033, + "epoch": 0.6577529819161216, + "flos": 481696630272.0, + "grad_norm": 0.053979947748841836, + "language_loss": 0.80128914, + "learning_rate": 0.0002769831912307658, + "loss": 0.81201094, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.36157227, + "step": 3419, + "time_per_iteration": 2.51863169670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010709, + "balance_loss_mlp": 1.0346607, + "epoch": 0.6579453636013851, + "flos": 530589878784.0, + "grad_norm": 0.061422994023147534, + "language_loss": 0.80013275, + "learning_rate": 0.00027670439912341917, + "loss": 0.81084168, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.36254883, + "step": 3420, + "time_per_iteration": 2.595789670944214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067388, + "balance_loss_mlp": 1.03117275, + "epoch": 0.6581377452866487, + "flos": 627737596416.0, + "grad_norm": 0.0471415503067176, + "language_loss": 0.8344667, + "learning_rate": 0.0002764256937071129, + "loss": 0.84514058, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.36230469, + "step": 3421, + "time_per_iteration": 2.7812321186065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075792, + "balance_loss_mlp": 1.03886116, + "epoch": 0.6583301269719123, + "flos": 548355211776.0, + "grad_norm": 0.05116368726028845, + "language_loss": 0.86894339, + "learning_rate": 0.00027614707509005036, + "loss": 0.87970132, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.36889648, + "step": 3422, + "time_per_iteration": 2.6573753356933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069799, + "balance_loss_mlp": 1.03401232, + "epoch": 0.6585225086571759, + "flos": 427268639232.0, + "grad_norm": 0.053946906539649876, + "language_loss": 0.7900126, + "learning_rate": 0.0002758685433804008, + "loss": 0.80071056, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.35839844, + "step": 3423, + "time_per_iteration": 2.4556972980499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075371, + "balance_loss_mlp": 1.03758192, + "epoch": 0.6587148903424394, + "flos": 859264657920.0, + "grad_norm": 0.05746906751203771, + "language_loss": 0.79022425, + "learning_rate": 0.00027559009868630005, + "loss": 0.80097795, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.37768555, + "step": 3424, + "time_per_iteration": 3.0918102264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068275, + "balance_loss_mlp": 1.03067625, + "epoch": 0.6589072720277029, + "flos": 805280417280.0, + "grad_norm": 0.05909134726698472, + "language_loss": 0.7990104, + "learning_rate": 0.0002753117411158491, + "loss": 0.8096931, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.37573242, + "step": 3425, + "time_per_iteration": 3.0557546615600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074885, + "balance_loss_mlp": 1.03769183, + "epoch": 0.6590996537129665, + "flos": 548355211776.0, + "grad_norm": 0.0487398796366246, + "language_loss": 0.89624393, + "learning_rate": 0.0002750334707771168, + "loss": 0.90699285, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.37158203, + "step": 3426, + "time_per_iteration": 2.6186933517456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107782, + "balance_loss_mlp": 1.03991175, + "epoch": 0.6592920353982301, + "flos": 453931195392.0, + "grad_norm": 0.09520851451243123, + "language_loss": 0.81130987, + "learning_rate": 0.0002747552877781369, + "loss": 0.82208812, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.37866211, + "step": 3427, + "time_per_iteration": 2.4979238510131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068038, + "balance_loss_mlp": 1.03086865, + "epoch": 0.6594844170834937, + "flos": 566903328768.0, + "grad_norm": 0.04689884727267459, + "language_loss": 0.81804323, + "learning_rate": 0.0002744771922269097, + "loss": 0.82872361, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.37158203, + "step": 3428, + "time_per_iteration": 2.740729808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075113, + "balance_loss_mlp": 1.03768158, + "epoch": 0.6596767987687572, + "flos": 1187452016640.0, + "grad_norm": 0.05881296297664234, + "language_loss": 0.81886125, + "learning_rate": 0.0002741991842314015, + "loss": 0.82961237, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.37426758, + "step": 3429, + "time_per_iteration": 3.4745006561279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071953, + "balance_loss_mlp": 1.03506947, + "epoch": 0.6598691804540208, + "flos": 503247845376.0, + "grad_norm": 0.05507751278667406, + "language_loss": 0.85868287, + "learning_rate": 0.0002739212638995445, + "loss": 0.86940235, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.3684082, + "step": 3430, + "time_per_iteration": 2.532402515411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070704, + "balance_loss_mlp": 1.033463, + "epoch": 0.6600615621392844, + "flos": 531072916992.0, + "grad_norm": 0.05565442756862113, + "language_loss": 0.83027416, + "learning_rate": 0.00027364343133923696, + "loss": 0.84098119, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.37231445, + "step": 3431, + "time_per_iteration": 2.630985736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077517, + "balance_loss_mlp": 1.0396086, + "epoch": 0.6602539438245479, + "flos": 565170435072.0, + "grad_norm": 0.06720345334853779, + "language_loss": 0.82615936, + "learning_rate": 0.0002733656866583431, + "loss": 0.83693457, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.37890625, + "step": 3432, + "time_per_iteration": 2.6693778038024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072775, + "balance_loss_mlp": 1.0354147, + "epoch": 0.6604463255098114, + "flos": 856802824704.0, + "grad_norm": 0.05437523875977016, + "language_loss": 0.82810867, + "learning_rate": 0.0002730880299646927, + "loss": 0.83883643, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.37329102, + "step": 3433, + "time_per_iteration": 3.047272205352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072216, + "balance_loss_mlp": 1.03540444, + "epoch": 0.660638707195075, + "flos": 674158837248.0, + "grad_norm": 0.05169361023924996, + "language_loss": 0.85458863, + "learning_rate": 0.0002728104613660821, + "loss": 0.86531085, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.36791992, + "step": 3434, + "time_per_iteration": 2.8202831745147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010658, + "balance_loss_mlp": 1.02879786, + "epoch": 0.6608310888803386, + "flos": 888572339712.0, + "grad_norm": 0.05115304739976813, + "language_loss": 0.83194226, + "learning_rate": 0.0002725329809702729, + "loss": 0.84260029, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.36962891, + "step": 3435, + "time_per_iteration": 3.228891134262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071505, + "balance_loss_mlp": 1.03376281, + "epoch": 0.6610234705656022, + "flos": 1135909260288.0, + "grad_norm": 0.06628416389045559, + "language_loss": 0.75631964, + "learning_rate": 0.0002722555888849921, + "loss": 0.76703465, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.37695312, + "step": 3436, + "time_per_iteration": 3.422288179397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068961, + "balance_loss_mlp": 1.03212583, + "epoch": 0.6612158522508658, + "flos": 467776816128.0, + "grad_norm": 0.05048111401896507, + "language_loss": 0.80400562, + "learning_rate": 0.00027197828521793334, + "loss": 0.81469518, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.36816406, + "step": 3437, + "time_per_iteration": 2.4787607192993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073991, + "balance_loss_mlp": 1.03686941, + "epoch": 0.6614082339361292, + "flos": 571374480384.0, + "grad_norm": 0.05876416837727376, + "language_loss": 0.84865153, + "learning_rate": 0.0002717010700767552, + "loss": 0.85939145, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.37109375, + "step": 3438, + "time_per_iteration": 2.740835189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071354, + "balance_loss_mlp": 1.03444707, + "epoch": 0.6616006156213928, + "flos": 498220872192.0, + "grad_norm": 0.06865546708014894, + "language_loss": 0.75838953, + "learning_rate": 0.00027142394356908226, + "loss": 0.76910305, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.36889648, + "step": 3439, + "time_per_iteration": 2.5476725101470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067365, + "balance_loss_mlp": 1.03021967, + "epoch": 0.6617929973066564, + "flos": 602124074496.0, + "grad_norm": 0.05819778232686783, + "language_loss": 0.85115051, + "learning_rate": 0.00027114690580250456, + "loss": 0.86182415, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.37133789, + "step": 3440, + "time_per_iteration": 2.746610403060913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072245, + "balance_loss_mlp": 1.03562403, + "epoch": 0.66198537899192, + "flos": 522731515392.0, + "grad_norm": 0.053821887104205664, + "language_loss": 0.86748421, + "learning_rate": 0.0002708699568845776, + "loss": 0.87820661, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.36621094, + "step": 3441, + "time_per_iteration": 2.6001980304718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046259, + "balance_loss_mlp": 1.0328126, + "epoch": 0.6621777606771835, + "flos": 1565421230592.0, + "grad_norm": 0.030021604030083596, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80334044, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.13476562, + "step": 3442, + "time_per_iteration": 4.909358263015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075884, + "balance_loss_mlp": 1.03933442, + "epoch": 0.6623701423624471, + "flos": 526409708544.0, + "grad_norm": 0.050122845180299073, + "language_loss": 0.83157456, + "learning_rate": 0.0002703163260247261, + "loss": 0.84233344, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.36547852, + "step": 3443, + "time_per_iteration": 2.600733757019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074894, + "balance_loss_mlp": 1.03853548, + "epoch": 0.6625625240477107, + "flos": 527921432064.0, + "grad_norm": 0.07644437952185021, + "language_loss": 0.81613672, + "learning_rate": 0.0002700396442977399, + "loss": 0.8268857, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.36376953, + "step": 3444, + "time_per_iteration": 2.598722457885742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080077, + "balance_loss_mlp": 1.04312193, + "epoch": 0.6627549057329742, + "flos": 472854661632.0, + "grad_norm": 0.05132438186678615, + "language_loss": 0.84284377, + "learning_rate": 0.0002697630518492817, + "loss": 0.85364461, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.36938477, + "step": 3445, + "time_per_iteration": 2.6794075965881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078368, + "balance_loss_mlp": 1.04253387, + "epoch": 0.6629472874182378, + "flos": 527743931904.0, + "grad_norm": 0.05491144350541831, + "language_loss": 0.8564226, + "learning_rate": 0.0002694865487867343, + "loss": 0.86720634, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.35888672, + "step": 3446, + "time_per_iteration": 2.643427848815918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081911, + "balance_loss_mlp": 1.04540932, + "epoch": 0.6631396691035013, + "flos": 612906960384.0, + "grad_norm": 0.04980385474467639, + "language_loss": 0.84496373, + "learning_rate": 0.0002692101352174453, + "loss": 0.85578281, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.36499023, + "step": 3447, + "time_per_iteration": 2.750990629196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077753, + "balance_loss_mlp": 1.04106009, + "epoch": 0.6633320507887649, + "flos": 609041092608.0, + "grad_norm": 0.05216047224803115, + "language_loss": 0.8459692, + "learning_rate": 0.00026893381124872787, + "loss": 0.85674667, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.3671875, + "step": 3448, + "time_per_iteration": 2.7701821327209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074873, + "balance_loss_mlp": 1.03839493, + "epoch": 0.6635244324740285, + "flos": 749342112768.0, + "grad_norm": 0.05521376247242365, + "language_loss": 0.80839992, + "learning_rate": 0.00026865757698786097, + "loss": 0.81914866, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.36499023, + "step": 3449, + "time_per_iteration": 3.046751022338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079305, + "balance_loss_mlp": 1.04382825, + "epoch": 0.6637168141592921, + "flos": 664222754304.0, + "grad_norm": 0.05057031991468663, + "language_loss": 0.8206256, + "learning_rate": 0.000268381432542088, + "loss": 0.83141863, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.35546875, + "step": 3450, + "time_per_iteration": 2.7903122901916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078156, + "balance_loss_mlp": 1.04117751, + "epoch": 0.6639091958445555, + "flos": 606500683776.0, + "grad_norm": 0.05221239612866202, + "language_loss": 0.7978282, + "learning_rate": 0.00026810537801861807, + "loss": 0.80860978, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.36938477, + "step": 3451, + "time_per_iteration": 2.7744555473327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078626, + "balance_loss_mlp": 1.04200482, + "epoch": 0.6641015775298191, + "flos": 476452869120.0, + "grad_norm": 0.04982593193554921, + "language_loss": 0.81320304, + "learning_rate": 0.0002678294135246243, + "loss": 0.82398927, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.36621094, + "step": 3452, + "time_per_iteration": 2.748623847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107526, + "balance_loss_mlp": 1.03902042, + "epoch": 0.6642939592150827, + "flos": 903746391552.0, + "grad_norm": 0.05075048748752087, + "language_loss": 0.86122698, + "learning_rate": 0.0002675535391672463, + "loss": 0.87197959, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.36230469, + "step": 3453, + "time_per_iteration": 3.0941269397735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075595, + "balance_loss_mlp": 1.03995168, + "epoch": 0.6644863409003463, + "flos": 581527351296.0, + "grad_norm": 0.04705931875685086, + "language_loss": 0.85942483, + "learning_rate": 0.0002672777550535877, + "loss": 0.87018085, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.35668945, + "step": 3454, + "time_per_iteration": 2.782492160797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077961, + "balance_loss_mlp": 1.04222202, + "epoch": 0.6646787225856099, + "flos": 478761933312.0, + "grad_norm": 0.05883776733050642, + "language_loss": 0.84943002, + "learning_rate": 0.00026700206129071747, + "loss": 0.86020958, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.35791016, + "step": 3455, + "time_per_iteration": 2.524601697921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074329, + "balance_loss_mlp": 1.0389235, + "epoch": 0.6648711042708734, + "flos": 449676831744.0, + "grad_norm": 0.058012568255648024, + "language_loss": 0.88879943, + "learning_rate": 0.00026672645798566925, + "loss": 0.89954275, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.35449219, + "step": 3456, + "time_per_iteration": 2.532412528991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072913, + "balance_loss_mlp": 1.03745985, + "epoch": 0.665063485956137, + "flos": 858553095168.0, + "grad_norm": 0.053261627047558845, + "language_loss": 0.79371452, + "learning_rate": 0.00026645094524544225, + "loss": 0.8044436, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.35473633, + "step": 3457, + "time_per_iteration": 3.2936151027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068323, + "balance_loss_mlp": 1.03229845, + "epoch": 0.6652558676414005, + "flos": 604024293888.0, + "grad_norm": 0.04836928796010222, + "language_loss": 0.75254017, + "learning_rate": 0.00026617552317699945, + "loss": 0.76322341, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.36035156, + "step": 3458, + "time_per_iteration": 2.781972646713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072016, + "balance_loss_mlp": 1.03651559, + "epoch": 0.6654482493266641, + "flos": 510141542400.0, + "grad_norm": 0.05402195072483101, + "language_loss": 0.87006921, + "learning_rate": 0.0002659001918872693, + "loss": 0.88078934, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.35546875, + "step": 3459, + "time_per_iteration": 2.586364507675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073402, + "balance_loss_mlp": 1.03790104, + "epoch": 0.6656406310119277, + "flos": 565342142976.0, + "grad_norm": 0.06009221273725258, + "language_loss": 0.80872095, + "learning_rate": 0.0002656249514831449, + "loss": 0.81945497, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.35522461, + "step": 3460, + "time_per_iteration": 2.6385302543640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072951, + "balance_loss_mlp": 1.03652048, + "epoch": 0.6658330126971912, + "flos": 1023859533312.0, + "grad_norm": 0.05794846268474579, + "language_loss": 0.86832029, + "learning_rate": 0.00026534980207148416, + "loss": 0.87904978, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.36425781, + "step": 3461, + "time_per_iteration": 3.388073205947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074265, + "balance_loss_mlp": 1.03869295, + "epoch": 0.6660253943824548, + "flos": 816472147968.0, + "grad_norm": 0.06339025189442228, + "language_loss": 0.7302506, + "learning_rate": 0.0002650747437591097, + "loss": 0.74099326, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.35595703, + "step": 3462, + "time_per_iteration": 2.980158567428589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021261, + "balance_loss_mlp": 1.00810075, + "epoch": 0.6662177760677184, + "flos": 1495331767296.0, + "grad_norm": 0.02097535909927297, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82900834, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.13183594, + "step": 3463, + "time_per_iteration": 5.0071799755096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070948, + "balance_loss_mlp": 1.0354948, + "epoch": 0.666410157752982, + "flos": 499875190272.0, + "grad_norm": 0.04521050671951116, + "language_loss": 0.86503369, + "learning_rate": 0.00026452490085933155, + "loss": 0.87574315, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.35473633, + "step": 3464, + "time_per_iteration": 2.5450592041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067553, + "balance_loss_mlp": 1.03212357, + "epoch": 0.6666025394382454, + "flos": 480928402944.0, + "grad_norm": 0.05339724932754041, + "language_loss": 0.89435887, + "learning_rate": 0.00026425011648539614, + "loss": 0.90503436, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.35424805, + "step": 3465, + "time_per_iteration": 2.5414719581604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070322, + "balance_loss_mlp": 1.03377271, + "epoch": 0.666794921123509, + "flos": 546395355648.0, + "grad_norm": 0.05247467659401075, + "language_loss": 0.82117605, + "learning_rate": 0.00026397542363768267, + "loss": 0.83187926, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.36547852, + "step": 3466, + "time_per_iteration": 2.659952402114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071457, + "balance_loss_mlp": 1.03533673, + "epoch": 0.6669873028087726, + "flos": 471750372864.0, + "grad_norm": 0.052441453711620734, + "language_loss": 0.81731021, + "learning_rate": 0.0002637008224228362, + "loss": 0.82802474, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.36132812, + "step": 3467, + "time_per_iteration": 2.5569608211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073874, + "balance_loss_mlp": 1.03875458, + "epoch": 0.6671796844940362, + "flos": 547119912960.0, + "grad_norm": 0.04638174393206939, + "language_loss": 0.84333348, + "learning_rate": 0.00026342631294746653, + "loss": 0.85407221, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.3515625, + "step": 3468, + "time_per_iteration": 2.7492995262145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068836, + "balance_loss_mlp": 1.03300142, + "epoch": 0.6673720661792998, + "flos": 1069867547136.0, + "grad_norm": 0.06886465160601114, + "language_loss": 0.80601752, + "learning_rate": 0.0002631518953181476, + "loss": 0.81670582, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.35839844, + "step": 3469, + "time_per_iteration": 3.4849367141723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017385, + "balance_loss_mlp": 1.0047015, + "epoch": 0.6675644478645633, + "flos": 1522963372032.0, + "grad_norm": 0.011284556376000376, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.77342671, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.12695312, + "step": 3470, + "time_per_iteration": 4.8896119594573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073777, + "balance_loss_mlp": 1.03775215, + "epoch": 0.6677568295498268, + "flos": 579410343936.0, + "grad_norm": 0.05100561036949307, + "language_loss": 0.8019954, + "learning_rate": 0.00026260333602377985, + "loss": 0.81273311, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.36035156, + "step": 3471, + "time_per_iteration": 2.7527613639831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069612, + "balance_loss_mlp": 1.03370583, + "epoch": 0.6679492112350904, + "flos": 383722458624.0, + "grad_norm": 0.06457573009444674, + "language_loss": 0.86992371, + "learning_rate": 0.0002623291945717007, + "loss": 0.88061988, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.35913086, + "step": 3472, + "time_per_iteration": 2.4496309757232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067849, + "balance_loss_mlp": 1.03158569, + "epoch": 0.668141592920354, + "flos": 1150297555968.0, + "grad_norm": 0.0483341926082761, + "language_loss": 0.83728033, + "learning_rate": 0.00026205514539161175, + "loss": 0.84795886, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.36254883, + "step": 3473, + "time_per_iteration": 3.518329620361328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072278, + "balance_loss_mlp": 1.03682494, + "epoch": 0.6683339746056175, + "flos": 560804000256.0, + "grad_norm": 0.054398972389199884, + "language_loss": 0.84145987, + "learning_rate": 0.00026178118858990773, + "loss": 0.85218263, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.35449219, + "step": 3474, + "time_per_iteration": 2.848719596862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068236, + "balance_loss_mlp": 1.0318768, + "epoch": 0.6685263562908811, + "flos": 514051080192.0, + "grad_norm": 0.060039795644517814, + "language_loss": 0.84093618, + "learning_rate": 0.0002615073242729483, + "loss": 0.85161853, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.36352539, + "step": 3475, + "time_per_iteration": 2.648353099822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070134, + "balance_loss_mlp": 1.03382277, + "epoch": 0.6687187379761447, + "flos": 629466107904.0, + "grad_norm": 0.05046564119076302, + "language_loss": 0.84281248, + "learning_rate": 0.0002612335525470573, + "loss": 0.85351384, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.36352539, + "step": 3476, + "time_per_iteration": 2.792809247970581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_mlp": 1.03096104, + "epoch": 0.6689111196614083, + "flos": 535312723968.0, + "grad_norm": 0.05473638804270082, + "language_loss": 0.78341687, + "learning_rate": 0.0002609598735185221, + "loss": 0.79407597, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.35009766, + "step": 3477, + "time_per_iteration": 2.64404559135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070177, + "balance_loss_mlp": 1.03489089, + "epoch": 0.6691035013466718, + "flos": 602758471680.0, + "grad_norm": 0.0937067542198485, + "language_loss": 0.82979453, + "learning_rate": 0.00026068628729359445, + "loss": 0.84049624, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.35327148, + "step": 3478, + "time_per_iteration": 2.749631404876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071211, + "balance_loss_mlp": 1.03640211, + "epoch": 0.6692958830319353, + "flos": 632539017216.0, + "grad_norm": 0.04937335272714273, + "language_loss": 0.7616291, + "learning_rate": 0.00026041279397848996, + "loss": 0.77234125, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.34838867, + "step": 3479, + "time_per_iteration": 2.839651584625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072082, + "balance_loss_mlp": 1.03693914, + "epoch": 0.6694882647171989, + "flos": 645153721344.0, + "grad_norm": 0.04802288968176994, + "language_loss": 0.8253727, + "learning_rate": 0.00026013939367938797, + "loss": 0.83609354, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.35180664, + "step": 3480, + "time_per_iteration": 2.8756163120269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072592, + "balance_loss_mlp": 1.03861761, + "epoch": 0.6696806464024625, + "flos": 569292378624.0, + "grad_norm": 0.05111387659739007, + "language_loss": 0.81035048, + "learning_rate": 0.00025986608650243204, + "loss": 0.82107639, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.34008789, + "step": 3481, + "time_per_iteration": 2.780930757522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107265, + "balance_loss_mlp": 1.03762627, + "epoch": 0.6698730280877261, + "flos": 622386146304.0, + "grad_norm": 0.11620710974574953, + "language_loss": 0.79299992, + "learning_rate": 0.0002595928725537293, + "loss": 0.80372643, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.35058594, + "step": 3482, + "time_per_iteration": 2.8551175594329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071879, + "balance_loss_mlp": 1.03642654, + "epoch": 0.6700654097729896, + "flos": 502258447872.0, + "grad_norm": 0.05059450730585095, + "language_loss": 0.88189447, + "learning_rate": 0.0002593197519393509, + "loss": 0.89261329, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.35449219, + "step": 3483, + "time_per_iteration": 2.556617021560669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070709, + "balance_loss_mlp": 1.03637671, + "epoch": 0.6702577914582531, + "flos": 623567600640.0, + "grad_norm": 0.05152577773762556, + "language_loss": 0.79466176, + "learning_rate": 0.00025904672476533165, + "loss": 0.8053689, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.34375, + "step": 3484, + "time_per_iteration": 2.8806934356689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072584, + "balance_loss_mlp": 1.03794122, + "epoch": 0.6704501731435167, + "flos": 456033646080.0, + "grad_norm": 0.06330154522458538, + "language_loss": 0.82820839, + "learning_rate": 0.0002587737911376704, + "loss": 0.83893424, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.34643555, + "step": 3485, + "time_per_iteration": 2.6385717391967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073373, + "balance_loss_mlp": 1.03789639, + "epoch": 0.6706425548287803, + "flos": 542973238272.0, + "grad_norm": 0.04882372942075566, + "language_loss": 0.83671743, + "learning_rate": 0.00025850095116232885, + "loss": 0.84745121, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.35498047, + "step": 3486, + "time_per_iteration": 2.6404170989990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073473, + "balance_loss_mlp": 1.03873491, + "epoch": 0.6708349365140439, + "flos": 633631721472.0, + "grad_norm": 0.0500263981223685, + "language_loss": 0.77869016, + "learning_rate": 0.000258228204945233, + "loss": 0.7894249, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.34765625, + "step": 3487, + "time_per_iteration": 2.934980630874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107374, + "balance_loss_mlp": 1.03964591, + "epoch": 0.6710273181993074, + "flos": 640459989504.0, + "grad_norm": 0.05519065712818486, + "language_loss": 0.84700072, + "learning_rate": 0.00025795555259227254, + "loss": 0.85773814, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.34130859, + "step": 3488, + "time_per_iteration": 2.7644948959350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071185, + "balance_loss_mlp": 1.03720999, + "epoch": 0.671219699884571, + "flos": 553673166336.0, + "grad_norm": 0.13608492094864486, + "language_loss": 0.8373906, + "learning_rate": 0.00025768299420930046, + "loss": 0.84810245, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.33984375, + "step": 3489, + "time_per_iteration": 2.718442916870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072555, + "balance_loss_mlp": 1.03700686, + "epoch": 0.6714120815698346, + "flos": 731191256064.0, + "grad_norm": 0.05259417787616518, + "language_loss": 0.83743513, + "learning_rate": 0.0002574105299021332, + "loss": 0.84816062, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.35571289, + "step": 3490, + "time_per_iteration": 2.8551361560821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069241, + "balance_loss_mlp": 1.03440833, + "epoch": 0.6716044632550981, + "flos": 688344901632.0, + "grad_norm": 0.0512424310438266, + "language_loss": 0.84138238, + "learning_rate": 0.00025713815977655084, + "loss": 0.85207486, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.34863281, + "step": 3491, + "time_per_iteration": 2.8758041858673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107046, + "balance_loss_mlp": 1.03700948, + "epoch": 0.6717968449403616, + "flos": 460391316480.0, + "grad_norm": 0.05311776823475344, + "language_loss": 0.84021199, + "learning_rate": 0.0002568658839382969, + "loss": 0.85091662, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.3347168, + "step": 3492, + "time_per_iteration": 2.5461535453796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066732, + "balance_loss_mlp": 1.03259087, + "epoch": 0.6719892266256252, + "flos": 501362182656.0, + "grad_norm": 0.0636144820373753, + "language_loss": 0.84432656, + "learning_rate": 0.00025659370249307814, + "loss": 0.85499388, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.34179688, + "step": 3493, + "time_per_iteration": 2.5833051204681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066615, + "balance_loss_mlp": 1.03094745, + "epoch": 0.6721816083108888, + "flos": 683223386112.0, + "grad_norm": 0.056507935755291845, + "language_loss": 0.84795702, + "learning_rate": 0.00025632161554656473, + "loss": 0.85862321, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.35717773, + "step": 3494, + "time_per_iteration": 2.852865219116211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065933, + "balance_loss_mlp": 1.03067088, + "epoch": 0.6723739899961524, + "flos": 585544578048.0, + "grad_norm": 0.05119219920681276, + "language_loss": 0.82001173, + "learning_rate": 0.00025604962320439017, + "loss": 0.83067107, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.35327148, + "step": 3495, + "time_per_iteration": 2.6681125164031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068608, + "balance_loss_mlp": 1.03334618, + "epoch": 0.672566371681416, + "flos": 506336721408.0, + "grad_norm": 0.06376768707456672, + "language_loss": 0.82132721, + "learning_rate": 0.0002557777255721516, + "loss": 0.83201331, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.35302734, + "step": 3496, + "time_per_iteration": 2.688211441040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066305, + "balance_loss_mlp": 1.03142464, + "epoch": 0.6727587533666795, + "flos": 535405856256.0, + "grad_norm": 0.061511790914054676, + "language_loss": 0.80550486, + "learning_rate": 0.0002555059227554087, + "loss": 0.81616795, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.34912109, + "step": 3497, + "time_per_iteration": 2.6755480766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107069, + "balance_loss_mlp": 1.03588057, + "epoch": 0.672951135051943, + "flos": 602532919296.0, + "grad_norm": 0.08077616236025223, + "language_loss": 0.77663779, + "learning_rate": 0.00025523421485968453, + "loss": 0.78734469, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.34838867, + "step": 3498, + "time_per_iteration": 2.782900333404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067117, + "balance_loss_mlp": 1.0330708, + "epoch": 0.6731435167372066, + "flos": 810976693248.0, + "grad_norm": 0.05548957560218429, + "language_loss": 0.85524929, + "learning_rate": 0.00025496260199046585, + "loss": 0.86592042, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.34082031, + "step": 3499, + "time_per_iteration": 2.9468865394592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070869, + "balance_loss_mlp": 1.0354166, + "epoch": 0.6733358984224702, + "flos": 611306486784.0, + "grad_norm": 0.05533117407316435, + "language_loss": 0.84011221, + "learning_rate": 0.000254691084253202, + "loss": 0.8508209, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.35473633, + "step": 3500, + "time_per_iteration": 2.7936129570007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107048, + "balance_loss_mlp": 1.03607607, + "epoch": 0.6735282801077337, + "flos": 558636120576.0, + "grad_norm": 0.06619060652022955, + "language_loss": 0.77001846, + "learning_rate": 0.00025441966175330567, + "loss": 0.78072333, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.34423828, + "step": 3501, + "time_per_iteration": 2.7096900939941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073276, + "balance_loss_mlp": 1.03737032, + "epoch": 0.6737206617929973, + "flos": 672134962176.0, + "grad_norm": 0.04835122337119983, + "language_loss": 0.79766667, + "learning_rate": 0.00025414833459615183, + "loss": 0.80839938, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.35913086, + "step": 3502, + "time_per_iteration": 2.787539482116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075889, + "balance_loss_mlp": 1.03933966, + "epoch": 0.6739130434782609, + "flos": 633148683264.0, + "grad_norm": 0.05358836017753152, + "language_loss": 0.80260807, + "learning_rate": 0.0002538771028870796, + "loss": 0.81336701, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.36547852, + "step": 3503, + "time_per_iteration": 2.7826414108276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077652, + "balance_loss_mlp": 1.04224694, + "epoch": 0.6741054251635245, + "flos": 531171841536.0, + "grad_norm": 0.07580622934543835, + "language_loss": 0.81591624, + "learning_rate": 0.0002536059667313903, + "loss": 0.82669276, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.35424805, + "step": 3504, + "time_per_iteration": 2.7296247482299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107121, + "balance_loss_mlp": 1.03551888, + "epoch": 0.674297806848788, + "flos": 542343223296.0, + "grad_norm": 0.056073772887399426, + "language_loss": 0.8900978, + "learning_rate": 0.0002533349262343483, + "loss": 0.90080988, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.35742188, + "step": 3505, + "time_per_iteration": 2.674409866333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107348, + "balance_loss_mlp": 1.03828955, + "epoch": 0.6744901885340515, + "flos": 463291107840.0, + "grad_norm": 0.05947075073095298, + "language_loss": 0.81730378, + "learning_rate": 0.0002530639815011807, + "loss": 0.82803857, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.35229492, + "step": 3506, + "time_per_iteration": 2.497544765472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068375, + "balance_loss_mlp": 1.0326128, + "epoch": 0.6746825702193151, + "flos": 631533652992.0, + "grad_norm": 0.07086052765097473, + "language_loss": 0.84639049, + "learning_rate": 0.0002527931326370781, + "loss": 0.85707426, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.35791016, + "step": 3507, + "time_per_iteration": 2.7526142597198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069527, + "balance_loss_mlp": 1.03395462, + "epoch": 0.6748749519045787, + "flos": 670835644416.0, + "grad_norm": 0.05093445347334381, + "language_loss": 0.82660782, + "learning_rate": 0.00025252237974719276, + "loss": 0.83730316, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.35595703, + "step": 3508, + "time_per_iteration": 2.8549742698669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107359, + "balance_loss_mlp": 1.03782725, + "epoch": 0.6750673335898423, + "flos": 766756827648.0, + "grad_norm": 0.05329285448866526, + "language_loss": 0.80265921, + "learning_rate": 0.00025225172293664056, + "loss": 0.81339508, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.3581543, + "step": 3509, + "time_per_iteration": 2.974613904953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026572, + "balance_loss_mlp": 1.01465082, + "epoch": 0.6752597152751059, + "flos": 1511786198016.0, + "grad_norm": 0.015514835233315651, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77959704, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.11914062, + "step": 3510, + "time_per_iteration": 4.91582179069519 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072206, + "balance_loss_mlp": 1.03637218, + "epoch": 0.6754520969603693, + "flos": 686990329344.0, + "grad_norm": 0.06350153745428545, + "language_loss": 0.84804261, + "learning_rate": 0.00025171069797381106, + "loss": 0.85876471, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.35864258, + "step": 3511, + "time_per_iteration": 2.7993617057800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066581, + "balance_loss_mlp": 1.0310328, + "epoch": 0.6756444786456329, + "flos": 500318940672.0, + "grad_norm": 0.06118900000736982, + "language_loss": 0.81987178, + "learning_rate": 0.00025144033003157864, + "loss": 0.83053756, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.35620117, + "step": 3512, + "time_per_iteration": 2.5873219966888428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069288, + "balance_loss_mlp": 1.03450298, + "epoch": 0.6758368603308965, + "flos": 492357270528.0, + "grad_norm": 0.060009957038895716, + "language_loss": 0.78680366, + "learning_rate": 0.00025117005858876806, + "loss": 0.7974965, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.34838867, + "step": 3513, + "time_per_iteration": 2.6835427284240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069658, + "balance_loss_mlp": 1.03427649, + "epoch": 0.6760292420161601, + "flos": 555657753600.0, + "grad_norm": 0.15540830916665044, + "language_loss": 0.8478874, + "learning_rate": 0.000250899883750308, + "loss": 0.85858399, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.35400391, + "step": 3514, + "time_per_iteration": 2.650256395339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070046, + "balance_loss_mlp": 1.03478396, + "epoch": 0.6762216237014236, + "flos": 607322755584.0, + "grad_norm": 0.06069446103583955, + "language_loss": 0.8186444, + "learning_rate": 0.00025062980562109006, + "loss": 0.82934481, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.35302734, + "step": 3515, + "time_per_iteration": 2.7015137672424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066431, + "balance_loss_mlp": 1.0309782, + "epoch": 0.6764140053866872, + "flos": 533501254656.0, + "grad_norm": 0.06011919218972519, + "language_loss": 0.82936066, + "learning_rate": 0.0002503598243059677, + "loss": 0.84002495, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.35473633, + "step": 3516, + "time_per_iteration": 2.7936599254608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066759, + "balance_loss_mlp": 1.03221166, + "epoch": 0.6766063870719508, + "flos": 504548573184.0, + "grad_norm": 0.0538086785967606, + "language_loss": 0.79831243, + "learning_rate": 0.0002500899399097568, + "loss": 0.80897999, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.34594727, + "step": 3517, + "time_per_iteration": 2.647766351699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068577, + "balance_loss_mlp": 1.03340983, + "epoch": 0.6767987687572143, + "flos": 512923470336.0, + "grad_norm": 0.05682834446853688, + "language_loss": 0.85193241, + "learning_rate": 0.0002498201525372359, + "loss": 0.86261815, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.35205078, + "step": 3518, + "time_per_iteration": 2.5557949542999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064417, + "balance_loss_mlp": 1.03029943, + "epoch": 0.6769911504424779, + "flos": 524780121600.0, + "grad_norm": 0.05092560749530118, + "language_loss": 0.83158201, + "learning_rate": 0.00024955046229314584, + "loss": 0.84222615, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.34130859, + "step": 3519, + "time_per_iteration": 2.578089475631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069846, + "balance_loss_mlp": 1.03422618, + "epoch": 0.6771835321277414, + "flos": 449662275072.0, + "grad_norm": 0.05617502004048809, + "language_loss": 0.87603748, + "learning_rate": 0.00024928086928218947, + "loss": 0.88673592, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.35644531, + "step": 3520, + "time_per_iteration": 2.490943193435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068484, + "balance_loss_mlp": 1.03322208, + "epoch": 0.677375913813005, + "flos": 709020200448.0, + "grad_norm": 0.051602142671676454, + "language_loss": 0.75993657, + "learning_rate": 0.00024901137360903216, + "loss": 0.77062142, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.35302734, + "step": 3521, + "time_per_iteration": 2.9075634479522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073222, + "balance_loss_mlp": 1.03817451, + "epoch": 0.6775682954982686, + "flos": 428189635584.0, + "grad_norm": 0.10231641973637204, + "language_loss": 0.81175685, + "learning_rate": 0.00024874197537830115, + "loss": 0.82248902, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.35083008, + "step": 3522, + "time_per_iteration": 2.5057058334350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069237, + "balance_loss_mlp": 1.03478503, + "epoch": 0.6777606771835322, + "flos": 437677585920.0, + "grad_norm": 0.060253133761597404, + "language_loss": 0.83087361, + "learning_rate": 0.00024847267469458684, + "loss": 0.84156603, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.3449707, + "step": 3523, + "time_per_iteration": 2.5406739711761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067339, + "balance_loss_mlp": 1.03210068, + "epoch": 0.6779530588687956, + "flos": 775106993664.0, + "grad_norm": 0.0551254373136415, + "language_loss": 0.78231275, + "learning_rate": 0.00024820347166244034, + "loss": 0.79298615, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.35302734, + "step": 3524, + "time_per_iteration": 3.021663188934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064562, + "balance_loss_mlp": 1.03013432, + "epoch": 0.6781454405540592, + "flos": 571502518272.0, + "grad_norm": 0.04412805225967261, + "language_loss": 0.84577274, + "learning_rate": 0.0002479343663863755, + "loss": 0.85641837, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.34448242, + "step": 3525, + "time_per_iteration": 2.760934352874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070433, + "balance_loss_mlp": 1.03395486, + "epoch": 0.6783378222393228, + "flos": 484788478464.0, + "grad_norm": 0.051123449842866715, + "language_loss": 0.76749617, + "learning_rate": 0.00024766535897086876, + "loss": 0.77820051, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.36474609, + "step": 3526, + "time_per_iteration": 2.5466532707214355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071181, + "balance_loss_mlp": 1.03584695, + "epoch": 0.6785302039245864, + "flos": 482592895488.0, + "grad_norm": 0.04922293189317912, + "language_loss": 0.78913069, + "learning_rate": 0.0002473964495203578, + "loss": 0.79984254, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.35351562, + "step": 3527, + "time_per_iteration": 2.65765118598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072886, + "balance_loss_mlp": 1.03609788, + "epoch": 0.67872258560985, + "flos": 524451262464.0, + "grad_norm": 0.04942804135010068, + "language_loss": 0.85464156, + "learning_rate": 0.0002471276381392425, + "loss": 0.86537039, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.36791992, + "step": 3528, + "time_per_iteration": 2.75915265083313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038837, + "balance_loss_mlp": 1.02634406, + "epoch": 0.6789149672951135, + "flos": 1551786605568.0, + "grad_norm": 0.02259283228752806, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79227471, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.125, + "step": 3529, + "time_per_iteration": 4.964378356933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069996, + "balance_loss_mlp": 1.0344243, + "epoch": 0.6791073489803771, + "flos": 741088051200.0, + "grad_norm": 0.05189094051618866, + "language_loss": 0.84224343, + "learning_rate": 0.00024659031000260826, + "loss": 0.85294336, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.35595703, + "step": 3530, + "time_per_iteration": 2.8634302616119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072509, + "balance_loss_mlp": 1.03638899, + "epoch": 0.6792997306656406, + "flos": 576095915520.0, + "grad_norm": 0.055023533803773034, + "language_loss": 0.80543637, + "learning_rate": 0.0002463217934556985, + "loss": 0.81616145, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.36132812, + "step": 3531, + "time_per_iteration": 2.632070541381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031238, + "balance_loss_mlp": 1.01884079, + "epoch": 0.6794921123509042, + "flos": 1502538356736.0, + "grad_norm": 0.018779116568333653, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77563328, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.12402344, + "step": 3532, + "time_per_iteration": 4.7274627685546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073531, + "balance_loss_mlp": 1.03836441, + "epoch": 0.6796844940361677, + "flos": 698620018176.0, + "grad_norm": 0.05756666047667581, + "language_loss": 0.8354668, + "learning_rate": 0.0002457850559259306, + "loss": 0.84620214, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.35205078, + "step": 3533, + "time_per_iteration": 2.8860280513763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074469, + "balance_loss_mlp": 1.03901649, + "epoch": 0.6798768757214313, + "flos": 552496094208.0, + "grad_norm": 0.05133054826538493, + "language_loss": 0.81485093, + "learning_rate": 0.00024551683515145275, + "loss": 0.82559562, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.35498047, + "step": 3534, + "time_per_iteration": 2.620476722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072102, + "balance_loss_mlp": 1.03610086, + "epoch": 0.6800692574066949, + "flos": 522677670912.0, + "grad_norm": 0.04887500327812814, + "language_loss": 0.86479199, + "learning_rate": 0.0002452487131761014, + "loss": 0.87551308, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.35986328, + "step": 3535, + "time_per_iteration": 2.7402584552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069949, + "balance_loss_mlp": 1.03523564, + "epoch": 0.6802616390919585, + "flos": 573747563520.0, + "grad_norm": 0.05056319210769973, + "language_loss": 0.79672563, + "learning_rate": 0.00024498069010397093, + "loss": 0.80742508, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.34741211, + "step": 3536, + "time_per_iteration": 2.6493327617645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076551, + "balance_loss_mlp": 1.04109788, + "epoch": 0.6804540207772221, + "flos": 487915232256.0, + "grad_norm": 0.08967027587321133, + "language_loss": 0.85052317, + "learning_rate": 0.00024471276603911697, + "loss": 0.86128873, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.35449219, + "step": 3537, + "time_per_iteration": 2.5946011543273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074683, + "balance_loss_mlp": 1.03946912, + "epoch": 0.6806464024624855, + "flos": 578307465216.0, + "grad_norm": 0.050744450088680546, + "language_loss": 0.78934067, + "learning_rate": 0.0002444449410855572, + "loss": 0.80008757, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.35229492, + "step": 3538, + "time_per_iteration": 2.7160799503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073592, + "balance_loss_mlp": 1.03778172, + "epoch": 0.6808387841477491, + "flos": 553456378368.0, + "grad_norm": 0.0415443850681439, + "language_loss": 0.84257662, + "learning_rate": 0.00024417721534727033, + "loss": 0.85331261, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.35864258, + "step": 3539, + "time_per_iteration": 2.6316590309143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067702, + "balance_loss_mlp": 1.03220177, + "epoch": 0.6810311658330127, + "flos": 426613893120.0, + "grad_norm": 0.06268112342212401, + "language_loss": 0.82995272, + "learning_rate": 0.00024390958892819687, + "loss": 0.8406297, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.35546875, + "step": 3540, + "time_per_iteration": 2.4619975090026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071831, + "balance_loss_mlp": 1.03518569, + "epoch": 0.6812235475182763, + "flos": 571956443136.0, + "grad_norm": 0.047330457395290515, + "language_loss": 0.80951297, + "learning_rate": 0.0002436420619322381, + "loss": 0.82023126, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.36645508, + "step": 3541, + "time_per_iteration": 2.814427614212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070446, + "balance_loss_mlp": 1.03515983, + "epoch": 0.6814159292035398, + "flos": 501648781824.0, + "grad_norm": 0.0608425293250951, + "language_loss": 0.82551098, + "learning_rate": 0.0002433746344632577, + "loss": 0.83621544, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.35327148, + "step": 3542, + "time_per_iteration": 2.6463205814361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069714, + "balance_loss_mlp": 1.03340268, + "epoch": 0.6816083108888034, + "flos": 765176702976.0, + "grad_norm": 0.05597669105837374, + "language_loss": 0.7998035, + "learning_rate": 0.00024310730662508006, + "loss": 0.81050068, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.36303711, + "step": 3543, + "time_per_iteration": 3.0262770652770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107287, + "balance_loss_mlp": 1.03787053, + "epoch": 0.681800692574067, + "flos": 479205683712.0, + "grad_norm": 0.05246394950285061, + "language_loss": 0.87412894, + "learning_rate": 0.0002428400785214911, + "loss": 0.88485765, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.3503418, + "step": 3544, + "time_per_iteration": 2.6026573181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072118, + "balance_loss_mlp": 1.03547359, + "epoch": 0.6819930742593305, + "flos": 691298537472.0, + "grad_norm": 0.057535239065408805, + "language_loss": 0.8261283, + "learning_rate": 0.00024257295025623794, + "loss": 0.83684945, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.36645508, + "step": 3545, + "time_per_iteration": 2.813525915145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066625, + "balance_loss_mlp": 1.03059971, + "epoch": 0.6821854559445941, + "flos": 677783185920.0, + "grad_norm": 0.051890775320829655, + "language_loss": 0.80731034, + "learning_rate": 0.00024230592193302892, + "loss": 0.81797659, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.3605957, + "step": 3546, + "time_per_iteration": 2.852640151977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069168, + "balance_loss_mlp": 1.03378654, + "epoch": 0.6823778376298576, + "flos": 461956884480.0, + "grad_norm": 0.04826922291722955, + "language_loss": 0.84192979, + "learning_rate": 0.00024203899365553372, + "loss": 0.85262144, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.35424805, + "step": 3547, + "time_per_iteration": 2.51088285446167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018198, + "balance_loss_mlp": 1.00651574, + "epoch": 0.6825702193151212, + "flos": 1474582427136.0, + "grad_norm": 0.01234117563256537, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.77752554, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.11669922, + "step": 3548, + "time_per_iteration": 4.512159824371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069713, + "balance_loss_mlp": 1.03397429, + "epoch": 0.6827626010003848, + "flos": 722791627776.0, + "grad_norm": 0.05201405662428197, + "language_loss": 0.83068311, + "learning_rate": 0.00024150543765216848, + "loss": 0.84138024, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.35766602, + "step": 3549, + "time_per_iteration": 2.9022421836853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066879, + "balance_loss_mlp": 1.03066325, + "epoch": 0.6829549826856484, + "flos": 558596832768.0, + "grad_norm": 0.050492877395882395, + "language_loss": 0.83153272, + "learning_rate": 0.00024123881013344352, + "loss": 0.84220147, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.36230469, + "step": 3550, + "time_per_iteration": 2.663245677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070068, + "balance_loss_mlp": 1.03509164, + "epoch": 0.6831473643709118, + "flos": 624635573760.0, + "grad_norm": 0.06049149203697264, + "language_loss": 0.79663515, + "learning_rate": 0.00024097228307472202, + "loss": 0.80733585, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.35009766, + "step": 3551, + "time_per_iteration": 2.7762739658355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070299, + "balance_loss_mlp": 1.03458428, + "epoch": 0.6833397460561754, + "flos": 713553960960.0, + "grad_norm": 0.05841581019215986, + "language_loss": 0.81410074, + "learning_rate": 0.00024070585657947846, + "loss": 0.82480371, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.35717773, + "step": 3552, + "time_per_iteration": 2.8573665618896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070846, + "balance_loss_mlp": 1.03470206, + "epoch": 0.683532127741439, + "flos": 464449241088.0, + "grad_norm": 0.042320338748993415, + "language_loss": 0.85217428, + "learning_rate": 0.00024043953075114934, + "loss": 0.86288273, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.36157227, + "step": 3553, + "time_per_iteration": 2.6308178901672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106716, + "balance_loss_mlp": 1.03230345, + "epoch": 0.6837245094267026, + "flos": 581979866112.0, + "grad_norm": 0.06353851780596993, + "language_loss": 0.88855463, + "learning_rate": 0.00024017330569313128, + "loss": 0.89922619, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.34912109, + "step": 3554, + "time_per_iteration": 2.691176176071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070226, + "balance_loss_mlp": 1.03415298, + "epoch": 0.6839168911119662, + "flos": 793836993024.0, + "grad_norm": 0.05307417263054524, + "language_loss": 0.74880016, + "learning_rate": 0.0002399071815087821, + "loss": 0.75950241, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.36108398, + "step": 3555, + "time_per_iteration": 2.990910530090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073324, + "balance_loss_mlp": 1.03803802, + "epoch": 0.6841092727972297, + "flos": 579734820864.0, + "grad_norm": 0.05505515245095852, + "language_loss": 0.83355868, + "learning_rate": 0.00023964115830142025, + "loss": 0.84429193, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.35327148, + "step": 3556, + "time_per_iteration": 2.6737208366394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070632, + "balance_loss_mlp": 1.03522646, + "epoch": 0.6843016544824932, + "flos": 383530401792.0, + "grad_norm": 0.06254442302238046, + "language_loss": 0.8747263, + "learning_rate": 0.00023937523617432522, + "loss": 0.8854326, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.35449219, + "step": 3557, + "time_per_iteration": 2.4377589225769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066745, + "balance_loss_mlp": 1.03176904, + "epoch": 0.6844940361677568, + "flos": 1438474332672.0, + "grad_norm": 0.05391810386575329, + "language_loss": 0.86953497, + "learning_rate": 0.00023910941523073705, + "loss": 0.88020241, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.34985352, + "step": 3558, + "time_per_iteration": 3.854933738708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072327, + "balance_loss_mlp": 1.03739882, + "epoch": 0.6846864178530204, + "flos": 520614508032.0, + "grad_norm": 0.05572945475530707, + "language_loss": 0.86660743, + "learning_rate": 0.0002388436955738566, + "loss": 0.87733072, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.34960938, + "step": 3559, + "time_per_iteration": 2.6673743724823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072956, + "balance_loss_mlp": 1.03874326, + "epoch": 0.6848787995382839, + "flos": 717626442240.0, + "grad_norm": 0.051092768918582485, + "language_loss": 0.81714153, + "learning_rate": 0.00023857807730684523, + "loss": 0.82787108, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.3425293, + "step": 3560, + "time_per_iteration": 2.8930888175964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074328, + "balance_loss_mlp": 1.03956604, + "epoch": 0.6850711812235475, + "flos": 510787524096.0, + "grad_norm": 0.06174671890156068, + "language_loss": 0.82387376, + "learning_rate": 0.00023831256053282547, + "loss": 0.83461708, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.34790039, + "step": 3561, + "time_per_iteration": 2.6872005462646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073119, + "balance_loss_mlp": 1.03923941, + "epoch": 0.6852635629088111, + "flos": 667832546304.0, + "grad_norm": 0.051363024529254335, + "language_loss": 0.78085375, + "learning_rate": 0.00023804714535488003, + "loss": 0.79158491, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.33911133, + "step": 3562, + "time_per_iteration": 4.3489556312561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008548, + "balance_loss_mlp": 0.9979142, + "epoch": 0.6854559445940747, + "flos": 1522136918016.0, + "grad_norm": 0.005165223405227486, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80818176, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.10644531, + "step": 3563, + "time_per_iteration": 4.906137704849243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072342, + "balance_loss_mlp": 1.03812885, + "epoch": 0.6856483262793382, + "flos": 453970483200.0, + "grad_norm": 0.05119141259642537, + "language_loss": 0.80591673, + "learning_rate": 0.00023751662019934488, + "loss": 0.81664014, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.3425293, + "step": 3564, + "time_per_iteration": 2.4906551837921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071978, + "balance_loss_mlp": 1.03745532, + "epoch": 0.6858407079646017, + "flos": 615269869056.0, + "grad_norm": 0.08282945217506828, + "language_loss": 0.79188418, + "learning_rate": 0.00023725151042772364, + "loss": 0.80260396, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.34545898, + "step": 3565, + "time_per_iteration": 2.7048499584198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075078, + "balance_loss_mlp": 1.04065084, + "epoch": 0.6860330896498653, + "flos": 465793638912.0, + "grad_norm": 0.05470196692680893, + "language_loss": 0.82981157, + "learning_rate": 0.00023698650266411276, + "loss": 0.8405624, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.34472656, + "step": 3566, + "time_per_iteration": 2.6011905670166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072281, + "balance_loss_mlp": 1.03909349, + "epoch": 0.6862254713351289, + "flos": 863879814144.0, + "grad_norm": 0.05579586531854514, + "language_loss": 0.82876581, + "learning_rate": 0.00023672159701139755, + "loss": 0.83948863, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.33203125, + "step": 3567, + "time_per_iteration": 3.1918952465057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078201, + "balance_loss_mlp": 1.0438447, + "epoch": 0.6864178530203925, + "flos": 446905078272.0, + "grad_norm": 0.06805670760386738, + "language_loss": 0.85873824, + "learning_rate": 0.00023645679357242296, + "loss": 0.86952031, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.34399414, + "step": 3568, + "time_per_iteration": 2.4888172149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074879, + "balance_loss_mlp": 1.04128623, + "epoch": 0.6866102347056561, + "flos": 424034196480.0, + "grad_norm": 0.05006770232648597, + "language_loss": 0.83895862, + "learning_rate": 0.00023619209244999534, + "loss": 0.84970748, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.33618164, + "step": 3569, + "time_per_iteration": 2.502540111541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107562, + "balance_loss_mlp": 1.04150224, + "epoch": 0.6868026163909196, + "flos": 472134486528.0, + "grad_norm": 0.060913037985659245, + "language_loss": 0.85054779, + "learning_rate": 0.0002359274937468806, + "loss": 0.86130404, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.34155273, + "step": 3570, + "time_per_iteration": 2.5016539096832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076781, + "balance_loss_mlp": 1.04263973, + "epoch": 0.6869949980761831, + "flos": 463937089536.0, + "grad_norm": 0.04774464497453654, + "language_loss": 0.778054, + "learning_rate": 0.00023566299756580512, + "loss": 0.78882182, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.34179688, + "step": 3571, + "time_per_iteration": 2.6037425994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076706, + "balance_loss_mlp": 1.04194498, + "epoch": 0.6871873797614467, + "flos": 426012991488.0, + "grad_norm": 0.056784915958369084, + "language_loss": 0.7818104, + "learning_rate": 0.0002353986040094551, + "loss": 0.79257739, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.34765625, + "step": 3572, + "time_per_iteration": 2.4650750160217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077273, + "balance_loss_mlp": 1.04286885, + "epoch": 0.6873797614467103, + "flos": 443394210816.0, + "grad_norm": 0.05696789275443238, + "language_loss": 0.7911824, + "learning_rate": 0.00023513431318047796, + "loss": 0.8019551, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.34448242, + "step": 3573, + "time_per_iteration": 2.5429108142852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072767, + "balance_loss_mlp": 1.03912568, + "epoch": 0.6875721431319738, + "flos": 991927074816.0, + "grad_norm": 0.06588497554546605, + "language_loss": 0.76656246, + "learning_rate": 0.00023487012518147977, + "loss": 0.77729011, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.33666992, + "step": 3574, + "time_per_iteration": 3.2478342056274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074542, + "balance_loss_mlp": 1.03985214, + "epoch": 0.6877645248172374, + "flos": 1285031900160.0, + "grad_norm": 0.05648016172081939, + "language_loss": 0.84123796, + "learning_rate": 0.00023460604011502772, + "loss": 0.85198337, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.34692383, + "step": 3575, + "time_per_iteration": 3.6104493141174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073345, + "balance_loss_mlp": 1.03946543, + "epoch": 0.687956906502501, + "flos": 876360688128.0, + "grad_norm": 0.05234067730424214, + "language_loss": 0.8542276, + "learning_rate": 0.00023434205808364845, + "loss": 0.86496103, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.33911133, + "step": 3576, + "time_per_iteration": 3.1311981678009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107559, + "balance_loss_mlp": 1.04142499, + "epoch": 0.6881492881877646, + "flos": 563038871040.0, + "grad_norm": 0.05805523475293479, + "language_loss": 0.8543247, + "learning_rate": 0.00023407817918982932, + "loss": 0.86508065, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.34204102, + "step": 3577, + "time_per_iteration": 2.76940655708313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075416, + "balance_loss_mlp": 1.04101276, + "epoch": 0.6883416698730281, + "flos": 794782720512.0, + "grad_norm": 0.05454368675547281, + "language_loss": 0.7852968, + "learning_rate": 0.00023381440353601718, + "loss": 0.79605091, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.34448242, + "step": 3578, + "time_per_iteration": 2.987713098526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078404, + "balance_loss_mlp": 1.04295087, + "epoch": 0.6885340515582916, + "flos": 723308161536.0, + "grad_norm": 0.1550034716178633, + "language_loss": 0.8585633, + "learning_rate": 0.00023355073122461822, + "loss": 0.86934739, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.35449219, + "step": 3579, + "time_per_iteration": 2.8689723014831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073162, + "balance_loss_mlp": 1.03866315, + "epoch": 0.6887264332435552, + "flos": 1010529036288.0, + "grad_norm": 0.05073405937769219, + "language_loss": 0.82913256, + "learning_rate": 0.00023328716235799973, + "loss": 0.83986419, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.34545898, + "step": 3580, + "time_per_iteration": 3.2760398387908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077211, + "balance_loss_mlp": 1.04292655, + "epoch": 0.6889188149288188, + "flos": 584993138688.0, + "grad_norm": 0.0642868391556551, + "language_loss": 0.83958888, + "learning_rate": 0.00023302369703848803, + "loss": 0.85036099, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.34326172, + "step": 3581, + "time_per_iteration": 2.6795780658721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075736, + "balance_loss_mlp": 1.04121315, + "epoch": 0.6891111966140824, + "flos": 635831686656.0, + "grad_norm": 0.05830003162798764, + "language_loss": 0.79951459, + "learning_rate": 0.00023276033536836937, + "loss": 0.81027198, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.34570312, + "step": 3582, + "time_per_iteration": 2.7684953212738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074456, + "balance_loss_mlp": 1.03964663, + "epoch": 0.6893035782993459, + "flos": 495011160576.0, + "grad_norm": 0.04509310145442872, + "language_loss": 0.84428883, + "learning_rate": 0.00023249707744988984, + "loss": 0.8550334, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.34838867, + "step": 3583, + "time_per_iteration": 2.6324620246887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074677, + "balance_loss_mlp": 1.04041624, + "epoch": 0.6894959599846094, + "flos": 457983327744.0, + "grad_norm": 0.06541043788965, + "language_loss": 0.81646812, + "learning_rate": 0.00023223392338525529, + "loss": 0.8272149, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.34301758, + "step": 3584, + "time_per_iteration": 2.496835231781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070744, + "balance_loss_mlp": 1.03614986, + "epoch": 0.689688341669873, + "flos": 504740630016.0, + "grad_norm": 0.0500959825049001, + "language_loss": 0.78515136, + "learning_rate": 0.00023197087327663107, + "loss": 0.7958588, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.34643555, + "step": 3585, + "time_per_iteration": 2.6497855186462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107737, + "balance_loss_mlp": 1.04349089, + "epoch": 0.6898807233551366, + "flos": 763584993792.0, + "grad_norm": 0.05545986059450925, + "language_loss": 0.81721687, + "learning_rate": 0.00023170792722614243, + "loss": 0.82799053, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.33911133, + "step": 3586, + "time_per_iteration": 2.8789288997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071576, + "balance_loss_mlp": 1.0367434, + "epoch": 0.6900731050404002, + "flos": 583030310400.0, + "grad_norm": 0.05029766249236532, + "language_loss": 0.83530807, + "learning_rate": 0.00023144508533587377, + "loss": 0.84602392, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.34863281, + "step": 3587, + "time_per_iteration": 2.8913052082061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074935, + "balance_loss_mlp": 1.03998244, + "epoch": 0.6902654867256637, + "flos": 711531495936.0, + "grad_norm": 0.0709422421698616, + "language_loss": 0.7865144, + "learning_rate": 0.0002311823477078698, + "loss": 0.79726374, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.35009766, + "step": 3588, + "time_per_iteration": 2.923501491546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068421, + "balance_loss_mlp": 1.03446984, + "epoch": 0.6904578684109273, + "flos": 596816294400.0, + "grad_norm": 0.26453664714217867, + "language_loss": 0.8501482, + "learning_rate": 0.00023091971444413428, + "loss": 0.86083239, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.33984375, + "step": 3589, + "time_per_iteration": 2.779235363006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076229, + "balance_loss_mlp": 1.04056144, + "epoch": 0.6906502500961909, + "flos": 584757411840.0, + "grad_norm": 0.051361873763105706, + "language_loss": 0.82785845, + "learning_rate": 0.00023065718564663012, + "loss": 0.83862066, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.35668945, + "step": 3590, + "time_per_iteration": 2.7035253047943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020688, + "balance_loss_mlp": 1.00957787, + "epoch": 0.6908426317814544, + "flos": 1587001559040.0, + "grad_norm": 0.009423557970014077, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74932277, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.11132812, + "step": 3591, + "time_per_iteration": 4.9744603633880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073352, + "balance_loss_mlp": 1.03901935, + "epoch": 0.6910350134667179, + "flos": 500525554176.0, + "grad_norm": 0.048031169148873155, + "language_loss": 0.80940306, + "learning_rate": 0.0002301324418579666, + "loss": 0.82013655, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.34350586, + "step": 3592, + "time_per_iteration": 2.673436403274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016267, + "balance_loss_mlp": 1.00534713, + "epoch": 0.6912273951519815, + "flos": 1408462138368.0, + "grad_norm": 0.006132313228220279, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79704738, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.109375, + "step": 3593, + "time_per_iteration": 4.7109363079071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074657, + "balance_loss_mlp": 1.04053962, + "epoch": 0.6914197768372451, + "flos": 634961562624.0, + "grad_norm": 0.056049498625347735, + "language_loss": 0.80705756, + "learning_rate": 0.00022960811715677415, + "loss": 0.8178041, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.34155273, + "step": 3594, + "time_per_iteration": 2.830838918685913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107036, + "balance_loss_mlp": 1.03686213, + "epoch": 0.6916121585225087, + "flos": 557755822080.0, + "grad_norm": 0.05478776736586074, + "language_loss": 0.81540507, + "learning_rate": 0.00022934611221845608, + "loss": 0.82610869, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.33520508, + "step": 3595, + "time_per_iteration": 2.800851583480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074607, + "balance_loss_mlp": 1.04127622, + "epoch": 0.6918045402077723, + "flos": 528887508480.0, + "grad_norm": 0.051880347807473304, + "language_loss": 0.77869982, + "learning_rate": 0.00022908421235729609, + "loss": 0.78944588, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.33349609, + "step": 3596, + "time_per_iteration": 2.7151432037353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071648, + "balance_loss_mlp": 1.03645778, + "epoch": 0.6919969218930357, + "flos": 570083927040.0, + "grad_norm": 0.044849912113491465, + "language_loss": 0.85305548, + "learning_rate": 0.0002288224176749728, + "loss": 0.86377192, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.35205078, + "step": 3597, + "time_per_iteration": 2.634561061859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075011, + "balance_loss_mlp": 1.04005897, + "epoch": 0.6921893035782993, + "flos": 683006598144.0, + "grad_norm": 0.0536844380747242, + "language_loss": 0.78127837, + "learning_rate": 0.00022856072827312385, + "loss": 0.79202843, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.34936523, + "step": 3598, + "time_per_iteration": 2.8242592811584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072467, + "balance_loss_mlp": 1.03830183, + "epoch": 0.6923816852635629, + "flos": 546484105728.0, + "grad_norm": 0.13391006913463419, + "language_loss": 0.76835263, + "learning_rate": 0.00022829914425334598, + "loss": 0.77907735, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.34204102, + "step": 3599, + "time_per_iteration": 2.634923219680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074851, + "balance_loss_mlp": 1.04051888, + "epoch": 0.6925740669488265, + "flos": 509782159872.0, + "grad_norm": 0.0539133277986469, + "language_loss": 0.80556238, + "learning_rate": 0.0002280376657171956, + "loss": 0.81631094, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.34350586, + "step": 3600, + "time_per_iteration": 2.6054348945617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071386, + "balance_loss_mlp": 1.03662419, + "epoch": 0.69276644863409, + "flos": 869053764096.0, + "grad_norm": 0.05194865310511828, + "language_loss": 0.76575196, + "learning_rate": 0.00022777629276618706, + "loss": 0.77646577, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.34765625, + "step": 3601, + "time_per_iteration": 3.1115190982818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077383, + "balance_loss_mlp": 1.04219222, + "epoch": 0.6929588303193536, + "flos": 625486758912.0, + "grad_norm": 0.05453934109077095, + "language_loss": 0.77726191, + "learning_rate": 0.0002275150255017947, + "loss": 0.78803569, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.35205078, + "step": 3602, + "time_per_iteration": 2.7954330444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013352, + "balance_loss_mlp": 1.00333869, + "epoch": 0.6931512120046172, + "flos": 1544530553856.0, + "grad_norm": 0.00865021754788789, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76746023, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.10009766, + "step": 3603, + "time_per_iteration": 4.98169469833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016608, + "balance_loss_mlp": 1.00664246, + "epoch": 0.6933435936898807, + "flos": 1447460001792.0, + "grad_norm": 0.007581021196043067, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76143718, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.09960938, + "step": 3604, + "time_per_iteration": 4.666281223297119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071493, + "balance_loss_mlp": 1.03739882, + "epoch": 0.6935359753751443, + "flos": 540639442944.0, + "grad_norm": 0.05365572329513203, + "language_loss": 0.84348619, + "learning_rate": 0.0002267318588424379, + "loss": 0.85420108, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.34130859, + "step": 3605, + "time_per_iteration": 2.5876171588897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071367, + "balance_loss_mlp": 1.03755951, + "epoch": 0.6937283570604078, + "flos": 719074146816.0, + "grad_norm": 0.0635324341399035, + "language_loss": 0.87573755, + "learning_rate": 0.00022647101533842845, + "loss": 0.8864513, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.33837891, + "step": 3606, + "time_per_iteration": 2.873445510864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072563, + "balance_loss_mlp": 1.03825426, + "epoch": 0.6939207387456714, + "flos": 521909443584.0, + "grad_norm": 0.05554055490203988, + "language_loss": 0.76844239, + "learning_rate": 0.00022621027802778872, + "loss": 0.77916795, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.34350586, + "step": 3607, + "time_per_iteration": 2.607332706451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074645, + "balance_loss_mlp": 1.04086149, + "epoch": 0.694113120430935, + "flos": 535100318208.0, + "grad_norm": 0.058788257779223134, + "language_loss": 0.78766942, + "learning_rate": 0.00022594964701174586, + "loss": 0.79841584, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.33813477, + "step": 3608, + "time_per_iteration": 2.6019680500030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074213, + "balance_loss_mlp": 1.03985715, + "epoch": 0.6943055021161986, + "flos": 523101072384.0, + "grad_norm": 0.052336959457674984, + "language_loss": 0.84605336, + "learning_rate": 0.00022568912239148586, + "loss": 0.85679555, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.34399414, + "step": 3609, + "time_per_iteration": 2.6037116050720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073862, + "balance_loss_mlp": 1.03943467, + "epoch": 0.694497883801462, + "flos": 484637119488.0, + "grad_norm": 0.05428318108102923, + "language_loss": 0.81688815, + "learning_rate": 0.00022542870426815344, + "loss": 0.82762676, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.34472656, + "step": 3610, + "time_per_iteration": 2.723229169845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080613, + "balance_loss_mlp": 1.04518366, + "epoch": 0.6946902654867256, + "flos": 461238119424.0, + "grad_norm": 0.06119674491487997, + "language_loss": 0.86244833, + "learning_rate": 0.00022516839274285173, + "loss": 0.87325442, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.35449219, + "step": 3611, + "time_per_iteration": 2.540647268295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073799, + "balance_loss_mlp": 1.03832269, + "epoch": 0.6948826471719892, + "flos": 512603375616.0, + "grad_norm": 0.054515273937313154, + "language_loss": 0.74971861, + "learning_rate": 0.00022490818791664265, + "loss": 0.76045656, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.35522461, + "step": 3612, + "time_per_iteration": 2.577448844909668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074613, + "balance_loss_mlp": 1.03989887, + "epoch": 0.6950750288572528, + "flos": 556917783552.0, + "grad_norm": 0.04771365069249161, + "language_loss": 0.85378981, + "learning_rate": 0.00022464808989054676, + "loss": 0.86453593, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.34741211, + "step": 3613, + "time_per_iteration": 2.6405351161956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071033, + "balance_loss_mlp": 1.03646183, + "epoch": 0.6952674105425164, + "flos": 542215185408.0, + "grad_norm": 0.06079183455352582, + "language_loss": 0.75739813, + "learning_rate": 0.00022438809876554284, + "loss": 0.76810849, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.34594727, + "step": 3614, + "time_per_iteration": 2.613945484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075685, + "balance_loss_mlp": 1.04128122, + "epoch": 0.6954597922277799, + "flos": 546465166848.0, + "grad_norm": 0.05561683748761922, + "language_loss": 0.80328143, + "learning_rate": 0.00022412821464256873, + "loss": 0.81403828, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.34448242, + "step": 3615, + "time_per_iteration": 2.7260682582855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073152, + "balance_loss_mlp": 1.03922486, + "epoch": 0.6956521739130435, + "flos": 519255553536.0, + "grad_norm": 0.0593468724066596, + "language_loss": 0.82113886, + "learning_rate": 0.00022386843762252023, + "loss": 0.83187044, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.33959961, + "step": 3616, + "time_per_iteration": 2.6294190883636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070918, + "balance_loss_mlp": 1.03622794, + "epoch": 0.695844555598307, + "flos": 466029365760.0, + "grad_norm": 0.055313153128714786, + "language_loss": 0.79384601, + "learning_rate": 0.00022360876780625193, + "loss": 0.80455518, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.34741211, + "step": 3617, + "time_per_iteration": 2.590061664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071648, + "balance_loss_mlp": 1.03741097, + "epoch": 0.6960369372835706, + "flos": 600347510784.0, + "grad_norm": 0.044171001480645455, + "language_loss": 0.79755616, + "learning_rate": 0.00022334920529457604, + "loss": 0.8082726, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.34277344, + "step": 3618, + "time_per_iteration": 2.9306209087371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071307, + "balance_loss_mlp": 1.0369513, + "epoch": 0.6962293189688342, + "flos": 643927186944.0, + "grad_norm": 0.0535379410757751, + "language_loss": 0.87326622, + "learning_rate": 0.00022308975018826423, + "loss": 0.88397926, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.34399414, + "step": 3619, + "time_per_iteration": 2.888936758041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074623, + "balance_loss_mlp": 1.03967083, + "epoch": 0.6964217006540977, + "flos": 638524864512.0, + "grad_norm": 0.061080983554533244, + "language_loss": 0.84665489, + "learning_rate": 0.00022283040258804564, + "loss": 0.85740113, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.34985352, + "step": 3620, + "time_per_iteration": 2.777407169342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073748, + "balance_loss_mlp": 1.04005957, + "epoch": 0.6966140823393613, + "flos": 651864125952.0, + "grad_norm": 0.05227227103704651, + "language_loss": 0.83467555, + "learning_rate": 0.00022257116259460802, + "loss": 0.84541297, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.3371582, + "step": 3621, + "time_per_iteration": 2.8371803760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107145, + "balance_loss_mlp": 1.03802419, + "epoch": 0.6968064640246249, + "flos": 704160552960.0, + "grad_norm": 0.054247578312955166, + "language_loss": 0.8137657, + "learning_rate": 0.00022231203030859725, + "loss": 0.82448018, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.33447266, + "step": 3622, + "time_per_iteration": 2.9509494304656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077752, + "balance_loss_mlp": 1.04361081, + "epoch": 0.6969988457098885, + "flos": 492312190464.0, + "grad_norm": 0.06806535076017864, + "language_loss": 0.83473521, + "learning_rate": 0.00022205300583061737, + "loss": 0.84551275, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.34179688, + "step": 3623, + "time_per_iteration": 2.564910888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006317, + "balance_loss_mlp": 0.99630374, + "epoch": 0.6971912273951519, + "flos": 1351839974400.0, + "grad_norm": 0.005946878920226346, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83844519, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.10009766, + "step": 3624, + "time_per_iteration": 4.894897937774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_mlp": 1.04030991, + "epoch": 0.6973836090804155, + "flos": 602182301184.0, + "grad_norm": 0.052322011442081255, + "language_loss": 0.77296048, + "learning_rate": 0.00022153528070095735, + "loss": 0.78370118, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.33789062, + "step": 3625, + "time_per_iteration": 2.6873764991760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074827, + "balance_loss_mlp": 1.04056633, + "epoch": 0.6975759907656791, + "flos": 523805280768.0, + "grad_norm": 0.05344661809943597, + "language_loss": 0.88087487, + "learning_rate": 0.00022127658025027568, + "loss": 0.89162308, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.34301758, + "step": 3626, + "time_per_iteration": 2.6872076988220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077609, + "balance_loss_mlp": 1.04291928, + "epoch": 0.6977683724509427, + "flos": 480672327168.0, + "grad_norm": 0.05134929974551719, + "language_loss": 0.84773469, + "learning_rate": 0.00022101798800962258, + "loss": 0.85851079, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.34741211, + "step": 3627, + "time_per_iteration": 2.592256546020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074479, + "balance_loss_mlp": 1.03933573, + "epoch": 0.6979607541362063, + "flos": 522372132864.0, + "grad_norm": 0.06417164030840651, + "language_loss": 0.78953862, + "learning_rate": 0.00022075950407939227, + "loss": 0.80028337, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.35180664, + "step": 3628, + "time_per_iteration": 2.616570234298706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_mlp": 1.04023814, + "epoch": 0.6981531358214698, + "flos": 547818329088.0, + "grad_norm": 0.05532420233787888, + "language_loss": 0.82282603, + "learning_rate": 0.0002205011285599367, + "loss": 0.83356667, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.33862305, + "step": 3629, + "time_per_iteration": 2.612488269805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073299, + "balance_loss_mlp": 1.03925288, + "epoch": 0.6983455175067333, + "flos": 699747628032.0, + "grad_norm": 0.05532386422624981, + "language_loss": 0.80727249, + "learning_rate": 0.00022024286155156658, + "loss": 0.8180055, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.34082031, + "step": 3630, + "time_per_iteration": 2.8387677669525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070371, + "balance_loss_mlp": 1.03632545, + "epoch": 0.6985378991919969, + "flos": 484819001856.0, + "grad_norm": 0.047952910030837306, + "language_loss": 0.85720146, + "learning_rate": 0.00021998470315454994, + "loss": 0.8679052, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.34057617, + "step": 3631, + "time_per_iteration": 2.635730743408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071863, + "balance_loss_mlp": 1.03843713, + "epoch": 0.6987302808772605, + "flos": 558503700480.0, + "grad_norm": 0.05280665579931524, + "language_loss": 0.86521721, + "learning_rate": 0.00021972665346911275, + "loss": 0.87593591, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.33447266, + "step": 3632, + "time_per_iteration": 2.668616771697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071923, + "balance_loss_mlp": 1.03763855, + "epoch": 0.698922662562524, + "flos": 483350948352.0, + "grad_norm": 0.05402222352143004, + "language_loss": 0.79431093, + "learning_rate": 0.00021946871259543877, + "loss": 0.80503017, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.34326172, + "step": 3633, + "time_per_iteration": 2.580191135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068198, + "balance_loss_mlp": 1.03486705, + "epoch": 0.6991150442477876, + "flos": 718586726400.0, + "grad_norm": 0.05023014316790998, + "language_loss": 0.8304534, + "learning_rate": 0.00021921088063366957, + "loss": 0.84113538, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.33349609, + "step": 3634, + "time_per_iteration": 2.9607045650482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067291, + "balance_loss_mlp": 1.03384113, + "epoch": 0.6993074259330512, + "flos": 488871134208.0, + "grad_norm": 0.05127346508888132, + "language_loss": 0.8176077, + "learning_rate": 0.00021895315768390435, + "loss": 0.82828063, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.3347168, + "step": 3635, + "time_per_iteration": 2.585498332977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107047, + "balance_loss_mlp": 1.03651941, + "epoch": 0.6994998076183148, + "flos": 717745715712.0, + "grad_norm": 0.04635500593717234, + "language_loss": 0.87909687, + "learning_rate": 0.00021869554384619999, + "loss": 0.88980162, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.33959961, + "step": 3636, + "time_per_iteration": 2.968268394470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074967, + "balance_loss_mlp": 1.0413022, + "epoch": 0.6996921893035783, + "flos": 578730866688.0, + "grad_norm": 0.05835542586274351, + "language_loss": 0.80754793, + "learning_rate": 0.00021843803922057115, + "loss": 0.81829762, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.33691406, + "step": 3637, + "time_per_iteration": 2.7109100818634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068155, + "balance_loss_mlp": 1.0351578, + "epoch": 0.6998845709888418, + "flos": 518369462784.0, + "grad_norm": 0.06833550802909422, + "language_loss": 0.81533343, + "learning_rate": 0.00021818064390698977, + "loss": 0.826015, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.33007812, + "step": 3638, + "time_per_iteration": 2.5944924354553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071173, + "balance_loss_mlp": 1.03726995, + "epoch": 0.7000769526741054, + "flos": 620666399232.0, + "grad_norm": 0.05517026065702434, + "language_loss": 0.86890268, + "learning_rate": 0.0002179233580053861, + "loss": 0.87961447, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.33935547, + "step": 3639, + "time_per_iteration": 2.7613229751586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070142, + "balance_loss_mlp": 1.03652453, + "epoch": 0.700269334359369, + "flos": 559670598144.0, + "grad_norm": 0.13465593059658462, + "language_loss": 0.85617924, + "learning_rate": 0.00021766618161564688, + "loss": 0.86688066, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.33642578, + "step": 3640, + "time_per_iteration": 2.7400569915771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071036, + "balance_loss_mlp": 1.0372045, + "epoch": 0.7004617160446326, + "flos": 483090490368.0, + "grad_norm": 0.051527698047250534, + "language_loss": 0.87097609, + "learning_rate": 0.00021740911483761677, + "loss": 0.88168645, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.33862305, + "step": 3641, + "time_per_iteration": 2.5464553833007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107241, + "balance_loss_mlp": 1.0389359, + "epoch": 0.7006540977298961, + "flos": 696647015424.0, + "grad_norm": 0.04496743490694548, + "language_loss": 0.91822404, + "learning_rate": 0.00021715215777109837, + "loss": 0.92894816, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.33496094, + "step": 3642, + "time_per_iteration": 2.9422945976257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068413, + "balance_loss_mlp": 1.03477192, + "epoch": 0.7008464794151597, + "flos": 504528224256.0, + "grad_norm": 0.053490842325032185, + "language_loss": 0.84272158, + "learning_rate": 0.00021689531051585103, + "loss": 0.85340571, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.33642578, + "step": 3643, + "time_per_iteration": 2.609464406967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069383, + "balance_loss_mlp": 1.03421593, + "epoch": 0.7010388611004232, + "flos": 536985980928.0, + "grad_norm": 0.06575198455651811, + "language_loss": 0.79940069, + "learning_rate": 0.00021663857317159196, + "loss": 0.81009454, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.35229492, + "step": 3644, + "time_per_iteration": 2.652776002883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074301, + "balance_loss_mlp": 1.04006386, + "epoch": 0.7012312427856868, + "flos": 546996257280.0, + "grad_norm": 0.05180675245879084, + "language_loss": 0.8175106, + "learning_rate": 0.00021638194583799487, + "loss": 0.82825363, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.3425293, + "step": 3645, + "time_per_iteration": 2.647700071334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072403, + "balance_loss_mlp": 1.03785658, + "epoch": 0.7014236244709504, + "flos": 941020125696.0, + "grad_norm": 0.0581240827613666, + "language_loss": 0.82057631, + "learning_rate": 0.00021612542861469176, + "loss": 0.83130032, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.34594727, + "step": 3646, + "time_per_iteration": 3.1926403045654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070686, + "balance_loss_mlp": 1.03644955, + "epoch": 0.7016160061562139, + "flos": 524908159488.0, + "grad_norm": 0.05426451368259885, + "language_loss": 0.82171357, + "learning_rate": 0.00021586902160127135, + "loss": 0.83242047, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.34277344, + "step": 3647, + "time_per_iteration": 2.5836267471313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074201, + "balance_loss_mlp": 1.03967857, + "epoch": 0.7018083878414775, + "flos": 373170917376.0, + "grad_norm": 0.07691887625237197, + "language_loss": 0.73860252, + "learning_rate": 0.00021561272489727974, + "loss": 0.74934447, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.34570312, + "step": 3648, + "time_per_iteration": 2.426370143890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068543, + "balance_loss_mlp": 1.03518772, + "epoch": 0.7020007695267411, + "flos": 527522761728.0, + "grad_norm": 0.07653563490177187, + "language_loss": 0.80320156, + "learning_rate": 0.0002153565386022199, + "loss": 0.813887, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.33374023, + "step": 3649, + "time_per_iteration": 2.6524124145507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073058, + "balance_loss_mlp": 1.03822541, + "epoch": 0.7021931512120047, + "flos": 689850832896.0, + "grad_norm": 0.0770521311839047, + "language_loss": 0.82439005, + "learning_rate": 0.00021510046281555262, + "loss": 0.83512068, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.34887695, + "step": 3650, + "time_per_iteration": 2.796095609664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069064, + "balance_loss_mlp": 1.03497052, + "epoch": 0.7023855328972681, + "flos": 639499705344.0, + "grad_norm": 0.07628366219259466, + "language_loss": 0.81408215, + "learning_rate": 0.0002148444976366949, + "loss": 0.82477278, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.34130859, + "step": 3651, + "time_per_iteration": 2.7908504009246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071482, + "balance_loss_mlp": 1.03760242, + "epoch": 0.7025779145825317, + "flos": 560674552320.0, + "grad_norm": 0.06297036166850548, + "language_loss": 0.82553816, + "learning_rate": 0.00021458864316502136, + "loss": 0.83625293, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.33911133, + "step": 3652, + "time_per_iteration": 2.7136270999908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073226, + "balance_loss_mlp": 1.03927469, + "epoch": 0.7027702962677953, + "flos": 447214998528.0, + "grad_norm": 0.0549303916698645, + "language_loss": 0.87089896, + "learning_rate": 0.0002143328994998634, + "loss": 0.88163126, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.33959961, + "step": 3653, + "time_per_iteration": 2.4819934368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071057, + "balance_loss_mlp": 1.03603339, + "epoch": 0.7029626779530589, + "flos": 622198471680.0, + "grad_norm": 0.05753095633291236, + "language_loss": 0.78409469, + "learning_rate": 0.00021407726674050982, + "loss": 0.79480523, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.35058594, + "step": 3654, + "time_per_iteration": 2.839901924133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077723, + "balance_loss_mlp": 1.04312825, + "epoch": 0.7031550596383225, + "flos": 629307546624.0, + "grad_norm": 0.04660069709874721, + "language_loss": 0.87104034, + "learning_rate": 0.0002138217449862061, + "loss": 0.88181752, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.34619141, + "step": 3655, + "time_per_iteration": 2.729714870452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074113, + "balance_loss_mlp": 1.04001868, + "epoch": 0.703347441323586, + "flos": 530589878784.0, + "grad_norm": 0.04994580933868796, + "language_loss": 0.78216398, + "learning_rate": 0.00021356633433615403, + "loss": 0.79290509, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.34130859, + "step": 3656, + "time_per_iteration": 2.578078031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074299, + "balance_loss_mlp": 1.04044342, + "epoch": 0.7035398230088495, + "flos": 693264185856.0, + "grad_norm": 0.0479106829759696, + "language_loss": 0.83245599, + "learning_rate": 0.0002133110348895133, + "loss": 0.84319901, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.33862305, + "step": 3657, + "time_per_iteration": 2.9648847579956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068496, + "balance_loss_mlp": 1.03537953, + "epoch": 0.7037322046941131, + "flos": 967628837376.0, + "grad_norm": 0.048159657931533775, + "language_loss": 0.84623647, + "learning_rate": 0.0002130558467453999, + "loss": 0.85692137, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.33129883, + "step": 3658, + "time_per_iteration": 3.3155901432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069074, + "balance_loss_mlp": 1.03514767, + "epoch": 0.7039245863793767, + "flos": 502598891520.0, + "grad_norm": 0.045313539316245835, + "language_loss": 0.84409332, + "learning_rate": 0.0002128007700028865, + "loss": 0.85478401, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.33959961, + "step": 3659, + "time_per_iteration": 2.7024378776550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072456, + "balance_loss_mlp": 1.03926849, + "epoch": 0.7041169680646402, + "flos": 465709271040.0, + "grad_norm": 0.056824645226565565, + "language_loss": 0.84162152, + "learning_rate": 0.00021254580476100276, + "loss": 0.85234612, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.33203125, + "step": 3660, + "time_per_iteration": 2.5560450553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074809, + "balance_loss_mlp": 1.04097748, + "epoch": 0.7043093497499038, + "flos": 631897417728.0, + "grad_norm": 0.07471330414673147, + "language_loss": 0.78714609, + "learning_rate": 0.00021229095111873497, + "loss": 0.79789412, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.33862305, + "step": 3661, + "time_per_iteration": 2.7691423892974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070997, + "balance_loss_mlp": 1.03704596, + "epoch": 0.7045017314351674, + "flos": 542639996928.0, + "grad_norm": 0.04471074658603975, + "language_loss": 0.86054224, + "learning_rate": 0.0002120362091750261, + "loss": 0.87125218, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.33984375, + "step": 3662, + "time_per_iteration": 2.7782440185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073091, + "balance_loss_mlp": 1.03883076, + "epoch": 0.704694113120431, + "flos": 428012135424.0, + "grad_norm": 0.05523093470828303, + "language_loss": 0.86868262, + "learning_rate": 0.00021178157902877566, + "loss": 0.8794136, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.34301758, + "step": 3663, + "time_per_iteration": 2.440488815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070446, + "balance_loss_mlp": 1.03682911, + "epoch": 0.7048864948056945, + "flos": 650253477888.0, + "grad_norm": 0.07482453920379879, + "language_loss": 0.87160063, + "learning_rate": 0.0002115270607788397, + "loss": 0.88230515, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.33642578, + "step": 3664, + "time_per_iteration": 2.760225772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.04015791, + "epoch": 0.705078876490958, + "flos": 412330314240.0, + "grad_norm": 0.05762286530441703, + "language_loss": 0.85702121, + "learning_rate": 0.00021127265452403133, + "loss": 0.86775321, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.33032227, + "step": 3665, + "time_per_iteration": 2.561060905456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007528, + "balance_loss_mlp": 0.99813432, + "epoch": 0.7052712581762216, + "flos": 1419266783232.0, + "grad_norm": 0.0045947469063837235, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85099161, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.09375, + "step": 3666, + "time_per_iteration": 4.89429235458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_mlp": 1.03714871, + "epoch": 0.7054636398614852, + "flos": 492795228672.0, + "grad_norm": 0.08921720435757349, + "language_loss": 0.82764697, + "learning_rate": 0.00021076417839483065, + "loss": 0.83834386, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.32543945, + "step": 3667, + "time_per_iteration": 2.768646240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073838, + "balance_loss_mlp": 1.04010153, + "epoch": 0.7056560215467488, + "flos": 450228271104.0, + "grad_norm": 0.04427607909576538, + "language_loss": 0.85058916, + "learning_rate": 0.00021051010871784589, + "loss": 0.86132753, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.33764648, + "step": 3668, + "time_per_iteration": 2.567970037460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068482, + "balance_loss_mlp": 1.03462708, + "epoch": 0.7058484032320124, + "flos": 565426510848.0, + "grad_norm": 0.048767729933519285, + "language_loss": 0.78747618, + "learning_rate": 0.0002102561514308045, + "loss": 0.79816097, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.33886719, + "step": 3669, + "time_per_iteration": 2.7534899711608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069827, + "balance_loss_mlp": 1.03635263, + "epoch": 0.7060407849172758, + "flos": 566736003072.0, + "grad_norm": 0.04982032344187492, + "language_loss": 0.82456899, + "learning_rate": 0.00021000230663230135, + "loss": 0.83526719, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.33496094, + "step": 3670, + "time_per_iteration": 2.6715986728668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070311, + "balance_loss_mlp": 1.03655052, + "epoch": 0.7062331666025394, + "flos": 468505755648.0, + "grad_norm": 0.07243344373146629, + "language_loss": 0.82818425, + "learning_rate": 0.00020974857442088762, + "loss": 0.83888733, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.33789062, + "step": 3671, + "time_per_iteration": 2.5750696659088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072721, + "balance_loss_mlp": 1.03896141, + "epoch": 0.706425548287803, + "flos": 595042702848.0, + "grad_norm": 0.061680604914147966, + "language_loss": 0.88855779, + "learning_rate": 0.00020949495489507104, + "loss": 0.89928508, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.33789062, + "step": 3672, + "time_per_iteration": 2.6669857501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070135, + "balance_loss_mlp": 1.03680396, + "epoch": 0.7066179299730666, + "flos": 475566778368.0, + "grad_norm": 0.055232709126585705, + "language_loss": 0.8461234, + "learning_rate": 0.00020924144815331525, + "loss": 0.85682476, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.33349609, + "step": 3673, + "time_per_iteration": 2.5462799072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067992, + "balance_loss_mlp": 1.03451765, + "epoch": 0.7068103116583301, + "flos": 506153428992.0, + "grad_norm": 0.061788729653189316, + "language_loss": 0.82846355, + "learning_rate": 0.00020898805429404044, + "loss": 0.83914346, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.33496094, + "step": 3674, + "time_per_iteration": 2.5948987007141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073379, + "balance_loss_mlp": 1.03880787, + "epoch": 0.7070026933435937, + "flos": 679028659200.0, + "grad_norm": 0.053331350399745237, + "language_loss": 0.78217506, + "learning_rate": 0.0002087347734156228, + "loss": 0.79290879, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.34619141, + "step": 3675, + "time_per_iteration": 2.8384974002838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069694, + "balance_loss_mlp": 1.0364821, + "epoch": 0.7071950750288573, + "flos": 471981717504.0, + "grad_norm": 0.04797263488188438, + "language_loss": 0.79430759, + "learning_rate": 0.00020848160561639452, + "loss": 0.8050046, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.33227539, + "step": 3676, + "time_per_iteration": 2.6169028282165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067947, + "balance_loss_mlp": 1.03406775, + "epoch": 0.7073874567141208, + "flos": 473507997696.0, + "grad_norm": 0.04772517856798178, + "language_loss": 0.85496527, + "learning_rate": 0.0002082285509946445, + "loss": 0.86564475, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.33911133, + "step": 3677, + "time_per_iteration": 2.536482334136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070461, + "balance_loss_mlp": 1.03562784, + "epoch": 0.7075798383993844, + "flos": 545589250560.0, + "grad_norm": 0.05597865502328579, + "language_loss": 0.83377022, + "learning_rate": 0.00020797560964861683, + "loss": 0.84447479, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.34887695, + "step": 3678, + "time_per_iteration": 2.7888569831848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070957, + "balance_loss_mlp": 1.03765035, + "epoch": 0.7077722200846479, + "flos": 661766713344.0, + "grad_norm": 0.05495651688887883, + "language_loss": 0.80313671, + "learning_rate": 0.0002077227816765122, + "loss": 0.81384623, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.33325195, + "step": 3679, + "time_per_iteration": 3.0229249000549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009856, + "balance_loss_mlp": 1.00065279, + "epoch": 0.7079646017699115, + "flos": 1529128129536.0, + "grad_norm": 0.00795907908422284, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77457583, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.09179688, + "step": 3680, + "time_per_iteration": 4.766546249389648 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066391, + "balance_loss_mlp": 1.03317952, + "epoch": 0.7081569834551751, + "flos": 621217838592.0, + "grad_norm": 0.05324470770264926, + "language_loss": 0.78516078, + "learning_rate": 0.00020721746624665383, + "loss": 0.79582465, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.33203125, + "step": 3681, + "time_per_iteration": 2.7075722217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065664, + "balance_loss_mlp": 1.03199935, + "epoch": 0.7083493651404387, + "flos": 794280743424.0, + "grad_norm": 0.05089131854365718, + "language_loss": 0.79764175, + "learning_rate": 0.00020696497898508114, + "loss": 0.80829841, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.33691406, + "step": 3682, + "time_per_iteration": 2.9950366020202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066323, + "balance_loss_mlp": 1.03165746, + "epoch": 0.7085417468257021, + "flos": 813394856448.0, + "grad_norm": 0.05983793282747749, + "language_loss": 0.7766552, + "learning_rate": 0.00020671260548979316, + "loss": 0.78731841, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.34716797, + "step": 3683, + "time_per_iteration": 2.986528158187866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069107, + "balance_loss_mlp": 1.03503704, + "epoch": 0.7087341285109657, + "flos": 700259779584.0, + "grad_norm": 0.07395200120023371, + "language_loss": 0.84964406, + "learning_rate": 0.00020646034585876982, + "loss": 0.86033517, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.34106445, + "step": 3684, + "time_per_iteration": 2.801340341567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068964, + "balance_loss_mlp": 1.03467929, + "epoch": 0.7089265101962293, + "flos": 596211010560.0, + "grad_norm": 0.047359686788279315, + "language_loss": 0.84225708, + "learning_rate": 0.00020620820018994718, + "loss": 0.85294676, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.34301758, + "step": 3685, + "time_per_iteration": 2.8521230220794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069096, + "balance_loss_mlp": 1.03438258, + "epoch": 0.7091188918814929, + "flos": 486842876928.0, + "grad_norm": 0.05746562851929707, + "language_loss": 0.82886755, + "learning_rate": 0.00020595616858121675, + "loss": 0.8395586, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.34765625, + "step": 3686, + "time_per_iteration": 2.7113983631134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064604, + "balance_loss_mlp": 1.03034306, + "epoch": 0.7093112735667565, + "flos": 599833949184.0, + "grad_norm": 0.05104944796705689, + "language_loss": 0.80622023, + "learning_rate": 0.00020570425113042586, + "loss": 0.81686622, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.34277344, + "step": 3687, + "time_per_iteration": 2.712451457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066788, + "balance_loss_mlp": 1.03293276, + "epoch": 0.70950365525202, + "flos": 505577258496.0, + "grad_norm": 0.05729403369858188, + "language_loss": 0.85692352, + "learning_rate": 0.0002054524479353776, + "loss": 0.86759138, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.33886719, + "step": 3688, + "time_per_iteration": 2.6377811431884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068463, + "balance_loss_mlp": 1.03446496, + "epoch": 0.7096960369372836, + "flos": 731846002176.0, + "grad_norm": 0.05774020478713443, + "language_loss": 0.81201112, + "learning_rate": 0.00020520075909383063, + "loss": 0.82269579, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.34033203, + "step": 3689, + "time_per_iteration": 2.8854405879974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068474, + "balance_loss_mlp": 1.03409433, + "epoch": 0.7098884186225471, + "flos": 971685351936.0, + "grad_norm": 0.048806563033970844, + "language_loss": 0.8087877, + "learning_rate": 0.00020494918470349916, + "loss": 0.81947243, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.34399414, + "step": 3690, + "time_per_iteration": 3.2719247341156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069117, + "balance_loss_mlp": 1.03518987, + "epoch": 0.7100808003078107, + "flos": 504001516032.0, + "grad_norm": 0.0562848132432342, + "language_loss": 0.85595727, + "learning_rate": 0.00020469772486205297, + "loss": 0.86664844, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.33959961, + "step": 3691, + "time_per_iteration": 2.599254608154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064349, + "balance_loss_mlp": 1.03018332, + "epoch": 0.7102731819930742, + "flos": 540073446912.0, + "grad_norm": 0.052398389551748005, + "language_loss": 0.81299037, + "learning_rate": 0.0002044463796671177, + "loss": 0.82363379, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.34204102, + "step": 3692, + "time_per_iteration": 2.6676712036132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068474, + "balance_loss_mlp": 1.03502345, + "epoch": 0.7104655636783378, + "flos": 620066907648.0, + "grad_norm": 0.05724464606399067, + "language_loss": 0.80306011, + "learning_rate": 0.00020419514921627408, + "loss": 0.8137449, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.3347168, + "step": 3693, + "time_per_iteration": 2.906092643737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071382, + "balance_loss_mlp": 1.03707361, + "epoch": 0.7106579453636014, + "flos": 557060378112.0, + "grad_norm": 0.04981428600794461, + "language_loss": 0.77017659, + "learning_rate": 0.00020394403360705855, + "loss": 0.78089035, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.34350586, + "step": 3694, + "time_per_iteration": 2.69543719291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107094, + "balance_loss_mlp": 1.03634608, + "epoch": 0.710850327048865, + "flos": 512795432448.0, + "grad_norm": 0.05615701524037797, + "language_loss": 0.8807683, + "learning_rate": 0.00020369303293696228, + "loss": 0.8914777, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.34619141, + "step": 3695, + "time_per_iteration": 2.613211154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072906, + "balance_loss_mlp": 1.03850234, + "epoch": 0.7110427087341286, + "flos": 423398389248.0, + "grad_norm": 0.05344233224786611, + "language_loss": 0.78265321, + "learning_rate": 0.00020344214730343304, + "loss": 0.79338229, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.34448242, + "step": 3696, + "time_per_iteration": 2.60355544090271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070854, + "balance_loss_mlp": 1.03687966, + "epoch": 0.711235090419392, + "flos": 577107072000.0, + "grad_norm": 0.05731164613368461, + "language_loss": 0.79340208, + "learning_rate": 0.00020319137680387296, + "loss": 0.80411065, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.34008789, + "step": 3697, + "time_per_iteration": 2.9248886108398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071143, + "balance_loss_mlp": 1.03712082, + "epoch": 0.7114274721046556, + "flos": 447830456832.0, + "grad_norm": 0.06826664171711681, + "language_loss": 0.80587053, + "learning_rate": 0.0002029407215356398, + "loss": 0.81658196, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.34057617, + "step": 3698, + "time_per_iteration": 2.5251829624176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066516, + "balance_loss_mlp": 1.03304207, + "epoch": 0.7116198537899192, + "flos": 621680527872.0, + "grad_norm": 0.05434937939483776, + "language_loss": 0.83318967, + "learning_rate": 0.00020269018159604663, + "loss": 0.84385484, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.33496094, + "step": 3699, + "time_per_iteration": 2.6997692584991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062784, + "balance_loss_mlp": 1.02921486, + "epoch": 0.7118122354751828, + "flos": 498476947968.0, + "grad_norm": 0.04823068648652618, + "language_loss": 0.81931448, + "learning_rate": 0.00020243975708236162, + "loss": 0.82994235, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.3359375, + "step": 3700, + "time_per_iteration": 2.5654993057250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071384, + "balance_loss_mlp": 1.03717113, + "epoch": 0.7120046171604463, + "flos": 572438071296.0, + "grad_norm": 0.09878181502627377, + "language_loss": 0.85897946, + "learning_rate": 0.00020218944809180818, + "loss": 0.86969334, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.3425293, + "step": 3701, + "time_per_iteration": 2.7016773223876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070567, + "balance_loss_mlp": 1.03661633, + "epoch": 0.7121969988457099, + "flos": 572388609024.0, + "grad_norm": 0.07221648962243508, + "language_loss": 0.8452931, + "learning_rate": 0.00020193925472156493, + "loss": 0.85599875, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.33984375, + "step": 3702, + "time_per_iteration": 2.6914734840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034328, + "balance_loss_mlp": 1.02545857, + "epoch": 0.7123893805309734, + "flos": 1522585050624.0, + "grad_norm": 0.022091327023181177, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75323498, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.08886719, + "step": 3703, + "time_per_iteration": 4.884379148483276 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066468, + "balance_loss_mlp": 1.03304124, + "epoch": 0.712581762216237, + "flos": 614779476480.0, + "grad_norm": 0.06545400953207585, + "language_loss": 0.83676839, + "learning_rate": 0.00020143921523049863, + "loss": 0.84743309, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.33447266, + "step": 3704, + "time_per_iteration": 2.9219436645507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106542, + "balance_loss_mlp": 1.03185105, + "epoch": 0.7127741439015006, + "flos": 597504536064.0, + "grad_norm": 0.06577771502635076, + "language_loss": 0.835908, + "learning_rate": 0.00020118936930380837, + "loss": 0.84656215, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.3359375, + "step": 3705, + "time_per_iteration": 2.6833901405334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070499, + "balance_loss_mlp": 1.03635776, + "epoch": 0.7129665255867641, + "flos": 537138749952.0, + "grad_norm": 0.05242920734791126, + "language_loss": 0.80929446, + "learning_rate": 0.0002009396393856932, + "loss": 0.81999946, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.34179688, + "step": 3706, + "time_per_iteration": 2.6226556301116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107105, + "balance_loss_mlp": 1.03707516, + "epoch": 0.7131589072720277, + "flos": 526173981696.0, + "grad_norm": 0.05578991259827158, + "language_loss": 0.82312477, + "learning_rate": 0.00020069002557310673, + "loss": 0.8338353, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.34008789, + "step": 3707, + "time_per_iteration": 2.6535470485687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064608, + "balance_loss_mlp": 1.0319922, + "epoch": 0.7133512889572913, + "flos": 530626194432.0, + "grad_norm": 0.0741438657284304, + "language_loss": 0.77105689, + "learning_rate": 0.00020044052796295807, + "loss": 0.78170288, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.32617188, + "step": 3708, + "time_per_iteration": 2.787355899810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066524, + "balance_loss_mlp": 1.03226364, + "epoch": 0.7135436706425549, + "flos": 503282750976.0, + "grad_norm": 0.05095203093874289, + "language_loss": 0.82020175, + "learning_rate": 0.00020019114665211063, + "loss": 0.83086699, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.34301758, + "step": 3709, + "time_per_iteration": 2.5732407569885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070401, + "balance_loss_mlp": 1.03645074, + "epoch": 0.7137360523278183, + "flos": 515719954944.0, + "grad_norm": 0.04941715658479687, + "language_loss": 0.81220102, + "learning_rate": 0.00019994188173738276, + "loss": 0.82290506, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.33984375, + "step": 3710, + "time_per_iteration": 2.5564064979553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068107, + "balance_loss_mlp": 1.03398967, + "epoch": 0.7139284340130819, + "flos": 510103664640.0, + "grad_norm": 0.05502854520341245, + "language_loss": 0.80873179, + "learning_rate": 0.0001996927333155477, + "loss": 0.81941289, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.34155273, + "step": 3711, + "time_per_iteration": 2.732224225997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071876, + "balance_loss_mlp": 1.03825879, + "epoch": 0.7141208156983455, + "flos": 889896388608.0, + "grad_norm": 0.05033741502761429, + "language_loss": 0.85233271, + "learning_rate": 0.00019944370148333346, + "loss": 0.86305141, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.33642578, + "step": 3712, + "time_per_iteration": 3.213644504547119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107149, + "balance_loss_mlp": 1.03827798, + "epoch": 0.7143131973836091, + "flos": 535504780800.0, + "grad_norm": 0.05173411094558013, + "language_loss": 0.79739279, + "learning_rate": 0.00019919478633742278, + "loss": 0.80810767, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.33227539, + "step": 3713, + "time_per_iteration": 2.7310914993286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072393, + "balance_loss_mlp": 1.03884721, + "epoch": 0.7145055790688727, + "flos": 473429422080.0, + "grad_norm": 0.04797356179200618, + "language_loss": 0.85098791, + "learning_rate": 0.00019894598797445302, + "loss": 0.86171186, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.33569336, + "step": 3714, + "time_per_iteration": 2.5128626823425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071822, + "balance_loss_mlp": 1.03796673, + "epoch": 0.7146979607541362, + "flos": 570227931648.0, + "grad_norm": 0.05105012604374378, + "language_loss": 0.81882799, + "learning_rate": 0.00019869730649101615, + "loss": 0.82954621, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.33886719, + "step": 3715, + "time_per_iteration": 2.7468035221099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074372, + "balance_loss_mlp": 1.03965807, + "epoch": 0.7148903424393998, + "flos": 839299359744.0, + "grad_norm": 0.0561955521045174, + "language_loss": 0.72303152, + "learning_rate": 0.00019844874198365943, + "loss": 0.73377526, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.34765625, + "step": 3716, + "time_per_iteration": 3.0928800106048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072381, + "balance_loss_mlp": 1.03807223, + "epoch": 0.7150827241246633, + "flos": 541560439296.0, + "grad_norm": 0.05538627322116671, + "language_loss": 0.83775991, + "learning_rate": 0.00019820029454888362, + "loss": 0.84848368, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.34326172, + "step": 3717, + "time_per_iteration": 2.6984283924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101976, + "balance_loss_mlp": 1.00993717, + "epoch": 0.7152751058099269, + "flos": 1582803859968.0, + "grad_norm": 0.008798476496045995, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.75541025, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.09814453, + "step": 3718, + "time_per_iteration": 5.056431531906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072324, + "balance_loss_mlp": 1.03775322, + "epoch": 0.7154674874951905, + "flos": 517167659520.0, + "grad_norm": 0.0523553620911167, + "language_loss": 0.80075788, + "learning_rate": 0.0001977037512828529, + "loss": 0.81148112, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.34594727, + "step": 3719, + "time_per_iteration": 2.57888126373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068149, + "balance_loss_mlp": 1.03391242, + "epoch": 0.715659869180454, + "flos": 602246320128.0, + "grad_norm": 0.048902324655222526, + "language_loss": 0.86289543, + "learning_rate": 0.0001974556556443734, + "loss": 0.873577, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.3425293, + "step": 3720, + "time_per_iteration": 2.6931040287017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065993, + "balance_loss_mlp": 1.03206623, + "epoch": 0.7158522508657176, + "flos": 531403186176.0, + "grad_norm": 0.0436888691485468, + "language_loss": 0.88365716, + "learning_rate": 0.00019720767746402547, + "loss": 0.89431709, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.33959961, + "step": 3721, + "time_per_iteration": 2.7067127227783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072793, + "balance_loss_mlp": 1.03867531, + "epoch": 0.7160446325509812, + "flos": 557301897216.0, + "grad_norm": 0.0582274730279212, + "language_loss": 0.80045772, + "learning_rate": 0.00019695981683808222, + "loss": 0.8111856, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.34155273, + "step": 3722, + "time_per_iteration": 2.708950996398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067549, + "balance_loss_mlp": 1.03405118, + "epoch": 0.7162370142362448, + "flos": 690664140288.0, + "grad_norm": 0.04509643904161843, + "language_loss": 0.84632957, + "learning_rate": 0.00019671207386277225, + "loss": 0.85700506, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.33520508, + "step": 3723, + "time_per_iteration": 2.9580013751983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068909, + "balance_loss_mlp": 1.03462386, + "epoch": 0.7164293959215082, + "flos": 793772974080.0, + "grad_norm": 0.06707821988874196, + "language_loss": 0.77988201, + "learning_rate": 0.0001964644486342777, + "loss": 0.79057109, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.34326172, + "step": 3724, + "time_per_iteration": 2.937603712081909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067817, + "balance_loss_mlp": 1.03403354, + "epoch": 0.7166217776067718, + "flos": 493922838528.0, + "grad_norm": 0.05338190287132838, + "language_loss": 0.86470282, + "learning_rate": 0.00019621694124873524, + "loss": 0.87538099, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.33813477, + "step": 3725, + "time_per_iteration": 2.708923816680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012685, + "balance_loss_mlp": 1.00305271, + "epoch": 0.7168141592920354, + "flos": 1400337524736.0, + "grad_norm": 0.004329548481597118, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.7755276, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.09619141, + "step": 3726, + "time_per_iteration": 4.868973970413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067858, + "balance_loss_mlp": 1.03323972, + "epoch": 0.717006540977299, + "flos": 792789368832.0, + "grad_norm": 0.04993242383663973, + "language_loss": 0.77399421, + "learning_rate": 0.00019572228039082428, + "loss": 0.78467286, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.34643555, + "step": 3727, + "time_per_iteration": 3.0444281101226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063691, + "balance_loss_mlp": 1.02971661, + "epoch": 0.7171989226625626, + "flos": 554525761536.0, + "grad_norm": 0.045554501799563094, + "language_loss": 0.83411372, + "learning_rate": 0.0001954751271105002, + "loss": 0.84475064, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.34008789, + "step": 3728, + "time_per_iteration": 2.809967041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065462, + "balance_loss_mlp": 1.03186858, + "epoch": 0.717391304347826, + "flos": 555628640256.0, + "grad_norm": 0.05755567657425633, + "language_loss": 0.80672932, + "learning_rate": 0.00019522809205721687, + "loss": 0.81738389, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.33618164, + "step": 3729, + "time_per_iteration": 2.7862703800201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067947, + "balance_loss_mlp": 1.03459263, + "epoch": 0.7175836860330896, + "flos": 538582072320.0, + "grad_norm": 0.05354925450450462, + "language_loss": 0.82769603, + "learning_rate": 0.0001949811753268816, + "loss": 0.83837551, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.33374023, + "step": 3730, + "time_per_iteration": 2.6676440238952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106439, + "balance_loss_mlp": 1.03046322, + "epoch": 0.7177760677183532, + "flos": 515385303552.0, + "grad_norm": 0.057530592847955, + "language_loss": 0.82664466, + "learning_rate": 0.00019473437701535634, + "loss": 0.8372885, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.33959961, + "step": 3731, + "time_per_iteration": 2.5901401042938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061132, + "balance_loss_mlp": 1.02772939, + "epoch": 0.7179684494036168, + "flos": 674414913024.0, + "grad_norm": 0.05555536497682914, + "language_loss": 0.89367867, + "learning_rate": 0.00019448769721845677, + "loss": 0.90428996, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.33422852, + "step": 3732, + "time_per_iteration": 2.784381628036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106645, + "balance_loss_mlp": 1.03192735, + "epoch": 0.7181608310888803, + "flos": 469672653312.0, + "grad_norm": 0.05444278495505657, + "language_loss": 0.85605729, + "learning_rate": 0.00019424113603195203, + "loss": 0.86672175, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.34570312, + "step": 3733, + "time_per_iteration": 2.5088841915130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106796, + "balance_loss_mlp": 1.03343654, + "epoch": 0.7183532127741439, + "flos": 593645870592.0, + "grad_norm": 0.06008894294367452, + "language_loss": 0.79899514, + "learning_rate": 0.0001939946935515657, + "loss": 0.80967468, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.34570312, + "step": 3734, + "time_per_iteration": 2.8258321285247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064654, + "balance_loss_mlp": 1.03065538, + "epoch": 0.7185455944594075, + "flos": 498669004800.0, + "grad_norm": 0.05732279387699742, + "language_loss": 0.80418706, + "learning_rate": 0.0001937483698729755, + "loss": 0.81483358, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.34008789, + "step": 3735, + "time_per_iteration": 2.5968332290649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065941, + "balance_loss_mlp": 1.03182328, + "epoch": 0.718737976144671, + "flos": 814590867456.0, + "grad_norm": 0.053801017075388924, + "language_loss": 0.82329178, + "learning_rate": 0.0001935021650918128, + "loss": 0.83395112, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.34155273, + "step": 3736, + "time_per_iteration": 2.982541084289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063158, + "balance_loss_mlp": 1.02894521, + "epoch": 0.7189303578299346, + "flos": 438100987392.0, + "grad_norm": 0.06976823938990344, + "language_loss": 0.86880851, + "learning_rate": 0.0001932560793036625, + "loss": 0.87944007, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.3425293, + "step": 3737, + "time_per_iteration": 2.5138731002807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064404, + "balance_loss_mlp": 1.0309298, + "epoch": 0.7191227395151981, + "flos": 549137995776.0, + "grad_norm": 0.0607946285508029, + "language_loss": 0.8638792, + "learning_rate": 0.00019301011260406382, + "loss": 0.87452322, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.33496094, + "step": 3738, + "time_per_iteration": 2.628265619277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065412, + "balance_loss_mlp": 1.03224778, + "epoch": 0.7193151212004617, + "flos": 626653656576.0, + "grad_norm": 0.05146382358147088, + "language_loss": 0.79296547, + "learning_rate": 0.00019276426508850936, + "loss": 0.80361962, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.33178711, + "step": 3739, + "time_per_iteration": 2.7006874084472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065434, + "balance_loss_mlp": 1.03179288, + "epoch": 0.7195075028857253, + "flos": 740719904256.0, + "grad_norm": 0.046550971907091544, + "language_loss": 0.80166346, + "learning_rate": 0.00019251853685244564, + "loss": 0.81231779, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.33666992, + "step": 3740, + "time_per_iteration": 3.0175721645355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066355, + "balance_loss_mlp": 1.0327853, + "epoch": 0.7196998845709889, + "flos": 802523220480.0, + "grad_norm": 0.05930173376482813, + "language_loss": 0.80639338, + "learning_rate": 0.00019227292799127283, + "loss": 0.81705689, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.3359375, + "step": 3741, + "time_per_iteration": 3.074167251586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069786, + "balance_loss_mlp": 1.03640747, + "epoch": 0.7198922662562524, + "flos": 924786865152.0, + "grad_norm": 0.05002690922956246, + "language_loss": 0.79003727, + "learning_rate": 0.00019202743860034454, + "loss": 0.80073518, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.33398438, + "step": 3742, + "time_per_iteration": 3.205714702606201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067652, + "balance_loss_mlp": 1.03448844, + "epoch": 0.7200846479415159, + "flos": 579838127616.0, + "grad_norm": 0.05345251644076864, + "language_loss": 0.83706784, + "learning_rate": 0.00019178206877496873, + "loss": 0.84774435, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.33178711, + "step": 3743, + "time_per_iteration": 2.6547601222991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106189, + "balance_loss_mlp": 1.02834439, + "epoch": 0.7202770296267795, + "flos": 557410996224.0, + "grad_norm": 0.043135096200134324, + "language_loss": 0.85002279, + "learning_rate": 0.0001915368186104059, + "loss": 0.86064172, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.33569336, + "step": 3744, + "time_per_iteration": 2.740265130996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066689, + "balance_loss_mlp": 1.03385842, + "epoch": 0.7204694113120431, + "flos": 672248443392.0, + "grad_norm": 0.0510098873102972, + "language_loss": 0.81037152, + "learning_rate": 0.0001912916882018706, + "loss": 0.82103842, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.32836914, + "step": 3745, + "time_per_iteration": 2.8475067615509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068353, + "balance_loss_mlp": 1.03511715, + "epoch": 0.7206617929973067, + "flos": 798845027328.0, + "grad_norm": 0.058473767349389985, + "language_loss": 0.78699112, + "learning_rate": 0.00019104667764453125, + "loss": 0.79767466, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.33251953, + "step": 3746, + "time_per_iteration": 3.016134738922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064862, + "balance_loss_mlp": 1.031793, + "epoch": 0.7208541746825702, + "flos": 531638913024.0, + "grad_norm": 0.04570203365425481, + "language_loss": 0.80496103, + "learning_rate": 0.00019080178703350926, + "loss": 0.81560969, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.33081055, + "step": 3747, + "time_per_iteration": 2.6047801971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060751, + "balance_loss_mlp": 1.02682364, + "epoch": 0.7210465563678338, + "flos": 534883530240.0, + "grad_norm": 0.04791251301755464, + "language_loss": 0.82855403, + "learning_rate": 0.00019055701646387952, + "loss": 0.83916157, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.33959961, + "step": 3748, + "time_per_iteration": 2.6366617679595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015113, + "balance_loss_mlp": 1.00548053, + "epoch": 0.7212389380530974, + "flos": 1533076955136.0, + "grad_norm": 0.0050303066243172915, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81487799, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.09619141, + "step": 3749, + "time_per_iteration": 4.800697326660156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067212, + "balance_loss_mlp": 1.03361845, + "epoch": 0.7214313197383609, + "flos": 461277407232.0, + "grad_norm": 0.05889548383130951, + "language_loss": 0.86542219, + "learning_rate": 0.00019006783582886368, + "loss": 0.87609434, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.33618164, + "step": 3750, + "time_per_iteration": 2.52746844291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066015, + "balance_loss_mlp": 1.0318023, + "epoch": 0.7216237014236244, + "flos": 1036691025408.0, + "grad_norm": 0.046476584677382714, + "language_loss": 0.82800925, + "learning_rate": 0.00018982342595339437, + "loss": 0.83866942, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.3425293, + "step": 3751, + "time_per_iteration": 3.5170929431915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067071, + "balance_loss_mlp": 1.03416932, + "epoch": 0.721816083108888, + "flos": 895578107904.0, + "grad_norm": 0.05167132755024372, + "language_loss": 0.81707644, + "learning_rate": 0.00018957913649915076, + "loss": 0.82774711, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.32910156, + "step": 3752, + "time_per_iteration": 3.1112849712371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010644, + "balance_loss_mlp": 1.03178465, + "epoch": 0.7220084647941516, + "flos": 523066166784.0, + "grad_norm": 0.05533376577602326, + "language_loss": 0.79672492, + "learning_rate": 0.00018933496756097428, + "loss": 0.80736887, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.32617188, + "step": 3753, + "time_per_iteration": 2.5987796783447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064595, + "balance_loss_mlp": 1.03102577, + "epoch": 0.7222008464794152, + "flos": 815757765120.0, + "grad_norm": 0.05288107423325553, + "language_loss": 0.81242466, + "learning_rate": 0.0001890909192336603, + "loss": 0.82307053, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.3359375, + "step": 3754, + "time_per_iteration": 3.0019736289978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065173, + "balance_loss_mlp": 1.03172278, + "epoch": 0.7223932281646788, + "flos": 748725244416.0, + "grad_norm": 0.049565047551570436, + "language_loss": 0.70085669, + "learning_rate": 0.00018884699161195623, + "loss": 0.71150839, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.3347168, + "step": 3755, + "time_per_iteration": 2.921433448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106462, + "balance_loss_mlp": 1.03047848, + "epoch": 0.7225856098499422, + "flos": 745132829184.0, + "grad_norm": 0.05110029255023059, + "language_loss": 0.77537811, + "learning_rate": 0.00018860318479056327, + "loss": 0.78602433, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.34179688, + "step": 3756, + "time_per_iteration": 4.5331456661224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064579, + "balance_loss_mlp": 1.03155816, + "epoch": 0.7227779915352058, + "flos": 547055894016.0, + "grad_norm": 0.047457603213344, + "language_loss": 0.835307, + "learning_rate": 0.00018835949886413555, + "loss": 0.84595281, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.33032227, + "step": 3757, + "time_per_iteration": 2.721592903137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106988, + "balance_loss_mlp": 1.0362395, + "epoch": 0.7229703732204694, + "flos": 530230496256.0, + "grad_norm": 0.05570980366468543, + "language_loss": 0.78520513, + "learning_rate": 0.0001881159339272806, + "loss": 0.79590392, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.33666992, + "step": 3758, + "time_per_iteration": 2.6724090576171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066008, + "balance_loss_mlp": 1.03289187, + "epoch": 0.723162754905733, + "flos": 528103314432.0, + "grad_norm": 0.05510744793319723, + "language_loss": 0.7836262, + "learning_rate": 0.00018787249007455858, + "loss": 0.79428625, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.33129883, + "step": 3759, + "time_per_iteration": 2.608786106109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065133, + "balance_loss_mlp": 1.03292298, + "epoch": 0.7233551365909965, + "flos": 654571860480.0, + "grad_norm": 0.051481631649939415, + "language_loss": 0.71461964, + "learning_rate": 0.00018762916740048302, + "loss": 0.72527099, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.32202148, + "step": 3760, + "time_per_iteration": 2.768165111541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064991, + "balance_loss_mlp": 1.03201807, + "epoch": 0.7235475182762601, + "flos": 522097118208.0, + "grad_norm": 0.045655130957968595, + "language_loss": 0.85612011, + "learning_rate": 0.0001873859659995195, + "loss": 0.86677003, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.32983398, + "step": 3761, + "time_per_iteration": 2.749396800994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106691, + "balance_loss_mlp": 1.03357887, + "epoch": 0.7237398999615237, + "flos": 608883941376.0, + "grad_norm": 0.05437044634391734, + "language_loss": 0.83492088, + "learning_rate": 0.0001871428859660878, + "loss": 0.84559, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.33349609, + "step": 3762, + "time_per_iteration": 2.767180919647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107084, + "balance_loss_mlp": 1.03820074, + "epoch": 0.7239322816467872, + "flos": 658664690688.0, + "grad_norm": 0.04804139363705488, + "language_loss": 0.82056308, + "learning_rate": 0.00018689992739455975, + "loss": 0.83127153, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.32641602, + "step": 3763, + "time_per_iteration": 2.8873496055603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071938, + "balance_loss_mlp": 1.03803444, + "epoch": 0.7241246633320508, + "flos": 968869928448.0, + "grad_norm": 0.04487268066979416, + "language_loss": 0.85964411, + "learning_rate": 0.00018665709037926027, + "loss": 0.87036347, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.33935547, + "step": 3764, + "time_per_iteration": 3.2812607288360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067601, + "balance_loss_mlp": 1.03429401, + "epoch": 0.7243170450173143, + "flos": 514745114112.0, + "grad_norm": 0.06636395802329886, + "language_loss": 0.84182644, + "learning_rate": 0.00018641437501446694, + "loss": 0.85250252, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.33325195, + "step": 3765, + "time_per_iteration": 2.573697328567505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069298, + "balance_loss_mlp": 1.03668237, + "epoch": 0.7245094267025779, + "flos": 559482923520.0, + "grad_norm": 0.05849002982454381, + "language_loss": 0.82240844, + "learning_rate": 0.0001861717813944104, + "loss": 0.83310151, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.32617188, + "step": 3766, + "time_per_iteration": 2.630692481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070145, + "balance_loss_mlp": 1.03674293, + "epoch": 0.7247018083878415, + "flos": 612359903232.0, + "grad_norm": 0.059142078563837144, + "language_loss": 0.7934258, + "learning_rate": 0.00018592930961327365, + "loss": 0.80412722, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.33422852, + "step": 3767, + "time_per_iteration": 2.714850902557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069775, + "balance_loss_mlp": 1.03694439, + "epoch": 0.7248941900731051, + "flos": 634379599872.0, + "grad_norm": 0.04667094016302488, + "language_loss": 0.8795737, + "learning_rate": 0.00018568695976519273, + "loss": 0.89027148, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.32836914, + "step": 3768, + "time_per_iteration": 2.78951358795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067478, + "balance_loss_mlp": 1.03433776, + "epoch": 0.7250865717583687, + "flos": 424718055936.0, + "grad_norm": 0.05715863238838566, + "language_loss": 0.80076563, + "learning_rate": 0.00018544473194425593, + "loss": 0.81144047, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.33154297, + "step": 3769, + "time_per_iteration": 2.5101308822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068926, + "balance_loss_mlp": 1.03542805, + "epoch": 0.7252789534436321, + "flos": 634794236928.0, + "grad_norm": 0.05221621035796038, + "language_loss": 0.78552115, + "learning_rate": 0.00018520262624450485, + "loss": 0.79621041, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.33520508, + "step": 3770, + "time_per_iteration": 2.851344347000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065232, + "balance_loss_mlp": 1.03247309, + "epoch": 0.7254713351288957, + "flos": 616895073792.0, + "grad_norm": 0.05281322327607285, + "language_loss": 0.86844021, + "learning_rate": 0.00018496064275993324, + "loss": 0.87909257, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.32763672, + "step": 3771, + "time_per_iteration": 2.740528106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065588, + "balance_loss_mlp": 1.03132713, + "epoch": 0.7256637168141593, + "flos": 766662285312.0, + "grad_norm": 0.053619752531576234, + "language_loss": 0.81698912, + "learning_rate": 0.00018471878158448686, + "loss": 0.82764494, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.34301758, + "step": 3772, + "time_per_iteration": 2.940927028656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068836, + "balance_loss_mlp": 1.03538561, + "epoch": 0.7258560984994229, + "flos": 495268646400.0, + "grad_norm": 0.044202669157845896, + "language_loss": 0.8410005, + "learning_rate": 0.00018447704281206512, + "loss": 0.85168886, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.3347168, + "step": 3773, + "time_per_iteration": 2.9211905002593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010641, + "balance_loss_mlp": 1.03050709, + "epoch": 0.7260484801846864, + "flos": 529802712576.0, + "grad_norm": 0.0599389288946333, + "language_loss": 0.82910264, + "learning_rate": 0.0001842354265365191, + "loss": 0.83974361, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.33618164, + "step": 3774, + "time_per_iteration": 2.672297477722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067714, + "balance_loss_mlp": 1.03478813, + "epoch": 0.72624086186995, + "flos": 624679243776.0, + "grad_norm": 0.055766679807351886, + "language_loss": 0.80738944, + "learning_rate": 0.0001839939328516526, + "loss": 0.81806654, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.32910156, + "step": 3775, + "time_per_iteration": 2.715765953063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067661, + "balance_loss_mlp": 1.03475976, + "epoch": 0.7264332435552135, + "flos": 716203468800.0, + "grad_norm": 0.054689232806286694, + "language_loss": 0.80927253, + "learning_rate": 0.0001837525618512218, + "loss": 0.81994909, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.32910156, + "step": 3776, + "time_per_iteration": 2.9182283878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067478, + "balance_loss_mlp": 1.03467178, + "epoch": 0.7266256252404771, + "flos": 680736821760.0, + "grad_norm": 0.056616455322331526, + "language_loss": 0.83123744, + "learning_rate": 0.00018351131362893519, + "loss": 0.84191227, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.328125, + "step": 3777, + "time_per_iteration": 2.8280246257781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065965, + "balance_loss_mlp": 1.03227687, + "epoch": 0.7268180069257407, + "flos": 518654651904.0, + "grad_norm": 0.0757528299469481, + "language_loss": 0.80649394, + "learning_rate": 0.00018327018827845364, + "loss": 0.81715357, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.3371582, + "step": 3778, + "time_per_iteration": 2.6342718601226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065833, + "balance_loss_mlp": 1.03221643, + "epoch": 0.7270103886110042, + "flos": 512411318784.0, + "grad_norm": 0.05462394949163198, + "language_loss": 0.87201697, + "learning_rate": 0.00018302918589339036, + "loss": 0.88267529, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.33642578, + "step": 3779, + "time_per_iteration": 2.6401546001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065353, + "balance_loss_mlp": 1.03166389, + "epoch": 0.7272027702962678, + "flos": 546395355648.0, + "grad_norm": 0.050485328839168696, + "language_loss": 0.90140432, + "learning_rate": 0.00018278830656731054, + "loss": 0.91205782, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.3371582, + "step": 3780, + "time_per_iteration": 2.6837782859802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060977, + "balance_loss_mlp": 1.02883863, + "epoch": 0.7273951519815314, + "flos": 592772926464.0, + "grad_norm": 0.04496338740790305, + "language_loss": 0.86495197, + "learning_rate": 0.00018254755039373222, + "loss": 0.87556171, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.32128906, + "step": 3781, + "time_per_iteration": 2.7322683334350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063791, + "balance_loss_mlp": 1.03084135, + "epoch": 0.727587533666795, + "flos": 605732456448.0, + "grad_norm": 0.056903164121683655, + "language_loss": 0.83278424, + "learning_rate": 0.0001823069174661252, + "loss": 0.84342206, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.32958984, + "step": 3782, + "time_per_iteration": 2.75710129737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067494, + "balance_loss_mlp": 1.03380585, + "epoch": 0.7277799153520584, + "flos": 512770701312.0, + "grad_norm": 0.05370507093110541, + "language_loss": 0.78568602, + "learning_rate": 0.00018206640787791112, + "loss": 0.79636097, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.3371582, + "step": 3783, + "time_per_iteration": 2.61852765083313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062618, + "balance_loss_mlp": 1.02923894, + "epoch": 0.727972297037322, + "flos": 537498132480.0, + "grad_norm": 0.05379721469366117, + "language_loss": 0.85843956, + "learning_rate": 0.00018182602172246416, + "loss": 0.8690657, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.33398438, + "step": 3784, + "time_per_iteration": 2.593327522277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061524, + "balance_loss_mlp": 1.02819335, + "epoch": 0.7281646787225856, + "flos": 534780223488.0, + "grad_norm": 0.06658957148496236, + "language_loss": 0.76393896, + "learning_rate": 0.00018158575909311075, + "loss": 0.77455419, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.33349609, + "step": 3785, + "time_per_iteration": 2.600620985031128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106257, + "balance_loss_mlp": 1.02935863, + "epoch": 0.7283570604078492, + "flos": 624767993856.0, + "grad_norm": 0.053054924881327924, + "language_loss": 0.79626518, + "learning_rate": 0.000181345620083129, + "loss": 0.80689085, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.33227539, + "step": 3786, + "time_per_iteration": 2.746778726577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065243, + "balance_loss_mlp": 1.03255534, + "epoch": 0.7285494420931128, + "flos": 533904307200.0, + "grad_norm": 0.097300641099862, + "language_loss": 0.86717927, + "learning_rate": 0.00018110560478574927, + "loss": 0.8778317, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.3269043, + "step": 3787, + "time_per_iteration": 2.6793131828308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065255, + "balance_loss_mlp": 1.03147149, + "epoch": 0.7287418237783763, + "flos": 666251011584.0, + "grad_norm": 0.05707772132850956, + "language_loss": 0.80307966, + "learning_rate": 0.0001808657132941533, + "loss": 0.81373221, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.33813477, + "step": 3788, + "time_per_iteration": 2.7490005493164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065453, + "balance_loss_mlp": 1.03147793, + "epoch": 0.7289342054636399, + "flos": 550344181248.0, + "grad_norm": 0.05691575768916977, + "language_loss": 0.82927215, + "learning_rate": 0.00018062594570147572, + "loss": 0.83992666, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.33984375, + "step": 3789, + "time_per_iteration": 2.584277391433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063831, + "balance_loss_mlp": 1.03109622, + "epoch": 0.7291265871489034, + "flos": 687620344320.0, + "grad_norm": 0.05865206546440876, + "language_loss": 0.85141826, + "learning_rate": 0.00018038630210080243, + "loss": 0.86205661, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.32739258, + "step": 3790, + "time_per_iteration": 2.7913711071014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010669, + "balance_loss_mlp": 1.03421283, + "epoch": 0.729318968834167, + "flos": 572388609024.0, + "grad_norm": 0.08871994753922169, + "language_loss": 0.8494693, + "learning_rate": 0.0001801467825851712, + "loss": 0.8601383, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.3269043, + "step": 3791, + "time_per_iteration": 2.7232275009155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064513, + "balance_loss_mlp": 1.03013325, + "epoch": 0.7295113505194305, + "flos": 585786097152.0, + "grad_norm": 0.05597763782774928, + "language_loss": 0.78437781, + "learning_rate": 0.00017990738724757172, + "loss": 0.79502296, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.34423828, + "step": 3792, + "time_per_iteration": 2.8646349906921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070211, + "balance_loss_mlp": 1.03664136, + "epoch": 0.7297037322046941, + "flos": 706872669696.0, + "grad_norm": 0.0454122102846594, + "language_loss": 0.82281637, + "learning_rate": 0.00017966811618094598, + "loss": 0.83351851, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.3359375, + "step": 3793, + "time_per_iteration": 2.9363014698028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065082, + "balance_loss_mlp": 1.03148866, + "epoch": 0.7298961138899577, + "flos": 487039315968.0, + "grad_norm": 0.060918230322826325, + "language_loss": 0.84644252, + "learning_rate": 0.00017942896947818664, + "loss": 0.85709333, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.33618164, + "step": 3794, + "time_per_iteration": 2.634622097015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014366, + "balance_loss_mlp": 1.00473428, + "epoch": 0.7300884955752213, + "flos": 1365102222336.0, + "grad_norm": 0.006306847562880891, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.75839418, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.09619141, + "step": 3795, + "time_per_iteration": 4.8498523235321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067916, + "balance_loss_mlp": 1.03434658, + "epoch": 0.7302808772604849, + "flos": 531550162944.0, + "grad_norm": 0.07784703337464734, + "language_loss": 0.85064995, + "learning_rate": 0.00017895104953559947, + "loss": 0.86132914, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.33569336, + "step": 3796, + "time_per_iteration": 2.578749418258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069311, + "balance_loss_mlp": 1.03533602, + "epoch": 0.7304732589457483, + "flos": 435949074432.0, + "grad_norm": 0.06903187092561903, + "language_loss": 0.8945868, + "learning_rate": 0.00017871227648131672, + "loss": 0.90527987, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.34008789, + "step": 3797, + "time_per_iteration": 2.498368740081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064089, + "balance_loss_mlp": 1.03071082, + "epoch": 0.7306656406310119, + "flos": 451376229888.0, + "grad_norm": 0.049186518116542115, + "language_loss": 0.82359099, + "learning_rate": 0.0001784736281619907, + "loss": 0.83423185, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.33398438, + "step": 3798, + "time_per_iteration": 2.5968668460845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063054, + "balance_loss_mlp": 1.02898395, + "epoch": 0.7308580223162755, + "flos": 511756572672.0, + "grad_norm": 0.049616480799322744, + "language_loss": 0.74341989, + "learning_rate": 0.00017823510467027232, + "loss": 0.75405043, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.34106445, + "step": 3799, + "time_per_iteration": 2.733454465866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063516, + "balance_loss_mlp": 1.02930331, + "epoch": 0.7310504040015391, + "flos": 375209349120.0, + "grad_norm": 0.0582146456406939, + "language_loss": 0.78020084, + "learning_rate": 0.00017799670609876516, + "loss": 0.79083604, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.3425293, + "step": 3800, + "time_per_iteration": 4.01823616027832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065254, + "balance_loss_mlp": 1.03135109, + "epoch": 0.7312427856868026, + "flos": 549073976832.0, + "grad_norm": 0.04960878758692363, + "language_loss": 0.8857708, + "learning_rate": 0.00017775843254002366, + "loss": 0.89642334, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.33935547, + "step": 3801, + "time_per_iteration": 2.6998913288116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063333, + "balance_loss_mlp": 1.03014541, + "epoch": 0.7314351673720662, + "flos": 766880483328.0, + "grad_norm": 0.0540974976561695, + "language_loss": 0.84199798, + "learning_rate": 0.00017752028408655367, + "loss": 0.85263133, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.33203125, + "step": 3802, + "time_per_iteration": 3.058145523071289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064093, + "balance_loss_mlp": 1.03102422, + "epoch": 0.7316275490573297, + "flos": 486492258816.0, + "grad_norm": 0.051110561372661595, + "language_loss": 0.85141397, + "learning_rate": 0.00017728226083081272, + "loss": 0.86205482, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.33081055, + "step": 3803, + "time_per_iteration": 2.5310099124908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065165, + "balance_loss_mlp": 1.03166723, + "epoch": 0.7318199307425933, + "flos": 473183520768.0, + "grad_norm": 0.05616081836254539, + "language_loss": 0.81485891, + "learning_rate": 0.00017704436286520965, + "loss": 0.8255105, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.33520508, + "step": 3804, + "time_per_iteration": 2.568283796310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063426, + "balance_loss_mlp": 1.02952337, + "epoch": 0.7320123124278569, + "flos": 549202014720.0, + "grad_norm": 0.05320670127317765, + "language_loss": 0.84491169, + "learning_rate": 0.0001768065902821046, + "loss": 0.855546, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.33935547, + "step": 3805, + "time_per_iteration": 2.605682134628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061947, + "balance_loss_mlp": 1.02751899, + "epoch": 0.7322046941131204, + "flos": 570502946304.0, + "grad_norm": 0.06611321477092025, + "language_loss": 0.8209759, + "learning_rate": 0.00017656894317380907, + "loss": 0.83159536, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.34472656, + "step": 3806, + "time_per_iteration": 2.7116403579711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010208, + "balance_loss_mlp": 1.00062358, + "epoch": 0.732397075798384, + "flos": 1468334559744.0, + "grad_norm": 0.00621008772312024, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77041477, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.09570312, + "step": 3807, + "time_per_iteration": 4.968751668930054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061209, + "balance_loss_mlp": 1.0275209, + "epoch": 0.7325894574836476, + "flos": 464620948992.0, + "grad_norm": 0.05827651043720701, + "language_loss": 0.83991838, + "learning_rate": 0.00017609402575064875, + "loss": 0.85053051, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.3371582, + "step": 3808, + "time_per_iteration": 2.5385282039642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063699, + "balance_loss_mlp": 1.03003407, + "epoch": 0.7327818391689112, + "flos": 495246887424.0, + "grad_norm": 0.05735407240941104, + "language_loss": 0.80858552, + "learning_rate": 0.00017585675562016367, + "loss": 0.81922251, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.33691406, + "step": 3809, + "time_per_iteration": 2.555299997329712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007774, + "balance_loss_mlp": 0.99823719, + "epoch": 0.7329742208541746, + "flos": 1432694794752.0, + "grad_norm": 0.0030976704675862504, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78220618, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.09521484, + "step": 3810, + "time_per_iteration": 4.790294647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062704, + "balance_loss_mlp": 1.02894437, + "epoch": 0.7331666025394382, + "flos": 496645129728.0, + "grad_norm": 0.057652785058487796, + "language_loss": 0.84699941, + "learning_rate": 0.00017538259298196474, + "loss": 0.85762644, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.33789062, + "step": 3811, + "time_per_iteration": 2.5608150959014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066548, + "balance_loss_mlp": 1.03271604, + "epoch": 0.7333589842247018, + "flos": 538247420928.0, + "grad_norm": 0.07102765773461414, + "language_loss": 0.81726062, + "learning_rate": 0.00017514570065833745, + "loss": 0.82792604, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.33862305, + "step": 3812, + "time_per_iteration": 2.733987808227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063175, + "balance_loss_mlp": 1.03024936, + "epoch": 0.7335513659099654, + "flos": 490825198080.0, + "grad_norm": 0.0783727795203613, + "language_loss": 0.80580723, + "learning_rate": 0.00017490893445433426, + "loss": 0.81643891, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.3293457, + "step": 3813, + "time_per_iteration": 2.5801103115081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062414, + "balance_loss_mlp": 1.02953637, + "epoch": 0.733743747595229, + "flos": 561876355584.0, + "grad_norm": 0.048847975772381425, + "language_loss": 0.81069362, + "learning_rate": 0.00017467229446187587, + "loss": 0.82131779, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.32885742, + "step": 3814, + "time_per_iteration": 2.683293104171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060801, + "balance_loss_mlp": 1.02684999, + "epoch": 0.7339361292804925, + "flos": 538315822080.0, + "grad_norm": 0.047730041635456175, + "language_loss": 0.81664294, + "learning_rate": 0.00017443578077283424, + "loss": 0.82725096, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.33984375, + "step": 3815, + "time_per_iteration": 2.641364574432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064457, + "balance_loss_mlp": 1.03043437, + "epoch": 0.734128510965756, + "flos": 548198060544.0, + "grad_norm": 0.05243488536705766, + "language_loss": 0.85093778, + "learning_rate": 0.0001741993934790319, + "loss": 0.86158234, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.34057617, + "step": 3816, + "time_per_iteration": 2.7296290397644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060693, + "balance_loss_mlp": 1.027004, + "epoch": 0.7343208926510196, + "flos": 539783875584.0, + "grad_norm": 0.059294435662015, + "language_loss": 0.84253871, + "learning_rate": 0.00017396313267224273, + "loss": 0.85314572, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.3371582, + "step": 3817, + "time_per_iteration": 2.702885866165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064529, + "balance_loss_mlp": 1.03141296, + "epoch": 0.7345132743362832, + "flos": 570827423232.0, + "grad_norm": 0.058276166249488254, + "language_loss": 0.88087535, + "learning_rate": 0.0001737269984441912, + "loss": 0.89152062, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.33129883, + "step": 3818, + "time_per_iteration": 2.6317105293273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064679, + "balance_loss_mlp": 1.03089499, + "epoch": 0.7347056560215467, + "flos": 545135325696.0, + "grad_norm": 0.04588849649553848, + "language_loss": 0.84933245, + "learning_rate": 0.00017349099088655263, + "loss": 0.85997921, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.33813477, + "step": 3819, + "time_per_iteration": 2.6894302368164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063138, + "balance_loss_mlp": 1.03023624, + "epoch": 0.7348980377068103, + "flos": 595668335616.0, + "grad_norm": 0.04507487661925427, + "language_loss": 0.80804777, + "learning_rate": 0.00017325511009095375, + "loss": 0.81867915, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.32910156, + "step": 3820, + "time_per_iteration": 2.7293684482574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106202, + "balance_loss_mlp": 1.02833104, + "epoch": 0.7350904193920739, + "flos": 538291090944.0, + "grad_norm": 0.05281271554601035, + "language_loss": 0.83436865, + "learning_rate": 0.00017301935614897113, + "loss": 0.84498882, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.3371582, + "step": 3821, + "time_per_iteration": 2.727043390274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065798, + "balance_loss_mlp": 1.03199053, + "epoch": 0.7352828010773375, + "flos": 512712474624.0, + "grad_norm": 0.049847760142976955, + "language_loss": 0.81776285, + "learning_rate": 0.00017278372915213274, + "loss": 0.82842088, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.33837891, + "step": 3822, + "time_per_iteration": 2.650468587875366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016098, + "balance_loss_mlp": 1.00732386, + "epoch": 0.735475182762601, + "flos": 1552965087744.0, + "grad_norm": 0.006919711828678118, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80909944, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.08789062, + "step": 3823, + "time_per_iteration": 4.953552007675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064811, + "balance_loss_mlp": 1.03152812, + "epoch": 0.7356675644478645, + "flos": 680984133120.0, + "grad_norm": 0.05477130008948058, + "language_loss": 0.80415845, + "learning_rate": 0.00017231285635975314, + "loss": 0.81480658, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.33300781, + "step": 3824, + "time_per_iteration": 2.889289140701294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067221, + "balance_loss_mlp": 1.03334153, + "epoch": 0.7358599461331281, + "flos": 514961902080.0, + "grad_norm": 0.05024116025531215, + "language_loss": 0.83180618, + "learning_rate": 0.00017207761074702115, + "loss": 0.84247839, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.33911133, + "step": 3825, + "time_per_iteration": 2.5944931507110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068957, + "balance_loss_mlp": 1.03491116, + "epoch": 0.7360523278183917, + "flos": 443739036672.0, + "grad_norm": 0.05416022756752086, + "language_loss": 0.83636504, + "learning_rate": 0.0001718424924450514, + "loss": 0.8470546, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.34082031, + "step": 3826, + "time_per_iteration": 2.6031198501586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067291, + "balance_loss_mlp": 1.03441358, + "epoch": 0.7362447095036553, + "flos": 603142585344.0, + "grad_norm": 0.04455430936789472, + "language_loss": 0.85882723, + "learning_rate": 0.00017160750154512482, + "loss": 0.86950016, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.32885742, + "step": 3827, + "time_per_iteration": 2.702148914337158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067373, + "balance_loss_mlp": 1.03470922, + "epoch": 0.7364370911889189, + "flos": 552807424512.0, + "grad_norm": 0.06654318382518472, + "language_loss": 0.83394545, + "learning_rate": 0.0001713726381384731, + "loss": 0.84461915, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.32666016, + "step": 3828, + "time_per_iteration": 2.7451815605163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069365, + "balance_loss_mlp": 1.03622484, + "epoch": 0.7366294728741823, + "flos": 448830028800.0, + "grad_norm": 0.05260282371151395, + "language_loss": 0.81186259, + "learning_rate": 0.00017113790231627812, + "loss": 0.82255614, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.33154297, + "step": 3829, + "time_per_iteration": 2.537193775177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017376, + "balance_loss_mlp": 1.00879276, + "epoch": 0.7368218545594459, + "flos": 1534705132032.0, + "grad_norm": 0.0074062815552694275, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80275595, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.0859375, + "step": 3830, + "time_per_iteration": 4.833421945571899 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069316, + "balance_loss_mlp": 1.03584218, + "epoch": 0.7370142362447095, + "flos": 515164133376.0, + "grad_norm": 0.05241835365741791, + "language_loss": 0.81748456, + "learning_rate": 0.00017066881378973936, + "loss": 0.82817769, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.33496094, + "step": 3831, + "time_per_iteration": 2.619849443435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071106, + "balance_loss_mlp": 1.03808546, + "epoch": 0.7372066179299731, + "flos": 500531346432.0, + "grad_norm": 0.056102661804596575, + "language_loss": 0.82564443, + "learning_rate": 0.00017043446126751189, + "loss": 0.83635545, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.33032227, + "step": 3832, + "time_per_iteration": 2.689955711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069455, + "balance_loss_mlp": 1.03605282, + "epoch": 0.7373989996152366, + "flos": 557814048768.0, + "grad_norm": 0.062254186962725604, + "language_loss": 0.76771331, + "learning_rate": 0.00017020023669397376, + "loss": 0.77840781, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.33422852, + "step": 3833, + "time_per_iteration": 2.7102112770080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071175, + "balance_loss_mlp": 1.03722405, + "epoch": 0.7375913813005002, + "flos": 506527368192.0, + "grad_norm": 0.05138473189519923, + "language_loss": 0.81401753, + "learning_rate": 0.0001699661401600589, + "loss": 0.82472932, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.33984375, + "step": 3834, + "time_per_iteration": 2.5580482482910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066317, + "balance_loss_mlp": 1.03386855, + "epoch": 0.7377837629857638, + "flos": 485940819456.0, + "grad_norm": 0.04817361691999996, + "language_loss": 0.78101605, + "learning_rate": 0.00016973217175665205, + "loss": 0.7916792, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.32446289, + "step": 3835, + "time_per_iteration": 2.5466511249542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014272, + "balance_loss_mlp": 1.00540292, + "epoch": 0.7379761446710273, + "flos": 1413900776448.0, + "grad_norm": 0.004962525889406641, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.8218044, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.08886719, + "step": 3836, + "time_per_iteration": 4.947209358215332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065038, + "balance_loss_mlp": 1.03173065, + "epoch": 0.7381685263562909, + "flos": 629445758976.0, + "grad_norm": 0.04309096718082386, + "language_loss": 0.83880627, + "learning_rate": 0.00016926461970465047, + "loss": 0.84945667, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.33325195, + "step": 3837, + "time_per_iteration": 2.7604105472564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064561, + "balance_loss_mlp": 1.03175426, + "epoch": 0.7383609080415544, + "flos": 738869147136.0, + "grad_norm": 0.046495404641084814, + "language_loss": 0.84092653, + "learning_rate": 0.00016903103623757516, + "loss": 0.8515721, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.328125, + "step": 3838, + "time_per_iteration": 3.0393178462982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064697, + "balance_loss_mlp": 1.03165209, + "epoch": 0.738553289726818, + "flos": 549945510912.0, + "grad_norm": 0.05807903751309768, + "language_loss": 0.80044198, + "learning_rate": 0.00016879758126404738, + "loss": 0.81108892, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.33056641, + "step": 3839, + "time_per_iteration": 2.7287819385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066157, + "balance_loss_mlp": 1.03296924, + "epoch": 0.7387456714120816, + "flos": 909925705728.0, + "grad_norm": 0.06505190297085839, + "language_loss": 0.7982837, + "learning_rate": 0.00016856425487470216, + "loss": 0.8089453, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.33203125, + "step": 3840, + "time_per_iteration": 3.088334321975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070195, + "balance_loss_mlp": 1.03724539, + "epoch": 0.7389380530973452, + "flos": 852308352000.0, + "grad_norm": 0.054902923406453155, + "language_loss": 0.78921622, + "learning_rate": 0.00016833105716012486, + "loss": 0.79991817, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.32958984, + "step": 3841, + "time_per_iteration": 3.1420795917510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067762, + "balance_loss_mlp": 1.03433585, + "epoch": 0.7391304347826086, + "flos": 816678761472.0, + "grad_norm": 0.0538484990097731, + "language_loss": 0.85046756, + "learning_rate": 0.00016809798821085088, + "loss": 0.86114514, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.33447266, + "step": 3842, + "time_per_iteration": 2.9748454093933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067117, + "balance_loss_mlp": 1.03321409, + "epoch": 0.7393228164678722, + "flos": 572541378048.0, + "grad_norm": 0.07853013477986996, + "language_loss": 0.88786352, + "learning_rate": 0.00016786504811736565, + "loss": 0.89853466, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.33935547, + "step": 3843, + "time_per_iteration": 2.697993516921997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107057, + "balance_loss_mlp": 1.0370723, + "epoch": 0.7395151981531358, + "flos": 684903845376.0, + "grad_norm": 0.054879027639850184, + "language_loss": 0.82676303, + "learning_rate": 0.00016763223697010442, + "loss": 0.83746874, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.33520508, + "step": 3844, + "time_per_iteration": 2.941396951675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069204, + "balance_loss_mlp": 1.03680301, + "epoch": 0.7397075798383994, + "flos": 556095711744.0, + "grad_norm": 0.044630458439445526, + "language_loss": 0.84558266, + "learning_rate": 0.00016739955485945256, + "loss": 0.85627472, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.32397461, + "step": 3845, + "time_per_iteration": 2.6704368591308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070852, + "balance_loss_mlp": 1.03692532, + "epoch": 0.739899961523663, + "flos": 546523393536.0, + "grad_norm": 0.16146348926095225, + "language_loss": 0.8579582, + "learning_rate": 0.00016716700187574513, + "loss": 0.86866671, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.33959961, + "step": 3846, + "time_per_iteration": 2.689548969268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066553, + "balance_loss_mlp": 1.03400922, + "epoch": 0.7400923432089265, + "flos": 608913054720.0, + "grad_norm": 0.062089054691193496, + "language_loss": 0.83502501, + "learning_rate": 0.0001669345781092675, + "loss": 0.84569055, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.32543945, + "step": 3847, + "time_per_iteration": 2.7922914028167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106553, + "balance_loss_mlp": 1.03286684, + "epoch": 0.7402847248941901, + "flos": 590715555840.0, + "grad_norm": 0.053588507044290926, + "language_loss": 0.86693704, + "learning_rate": 0.0001667022836502546, + "loss": 0.87759233, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.32666016, + "step": 3848, + "time_per_iteration": 2.7810423374176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106826, + "balance_loss_mlp": 1.0351913, + "epoch": 0.7404771065794536, + "flos": 477136728576.0, + "grad_norm": 0.05607520940274661, + "language_loss": 0.82591665, + "learning_rate": 0.00016647011858889077, + "loss": 0.83659923, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.33081055, + "step": 3849, + "time_per_iteration": 2.5447256565093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068875, + "balance_loss_mlp": 1.03552043, + "epoch": 0.7406694882647172, + "flos": 496192614912.0, + "grad_norm": 0.05524374859668954, + "language_loss": 0.85861689, + "learning_rate": 0.00016623808301531056, + "loss": 0.86930567, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.33374023, + "step": 3850, + "time_per_iteration": 2.647326707839966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067961, + "balance_loss_mlp": 1.03455853, + "epoch": 0.7408618699499807, + "flos": 561925817856.0, + "grad_norm": 0.0770294501397313, + "language_loss": 0.79239726, + "learning_rate": 0.00016600617701959842, + "loss": 0.80307692, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.33422852, + "step": 3851, + "time_per_iteration": 2.724172830581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011471, + "balance_loss_mlp": 1.00212514, + "epoch": 0.7410542516352443, + "flos": 1387421512704.0, + "grad_norm": 0.004619624955394922, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79855287, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.09326172, + "step": 3852, + "time_per_iteration": 4.94897198677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069894, + "balance_loss_mlp": 1.03620529, + "epoch": 0.7412466333205079, + "flos": 669697860096.0, + "grad_norm": 0.05139846534823347, + "language_loss": 0.80732995, + "learning_rate": 0.00016554275412186315, + "loss": 0.81802887, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.3371582, + "step": 3853, + "time_per_iteration": 2.798964262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069502, + "balance_loss_mlp": 1.0356704, + "epoch": 0.7414390150057715, + "flos": 489038459904.0, + "grad_norm": 0.059331107298497686, + "language_loss": 0.80721259, + "learning_rate": 0.0001653112373997568, + "loss": 0.81790757, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.33862305, + "step": 3854, + "time_per_iteration": 2.6622824668884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071808, + "balance_loss_mlp": 1.03878713, + "epoch": 0.7416313966910351, + "flos": 599119566336.0, + "grad_norm": 0.060794627478568314, + "language_loss": 0.74696434, + "learning_rate": 0.0001650798506153517, + "loss": 0.7576825, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.33032227, + "step": 3855, + "time_per_iteration": 2.6897103786468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068866, + "balance_loss_mlp": 1.03558254, + "epoch": 0.7418237783762985, + "flos": 542279204352.0, + "grad_norm": 0.06401290816121721, + "language_loss": 0.83928871, + "learning_rate": 0.00016484859385848023, + "loss": 0.84997737, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.33276367, + "step": 3856, + "time_per_iteration": 2.6182141304016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065034, + "balance_loss_mlp": 1.0325613, + "epoch": 0.7420161600615621, + "flos": 543865121280.0, + "grad_norm": 0.060824827203723085, + "language_loss": 0.77091217, + "learning_rate": 0.0001646174672189243, + "loss": 0.78156251, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.32470703, + "step": 3857, + "time_per_iteration": 2.639897584915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072771, + "balance_loss_mlp": 1.039464, + "epoch": 0.7422085417468257, + "flos": 526921860096.0, + "grad_norm": 0.05508256135397888, + "language_loss": 0.80038357, + "learning_rate": 0.00016438647078641488, + "loss": 0.81111133, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.33325195, + "step": 3858, + "time_per_iteration": 2.583303213119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071831, + "balance_loss_mlp": 1.0385952, + "epoch": 0.7424009234320893, + "flos": 508404266496.0, + "grad_norm": 0.05219884306446566, + "language_loss": 0.83017123, + "learning_rate": 0.00016415560465063344, + "loss": 0.84088957, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.33251953, + "step": 3859, + "time_per_iteration": 2.7442150115966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069864, + "balance_loss_mlp": 1.03670025, + "epoch": 0.7425933051173528, + "flos": 512347299840.0, + "grad_norm": 0.07638052287216905, + "language_loss": 0.78861916, + "learning_rate": 0.0001639248689012095, + "loss": 0.79931784, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.33154297, + "step": 3860, + "time_per_iteration": 2.5846545696258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067753, + "balance_loss_mlp": 1.03487468, + "epoch": 0.7427856868026164, + "flos": 458034200064.0, + "grad_norm": 0.05020095318806213, + "language_loss": 0.87714618, + "learning_rate": 0.00016369426362772271, + "loss": 0.8878237, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.32885742, + "step": 3861, + "time_per_iteration": 2.7977116107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106857, + "balance_loss_mlp": 1.03576398, + "epoch": 0.74297806848788, + "flos": 604728502272.0, + "grad_norm": 0.04367608298357755, + "language_loss": 0.80370325, + "learning_rate": 0.00016346378891970233, + "loss": 0.81438893, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.328125, + "step": 3862, + "time_per_iteration": 2.8144397735595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067373, + "balance_loss_mlp": 1.03416157, + "epoch": 0.7431704501731435, + "flos": 890971564032.0, + "grad_norm": 0.052584770309724485, + "language_loss": 0.81109643, + "learning_rate": 0.00016323344486662633, + "loss": 0.82177019, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.33227539, + "step": 3863, + "time_per_iteration": 3.306062936782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069805, + "balance_loss_mlp": 1.03566337, + "epoch": 0.7433628318584071, + "flos": 591867896832.0, + "grad_norm": 0.05409036708303953, + "language_loss": 0.78479373, + "learning_rate": 0.00016300323155792247, + "loss": 0.79549176, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.34179688, + "step": 3864, + "time_per_iteration": 2.881361961364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070231, + "balance_loss_mlp": 1.03756773, + "epoch": 0.7435552135436706, + "flos": 476896619520.0, + "grad_norm": 0.06261465074360906, + "language_loss": 0.88414448, + "learning_rate": 0.00016277314908296687, + "loss": 0.8948468, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.32666016, + "step": 3865, + "time_per_iteration": 2.6607327461242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068826, + "balance_loss_mlp": 1.03497088, + "epoch": 0.7437475952289342, + "flos": 672874076160.0, + "grad_norm": 0.05871216754162407, + "language_loss": 0.75963724, + "learning_rate": 0.00016254319753108604, + "loss": 0.77032548, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.33862305, + "step": 3866, + "time_per_iteration": 2.8663392066955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072261, + "balance_loss_mlp": 1.03881145, + "epoch": 0.7439399769141978, + "flos": 770094577152.0, + "grad_norm": 0.0657107928380086, + "language_loss": 0.76937765, + "learning_rate": 0.00016231337699155492, + "loss": 0.78010023, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.3347168, + "step": 3867, + "time_per_iteration": 3.0015652179718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069051, + "balance_loss_mlp": 1.03579164, + "epoch": 0.7441323585994614, + "flos": 647462785536.0, + "grad_norm": 0.05480167763007067, + "language_loss": 0.781057, + "learning_rate": 0.0001620836875535977, + "loss": 0.79174751, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.33276367, + "step": 3868, + "time_per_iteration": 2.842230796813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067908, + "balance_loss_mlp": 1.03438592, + "epoch": 0.7443247402847248, + "flos": 565091859456.0, + "grad_norm": 0.08182292750671373, + "language_loss": 0.80810648, + "learning_rate": 0.00016185412930638766, + "loss": 0.81878555, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.33544922, + "step": 3869, + "time_per_iteration": 2.7977213859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071043, + "balance_loss_mlp": 1.03761721, + "epoch": 0.7445171219699884, + "flos": 578243446272.0, + "grad_norm": 0.07110615471626963, + "language_loss": 0.82752168, + "learning_rate": 0.00016162470233904765, + "loss": 0.8382321, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.33447266, + "step": 3870, + "time_per_iteration": 2.707329273223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106823, + "balance_loss_mlp": 1.03456485, + "epoch": 0.744709503655252, + "flos": 618588679680.0, + "grad_norm": 0.08201563915437336, + "language_loss": 0.81978703, + "learning_rate": 0.00016139540674064856, + "loss": 0.83046937, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.33666992, + "step": 3871, + "time_per_iteration": 2.779015302658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.03349781, + "epoch": 0.7449018853405156, + "flos": 528355008000.0, + "grad_norm": 0.053737872907142804, + "language_loss": 0.77632427, + "learning_rate": 0.00016116624260021113, + "loss": 0.78698754, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.32836914, + "step": 3872, + "time_per_iteration": 2.748868942260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068405, + "balance_loss_mlp": 1.03509796, + "epoch": 0.7450942670257792, + "flos": 433088570880.0, + "grad_norm": 0.050066249617561176, + "language_loss": 0.83786619, + "learning_rate": 0.0001609372100067046, + "loss": 0.84855032, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.33325195, + "step": 3873, + "time_per_iteration": 2.5261478424072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068874, + "balance_loss_mlp": 1.03504205, + "epoch": 0.7452866487110427, + "flos": 696562647552.0, + "grad_norm": 0.062485843646331765, + "language_loss": 0.84858561, + "learning_rate": 0.0001607083090490475, + "loss": 0.85927439, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.33862305, + "step": 3874, + "time_per_iteration": 2.912550210952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070703, + "balance_loss_mlp": 1.03718174, + "epoch": 0.7454790303963063, + "flos": 511944247296.0, + "grad_norm": 0.05620990191133866, + "language_loss": 0.80024898, + "learning_rate": 0.00016047953981610714, + "loss": 0.810956, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.33544922, + "step": 3875, + "time_per_iteration": 2.7009074687957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024051, + "balance_loss_mlp": 1.01460981, + "epoch": 0.7456714120815698, + "flos": 1325221088256.0, + "grad_norm": 0.008467942690165917, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.8075369, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.09423828, + "step": 3876, + "time_per_iteration": 4.952231168746948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065537, + "balance_loss_mlp": 1.0320152, + "epoch": 0.7458637937668334, + "flos": 721397767680.0, + "grad_norm": 0.05688245720911951, + "language_loss": 0.8058607, + "learning_rate": 0.0001600223968795889, + "loss": 0.8165161, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.33544922, + "step": 3877, + "time_per_iteration": 2.87972092628479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024325, + "balance_loss_mlp": 1.014979, + "epoch": 0.746056175452097, + "flos": 1500761793024.0, + "grad_norm": 0.00806071633609759, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76720393, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.09326172, + "step": 3878, + "time_per_iteration": 4.914839029312134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065825, + "balance_loss_mlp": 1.03335285, + "epoch": 0.7462485571373605, + "flos": 519984493056.0, + "grad_norm": 0.05864389965433195, + "language_loss": 0.81840986, + "learning_rate": 0.00015956578190706483, + "loss": 0.82906812, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.32470703, + "step": 3879, + "time_per_iteration": 2.690336227416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067732, + "balance_loss_mlp": 1.03492546, + "epoch": 0.7464409388226241, + "flos": 480967690752.0, + "grad_norm": 0.05296793730256709, + "language_loss": 0.75717044, + "learning_rate": 0.00015933767262892468, + "loss": 0.76784778, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.328125, + "step": 3880, + "time_per_iteration": 2.702094078063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106723, + "balance_loss_mlp": 1.03418517, + "epoch": 0.7466333205078877, + "flos": 486516989952.0, + "grad_norm": 0.06088844142287201, + "language_loss": 0.81730115, + "learning_rate": 0.00015910969560762927, + "loss": 0.82797348, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.33056641, + "step": 3881, + "time_per_iteration": 2.5547542572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066924, + "balance_loss_mlp": 1.03464174, + "epoch": 0.7468257021931513, + "flos": 611015505408.0, + "grad_norm": 0.05773306272323557, + "language_loss": 0.83265662, + "learning_rate": 0.00015888185093168727, + "loss": 0.84332585, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.32275391, + "step": 3882, + "time_per_iteration": 2.7600655555725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069044, + "balance_loss_mlp": 1.03502131, + "epoch": 0.7470180838784147, + "flos": 533204481024.0, + "grad_norm": 0.06850625099692723, + "language_loss": 0.8104043, + "learning_rate": 0.00015865413868955581, + "loss": 0.82109475, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.34057617, + "step": 3883, + "time_per_iteration": 2.6018030643463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066378, + "balance_loss_mlp": 1.03378606, + "epoch": 0.7472104655636783, + "flos": 739005949440.0, + "grad_norm": 0.05384081039558067, + "language_loss": 0.82672417, + "learning_rate": 0.00015842655896964054, + "loss": 0.83738798, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.32592773, + "step": 3884, + "time_per_iteration": 3.021933078765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065038, + "balance_loss_mlp": 1.03223145, + "epoch": 0.7474028472489419, + "flos": 640007474688.0, + "grad_norm": 0.052763664912519236, + "language_loss": 0.73725951, + "learning_rate": 0.00015819911186029567, + "loss": 0.7479099, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.328125, + "step": 3885, + "time_per_iteration": 2.8068392276763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068676, + "balance_loss_mlp": 1.03577399, + "epoch": 0.7475952289342055, + "flos": 589980824064.0, + "grad_norm": 0.05740266756526494, + "language_loss": 0.8658216, + "learning_rate": 0.00015797179744982443, + "loss": 0.87650836, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.32910156, + "step": 3886, + "time_per_iteration": 2.7342216968536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067874, + "balance_loss_mlp": 1.03492451, + "epoch": 0.7477876106194691, + "flos": 487935581184.0, + "grad_norm": 0.05063564499597122, + "language_loss": 0.79109228, + "learning_rate": 0.00015774461582647765, + "loss": 0.80177104, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.32958984, + "step": 3887, + "time_per_iteration": 2.617705821990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066709, + "balance_loss_mlp": 1.03390241, + "epoch": 0.7479799923047326, + "flos": 554470507008.0, + "grad_norm": 0.04778068214414316, + "language_loss": 0.81002998, + "learning_rate": 0.00015751756707845505, + "loss": 0.82069701, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.328125, + "step": 3888, + "time_per_iteration": 2.611276626586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067147, + "balance_loss_mlp": 1.03505635, + "epoch": 0.7481723739899961, + "flos": 767037634560.0, + "grad_norm": 0.054687563688018546, + "language_loss": 0.88108873, + "learning_rate": 0.00015729065129390502, + "loss": 0.89176023, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.32080078, + "step": 3889, + "time_per_iteration": 3.022294759750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069902, + "balance_loss_mlp": 1.03614235, + "epoch": 0.7483647556752597, + "flos": 495926364672.0, + "grad_norm": 0.07150557993865005, + "language_loss": 0.81957299, + "learning_rate": 0.0001570638685609241, + "loss": 0.83027202, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.33789062, + "step": 3890, + "time_per_iteration": 2.540038585662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068666, + "balance_loss_mlp": 1.03588343, + "epoch": 0.7485571373605233, + "flos": 472607350272.0, + "grad_norm": 0.055161335390356114, + "language_loss": 0.8031671, + "learning_rate": 0.00015683721896755693, + "loss": 0.81385386, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.32788086, + "step": 3891, + "time_per_iteration": 2.5199973583221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015851, + "balance_loss_mlp": 1.00683892, + "epoch": 0.7487495190457868, + "flos": 1553619833856.0, + "grad_norm": 0.004937901566549453, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83226347, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.09033203, + "step": 3892, + "time_per_iteration": 4.912605047225952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068919, + "balance_loss_mlp": 1.03632677, + "epoch": 0.7489419007310504, + "flos": 581566639104.0, + "grad_norm": 0.04880798479848443, + "language_loss": 0.84992248, + "learning_rate": 0.00015638431955158528, + "loss": 0.86061168, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.32592773, + "step": 3893, + "time_per_iteration": 2.6795592308044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066956, + "balance_loss_mlp": 1.03398299, + "epoch": 0.749134282416314, + "flos": 567297616896.0, + "grad_norm": 0.04606226658973748, + "language_loss": 0.80857748, + "learning_rate": 0.00015615806990481186, + "loss": 0.81924701, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.32983398, + "step": 3894, + "time_per_iteration": 2.7299861907958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066927, + "balance_loss_mlp": 1.03433573, + "epoch": 0.7493266641015776, + "flos": 532786871808.0, + "grad_norm": 0.044395679249862555, + "language_loss": 0.8442167, + "learning_rate": 0.00015593195374931452, + "loss": 0.854886, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.32592773, + "step": 3895, + "time_per_iteration": 2.725260019302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066101, + "balance_loss_mlp": 1.03346133, + "epoch": 0.7495190457868411, + "flos": 523338209280.0, + "grad_norm": 0.05913067332521358, + "language_loss": 0.79859447, + "learning_rate": 0.00015570597117287922, + "loss": 0.80925548, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.32641602, + "step": 3896, + "time_per_iteration": 2.6577799320220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070214, + "balance_loss_mlp": 1.03762269, + "epoch": 0.7497114274721046, + "flos": 513937598976.0, + "grad_norm": 0.0999283842203671, + "language_loss": 0.77427346, + "learning_rate": 0.0001554801222632406, + "loss": 0.78497565, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.32592773, + "step": 3897, + "time_per_iteration": 2.6006200313568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065155, + "balance_loss_mlp": 1.03239596, + "epoch": 0.7499038091573682, + "flos": 494759467008.0, + "grad_norm": 0.050843654610054065, + "language_loss": 0.85019195, + "learning_rate": 0.00015525440710808052, + "loss": 0.86084348, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.32763672, + "step": 3898, + "time_per_iteration": 2.661421775817871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068075, + "balance_loss_mlp": 1.03586483, + "epoch": 0.7500961908426318, + "flos": 737326900224.0, + "grad_norm": 0.05107930467548482, + "language_loss": 0.77678949, + "learning_rate": 0.00015502882579502953, + "loss": 0.78747022, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.32202148, + "step": 3899, + "time_per_iteration": 2.9202702045440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062076, + "balance_loss_mlp": 1.02931714, + "epoch": 0.7502885725278954, + "flos": 533117140992.0, + "grad_norm": 0.046214312949338116, + "language_loss": 0.84483492, + "learning_rate": 0.00015480337841166592, + "loss": 0.85545564, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.32763672, + "step": 3900, + "time_per_iteration": 2.704392194747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070609, + "balance_loss_mlp": 1.03761196, + "epoch": 0.7504809542131589, + "flos": 589017567744.0, + "grad_norm": 0.05276594694020605, + "language_loss": 0.82456982, + "learning_rate": 0.00015457806504551647, + "loss": 0.83527595, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.33007812, + "step": 3901, + "time_per_iteration": 2.8369719982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066454, + "balance_loss_mlp": 1.03376722, + "epoch": 0.7506733358984224, + "flos": 511293883392.0, + "grad_norm": 0.05412460278066938, + "language_loss": 0.78305542, + "learning_rate": 0.0001543528857840554, + "loss": 0.79372001, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.3269043, + "step": 3902, + "time_per_iteration": 2.679732084274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064427, + "balance_loss_mlp": 1.03204942, + "epoch": 0.750865717583686, + "flos": 538990917120.0, + "grad_norm": 0.0614099012114921, + "language_loss": 0.80124992, + "learning_rate": 0.000154127840714705, + "loss": 0.81189418, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.32373047, + "step": 3903, + "time_per_iteration": 2.7384650707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065338, + "balance_loss_mlp": 1.03265119, + "epoch": 0.7510580992689496, + "flos": 476339387904.0, + "grad_norm": 0.0665672194872541, + "language_loss": 0.81678092, + "learning_rate": 0.00015390292992483557, + "loss": 0.82743436, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.3269043, + "step": 3904, + "time_per_iteration": 2.489619731903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106175, + "balance_loss_mlp": 1.02894402, + "epoch": 0.7512504809542132, + "flos": 578755597824.0, + "grad_norm": 0.06071277834491827, + "language_loss": 0.83697867, + "learning_rate": 0.00015367815350176523, + "loss": 0.84759617, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.328125, + "step": 3905, + "time_per_iteration": 2.716557025909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062505, + "balance_loss_mlp": 1.02943611, + "epoch": 0.7514428626394767, + "flos": 418435435008.0, + "grad_norm": 0.05426428820628694, + "language_loss": 0.82564658, + "learning_rate": 0.00015345351153275987, + "loss": 0.83627158, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.33081055, + "step": 3906, + "time_per_iteration": 2.522923707962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065336, + "balance_loss_mlp": 1.03262544, + "epoch": 0.7516352443247403, + "flos": 640736414208.0, + "grad_norm": 0.05433907321222643, + "language_loss": 0.80729043, + "learning_rate": 0.00015322900410503332, + "loss": 0.81794381, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.32714844, + "step": 3907, + "time_per_iteration": 2.793536424636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064989, + "balance_loss_mlp": 1.03189635, + "epoch": 0.7518276260100039, + "flos": 580700897280.0, + "grad_norm": 0.05951130469098692, + "language_loss": 0.76875365, + "learning_rate": 0.00015300463130574703, + "loss": 0.77940357, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.33105469, + "step": 3908, + "time_per_iteration": 2.8399226665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063261, + "balance_loss_mlp": 1.03045464, + "epoch": 0.7520200076952674, + "flos": 687025234944.0, + "grad_norm": 0.0651669879699934, + "language_loss": 0.81970477, + "learning_rate": 0.00015278039322201033, + "loss": 0.83033741, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.328125, + "step": 3909, + "time_per_iteration": 2.9373419284820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063542, + "balance_loss_mlp": 1.02985382, + "epoch": 0.7522123893805309, + "flos": 486196895232.0, + "grad_norm": 0.06049213601321292, + "language_loss": 0.79440963, + "learning_rate": 0.00015255628994088004, + "loss": 0.80504501, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.3371582, + "step": 3910, + "time_per_iteration": 2.528364419937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065253, + "balance_loss_mlp": 1.03175521, + "epoch": 0.7524047710657945, + "flos": 818581800960.0, + "grad_norm": 0.05892068173673864, + "language_loss": 0.75070155, + "learning_rate": 0.00015233232154936082, + "loss": 0.76135409, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.33520508, + "step": 3911, + "time_per_iteration": 3.230201244354248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062613, + "balance_loss_mlp": 1.02916312, + "epoch": 0.7525971527510581, + "flos": 699191806464.0, + "grad_norm": 0.055756434069827554, + "language_loss": 0.76463896, + "learning_rate": 0.0001521084881344048, + "loss": 0.7752651, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.3347168, + "step": 3912, + "time_per_iteration": 2.8348512649536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065733, + "balance_loss_mlp": 1.03216362, + "epoch": 0.7527895344363217, + "flos": 633497891328.0, + "grad_norm": 0.050850444756768094, + "language_loss": 0.86350536, + "learning_rate": 0.00015188478978291208, + "loss": 0.87416273, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.3359375, + "step": 3913, + "time_per_iteration": 2.744290828704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.03404021, + "epoch": 0.7529819161215853, + "flos": 562555832832.0, + "grad_norm": 0.05433821617011464, + "language_loss": 0.8621949, + "learning_rate": 0.00015166122658173014, + "loss": 0.8728655, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.33032227, + "step": 3914, + "time_per_iteration": 2.8117570877075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066949, + "balance_loss_mlp": 1.03368926, + "epoch": 0.7531742978068487, + "flos": 690344045568.0, + "grad_norm": 0.048975254587736855, + "language_loss": 0.88076222, + "learning_rate": 0.00015143779861765332, + "loss": 0.89143169, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.33251953, + "step": 3915, + "time_per_iteration": 2.8815720081329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063708, + "balance_loss_mlp": 1.03140223, + "epoch": 0.7533666794921123, + "flos": 680800840704.0, + "grad_norm": 0.04986662461838111, + "language_loss": 0.81009239, + "learning_rate": 0.00015121450597742458, + "loss": 0.82072949, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.32299805, + "step": 3916, + "time_per_iteration": 2.80761456489563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010608, + "balance_loss_mlp": 1.02830386, + "epoch": 0.7535590611773759, + "flos": 623384308224.0, + "grad_norm": 0.05782496092002166, + "language_loss": 0.78096646, + "learning_rate": 0.00015099134874773369, + "loss": 0.79157448, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.32495117, + "step": 3917, + "time_per_iteration": 2.7233426570892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065187, + "balance_loss_mlp": 1.03149819, + "epoch": 0.7537514428626395, + "flos": 519162421248.0, + "grad_norm": 0.0518571632225719, + "language_loss": 0.80421233, + "learning_rate": 0.00015076832701521793, + "loss": 0.81486416, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.3371582, + "step": 3918, + "time_per_iteration": 2.6993284225463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062901, + "balance_loss_mlp": 1.02971327, + "epoch": 0.753943824547903, + "flos": 723309571584.0, + "grad_norm": 0.06554029395428207, + "language_loss": 0.82133907, + "learning_rate": 0.000150545440866462, + "loss": 0.83196807, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.33203125, + "step": 3919, + "time_per_iteration": 2.9902353286743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063028, + "balance_loss_mlp": 1.03050804, + "epoch": 0.7541362062331666, + "flos": 437318203392.0, + "grad_norm": 0.051833460096662155, + "language_loss": 0.78462708, + "learning_rate": 0.000150322690387998, + "loss": 0.79525733, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.32519531, + "step": 3920, + "time_per_iteration": 2.496290922164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106303, + "balance_loss_mlp": 1.02941298, + "epoch": 0.7543285879184302, + "flos": 565007491584.0, + "grad_norm": 0.05213671641073607, + "language_loss": 0.75242233, + "learning_rate": 0.00015010007566630535, + "loss": 0.76305258, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.33642578, + "step": 3921, + "time_per_iteration": 2.7238450050354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065892, + "balance_loss_mlp": 1.03210807, + "epoch": 0.7545209696036937, + "flos": 520781833728.0, + "grad_norm": 0.060725267986870404, + "language_loss": 0.8104378, + "learning_rate": 0.00014987759678781077, + "loss": 0.82109678, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.33813477, + "step": 3922, + "time_per_iteration": 2.596788167953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066538, + "balance_loss_mlp": 1.03208637, + "epoch": 0.7547133512889573, + "flos": 615782020608.0, + "grad_norm": 0.05117423221869946, + "language_loss": 0.82205606, + "learning_rate": 0.00014965525383888795, + "loss": 0.83272147, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.3449707, + "step": 3923, + "time_per_iteration": 2.7719502449035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106299, + "balance_loss_mlp": 1.0298022, + "epoch": 0.7549057329742208, + "flos": 750522157056.0, + "grad_norm": 0.05672347636966434, + "language_loss": 0.72166836, + "learning_rate": 0.00014943304690585851, + "loss": 0.73229825, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.33203125, + "step": 3924, + "time_per_iteration": 2.90588116645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063807, + "balance_loss_mlp": 1.03069079, + "epoch": 0.7550981146594844, + "flos": 514193674752.0, + "grad_norm": 0.06004038441284508, + "language_loss": 0.79123962, + "learning_rate": 0.0001492109760749908, + "loss": 0.80187768, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.33129883, + "step": 3925, + "time_per_iteration": 2.573479652404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062523, + "balance_loss_mlp": 1.02900124, + "epoch": 0.755290496344748, + "flos": 521756674560.0, + "grad_norm": 0.04754610420203459, + "language_loss": 0.79945302, + "learning_rate": 0.00014898904143250002, + "loss": 0.81007826, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.33544922, + "step": 3926, + "time_per_iteration": 2.6605517864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011981, + "balance_loss_mlp": 1.00320745, + "epoch": 0.7554828780300116, + "flos": 1413845521920.0, + "grad_norm": 0.009243318676460378, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76767182, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.08789062, + "step": 3927, + "time_per_iteration": 4.911595106124878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066882, + "balance_loss_mlp": 1.03343201, + "epoch": 0.7556752597152752, + "flos": 556676264448.0, + "grad_norm": 0.06225847362151781, + "language_loss": 0.80114925, + "learning_rate": 0.0001485455810572474, + "loss": 0.81181806, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.3347168, + "step": 3928, + "time_per_iteration": 2.6221096515655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061736, + "balance_loss_mlp": 1.02864373, + "epoch": 0.7558676414005386, + "flos": 563363347968.0, + "grad_norm": 0.0430287394272786, + "language_loss": 0.83688951, + "learning_rate": 0.00014832405549665236, + "loss": 0.84750688, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.33105469, + "step": 3929, + "time_per_iteration": 2.687077760696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062194, + "balance_loss_mlp": 1.02898264, + "epoch": 0.7560600230858022, + "flos": 561089189376.0, + "grad_norm": 0.072300166117579, + "language_loss": 0.78684491, + "learning_rate": 0.00014810266646876746, + "loss": 0.79746687, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.33227539, + "step": 3930, + "time_per_iteration": 2.784480571746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060871, + "balance_loss_mlp": 1.02703977, + "epoch": 0.7562524047710658, + "flos": 719232708096.0, + "grad_norm": 0.05835242926257929, + "language_loss": 0.7758401, + "learning_rate": 0.00014788141405954364, + "loss": 0.78644884, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.33862305, + "step": 3931, + "time_per_iteration": 2.9784233570098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066404, + "balance_loss_mlp": 1.03345442, + "epoch": 0.7564447864563294, + "flos": 543086719488.0, + "grad_norm": 0.059110171964688825, + "language_loss": 0.84827656, + "learning_rate": 0.00014766029835487865, + "loss": 0.85894054, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.32958984, + "step": 3932, + "time_per_iteration": 2.6907904148101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062333, + "balance_loss_mlp": 1.02945542, + "epoch": 0.7566371681415929, + "flos": 725484805632.0, + "grad_norm": 0.06258669653948258, + "language_loss": 0.79361248, + "learning_rate": 0.0001474393194406173, + "loss": 0.80423582, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.32885742, + "step": 3933, + "time_per_iteration": 2.8968892097473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062749, + "balance_loss_mlp": 1.02991855, + "epoch": 0.7568295498268565, + "flos": 576274825728.0, + "grad_norm": 0.05981896319872157, + "language_loss": 0.79737186, + "learning_rate": 0.00014721847740255112, + "loss": 0.80799937, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.32836914, + "step": 3934, + "time_per_iteration": 2.7890961170196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011573, + "balance_loss_mlp": 1.00279939, + "epoch": 0.75702193151212, + "flos": 1519273594368.0, + "grad_norm": 0.004234862497934677, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74923497, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.08789062, + "step": 3935, + "time_per_iteration": 4.601314544677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061211, + "balance_loss_mlp": 1.02866662, + "epoch": 0.7572143131973836, + "flos": 525218079744.0, + "grad_norm": 0.08729501831475094, + "language_loss": 0.78364342, + "learning_rate": 0.00014677720429790526, + "loss": 0.7942555, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.32543945, + "step": 3936, + "time_per_iteration": 2.5926949977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061936, + "balance_loss_mlp": 1.0290581, + "epoch": 0.7574066948826472, + "flos": 550467836928.0, + "grad_norm": 0.04449678335712254, + "language_loss": 0.84388995, + "learning_rate": 0.0001465567734026429, + "loss": 0.85450935, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.32885742, + "step": 3937, + "time_per_iteration": 2.673203706741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064975, + "balance_loss_mlp": 1.03183448, + "epoch": 0.7575990765679107, + "flos": 395682416640.0, + "grad_norm": 0.06471305080336787, + "language_loss": 0.82730478, + "learning_rate": 0.00014633647972621034, + "loss": 0.83795452, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.33154297, + "step": 3938, + "time_per_iteration": 2.4455604553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067639, + "balance_loss_mlp": 1.03399837, + "epoch": 0.7577914582531743, + "flos": 584742855168.0, + "grad_norm": 0.04609831927497642, + "language_loss": 0.86192119, + "learning_rate": 0.00014611632335413354, + "loss": 0.87259758, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.33666992, + "step": 3939, + "time_per_iteration": 2.7661402225494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068002, + "balance_loss_mlp": 1.03526759, + "epoch": 0.7579838399384379, + "flos": 820604265984.0, + "grad_norm": 0.05221570879511052, + "language_loss": 0.82420516, + "learning_rate": 0.00014589630437188456, + "loss": 0.83488512, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.32739258, + "step": 3940, + "time_per_iteration": 3.1596429347991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010684, + "balance_loss_mlp": 1.03578401, + "epoch": 0.7581762216237015, + "flos": 443664843264.0, + "grad_norm": 0.0650937472679739, + "language_loss": 0.78844047, + "learning_rate": 0.00014567642286488253, + "loss": 0.79912448, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.32617188, + "step": 3941, + "time_per_iteration": 2.515453577041626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067786, + "balance_loss_mlp": 1.03505135, + "epoch": 0.7583686033089649, + "flos": 540624886272.0, + "grad_norm": 0.060324478977950624, + "language_loss": 0.7890631, + "learning_rate": 0.00014545667891849258, + "loss": 0.79974091, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.32739258, + "step": 3942, + "time_per_iteration": 2.632852554321289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068794, + "balance_loss_mlp": 1.03648806, + "epoch": 0.7585609849942285, + "flos": 522332845056.0, + "grad_norm": 0.05155975595459647, + "language_loss": 0.8239159, + "learning_rate": 0.00014523707261802733, + "loss": 0.83460391, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.32299805, + "step": 3943, + "time_per_iteration": 2.6377763748168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074539, + "balance_loss_mlp": 1.04170835, + "epoch": 0.7587533666794921, + "flos": 541599727104.0, + "grad_norm": 0.05698795626816005, + "language_loss": 0.81395125, + "learning_rate": 0.00014501760404874527, + "loss": 0.82469666, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.32836914, + "step": 3944, + "time_per_iteration": 2.690519332885742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073349, + "balance_loss_mlp": 1.04116213, + "epoch": 0.7589457483647557, + "flos": 606131126784.0, + "grad_norm": 0.06183156174415775, + "language_loss": 0.85775477, + "learning_rate": 0.00014479827329585176, + "loss": 0.86848831, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.32177734, + "step": 3945, + "time_per_iteration": 2.7058537006378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066895, + "balance_loss_mlp": 1.03449392, + "epoch": 0.7591381300500193, + "flos": 554821125120.0, + "grad_norm": 0.04920928189565755, + "language_loss": 0.84866571, + "learning_rate": 0.00014457908044449846, + "loss": 0.85933459, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.32397461, + "step": 3946, + "time_per_iteration": 2.785212516784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071888, + "balance_loss_mlp": 1.03963017, + "epoch": 0.7593305117352828, + "flos": 529399660032.0, + "grad_norm": 0.05182175118482316, + "language_loss": 0.82816386, + "learning_rate": 0.00014436002557978371, + "loss": 0.8388828, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.32250977, + "step": 3947, + "time_per_iteration": 2.784555196762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029365, + "balance_loss_mlp": 1.02059126, + "epoch": 0.7595228934205464, + "flos": 1502020412928.0, + "grad_norm": 0.01048294354444643, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77672517, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.08789062, + "step": 3948, + "time_per_iteration": 4.8788769245147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072982, + "balance_loss_mlp": 1.0407002, + "epoch": 0.7597152751058099, + "flos": 455290149888.0, + "grad_norm": 0.0492093123378979, + "language_loss": 0.79732686, + "learning_rate": 0.0001439223301503945, + "loss": 0.80805671, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.32275391, + "step": 3949, + "time_per_iteration": 2.548963785171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071385, + "balance_loss_mlp": 1.0404619, + "epoch": 0.7599076567910735, + "flos": 685135190016.0, + "grad_norm": 0.05900471318664728, + "language_loss": 0.76152921, + "learning_rate": 0.00014370368975564834, + "loss": 0.77224308, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.30883789, + "step": 3950, + "time_per_iteration": 2.913701295852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107191, + "balance_loss_mlp": 1.03915179, + "epoch": 0.760100038476337, + "flos": 532092837888.0, + "grad_norm": 0.059009621355687734, + "language_loss": 0.83279252, + "learning_rate": 0.00014348518768739766, + "loss": 0.84351158, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.32763672, + "step": 3951, + "time_per_iteration": 2.7261831760406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022819, + "balance_loss_mlp": 1.01409268, + "epoch": 0.7602924201616006, + "flos": 1470952134144.0, + "grad_norm": 0.0078103610005334605, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77750862, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.08740234, + "step": 3952, + "time_per_iteration": 4.8437769412994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072298, + "balance_loss_mlp": 1.04094601, + "epoch": 0.7604848018468642, + "flos": 774280539648.0, + "grad_norm": 0.04997444218606865, + "language_loss": 0.86468828, + "learning_rate": 0.00014304859886964867, + "loss": 0.87541121, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.31323242, + "step": 3953, + "time_per_iteration": 3.0284688472747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074246, + "balance_loss_mlp": 1.04148698, + "epoch": 0.7606771835321278, + "flos": 557917355520.0, + "grad_norm": 0.06472890254950428, + "language_loss": 0.83519757, + "learning_rate": 0.00014283051228964878, + "loss": 0.84594011, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.32763672, + "step": 3954, + "time_per_iteration": 2.783090591430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067108, + "balance_loss_mlp": 1.03527939, + "epoch": 0.7608695652173914, + "flos": 525139504128.0, + "grad_norm": 0.05417243250507387, + "language_loss": 0.82754749, + "learning_rate": 0.00014261256437514197, + "loss": 0.83821857, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.31811523, + "step": 3955, + "time_per_iteration": 2.644597291946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067658, + "balance_loss_mlp": 1.03497052, + "epoch": 0.7610619469026548, + "flos": 614757717504.0, + "grad_norm": 0.055555468337999576, + "language_loss": 0.82313621, + "learning_rate": 0.0001423947552107428, + "loss": 0.83381271, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.3269043, + "step": 3956, + "time_per_iteration": 2.7361013889312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069431, + "balance_loss_mlp": 1.03648186, + "epoch": 0.7612543285879184, + "flos": 862992313344.0, + "grad_norm": 0.0569357592258459, + "language_loss": 0.77433807, + "learning_rate": 0.00014217708488101243, + "loss": 0.78503239, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.32958984, + "step": 3957, + "time_per_iteration": 3.050961494445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074949, + "balance_loss_mlp": 1.04271495, + "epoch": 0.761446710273182, + "flos": 553392359424.0, + "grad_norm": 0.06767693941608623, + "language_loss": 0.77007008, + "learning_rate": 0.0001419595534704579, + "loss": 0.78081954, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.32226562, + "step": 3958, + "time_per_iteration": 2.660353899002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062214, + "balance_loss_mlp": 1.03105259, + "epoch": 0.7616390919584456, + "flos": 467107513344.0, + "grad_norm": 0.049028323039667754, + "language_loss": 0.80953354, + "learning_rate": 0.00014174216106353237, + "loss": 0.82015562, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.3112793, + "step": 3959, + "time_per_iteration": 2.5838327407836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068236, + "balance_loss_mlp": 1.03542924, + "epoch": 0.7618314736437091, + "flos": 498181584384.0, + "grad_norm": 0.05923666711399137, + "language_loss": 0.75957918, + "learning_rate": 0.00014152490774463512, + "loss": 0.77026153, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.328125, + "step": 3960, + "time_per_iteration": 2.629302978515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068987, + "balance_loss_mlp": 1.03639507, + "epoch": 0.7620238553289727, + "flos": 434319487488.0, + "grad_norm": 0.07059591088547341, + "language_loss": 0.8700611, + "learning_rate": 0.00014130779359811135, + "loss": 0.88075095, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.32592773, + "step": 3961, + "time_per_iteration": 2.485924243927002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067259, + "balance_loss_mlp": 1.03528666, + "epoch": 0.7622162370142362, + "flos": 663962296320.0, + "grad_norm": 0.05047068415952909, + "language_loss": 0.85704315, + "learning_rate": 0.0001410908187082521, + "loss": 0.86771578, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.31958008, + "step": 3962, + "time_per_iteration": 2.8265414237976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065802, + "balance_loss_mlp": 1.03313875, + "epoch": 0.7624086186994998, + "flos": 557700567552.0, + "grad_norm": 0.05430861505422096, + "language_loss": 0.82810938, + "learning_rate": 0.0001408739831592949, + "loss": 0.83876741, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.32666016, + "step": 3963, + "time_per_iteration": 2.661726236343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067961, + "balance_loss_mlp": 1.03529739, + "epoch": 0.7626010003847634, + "flos": 628844857344.0, + "grad_norm": 0.06042473159086171, + "language_loss": 0.77454793, + "learning_rate": 0.0001406572870354224, + "loss": 0.78522754, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.32666016, + "step": 3964, + "time_per_iteration": 2.7862119674682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068961, + "balance_loss_mlp": 1.03706062, + "epoch": 0.7627933820700269, + "flos": 437716873728.0, + "grad_norm": 0.04673534263309711, + "language_loss": 0.86767244, + "learning_rate": 0.00014044073042076337, + "loss": 0.87836206, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.31884766, + "step": 3965, + "time_per_iteration": 2.4798128604888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069459, + "balance_loss_mlp": 1.03765345, + "epoch": 0.7629857637552905, + "flos": 532456602624.0, + "grad_norm": 0.04658863025626681, + "language_loss": 0.88987994, + "learning_rate": 0.00014022431339939302, + "loss": 0.90057456, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.31787109, + "step": 3966, + "time_per_iteration": 2.636894702911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067525, + "balance_loss_mlp": 1.03479052, + "epoch": 0.7631781454405541, + "flos": 679737249792.0, + "grad_norm": 0.08316975322842361, + "language_loss": 0.77961999, + "learning_rate": 0.00014000803605533163, + "loss": 0.79029524, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.32739258, + "step": 3967, + "time_per_iteration": 2.8040103912353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065892, + "balance_loss_mlp": 1.03344274, + "epoch": 0.7633705271258177, + "flos": 507246133248.0, + "grad_norm": 0.05895392031680787, + "language_loss": 0.83634377, + "learning_rate": 0.00013979189847254553, + "loss": 0.84700263, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.32446289, + "step": 3968, + "time_per_iteration": 2.5431933403015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067532, + "balance_loss_mlp": 1.03501129, + "epoch": 0.7635629088110811, + "flos": 618574123008.0, + "grad_norm": 0.055607531947043785, + "language_loss": 0.80514443, + "learning_rate": 0.00013957590073494674, + "loss": 0.81581974, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.32519531, + "step": 3969, + "time_per_iteration": 2.8017959594726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064277, + "balance_loss_mlp": 1.03232884, + "epoch": 0.7637552904963447, + "flos": 638140750848.0, + "grad_norm": 0.26403384502939975, + "language_loss": 0.78649521, + "learning_rate": 0.0001393600429263931, + "loss": 0.79713798, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.31933594, + "step": 3970, + "time_per_iteration": 4.2505412101745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100666, + "balance_loss_mlp": 0.99793345, + "epoch": 0.7639476721816083, + "flos": 1562359905792.0, + "grad_norm": 0.004510519200430985, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75751543, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.08740234, + "step": 3971, + "time_per_iteration": 4.917391777038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063864, + "balance_loss_mlp": 1.03112936, + "epoch": 0.7641400538668719, + "flos": 495729925632.0, + "grad_norm": 0.05348736526149064, + "language_loss": 0.81438577, + "learning_rate": 0.0001389287474315804, + "loss": 0.82502437, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.32739258, + "step": 3972, + "time_per_iteration": 2.611975908279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063623, + "balance_loss_mlp": 1.03153205, + "epoch": 0.7643324355521355, + "flos": 578173635072.0, + "grad_norm": 0.05070273758495156, + "language_loss": 0.7976076, + "learning_rate": 0.00013871330991276505, + "loss": 0.80824381, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.32080078, + "step": 3973, + "time_per_iteration": 2.6702983379364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106582, + "balance_loss_mlp": 1.03318095, + "epoch": 0.764524817237399, + "flos": 784472698368.0, + "grad_norm": 0.053475096213737486, + "language_loss": 0.80356216, + "learning_rate": 0.00013849801265788247, + "loss": 0.81422037, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.32641602, + "step": 3974, + "time_per_iteration": 3.00087571144104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066526, + "balance_loss_mlp": 1.03357661, + "epoch": 0.7647171989226625, + "flos": 526025594880.0, + "grad_norm": 0.054787050143816365, + "language_loss": 0.82488281, + "learning_rate": 0.00013828285575051818, + "loss": 0.83554804, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.32958984, + "step": 3975, + "time_per_iteration": 2.6055147647857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061125, + "balance_loss_mlp": 1.0279367, + "epoch": 0.7649095806079261, + "flos": 554589780480.0, + "grad_norm": 0.05436611510263978, + "language_loss": 0.84129888, + "learning_rate": 0.0001380678392742035, + "loss": 0.85191011, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.33203125, + "step": 3976, + "time_per_iteration": 2.6914188861846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106537, + "balance_loss_mlp": 1.03232551, + "epoch": 0.7651019622931897, + "flos": 648836296704.0, + "grad_norm": 0.051149264081770666, + "language_loss": 0.84838861, + "learning_rate": 0.00013785296331241526, + "loss": 0.85904235, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.33056641, + "step": 3977, + "time_per_iteration": 2.866154670715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064487, + "balance_loss_mlp": 1.03089428, + "epoch": 0.7652943439784533, + "flos": 1046034971136.0, + "grad_norm": 0.05614197674370758, + "language_loss": 0.87043619, + "learning_rate": 0.00013763822794857583, + "loss": 0.8810811, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.33618164, + "step": 3978, + "time_per_iteration": 3.309242010116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062044, + "balance_loss_mlp": 1.02947557, + "epoch": 0.7654867256637168, + "flos": 504085883904.0, + "grad_norm": 0.05878573704619195, + "language_loss": 0.89744586, + "learning_rate": 0.00013742363326605278, + "loss": 0.90806627, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.32568359, + "step": 3979, + "time_per_iteration": 2.687633991241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060607, + "balance_loss_mlp": 1.02789593, + "epoch": 0.7656791073489804, + "flos": 574422658560.0, + "grad_norm": 0.055229141283006315, + "language_loss": 0.78390539, + "learning_rate": 0.00013720917934815935, + "loss": 0.79451144, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.32714844, + "step": 3980, + "time_per_iteration": 2.7192299365997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106176, + "balance_loss_mlp": 1.02876329, + "epoch": 0.765871489034244, + "flos": 492568266240.0, + "grad_norm": 0.11784191582460708, + "language_loss": 0.82716662, + "learning_rate": 0.00013699486627815344, + "loss": 0.83778423, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.33007812, + "step": 3981, + "time_per_iteration": 2.5523879528045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066983, + "balance_loss_mlp": 1.03386712, + "epoch": 0.7660638707195075, + "flos": 485769111552.0, + "grad_norm": 0.048709081947545384, + "language_loss": 0.82393169, + "learning_rate": 0.00013678069413923928, + "loss": 0.83460152, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.33129883, + "step": 3982, + "time_per_iteration": 2.5948498249053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062511, + "balance_loss_mlp": 1.03034854, + "epoch": 0.766256252404771, + "flos": 444059131392.0, + "grad_norm": 0.05195057178385164, + "language_loss": 0.81826979, + "learning_rate": 0.00013656666301456555, + "loss": 0.82889485, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.3215332, + "step": 3983, + "time_per_iteration": 2.5596601963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063593, + "balance_loss_mlp": 1.02980876, + "epoch": 0.7664486340900346, + "flos": 484922308608.0, + "grad_norm": 0.08343651185872063, + "language_loss": 0.84138393, + "learning_rate": 0.0001363527729872267, + "loss": 0.85201979, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.33813477, + "step": 3984, + "time_per_iteration": 2.6182045936584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065831, + "balance_loss_mlp": 1.03354931, + "epoch": 0.7666410157752982, + "flos": 645905981952.0, + "grad_norm": 0.1262618740109736, + "language_loss": 0.76256335, + "learning_rate": 0.00013613902414026207, + "loss": 0.77322161, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.32275391, + "step": 3985, + "time_per_iteration": 2.7776031494140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106424, + "balance_loss_mlp": 1.03079021, + "epoch": 0.7668333974605618, + "flos": 773964827136.0, + "grad_norm": 0.050561982196081254, + "language_loss": 0.8239125, + "learning_rate": 0.00013592541655665642, + "loss": 0.83455491, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.3347168, + "step": 3986, + "time_per_iteration": 2.952242374420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064817, + "balance_loss_mlp": 1.03136706, + "epoch": 0.7670257791458254, + "flos": 613200913920.0, + "grad_norm": 0.052879642645961566, + "language_loss": 0.85094202, + "learning_rate": 0.00013571195031933947, + "loss": 0.86159021, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.33447266, + "step": 3987, + "time_per_iteration": 2.7266581058502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005099, + "balance_loss_mlp": 0.9958964, + "epoch": 0.7672181608310888, + "flos": 1484608670208.0, + "grad_norm": 0.011043844961489012, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.8148644, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.09179688, + "step": 3988, + "time_per_iteration": 4.669104814529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063461, + "balance_loss_mlp": 1.03079784, + "epoch": 0.7674105425163524, + "flos": 610449509376.0, + "grad_norm": 0.05355294055383006, + "language_loss": 0.85597003, + "learning_rate": 0.00013528544221501655, + "loss": 0.86660457, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.32666016, + "step": 3989, + "time_per_iteration": 2.7729666233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063434, + "balance_loss_mlp": 1.02960289, + "epoch": 0.767602924201616, + "flos": 844857423360.0, + "grad_norm": 0.05868617722535175, + "language_loss": 0.81521833, + "learning_rate": 0.00013507240051359586, + "loss": 0.82585269, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.33837891, + "step": 3990, + "time_per_iteration": 3.0997486114501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065906, + "balance_loss_mlp": 1.0340054, + "epoch": 0.7677953058868796, + "flos": 526857841152.0, + "grad_norm": 0.07003191043706981, + "language_loss": 0.8601203, + "learning_rate": 0.00013485950048963425, + "loss": 0.8707794, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.31884766, + "step": 3991, + "time_per_iteration": 2.5849506855010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063228, + "balance_loss_mlp": 1.03039789, + "epoch": 0.7679876875721431, + "flos": 923161660416.0, + "grad_norm": 0.07243254290057845, + "language_loss": 0.82772785, + "learning_rate": 0.00013464674222578643, + "loss": 0.83836013, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.32836914, + "step": 3992, + "time_per_iteration": 3.2332818508148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106772, + "balance_loss_mlp": 1.03410292, + "epoch": 0.7681800692574067, + "flos": 457855289856.0, + "grad_norm": 0.05271812769462788, + "language_loss": 0.83249938, + "learning_rate": 0.00013443412580465292, + "loss": 0.8431766, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.33642578, + "step": 3993, + "time_per_iteration": 2.5794618129730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062971, + "balance_loss_mlp": 1.03033197, + "epoch": 0.7683724509426703, + "flos": 658113251328.0, + "grad_norm": 0.050288127283744266, + "language_loss": 0.83906549, + "learning_rate": 0.00013422165130877857, + "loss": 0.84969521, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.32641602, + "step": 3994, + "time_per_iteration": 2.8854472637176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060909, + "balance_loss_mlp": 1.028723, + "epoch": 0.7685648326279338, + "flos": 555021946368.0, + "grad_norm": 0.05841740887579896, + "language_loss": 0.80092537, + "learning_rate": 0.00013400931882065327, + "loss": 0.81153446, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.32177734, + "step": 3995, + "time_per_iteration": 2.6247458457946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066631, + "balance_loss_mlp": 1.03337145, + "epoch": 0.7687572143131974, + "flos": 687070315008.0, + "grad_norm": 0.0471892049079333, + "language_loss": 0.8085227, + "learning_rate": 0.0001337971284227118, + "loss": 0.81918901, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.33276367, + "step": 3996, + "time_per_iteration": 3.0075807571411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003827, + "balance_loss_mlp": 0.99471956, + "epoch": 0.7689495959984609, + "flos": 1488653448192.0, + "grad_norm": 0.013752910811902266, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77122247, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.09130859, + "step": 3997, + "time_per_iteration": 4.915713787078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060268, + "balance_loss_mlp": 1.02698493, + "epoch": 0.7691419776837245, + "flos": 570133389312.0, + "grad_norm": 0.05931733235007729, + "language_loss": 0.79872787, + "learning_rate": 0.0001333731742268438, + "loss": 0.80933058, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.33276367, + "step": 3998, + "time_per_iteration": 2.7005136013031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063521, + "balance_loss_mlp": 1.03033328, + "epoch": 0.7693343593689881, + "flos": 519812785152.0, + "grad_norm": 0.05123464057208785, + "language_loss": 0.8547945, + "learning_rate": 0.0001331614105935109, + "loss": 0.8654297, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.33203125, + "step": 3999, + "time_per_iteration": 2.6618032455444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062851, + "balance_loss_mlp": 1.0290674, + "epoch": 0.7695267410542517, + "flos": 660086254080.0, + "grad_norm": 0.04349114240195965, + "language_loss": 0.84291816, + "learning_rate": 0.00013294978937954883, + "loss": 0.85354662, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.33813477, + "step": 4000, + "time_per_iteration": 2.787548780441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106608, + "balance_loss_mlp": 1.03336918, + "epoch": 0.7697191227395151, + "flos": 546548124672.0, + "grad_norm": 0.06371806812200402, + "language_loss": 0.85203207, + "learning_rate": 0.00013273831066711655, + "loss": 0.86269283, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.32714844, + "step": 4001, + "time_per_iteration": 2.603930950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066325, + "balance_loss_mlp": 1.03387642, + "epoch": 0.7699115044247787, + "flos": 540339697152.0, + "grad_norm": 0.04713288479352539, + "language_loss": 0.80269563, + "learning_rate": 0.00013252697453831747, + "loss": 0.8133589, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.32446289, + "step": 4002, + "time_per_iteration": 2.681474447250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065185, + "balance_loss_mlp": 1.03230727, + "epoch": 0.7701038861100423, + "flos": 562635818496.0, + "grad_norm": 0.05017266789361132, + "language_loss": 0.82595527, + "learning_rate": 0.00013231578107519916, + "loss": 0.8366071, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.32885742, + "step": 4003, + "time_per_iteration": 2.910759210586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106487, + "balance_loss_mlp": 1.03289843, + "epoch": 0.7702962677953059, + "flos": 481490016768.0, + "grad_norm": 0.05443168691462721, + "language_loss": 0.82779682, + "learning_rate": 0.00013210473035975422, + "loss": 0.83844554, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.31958008, + "step": 4004, + "time_per_iteration": 2.574204444885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106935, + "balance_loss_mlp": 1.03656733, + "epoch": 0.7704886494805695, + "flos": 770036350464.0, + "grad_norm": 0.05675172766442488, + "language_loss": 0.85354382, + "learning_rate": 0.0001318938224739201, + "loss": 0.86423731, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.32788086, + "step": 4005, + "time_per_iteration": 3.032860279083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067294, + "balance_loss_mlp": 1.03417802, + "epoch": 0.770681031165833, + "flos": 600912096768.0, + "grad_norm": 0.04532626069780256, + "language_loss": 0.83667225, + "learning_rate": 0.00013168305749957843, + "loss": 0.84734517, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.33129883, + "step": 4006, + "time_per_iteration": 2.7624073028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066913, + "balance_loss_mlp": 1.03379726, + "epoch": 0.7708734128510966, + "flos": 495862345728.0, + "grad_norm": 0.05222212765251844, + "language_loss": 0.82636768, + "learning_rate": 0.00013147243551855532, + "loss": 0.83703679, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.33129883, + "step": 4007, + "time_per_iteration": 2.5816714763641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063599, + "balance_loss_mlp": 1.03115058, + "epoch": 0.7710657945363601, + "flos": 567012427776.0, + "grad_norm": 0.057481422314481036, + "language_loss": 0.80578291, + "learning_rate": 0.00013126195661262148, + "loss": 0.81641883, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.32446289, + "step": 4008, + "time_per_iteration": 2.7452778816223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064326, + "balance_loss_mlp": 1.03190088, + "epoch": 0.7712581762216237, + "flos": 604251256320.0, + "grad_norm": 0.05872708876253251, + "language_loss": 0.86326575, + "learning_rate": 0.00013105162086349216, + "loss": 0.873909, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.32421875, + "step": 4009, + "time_per_iteration": 2.8586156368255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066916, + "balance_loss_mlp": 1.03530204, + "epoch": 0.7714505579068872, + "flos": 530620402176.0, + "grad_norm": 0.047861775046014535, + "language_loss": 0.86009622, + "learning_rate": 0.00013084142835282687, + "loss": 0.87076533, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.31591797, + "step": 4010, + "time_per_iteration": 2.704119920730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005244, + "balance_loss_mlp": 0.99647039, + "epoch": 0.7716429395921508, + "flos": 1421414313984.0, + "grad_norm": 0.012063998338178145, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80889606, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.08789062, + "step": 4011, + "time_per_iteration": 4.7817652225494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065556, + "balance_loss_mlp": 1.03301144, + "epoch": 0.7718353212774144, + "flos": 578140139520.0, + "grad_norm": 0.051053206649878655, + "language_loss": 0.89366746, + "learning_rate": 0.0001304214733732485, + "loss": 0.90432304, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.32543945, + "step": 4012, + "time_per_iteration": 2.7189698219299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067191, + "balance_loss_mlp": 1.0337882, + "epoch": 0.772027702962678, + "flos": 510486368256.0, + "grad_norm": 0.053964234671719305, + "language_loss": 0.82622194, + "learning_rate": 0.00013021171106737672, + "loss": 0.8368938, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.33422852, + "step": 4013, + "time_per_iteration": 2.695345401763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062166, + "balance_loss_mlp": 1.03031349, + "epoch": 0.7722200846479416, + "flos": 525391197696.0, + "grad_norm": 0.05051004242016687, + "language_loss": 0.79927659, + "learning_rate": 0.00013000209232605071, + "loss": 0.80989826, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.31835938, + "step": 4014, + "time_per_iteration": 2.6742262840270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062585, + "balance_loss_mlp": 1.03049421, + "epoch": 0.772412466333205, + "flos": 479348278272.0, + "grad_norm": 0.06883144067650042, + "language_loss": 0.79881573, + "learning_rate": 0.0001297926172306519, + "loss": 0.80944163, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.32080078, + "step": 4015, + "time_per_iteration": 2.5998587608337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106658, + "balance_loss_mlp": 1.03420317, + "epoch": 0.7726048480184686, + "flos": 905284256256.0, + "grad_norm": 0.049021978478966305, + "language_loss": 0.7864179, + "learning_rate": 0.0001295832858625055, + "loss": 0.79708374, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.32373047, + "step": 4016, + "time_per_iteration": 3.241476535797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064551, + "balance_loss_mlp": 1.03195906, + "epoch": 0.7727972297037322, + "flos": 631085520384.0, + "grad_norm": 0.050738578814051916, + "language_loss": 0.69703871, + "learning_rate": 0.00012937409830288154, + "loss": 0.70768428, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.32592773, + "step": 4017, + "time_per_iteration": 2.7928261756896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060438, + "balance_loss_mlp": 1.02868032, + "epoch": 0.7729896113889958, + "flos": 414565185024.0, + "grad_norm": 0.11993476807725541, + "language_loss": 0.84959614, + "learning_rate": 0.00012916505463299362, + "loss": 0.86020052, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.31738281, + "step": 4018, + "time_per_iteration": 2.4724020957946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061884, + "balance_loss_mlp": 1.03012657, + "epoch": 0.7731819930742593, + "flos": 668609538048.0, + "grad_norm": 0.07815379187745079, + "language_loss": 0.78152752, + "learning_rate": 0.00012895615493399972, + "loss": 0.79214638, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.31738281, + "step": 4019, + "time_per_iteration": 2.7819771766662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105935, + "balance_loss_mlp": 1.02704406, + "epoch": 0.7733743747595229, + "flos": 489604455936.0, + "grad_norm": 0.06361322846277707, + "language_loss": 0.82174695, + "learning_rate": 0.00012874739928700192, + "loss": 0.83234048, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.32299805, + "step": 4020, + "time_per_iteration": 2.577558755874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063745, + "balance_loss_mlp": 1.03046131, + "epoch": 0.7735667564447865, + "flos": 659294705664.0, + "grad_norm": 0.0626070053016161, + "language_loss": 0.79737717, + "learning_rate": 0.00012853878777304624, + "loss": 0.80801463, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.33300781, + "step": 4021, + "time_per_iteration": 2.868053674697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064138, + "balance_loss_mlp": 1.03140283, + "epoch": 0.77375913813005, + "flos": 533106966528.0, + "grad_norm": 0.04737550155927703, + "language_loss": 0.84463626, + "learning_rate": 0.000128330320473123, + "loss": 0.85527766, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.32739258, + "step": 4022, + "time_per_iteration": 2.668313503265381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008425, + "balance_loss_mlp": 0.99988997, + "epoch": 0.7739515198153136, + "flos": 1519260447744.0, + "grad_norm": 0.005844569838786065, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79340327, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.08544922, + "step": 4023, + "time_per_iteration": 4.965493202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063459, + "balance_loss_mlp": 1.03053296, + "epoch": 0.7741439015005771, + "flos": 639819800064.0, + "grad_norm": 0.08130494829641424, + "language_loss": 0.81473714, + "learning_rate": 0.0001279138188390543, + "loss": 0.82537174, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.3293457, + "step": 4024, + "time_per_iteration": 2.7925288677215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063718, + "balance_loss_mlp": 1.03122211, + "epoch": 0.7743362831858407, + "flos": 665546803200.0, + "grad_norm": 0.05426924538048376, + "language_loss": 0.86122662, + "learning_rate": 0.00012770578466660915, + "loss": 0.87186384, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.32495117, + "step": 4025, + "time_per_iteration": 2.8743951320648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067837, + "balance_loss_mlp": 1.0342437, + "epoch": 0.7745286648711043, + "flos": 562453936128.0, + "grad_norm": 0.050549186901469166, + "language_loss": 0.81480557, + "learning_rate": 0.0001274978950315968, + "loss": 0.82548392, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.33618164, + "step": 4026, + "time_per_iteration": 2.7961745262145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061898, + "balance_loss_mlp": 1.02923501, + "epoch": 0.7747210465563679, + "flos": 516651125760.0, + "grad_norm": 0.06240008099647138, + "language_loss": 0.82893825, + "learning_rate": 0.00012729015001472716, + "loss": 0.83955729, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.32666016, + "step": 4027, + "time_per_iteration": 2.63754940032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065965, + "balance_loss_mlp": 1.03227663, + "epoch": 0.7749134282416313, + "flos": 633921292800.0, + "grad_norm": 0.052874284120550924, + "language_loss": 0.81483364, + "learning_rate": 0.00012708254969665418, + "loss": 0.82549322, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.3371582, + "step": 4028, + "time_per_iteration": 2.7484118938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064367, + "balance_loss_mlp": 1.03070259, + "epoch": 0.7751058099268949, + "flos": 495118849536.0, + "grad_norm": 0.06123905199819526, + "language_loss": 0.83476496, + "learning_rate": 0.00012687509415797526, + "loss": 0.84540868, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.33691406, + "step": 4029, + "time_per_iteration": 2.5675880908966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063972, + "balance_loss_mlp": 1.03183281, + "epoch": 0.7752981916121585, + "flos": 510048410112.0, + "grad_norm": 0.09107931997699928, + "language_loss": 0.81183356, + "learning_rate": 0.00012666778347923208, + "loss": 0.82247323, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.32128906, + "step": 4030, + "time_per_iteration": 2.632314443588257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060699, + "balance_loss_mlp": 1.02813113, + "epoch": 0.7754905732974221, + "flos": 497295493632.0, + "grad_norm": 0.04486214088641844, + "language_loss": 0.83638769, + "learning_rate": 0.0001264606177409092, + "loss": 0.84699464, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.32568359, + "step": 4031, + "time_per_iteration": 2.6301512718200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063527, + "balance_loss_mlp": 1.03081632, + "epoch": 0.7756829549826857, + "flos": 480486062592.0, + "grad_norm": 0.0481221818679906, + "language_loss": 0.86095941, + "learning_rate": 0.00012625359702343609, + "loss": 0.87159473, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.32714844, + "step": 4032, + "time_per_iteration": 2.708512544631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063655, + "balance_loss_mlp": 1.03125429, + "epoch": 0.7758753366679492, + "flos": 552368056320.0, + "grad_norm": 0.0642979185043706, + "language_loss": 0.84532368, + "learning_rate": 0.00012604672140718504, + "loss": 0.85596019, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.32397461, + "step": 4033, + "time_per_iteration": 2.632307529449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062755, + "balance_loss_mlp": 1.03006816, + "epoch": 0.7760677183532128, + "flos": 703529127936.0, + "grad_norm": 0.05215032719242253, + "language_loss": 0.77701473, + "learning_rate": 0.00012583999097247233, + "loss": 0.78764236, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.3269043, + "step": 4034, + "time_per_iteration": 2.8174097537994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064658, + "balance_loss_mlp": 1.03266239, + "epoch": 0.7762601000384763, + "flos": 523218935808.0, + "grad_norm": 0.06260246603028506, + "language_loss": 0.79696673, + "learning_rate": 0.0001256334057995578, + "loss": 0.80761331, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.31982422, + "step": 4035, + "time_per_iteration": 2.69726300239563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063141, + "balance_loss_mlp": 1.03159809, + "epoch": 0.7764524817237399, + "flos": 557262609408.0, + "grad_norm": 0.048886632926304276, + "language_loss": 0.84979451, + "learning_rate": 0.000125426965968645, + "loss": 0.86042595, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.31518555, + "step": 4036, + "time_per_iteration": 2.72336483001709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066748, + "balance_loss_mlp": 1.03508615, + "epoch": 0.7766448634090035, + "flos": 579454013952.0, + "grad_norm": 0.07567948550064775, + "language_loss": 0.81946111, + "learning_rate": 0.00012522067155988092, + "loss": 0.83012855, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.31640625, + "step": 4037, + "time_per_iteration": 2.6716489791870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063745, + "balance_loss_mlp": 1.03153515, + "epoch": 0.776837245094267, + "flos": 635300596224.0, + "grad_norm": 0.05548749189645599, + "language_loss": 0.75042689, + "learning_rate": 0.00012501452265335617, + "loss": 0.76106441, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.32202148, + "step": 4038, + "time_per_iteration": 2.798152446746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063102, + "balance_loss_mlp": 1.03115439, + "epoch": 0.7770296267795306, + "flos": 614398334976.0, + "grad_norm": 0.04733898192839437, + "language_loss": 0.83099091, + "learning_rate": 0.0001248085193291047, + "loss": 0.84162188, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.31933594, + "step": 4039, + "time_per_iteration": 2.713104009628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064564, + "balance_loss_mlp": 1.03287828, + "epoch": 0.7772220084647942, + "flos": 878438407680.0, + "grad_norm": 0.06729067040173044, + "language_loss": 0.8247925, + "learning_rate": 0.00012460266166710443, + "loss": 0.83543813, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.31665039, + "step": 4040, + "time_per_iteration": 3.142155408859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061758, + "balance_loss_mlp": 1.02988183, + "epoch": 0.7774143901500578, + "flos": 839293567488.0, + "grad_norm": 0.08233225163586903, + "language_loss": 0.77612185, + "learning_rate": 0.00012439694974727633, + "loss": 0.78673941, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.31860352, + "step": 4041, + "time_per_iteration": 2.9853243827819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065759, + "balance_loss_mlp": 1.03338194, + "epoch": 0.7776067718353212, + "flos": 567878169600.0, + "grad_norm": 0.054149054361607385, + "language_loss": 0.79806697, + "learning_rate": 0.00012419138364948458, + "loss": 0.80872452, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.32373047, + "step": 4042, + "time_per_iteration": 2.7431745529174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064082, + "balance_loss_mlp": 1.03191924, + "epoch": 0.7777991535205848, + "flos": 745627603968.0, + "grad_norm": 0.05348286137005146, + "language_loss": 0.8234185, + "learning_rate": 0.00012398596345353702, + "loss": 0.83405924, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.3215332, + "step": 4043, + "time_per_iteration": 2.896669864654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069609, + "balance_loss_mlp": 1.03785181, + "epoch": 0.7779915352058484, + "flos": 537799288320.0, + "grad_norm": 0.048601854183842386, + "language_loss": 0.83191538, + "learning_rate": 0.0001237806892391851, + "loss": 0.84261149, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.31738281, + "step": 4044, + "time_per_iteration": 2.6875576972961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067998, + "balance_loss_mlp": 1.03523958, + "epoch": 0.778183916891112, + "flos": 634497463296.0, + "grad_norm": 0.05218142456455376, + "language_loss": 0.807693, + "learning_rate": 0.0001235755610861233, + "loss": 0.81837296, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.32763672, + "step": 4045, + "time_per_iteration": 2.7440977096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063341, + "balance_loss_mlp": 1.03125, + "epoch": 0.7783762985763756, + "flos": 588400699392.0, + "grad_norm": 0.06119934823569683, + "language_loss": 0.85257781, + "learning_rate": 0.0001233705790739893, + "loss": 0.86321127, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.32080078, + "step": 4046, + "time_per_iteration": 2.771397829055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066435, + "balance_loss_mlp": 1.03398585, + "epoch": 0.7785686802616391, + "flos": 930261970944.0, + "grad_norm": 0.05518335637199763, + "language_loss": 0.74865597, + "learning_rate": 0.0001231657432823643, + "loss": 0.75932032, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.32446289, + "step": 4047, + "time_per_iteration": 3.2299704551696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068089, + "balance_loss_mlp": 1.03618836, + "epoch": 0.7787610619469026, + "flos": 497679607296.0, + "grad_norm": 0.061331476050258626, + "language_loss": 0.78644454, + "learning_rate": 0.0001229610537907725, + "loss": 0.7971254, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.31884766, + "step": 4048, + "time_per_iteration": 2.581489324569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062949, + "balance_loss_mlp": 1.03040469, + "epoch": 0.7789534436321662, + "flos": 515385303552.0, + "grad_norm": 0.060582734060361326, + "language_loss": 0.90193808, + "learning_rate": 0.00012275651067868143, + "loss": 0.9125675, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.32543945, + "step": 4049, + "time_per_iteration": 2.5799412727355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066, + "balance_loss_mlp": 1.03350401, + "epoch": 0.7791458253174298, + "flos": 988081555968.0, + "grad_norm": 0.06086378000483131, + "language_loss": 0.80482578, + "learning_rate": 0.00012255211402550182, + "loss": 0.81548578, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.32495117, + "step": 4050, + "time_per_iteration": 3.228003740310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065165, + "balance_loss_mlp": 1.03283536, + "epoch": 0.7793382070026933, + "flos": 628756107264.0, + "grad_norm": 0.1203274251701162, + "language_loss": 0.76654673, + "learning_rate": 0.00012234786391058727, + "loss": 0.77719831, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.32324219, + "step": 4051, + "time_per_iteration": 2.7767224311828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106691, + "balance_loss_mlp": 1.03405643, + "epoch": 0.7795305886879569, + "flos": 531500700672.0, + "grad_norm": 0.06608083549317771, + "language_loss": 0.85191727, + "learning_rate": 0.0001221437604132352, + "loss": 0.86258644, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.32861328, + "step": 4052, + "time_per_iteration": 2.6072323322296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069028, + "balance_loss_mlp": 1.03703237, + "epoch": 0.7797229703732205, + "flos": 611690600448.0, + "grad_norm": 0.06701840569046753, + "language_loss": 0.80875957, + "learning_rate": 0.0001219398036126852, + "loss": 0.8194499, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.31982422, + "step": 4053, + "time_per_iteration": 2.789151668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069667, + "balance_loss_mlp": 1.03738546, + "epoch": 0.7799153520584841, + "flos": 871758526464.0, + "grad_norm": 0.05089113411890528, + "language_loss": 0.78444964, + "learning_rate": 0.00012173599358812027, + "loss": 0.79514629, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.32275391, + "step": 4054, + "time_per_iteration": 3.282203197479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065975, + "balance_loss_mlp": 1.03359818, + "epoch": 0.7801077337437476, + "flos": 583348995072.0, + "grad_norm": 0.06359619445711458, + "language_loss": 0.82295758, + "learning_rate": 0.0001215323304186668, + "loss": 0.83361733, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.32373047, + "step": 4055, + "time_per_iteration": 2.751826763153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062514, + "balance_loss_mlp": 1.03073275, + "epoch": 0.7803001154290111, + "flos": 600887365632.0, + "grad_norm": 0.04750930955711312, + "language_loss": 0.8780787, + "learning_rate": 0.00012132881418339364, + "loss": 0.88870382, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.31762695, + "step": 4056, + "time_per_iteration": 2.7023940086364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016356, + "balance_loss_mlp": 1.00820196, + "epoch": 0.7804924971142747, + "flos": 1478743506432.0, + "grad_norm": 0.010148524200822068, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.78533918, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.08154297, + "step": 4057, + "time_per_iteration": 4.826777458190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066059, + "balance_loss_mlp": 1.03430223, + "epoch": 0.7806848787995383, + "flos": 630075773952.0, + "grad_norm": 0.04851285793009641, + "language_loss": 0.76570946, + "learning_rate": 0.00012092222283137944, + "loss": 0.77637005, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.31738281, + "step": 4058, + "time_per_iteration": 2.7130894660949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014581, + "balance_loss_mlp": 1.00647449, + "epoch": 0.7808772604848019, + "flos": 1416800567808.0, + "grad_norm": 0.006919063816033351, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79920888, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.08105469, + "step": 4059, + "time_per_iteration": 4.767851114273071 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068468, + "balance_loss_mlp": 1.03575706, + "epoch": 0.7810696421700654, + "flos": 731345435136.0, + "grad_norm": 0.0468820010320679, + "language_loss": 0.83492804, + "learning_rate": 0.00012051622016348856, + "loss": 0.8456127, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.32714844, + "step": 4060, + "time_per_iteration": 3.0499465465545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065592, + "balance_loss_mlp": 1.0336442, + "epoch": 0.781262023855329, + "flos": 424718055936.0, + "grad_norm": 0.05864420891572784, + "language_loss": 0.8411994, + "learning_rate": 0.00012031343978315539, + "loss": 0.85185528, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.31933594, + "step": 4061, + "time_per_iteration": 2.448692560195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063026, + "balance_loss_mlp": 1.0311023, + "epoch": 0.7814544055405925, + "flos": 500767073280.0, + "grad_norm": 0.10364470659774863, + "language_loss": 0.82632732, + "learning_rate": 0.00012011080681021774, + "loss": 0.83695757, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.3190918, + "step": 4062, + "time_per_iteration": 2.611121892929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066061, + "balance_loss_mlp": 1.03373194, + "epoch": 0.7816467872258561, + "flos": 462212960256.0, + "grad_norm": 0.09614941126191437, + "language_loss": 0.86035311, + "learning_rate": 0.00011990832132334512, + "loss": 0.87101376, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.32324219, + "step": 4063, + "time_per_iteration": 2.5123276710510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066235, + "balance_loss_mlp": 1.03354836, + "epoch": 0.7818391689111197, + "flos": 740497324032.0, + "grad_norm": 0.05603872830064661, + "language_loss": 0.8259666, + "learning_rate": 0.00011970598340114897, + "loss": 0.83662897, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.3269043, + "step": 4064, + "time_per_iteration": 2.992100238800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062887, + "balance_loss_mlp": 1.03101015, + "epoch": 0.7820315505963832, + "flos": 547386163200.0, + "grad_norm": 0.05629095926792252, + "language_loss": 0.8402884, + "learning_rate": 0.00011950379312218396, + "loss": 0.85091722, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.31860352, + "step": 4065, + "time_per_iteration": 2.7270681858062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061717, + "balance_loss_mlp": 1.02950692, + "epoch": 0.7822239322816468, + "flos": 728665403904.0, + "grad_norm": 0.045794357656988534, + "language_loss": 0.8601073, + "learning_rate": 0.00011930175056494719, + "loss": 0.87072444, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.32202148, + "step": 4066, + "time_per_iteration": 2.8730247020721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066726, + "balance_loss_mlp": 1.03408647, + "epoch": 0.7824163139669104, + "flos": 451774900224.0, + "grad_norm": 0.04781338865883617, + "language_loss": 0.76222277, + "learning_rate": 0.00011909985580787885, + "loss": 0.77288997, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.32641602, + "step": 4067, + "time_per_iteration": 2.6421656608581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063309, + "balance_loss_mlp": 1.03138483, + "epoch": 0.782608695652174, + "flos": 540207277056.0, + "grad_norm": 0.05261646090903281, + "language_loss": 0.81026649, + "learning_rate": 0.00011889810892936137, + "loss": 0.82089961, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.3190918, + "step": 4068, + "time_per_iteration": 2.70185923576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071177, + "balance_loss_mlp": 1.03813219, + "epoch": 0.7828010773374374, + "flos": 500029369344.0, + "grad_norm": 0.05419048158551631, + "language_loss": 0.7722286, + "learning_rate": 0.00011869651000771959, + "loss": 0.78294039, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.33056641, + "step": 4069, + "time_per_iteration": 2.822190523147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060879, + "balance_loss_mlp": 1.02890754, + "epoch": 0.782993459022701, + "flos": 600542539776.0, + "grad_norm": 0.05379601018960074, + "language_loss": 0.82404703, + "learning_rate": 0.00011849505912122117, + "loss": 0.83465582, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.31958008, + "step": 4070, + "time_per_iteration": 2.7197659015655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061129, + "balance_loss_mlp": 1.02827537, + "epoch": 0.7831858407079646, + "flos": 809702106624.0, + "grad_norm": 0.06431726516643936, + "language_loss": 0.77697992, + "learning_rate": 0.00011829375634807654, + "loss": 0.78759122, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.32861328, + "step": 4071, + "time_per_iteration": 3.0201632976531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060522, + "balance_loss_mlp": 1.027596, + "epoch": 0.7833782223932282, + "flos": 806240701440.0, + "grad_norm": 0.09019117286711203, + "language_loss": 0.80854774, + "learning_rate": 0.00011809260176643821, + "loss": 0.81915295, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.3293457, + "step": 4072, + "time_per_iteration": 3.059041738510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062272, + "balance_loss_mlp": 1.0295614, + "epoch": 0.7835706040784918, + "flos": 520614508032.0, + "grad_norm": 0.05845304127163334, + "language_loss": 0.83590925, + "learning_rate": 0.00011789159545440131, + "loss": 0.84653199, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.32714844, + "step": 4073, + "time_per_iteration": 2.5912578105926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064777, + "balance_loss_mlp": 1.03199446, + "epoch": 0.7837629857637552, + "flos": 505322592768.0, + "grad_norm": 0.0488968990026523, + "language_loss": 0.82248485, + "learning_rate": 0.00011769073749000348, + "loss": 0.83313262, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.32788086, + "step": 4074, + "time_per_iteration": 2.7853548526763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067546, + "balance_loss_mlp": 1.03533578, + "epoch": 0.7839553674490188, + "flos": 515872723968.0, + "grad_norm": 0.0606411027248537, + "language_loss": 0.75941336, + "learning_rate": 0.0001174900279512246, + "loss": 0.77008879, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.32202148, + "step": 4075, + "time_per_iteration": 2.5954041481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065014, + "balance_loss_mlp": 1.03266096, + "epoch": 0.7841477491342824, + "flos": 506399330304.0, + "grad_norm": 0.05056809711727469, + "language_loss": 0.81398273, + "learning_rate": 0.00011728946691598707, + "loss": 0.82463288, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.32348633, + "step": 4076, + "time_per_iteration": 2.618093252182007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059278, + "balance_loss_mlp": 1.02680504, + "epoch": 0.784340130819546, + "flos": 719320048128.0, + "grad_norm": 0.06832591600294699, + "language_loss": 0.76352495, + "learning_rate": 0.00011708905446215561, + "loss": 0.77411771, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.32470703, + "step": 4077, + "time_per_iteration": 2.8518495559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064476, + "balance_loss_mlp": 1.03228974, + "epoch": 0.7845325125048095, + "flos": 514174735872.0, + "grad_norm": 0.05162512360480059, + "language_loss": 0.79919541, + "learning_rate": 0.00011688879066753711, + "loss": 0.8098402, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.32177734, + "step": 4078, + "time_per_iteration": 2.693814516067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107088, + "balance_loss_mlp": 1.03919387, + "epoch": 0.7847248941900731, + "flos": 465866422272.0, + "grad_norm": 0.057791720647150095, + "language_loss": 0.87164676, + "learning_rate": 0.00011668867560988122, + "loss": 0.88235557, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.31665039, + "step": 4079, + "time_per_iteration": 2.544497489929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106489, + "balance_loss_mlp": 1.03217876, + "epoch": 0.7849172758753367, + "flos": 502766217216.0, + "grad_norm": 0.06577906092431222, + "language_loss": 0.84248155, + "learning_rate": 0.00011648870936687916, + "loss": 0.85313052, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.32714844, + "step": 4080, + "time_per_iteration": 2.73219895362854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067029, + "balance_loss_mlp": 1.03465128, + "epoch": 0.7851096575606002, + "flos": 531742219776.0, + "grad_norm": 0.07071087412215145, + "language_loss": 0.77993482, + "learning_rate": 0.00011628889201616461, + "loss": 0.79060507, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.32373047, + "step": 4081, + "time_per_iteration": 2.6256251335144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064024, + "balance_loss_mlp": 1.03145564, + "epoch": 0.7853020392458638, + "flos": 569685256704.0, + "grad_norm": 0.054581090755724565, + "language_loss": 0.81991017, + "learning_rate": 0.00011608922363531393, + "loss": 0.83055043, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.32568359, + "step": 4082, + "time_per_iteration": 2.68129825592041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066291, + "balance_loss_mlp": 1.03522539, + "epoch": 0.7854944209311273, + "flos": 832228162560.0, + "grad_norm": 0.0528540930480431, + "language_loss": 0.83166963, + "learning_rate": 0.00011588970430184504, + "loss": 0.84233254, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.31030273, + "step": 4083, + "time_per_iteration": 3.01277494430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068509, + "balance_loss_mlp": 1.03608418, + "epoch": 0.7856868026163909, + "flos": 559660423680.0, + "grad_norm": 0.04365607087588255, + "language_loss": 0.81863219, + "learning_rate": 0.00011569033409321822, + "loss": 0.82931721, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.32421875, + "step": 4084, + "time_per_iteration": 2.6665027141571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106523, + "balance_loss_mlp": 1.03290033, + "epoch": 0.7858791843016545, + "flos": 544972382208.0, + "grad_norm": 0.05673133805325975, + "language_loss": 0.72893167, + "learning_rate": 0.00011549111308683591, + "loss": 0.73958397, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.32324219, + "step": 4085, + "time_per_iteration": 2.652221918106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062463, + "balance_loss_mlp": 1.03111076, + "epoch": 0.7860715659869181, + "flos": 380787761664.0, + "grad_norm": 0.058608703259898844, + "language_loss": 0.80785263, + "learning_rate": 0.00011529204136004251, + "loss": 0.81847727, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.31323242, + "step": 4086, + "time_per_iteration": 2.4127490520477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069783, + "balance_loss_mlp": 1.03762007, + "epoch": 0.7862639476721817, + "flos": 567173961216.0, + "grad_norm": 0.058008459467675216, + "language_loss": 0.84520507, + "learning_rate": 0.00011509311899012459, + "loss": 0.85590291, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.3215332, + "step": 4087, + "time_per_iteration": 2.6412453651428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067278, + "balance_loss_mlp": 1.03544927, + "epoch": 0.7864563293574451, + "flos": 544968000000.0, + "grad_norm": 0.06454830776496215, + "language_loss": 0.78072417, + "learning_rate": 0.00011489434605431053, + "loss": 0.79139692, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.31811523, + "step": 4088, + "time_per_iteration": 2.637660026550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106782, + "balance_loss_mlp": 1.03563344, + "epoch": 0.7866487110427087, + "flos": 563260041216.0, + "grad_norm": 0.058240331432363256, + "language_loss": 0.81125653, + "learning_rate": 0.0001146957226297708, + "loss": 0.82193476, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.32177734, + "step": 4089, + "time_per_iteration": 2.6684415340423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065799, + "balance_loss_mlp": 1.03323102, + "epoch": 0.7868410927279723, + "flos": 727849124352.0, + "grad_norm": 0.04414589533004489, + "language_loss": 0.76471299, + "learning_rate": 0.00011449724879361827, + "loss": 0.77537096, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.32568359, + "step": 4090, + "time_per_iteration": 2.951436758041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106424, + "balance_loss_mlp": 1.03236377, + "epoch": 0.7870334744132359, + "flos": 521082989568.0, + "grad_norm": 0.060886300721865946, + "language_loss": 0.73346722, + "learning_rate": 0.00011429892462290687, + "loss": 0.74410957, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.31860352, + "step": 4091, + "time_per_iteration": 2.681136131286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063333, + "balance_loss_mlp": 1.03143215, + "epoch": 0.7872258560984994, + "flos": 451173998592.0, + "grad_norm": 0.05425416710162835, + "language_loss": 0.83261812, + "learning_rate": 0.00011410075019463295, + "loss": 0.84325141, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.31884766, + "step": 4092, + "time_per_iteration": 2.596997022628784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_mlp": 1.03559613, + "epoch": 0.787418237783763, + "flos": 514932788736.0, + "grad_norm": 0.06041624723999286, + "language_loss": 0.80031419, + "learning_rate": 0.00011390272558573461, + "loss": 0.81098628, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.31591797, + "step": 4093, + "time_per_iteration": 2.724531412124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066529, + "balance_loss_mlp": 1.03422308, + "epoch": 0.7876106194690266, + "flos": 484837940736.0, + "grad_norm": 0.057479971789758694, + "language_loss": 0.79717124, + "learning_rate": 0.00011370485087309202, + "loss": 0.80783653, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.32299805, + "step": 4094, + "time_per_iteration": 2.6680920124053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066683, + "balance_loss_mlp": 1.03401947, + "epoch": 0.7878030011542901, + "flos": 542570185728.0, + "grad_norm": 0.07064536799183499, + "language_loss": 0.79107904, + "learning_rate": 0.00011350712613352688, + "loss": 0.80174589, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.32666016, + "step": 4095, + "time_per_iteration": 2.6333553791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066243, + "balance_loss_mlp": 1.03415227, + "epoch": 0.7879953828395537, + "flos": 516488182272.0, + "grad_norm": 0.06900072412934964, + "language_loss": 0.79095006, + "learning_rate": 0.00011330955144380283, + "loss": 0.8016125, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.32080078, + "step": 4096, + "time_per_iteration": 2.5925889015197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.03246856, + "epoch": 0.7881877645248172, + "flos": 582004597248.0, + "grad_norm": 0.054709813023541755, + "language_loss": 0.8620733, + "learning_rate": 0.00011311212688062483, + "loss": 0.87271917, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.32104492, + "step": 4097, + "time_per_iteration": 2.774585485458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065245, + "balance_loss_mlp": 1.03279638, + "epoch": 0.7883801462100808, + "flos": 588883737600.0, + "grad_norm": 0.05950523871883677, + "language_loss": 0.77641714, + "learning_rate": 0.0001129148525206402, + "loss": 0.78706962, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.32446289, + "step": 4098, + "time_per_iteration": 2.8262319564819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066849, + "balance_loss_mlp": 1.03535402, + "epoch": 0.7885725278953444, + "flos": 481475460096.0, + "grad_norm": 0.05859958093341329, + "language_loss": 0.86361545, + "learning_rate": 0.00011271772844043759, + "loss": 0.87428391, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.31469727, + "step": 4099, + "time_per_iteration": 2.6731910705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064893, + "balance_loss_mlp": 1.03306413, + "epoch": 0.788764909580608, + "flos": 756470126592.0, + "grad_norm": 0.05966502266655521, + "language_loss": 0.75518525, + "learning_rate": 0.00011252075471654727, + "loss": 0.76583415, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.31811523, + "step": 4100, + "time_per_iteration": 2.919638156890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065613, + "balance_loss_mlp": 1.03294969, + "epoch": 0.7889572912658714, + "flos": 702225427968.0, + "grad_norm": 0.050441368463949324, + "language_loss": 0.77960974, + "learning_rate": 0.00011232393142544133, + "loss": 0.79026586, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.32666016, + "step": 4101, + "time_per_iteration": 2.9371449947357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064145, + "balance_loss_mlp": 1.03188694, + "epoch": 0.789149672951135, + "flos": 736047931392.0, + "grad_norm": 0.05824722379420924, + "language_loss": 0.83012629, + "learning_rate": 0.00011212725864353323, + "loss": 0.8407678, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.32250977, + "step": 4102, + "time_per_iteration": 3.070425033569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019214, + "balance_loss_mlp": 1.01106, + "epoch": 0.7893420546363986, + "flos": 1480626349056.0, + "grad_norm": 0.00964834437524815, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.7735514, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.08154297, + "step": 4103, + "time_per_iteration": 4.87341046333313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069024, + "balance_loss_mlp": 1.03688502, + "epoch": 0.7895344363216622, + "flos": 508821875712.0, + "grad_norm": 0.06647723888078448, + "language_loss": 0.76089919, + "learning_rate": 0.00011173436491267291, + "loss": 0.77158946, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.32128906, + "step": 4104, + "time_per_iteration": 2.579040050506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069021, + "balance_loss_mlp": 1.036906, + "epoch": 0.7897268180069258, + "flos": 541727764992.0, + "grad_norm": 0.05890584899946244, + "language_loss": 0.81946945, + "learning_rate": 0.0001115381441162554, + "loss": 0.83015972, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.32104492, + "step": 4105, + "time_per_iteration": 2.6771240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019188, + "balance_loss_mlp": 1.01103461, + "epoch": 0.7899191996921893, + "flos": 1411924953600.0, + "grad_norm": 0.009593800245269755, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74602914, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.08154297, + "step": 4106, + "time_per_iteration": 4.9348978996276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067573, + "balance_loss_mlp": 1.03593516, + "epoch": 0.7901115813774529, + "flos": 622547679744.0, + "grad_norm": 0.05203428042978299, + "language_loss": 0.84845543, + "learning_rate": 0.00011114615504234465, + "loss": 0.85913116, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.31616211, + "step": 4107, + "time_per_iteration": 2.78153657913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067357, + "balance_loss_mlp": 1.03581429, + "epoch": 0.7903039630627164, + "flos": 645232296960.0, + "grad_norm": 0.05460483755610551, + "language_loss": 0.80740857, + "learning_rate": 0.00011095038691703468, + "loss": 0.81808215, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.31518555, + "step": 4108, + "time_per_iteration": 2.83954119682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069484, + "balance_loss_mlp": 1.03829885, + "epoch": 0.79049634474798, + "flos": 594054715392.0, + "grad_norm": 0.05143854855735133, + "language_loss": 0.82689941, + "learning_rate": 0.00011075476983417998, + "loss": 0.83759421, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.31152344, + "step": 4109, + "time_per_iteration": 2.8581154346466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069019, + "balance_loss_mlp": 1.03792906, + "epoch": 0.7906887264332435, + "flos": 715784449536.0, + "grad_norm": 0.056450839629860305, + "language_loss": 0.77744591, + "learning_rate": 0.00011055930386972579, + "loss": 0.78813612, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.31054688, + "step": 4110, + "time_per_iteration": 2.8273229598999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071548, + "balance_loss_mlp": 1.03855133, + "epoch": 0.7908811081185071, + "flos": 789553516032.0, + "grad_norm": 0.04891253400272343, + "language_loss": 0.78669703, + "learning_rate": 0.00011036398909955863, + "loss": 0.79741246, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.33007812, + "step": 4111, + "time_per_iteration": 2.961766004562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069451, + "balance_loss_mlp": 1.03747857, + "epoch": 0.7910734898037707, + "flos": 641612330496.0, + "grad_norm": 0.048663438809518546, + "language_loss": 0.81452119, + "learning_rate": 0.00011016882559950648, + "loss": 0.82521558, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.31958008, + "step": 4112, + "time_per_iteration": 2.8214406967163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068413, + "balance_loss_mlp": 1.03660822, + "epoch": 0.7912658714890343, + "flos": 669057670656.0, + "grad_norm": 0.05392137662685343, + "language_loss": 0.80067742, + "learning_rate": 0.00010997381344533853, + "loss": 0.81136161, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.31787109, + "step": 4113, + "time_per_iteration": 2.811772346496582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073852, + "balance_loss_mlp": 1.04152238, + "epoch": 0.7914582531742979, + "flos": 557504128512.0, + "grad_norm": 0.0581863083981893, + "language_loss": 0.80220509, + "learning_rate": 0.00010977895271276517, + "loss": 0.81294358, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.32324219, + "step": 4114, + "time_per_iteration": 2.719431161880493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107287, + "balance_loss_mlp": 1.0409224, + "epoch": 0.7916506348595613, + "flos": 569784181248.0, + "grad_norm": 0.05018332028611806, + "language_loss": 0.7987901, + "learning_rate": 0.00010958424347743807, + "loss": 0.80951875, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.31933594, + "step": 4115, + "time_per_iteration": 2.6972670555114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106862, + "balance_loss_mlp": 1.03724396, + "epoch": 0.7918430165448249, + "flos": 717966885888.0, + "grad_norm": 0.06933669285907723, + "language_loss": 0.80126512, + "learning_rate": 0.00010938968581494991, + "loss": 0.81195128, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.31347656, + "step": 4116, + "time_per_iteration": 2.9974632263183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069688, + "balance_loss_mlp": 1.03750205, + "epoch": 0.7920353982300885, + "flos": 553377802752.0, + "grad_norm": 0.05941447289744039, + "language_loss": 0.78879136, + "learning_rate": 0.000109195279800835, + "loss": 0.79948825, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.32177734, + "step": 4117, + "time_per_iteration": 2.710513114929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071029, + "balance_loss_mlp": 1.03896213, + "epoch": 0.7922277799153521, + "flos": 809766125568.0, + "grad_norm": 0.05531983375516572, + "language_loss": 0.76555854, + "learning_rate": 0.00010900102551056834, + "loss": 0.77626884, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.32055664, + "step": 4118, + "time_per_iteration": 3.0103225708007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069951, + "balance_loss_mlp": 1.03766966, + "epoch": 0.7924201616006156, + "flos": 421128612864.0, + "grad_norm": 0.05482547351078549, + "language_loss": 0.84337735, + "learning_rate": 0.00010880692301956601, + "loss": 0.85407686, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.32275391, + "step": 4119, + "time_per_iteration": 2.445122003555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069691, + "balance_loss_mlp": 1.03707528, + "epoch": 0.7926125432858792, + "flos": 617541055488.0, + "grad_norm": 0.04369868110465695, + "language_loss": 0.86072242, + "learning_rate": 0.00010861297240318518, + "loss": 0.87141925, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.32617188, + "step": 4120, + "time_per_iteration": 2.85048508644104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067992, + "balance_loss_mlp": 1.03656876, + "epoch": 0.7928049249711427, + "flos": 602207032320.0, + "grad_norm": 0.05006458241333452, + "language_loss": 0.86780667, + "learning_rate": 0.00010841917373672444, + "loss": 0.87848663, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.31396484, + "step": 4121, + "time_per_iteration": 2.704904794692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067696, + "balance_loss_mlp": 1.03570032, + "epoch": 0.7929973066564063, + "flos": 655724201472.0, + "grad_norm": 0.05319226556655214, + "language_loss": 0.78318095, + "learning_rate": 0.00010822552709542293, + "loss": 0.79385787, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.31982422, + "step": 4122, + "time_per_iteration": 2.8160955905914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069953, + "balance_loss_mlp": 1.03814769, + "epoch": 0.7931896883416699, + "flos": 536139177984.0, + "grad_norm": 0.04444307991995564, + "language_loss": 0.85812402, + "learning_rate": 0.0001080320325544612, + "loss": 0.86882365, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.31787109, + "step": 4123, + "time_per_iteration": 2.6734302043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067197, + "balance_loss_mlp": 1.03594005, + "epoch": 0.7933820700269334, + "flos": 497836758528.0, + "grad_norm": 0.04986309312867086, + "language_loss": 0.82817209, + "learning_rate": 0.00010783869018895997, + "loss": 0.838844, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.31225586, + "step": 4124, + "time_per_iteration": 2.578643321990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067226, + "balance_loss_mlp": 1.03506327, + "epoch": 0.793574451712197, + "flos": 537217325568.0, + "grad_norm": 0.05142590484857824, + "language_loss": 0.84177709, + "learning_rate": 0.00010764550007398189, + "loss": 0.8524493, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.3215332, + "step": 4125, + "time_per_iteration": 2.668468475341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065806, + "balance_loss_mlp": 1.03419125, + "epoch": 0.7937668333974606, + "flos": 488043270144.0, + "grad_norm": 0.048489850781485225, + "language_loss": 0.81036043, + "learning_rate": 0.00010745246228452982, + "loss": 0.82101846, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.31591797, + "step": 4126, + "time_per_iteration": 2.5388453006744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106876, + "balance_loss_mlp": 1.0364542, + "epoch": 0.7939592150827242, + "flos": 527163379200.0, + "grad_norm": 0.05117583653255347, + "language_loss": 0.81550407, + "learning_rate": 0.00010725957689554771, + "loss": 0.82619166, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.32299805, + "step": 4127, + "time_per_iteration": 2.7774598598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065815, + "balance_loss_mlp": 1.03353345, + "epoch": 0.7941515967679876, + "flos": 541428019200.0, + "grad_norm": 0.13198996647770603, + "language_loss": 0.84346122, + "learning_rate": 0.00010706684398192013, + "loss": 0.85411942, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.32275391, + "step": 4128, + "time_per_iteration": 2.6948909759521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068056, + "balance_loss_mlp": 1.03555918, + "epoch": 0.7943439784532512, + "flos": 518104622592.0, + "grad_norm": 0.05568877803614168, + "language_loss": 0.81997395, + "learning_rate": 0.00010687426361847313, + "loss": 0.8306545, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.32495117, + "step": 4129, + "time_per_iteration": 2.693753957748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069811, + "balance_loss_mlp": 1.0384829, + "epoch": 0.7945363601385148, + "flos": 508768031232.0, + "grad_norm": 0.052703179932938445, + "language_loss": 0.85951877, + "learning_rate": 0.00010668183587997254, + "loss": 0.87021685, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.31298828, + "step": 4130, + "time_per_iteration": 2.5763041973114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069665, + "balance_loss_mlp": 1.03731203, + "epoch": 0.7947287418237784, + "flos": 650918398464.0, + "grad_norm": 0.061493260737887565, + "language_loss": 0.77379823, + "learning_rate": 0.0001064895608411256, + "loss": 0.78449482, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.32348633, + "step": 4131, + "time_per_iteration": 2.763904333114624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068483, + "balance_loss_mlp": 1.03620124, + "epoch": 0.794921123509042, + "flos": 695726019072.0, + "grad_norm": 0.07934957130099038, + "language_loss": 0.80297732, + "learning_rate": 0.00010629743857657998, + "loss": 0.81366217, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.32275391, + "step": 4132, + "time_per_iteration": 2.933009386062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019333, + "balance_loss_mlp": 1.01117909, + "epoch": 0.7951135051943055, + "flos": 1402161988608.0, + "grad_norm": 0.006928845772435826, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71618003, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.08154297, + "step": 4133, + "time_per_iteration": 4.611080884933472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067103, + "balance_loss_mlp": 1.03560841, + "epoch": 0.795305886879569, + "flos": 809745776640.0, + "grad_norm": 0.059789926396459823, + "language_loss": 0.81835663, + "learning_rate": 0.00010591365266868802, + "loss": 0.82902765, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.31469727, + "step": 4134, + "time_per_iteration": 2.9697659015655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016843, + "balance_loss_mlp": 1.00873721, + "epoch": 0.7954982685648326, + "flos": 1425205988352.0, + "grad_norm": 0.006305006479863361, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76528627, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.08105469, + "step": 4135, + "time_per_iteration": 4.8860838413238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068161, + "balance_loss_mlp": 1.03547359, + "epoch": 0.7956906502500962, + "flos": 389670428160.0, + "grad_norm": 0.055642824897664006, + "language_loss": 0.79057562, + "learning_rate": 0.00010553047875229166, + "loss": 0.80125725, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.3269043, + "step": 4136, + "time_per_iteration": 2.5156140327453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065548, + "balance_loss_mlp": 1.03359985, + "epoch": 0.7958830319353598, + "flos": 515321284608.0, + "grad_norm": 0.05406078670363032, + "language_loss": 0.83169937, + "learning_rate": 0.00010533912147689328, + "loss": 0.84235483, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.31933594, + "step": 4137, + "time_per_iteration": 2.613961696624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064795, + "balance_loss_mlp": 1.03296661, + "epoch": 0.7960754136206233, + "flos": 493695876096.0, + "grad_norm": 0.050232390896865514, + "language_loss": 0.82344103, + "learning_rate": 0.00010514791742243656, + "loss": 0.83408904, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.31811523, + "step": 4138, + "time_per_iteration": 2.5978379249572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106016, + "balance_loss_mlp": 1.02813983, + "epoch": 0.7962677953058869, + "flos": 655409899008.0, + "grad_norm": 0.05370274741433686, + "language_loss": 0.82677209, + "learning_rate": 0.00010495686666315341, + "loss": 0.83737361, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.32006836, + "step": 4139, + "time_per_iteration": 2.872997283935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062257, + "balance_loss_mlp": 1.03088117, + "epoch": 0.7964601769911505, + "flos": 542126435328.0, + "grad_norm": 0.05348146063522791, + "language_loss": 0.77502406, + "learning_rate": 0.00010476596927321635, + "loss": 0.78564668, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.31347656, + "step": 4140, + "time_per_iteration": 2.620577812194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064605, + "balance_loss_mlp": 1.0327282, + "epoch": 0.796652558676414, + "flos": 537356947968.0, + "grad_norm": 0.042260612329337484, + "language_loss": 0.80177677, + "learning_rate": 0.00010457522532673835, + "loss": 0.81242287, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.31860352, + "step": 4141, + "time_per_iteration": 2.7778780460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066312, + "balance_loss_mlp": 1.03419721, + "epoch": 0.7968449403616775, + "flos": 474852395520.0, + "grad_norm": 0.061301631429393516, + "language_loss": 0.82973599, + "learning_rate": 0.00010438463489777272, + "loss": 0.84039915, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.32104492, + "step": 4142, + "time_per_iteration": 2.579953908920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064314, + "balance_loss_mlp": 1.03157902, + "epoch": 0.7970373220469411, + "flos": 567336904704.0, + "grad_norm": 0.06081943760353449, + "language_loss": 0.77709621, + "learning_rate": 0.00010419419806031316, + "loss": 0.7877394, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.32739258, + "step": 4143, + "time_per_iteration": 2.6624910831451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066036, + "balance_loss_mlp": 1.03477979, + "epoch": 0.7972297037322047, + "flos": 555924003840.0, + "grad_norm": 0.05994376344115418, + "language_loss": 0.83806866, + "learning_rate": 0.00010400391488829403, + "loss": 0.84872901, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.31225586, + "step": 4144, + "time_per_iteration": 2.774700880050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063404, + "balance_loss_mlp": 1.03157544, + "epoch": 0.7974220854174683, + "flos": 575899476480.0, + "grad_norm": 0.04407421907789105, + "language_loss": 0.86373734, + "learning_rate": 0.00010381378545558984, + "loss": 0.87437141, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.31811523, + "step": 4145, + "time_per_iteration": 2.686239004135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065318, + "balance_loss_mlp": 1.03301203, + "epoch": 0.7976144671027319, + "flos": 482824240128.0, + "grad_norm": 0.047216774900369206, + "language_loss": 0.8480643, + "learning_rate": 0.00010362380983601505, + "loss": 0.85871744, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.32299805, + "step": 4146, + "time_per_iteration": 2.533198833465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062521, + "balance_loss_mlp": 1.03102612, + "epoch": 0.7978068487879953, + "flos": 1077420372480.0, + "grad_norm": 0.04375196843804429, + "language_loss": 0.78552485, + "learning_rate": 0.00010343398810332477, + "loss": 0.79615009, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.31469727, + "step": 4147, + "time_per_iteration": 3.451004981994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106262, + "balance_loss_mlp": 1.03007627, + "epoch": 0.7979992304732589, + "flos": 733421744640.0, + "grad_norm": 0.06305718879587498, + "language_loss": 0.84127843, + "learning_rate": 0.00010324432033121467, + "loss": 0.85190463, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.32543945, + "step": 4148, + "time_per_iteration": 2.890085220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065874, + "balance_loss_mlp": 1.03349686, + "epoch": 0.7981916121585225, + "flos": 415531261440.0, + "grad_norm": 0.050318448147633754, + "language_loss": 0.83318138, + "learning_rate": 0.00010305480659332005, + "loss": 0.84384012, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.32373047, + "step": 4149, + "time_per_iteration": 2.588676929473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063088, + "balance_loss_mlp": 1.03133059, + "epoch": 0.7983839938437861, + "flos": 465019619328.0, + "grad_norm": 0.06596514407169883, + "language_loss": 0.83595121, + "learning_rate": 0.00010286544696321682, + "loss": 0.84658206, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.31738281, + "step": 4150, + "time_per_iteration": 2.546215772628784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064473, + "balance_loss_mlp": 1.03304911, + "epoch": 0.7985763755290496, + "flos": 510304485888.0, + "grad_norm": 0.05480976519736011, + "language_loss": 0.79303128, + "learning_rate": 0.00010267624151442073, + "loss": 0.80367601, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.31396484, + "step": 4151, + "time_per_iteration": 2.620140790939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062507, + "balance_loss_mlp": 1.03077376, + "epoch": 0.7987687572143132, + "flos": 1010243847168.0, + "grad_norm": 0.05583504275555366, + "language_loss": 0.81259573, + "learning_rate": 0.000102487190320388, + "loss": 0.82322085, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.31713867, + "step": 4152, + "time_per_iteration": 3.3063504695892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064782, + "balance_loss_mlp": 1.03247619, + "epoch": 0.7989611388995768, + "flos": 1020662968320.0, + "grad_norm": 0.05403781232268857, + "language_loss": 0.79678059, + "learning_rate": 0.00010229829345451475, + "loss": 0.80742842, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.32299805, + "step": 4153, + "time_per_iteration": 3.301403522491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064656, + "balance_loss_mlp": 1.03237379, + "epoch": 0.7991535205848403, + "flos": 1100915476992.0, + "grad_norm": 0.05303368831267737, + "language_loss": 0.79783893, + "learning_rate": 0.00010210955099013724, + "loss": 0.80848551, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.32275391, + "step": 4154, + "time_per_iteration": 3.383039712905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065536, + "balance_loss_mlp": 1.03301597, + "epoch": 0.7993459022701039, + "flos": 834454268928.0, + "grad_norm": 0.06456924160427363, + "language_loss": 0.76284033, + "learning_rate": 0.00010192096300053167, + "loss": 0.77349567, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.32519531, + "step": 4155, + "time_per_iteration": 3.0697450637817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061021, + "balance_loss_mlp": 1.02928793, + "epoch": 0.7995382839553674, + "flos": 522417212928.0, + "grad_norm": 0.04699781712080769, + "language_loss": 0.851726, + "learning_rate": 0.00010173252955891477, + "loss": 0.86233628, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.31713867, + "step": 4156, + "time_per_iteration": 2.7266414165496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106594, + "balance_loss_mlp": 1.03389633, + "epoch": 0.799730665640631, + "flos": 537562151424.0, + "grad_norm": 0.059253037565978675, + "language_loss": 0.73188376, + "learning_rate": 0.00010154425073844253, + "loss": 0.7425431, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.3203125, + "step": 4157, + "time_per_iteration": 2.6836955547332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067435, + "balance_loss_mlp": 1.0347718, + "epoch": 0.7999230473258946, + "flos": 504809031168.0, + "grad_norm": 0.050604408560098985, + "language_loss": 0.82231861, + "learning_rate": 0.00010135612661221138, + "loss": 0.83299297, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.32666016, + "step": 4158, + "time_per_iteration": 2.5858490467071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061337, + "balance_loss_mlp": 1.02903104, + "epoch": 0.8001154290111582, + "flos": 1026935414784.0, + "grad_norm": 0.07154666191877361, + "language_loss": 0.81335956, + "learning_rate": 0.00010116815725325751, + "loss": 0.82397294, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.32299805, + "step": 4159, + "time_per_iteration": 3.30757474899292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063379, + "balance_loss_mlp": 1.03073967, + "epoch": 0.8003078106964217, + "flos": 750567237120.0, + "grad_norm": 0.05734149006242142, + "language_loss": 0.80527955, + "learning_rate": 0.00010098034273455725, + "loss": 0.81591332, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.32641602, + "step": 4160, + "time_per_iteration": 2.9547767639160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065922, + "balance_loss_mlp": 1.03313947, + "epoch": 0.8005001923816852, + "flos": 488201831424.0, + "grad_norm": 0.05051565727224089, + "language_loss": 0.79769969, + "learning_rate": 0.00010079268312902662, + "loss": 0.80835891, + "num_input_tokens_seen": 345015392, + "router_z_loss_mlp": 0.32788086, + "step": 4161, + "time_per_iteration": 2.6677966117858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062976, + "balance_loss_mlp": 1.03119469, + "epoch": 0.8006925740669488, + "flos": 512983107072.0, + "grad_norm": 0.05230288400742034, + "language_loss": 0.81782764, + "learning_rate": 0.0001006051785095215, + "loss": 0.82845742, + "num_input_tokens_seen": 345086640, + "router_z_loss_mlp": 0.31762695, + "step": 4162, + "time_per_iteration": 2.642228126525879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064845, + "balance_loss_mlp": 1.03172922, + "epoch": 0.8008849557522124, + "flos": 578243446272.0, + "grad_norm": 0.05393641740779556, + "language_loss": 0.79291767, + "learning_rate": 0.0001004178289488376, + "loss": 0.8035661, + "num_input_tokens_seen": 345159616, + "router_z_loss_mlp": 0.33129883, + "step": 4163, + "time_per_iteration": 2.7046382427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065647, + "balance_loss_mlp": 1.03360367, + "epoch": 0.801077337437476, + "flos": 478466569728.0, + "grad_norm": 0.05246916136240305, + "language_loss": 0.83748746, + "learning_rate": 0.0001002306345197106, + "loss": 0.84814394, + "num_input_tokens_seen": 345225536, + "router_z_loss_mlp": 0.3203125, + "step": 4164, + "time_per_iteration": 2.5735926628112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064035, + "balance_loss_mlp": 1.03115666, + "epoch": 0.8012697191227395, + "flos": 676384943616.0, + "grad_norm": 0.06395934079571464, + "language_loss": 0.79516339, + "learning_rate": 0.00010004359529481571, + "loss": 0.80580378, + "num_input_tokens_seen": 345302960, + "router_z_loss_mlp": 0.32885742, + "step": 4165, + "time_per_iteration": 3.073218822479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070917, + "balance_loss_mlp": 1.03760982, + "epoch": 0.8014621008080031, + "flos": 1294624567296.0, + "grad_norm": 0.06013073241121916, + "language_loss": 0.81983745, + "learning_rate": 9.985671134676804e-05, + "loss": 0.83054662, + "num_input_tokens_seen": 345397792, + "router_z_loss_mlp": 0.33325195, + "step": 4166, + "time_per_iteration": 3.6898140907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064786, + "balance_loss_mlp": 1.03259957, + "epoch": 0.8016544824932667, + "flos": 511579072512.0, + "grad_norm": 0.0757010712096974, + "language_loss": 0.83196199, + "learning_rate": 9.966998274812234e-05, + "loss": 0.84260988, + "num_input_tokens_seen": 345465440, + "router_z_loss_mlp": 0.32177734, + "step": 4167, + "time_per_iteration": 2.61006498336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065564, + "balance_loss_mlp": 1.03368783, + "epoch": 0.8018468641785302, + "flos": 535434969600.0, + "grad_norm": 0.06034302862122322, + "language_loss": 0.81307828, + "learning_rate": 9.948340957137308e-05, + "loss": 0.82373393, + "num_input_tokens_seen": 345533072, + "router_z_loss_mlp": 0.31860352, + "step": 4168, + "time_per_iteration": 2.615943670272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064528, + "balance_loss_mlp": 1.03160238, + "epoch": 0.8020392458637937, + "flos": 1023025876992.0, + "grad_norm": 0.07125861886522546, + "language_loss": 0.79438949, + "learning_rate": 9.929699188895447e-05, + "loss": 0.80503476, + "num_input_tokens_seen": 345622208, + "router_z_loss_mlp": 0.3293457, + "step": 4169, + "time_per_iteration": 3.2835214138031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011972, + "balance_loss_mlp": 1.00377011, + "epoch": 0.8022316275490573, + "flos": 1560993748992.0, + "grad_norm": 0.00911725430175954, + "language_loss": 0.78054404, + "learning_rate": 9.911072977324009e-05, + "loss": 0.79066378, + "num_input_tokens_seen": 345852544, + "router_z_loss_mlp": 0.08203125, + "step": 4170, + "time_per_iteration": 4.975283861160278 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064906, + "balance_loss_mlp": 1.03236222, + "epoch": 0.8024240092343209, + "flos": 420473866752.0, + "grad_norm": 0.05669992043917717, + "language_loss": 0.82862949, + "learning_rate": 9.89246232965435e-05, + "loss": 0.83927858, + "num_input_tokens_seen": 345917328, + "router_z_loss_mlp": 0.32543945, + "step": 4171, + "time_per_iteration": 2.5107967853546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066001, + "balance_loss_mlp": 1.03307581, + "epoch": 0.8026163909195845, + "flos": 763506418176.0, + "grad_norm": 0.06023729307779171, + "language_loss": 0.78644282, + "learning_rate": 9.873867253111762e-05, + "loss": 0.79710281, + "num_input_tokens_seen": 345995936, + "router_z_loss_mlp": 0.3293457, + "step": 4172, + "time_per_iteration": 2.9537975788116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011767, + "balance_loss_mlp": 1.00356519, + "epoch": 0.8028087726048481, + "flos": 1518044087808.0, + "grad_norm": 0.009851202253072271, + "language_loss": 0.80264562, + "learning_rate": 9.855287754915503e-05, + "loss": 0.81276327, + "num_input_tokens_seen": 346232720, + "router_z_loss_mlp": 0.08203125, + "step": 4173, + "time_per_iteration": 4.9082865715026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065835, + "balance_loss_mlp": 1.03310037, + "epoch": 0.8030011542901115, + "flos": 517620174336.0, + "grad_norm": 0.06108822499581813, + "language_loss": 0.88279212, + "learning_rate": 9.836723842278733e-05, + "loss": 0.89345044, + "num_input_tokens_seen": 346298208, + "router_z_loss_mlp": 0.32739258, + "step": 4174, + "time_per_iteration": 2.58488392829895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063958, + "balance_loss_mlp": 1.03177166, + "epoch": 0.8031935359753751, + "flos": 545356495872.0, + "grad_norm": 0.05400426019462201, + "language_loss": 0.77958262, + "learning_rate": 9.818175522408646e-05, + "loss": 0.79022217, + "num_input_tokens_seen": 346370080, + "router_z_loss_mlp": 0.32177734, + "step": 4175, + "time_per_iteration": 2.6506383419036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064557, + "balance_loss_mlp": 1.03201258, + "epoch": 0.8033859176606387, + "flos": 603266241024.0, + "grad_norm": 0.12211411732469771, + "language_loss": 0.84598446, + "learning_rate": 9.79964280250632e-05, + "loss": 0.85662997, + "num_input_tokens_seen": 346442432, + "router_z_loss_mlp": 0.32543945, + "step": 4176, + "time_per_iteration": 2.7917282581329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066053, + "balance_loss_mlp": 1.03353262, + "epoch": 0.8035782993459023, + "flos": 565579279872.0, + "grad_norm": 0.06768154425814676, + "language_loss": 0.8119694, + "learning_rate": 9.781125689766795e-05, + "loss": 0.82262993, + "num_input_tokens_seen": 346513088, + "router_z_loss_mlp": 0.32519531, + "step": 4177, + "time_per_iteration": 2.776914358139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062732, + "balance_loss_mlp": 1.03080821, + "epoch": 0.8037706810311658, + "flos": 538177609728.0, + "grad_norm": 0.06144959220645004, + "language_loss": 0.84109223, + "learning_rate": 9.762624191379054e-05, + "loss": 0.8517195, + "num_input_tokens_seen": 346581376, + "router_z_loss_mlp": 0.3190918, + "step": 4178, + "time_per_iteration": 2.6495769023895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062746, + "balance_loss_mlp": 1.03122723, + "epoch": 0.8039630627164294, + "flos": 514937170944.0, + "grad_norm": 0.06604335738972231, + "language_loss": 0.79559189, + "learning_rate": 9.744138314526014e-05, + "loss": 0.80621934, + "num_input_tokens_seen": 346653328, + "router_z_loss_mlp": 0.31494141, + "step": 4179, + "time_per_iteration": 2.602325201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011191, + "balance_loss_mlp": 1.0028466, + "epoch": 0.804155444401693, + "flos": 1478061209088.0, + "grad_norm": 0.008490114201074244, + "language_loss": 0.74733561, + "learning_rate": 9.725668066384535e-05, + "loss": 0.75744754, + "num_input_tokens_seen": 346873264, + "router_z_loss_mlp": 0.08349609, + "step": 4180, + "time_per_iteration": 4.859116077423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065566, + "balance_loss_mlp": 1.03337991, + "epoch": 0.8043478260869565, + "flos": 520909871616.0, + "grad_norm": 0.05865304304999897, + "language_loss": 0.77159905, + "learning_rate": 9.707213454125396e-05, + "loss": 0.7822547, + "num_input_tokens_seen": 346946272, + "router_z_loss_mlp": 0.32177734, + "step": 4181, + "time_per_iteration": 2.635836362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064141, + "balance_loss_mlp": 1.03131068, + "epoch": 0.8045402077722201, + "flos": 545170231296.0, + "grad_norm": 0.0530956365986092, + "language_loss": 0.80491257, + "learning_rate": 9.688774484913298e-05, + "loss": 0.81555402, + "num_input_tokens_seen": 347024048, + "router_z_loss_mlp": 0.32836914, + "step": 4182, + "time_per_iteration": 2.751748561859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106557, + "balance_loss_mlp": 1.03438473, + "epoch": 0.8047325894574836, + "flos": 678059610624.0, + "grad_norm": 0.05198093608069738, + "language_loss": 0.740538, + "learning_rate": 9.670351165906921e-05, + "loss": 0.75119376, + "num_input_tokens_seen": 347108736, + "router_z_loss_mlp": 0.31152344, + "step": 4183, + "time_per_iteration": 2.9374914169311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063909, + "balance_loss_mlp": 1.03146052, + "epoch": 0.8049249711427472, + "flos": 586952994816.0, + "grad_norm": 0.051787161978078845, + "language_loss": 0.78289056, + "learning_rate": 9.65194350425882e-05, + "loss": 0.79352963, + "num_input_tokens_seen": 347184192, + "router_z_loss_mlp": 0.32446289, + "step": 4184, + "time_per_iteration": 2.7513604164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066837, + "balance_loss_mlp": 1.03467417, + "epoch": 0.8051173528280108, + "flos": 813824050176.0, + "grad_norm": 0.049086248297275724, + "language_loss": 0.77875173, + "learning_rate": 9.633551507115452e-05, + "loss": 0.78942013, + "num_input_tokens_seen": 347282336, + "router_z_loss_mlp": 0.3215332, + "step": 4185, + "time_per_iteration": 3.129385232925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064139, + "balance_loss_mlp": 1.03254879, + "epoch": 0.8053097345132744, + "flos": 725371324416.0, + "grad_norm": 0.05310426447425182, + "language_loss": 0.77401984, + "learning_rate": 9.615175181617259e-05, + "loss": 0.78466123, + "num_input_tokens_seen": 347364800, + "router_z_loss_mlp": 0.31567383, + "step": 4186, + "time_per_iteration": 2.9424233436584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062352, + "balance_loss_mlp": 1.02980769, + "epoch": 0.805502116198538, + "flos": 747706733568.0, + "grad_norm": 0.12719309664505637, + "language_loss": 0.81555432, + "learning_rate": 9.596814534898552e-05, + "loss": 0.82617784, + "num_input_tokens_seen": 347443328, + "router_z_loss_mlp": 0.32543945, + "step": 4187, + "time_per_iteration": 2.9859328269958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063734, + "balance_loss_mlp": 1.03204834, + "epoch": 0.8056944978838014, + "flos": 639953630208.0, + "grad_norm": 0.05354487168987546, + "language_loss": 0.87387693, + "learning_rate": 9.578469574087561e-05, + "loss": 0.88451427, + "num_input_tokens_seen": 347522064, + "router_z_loss_mlp": 0.31665039, + "step": 4188, + "time_per_iteration": 2.7995247840881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065054, + "balance_loss_mlp": 1.03255761, + "epoch": 0.805886879569065, + "flos": 644344796160.0, + "grad_norm": 0.056241473497873305, + "language_loss": 0.77584517, + "learning_rate": 9.560140306306436e-05, + "loss": 0.78649569, + "num_input_tokens_seen": 347597200, + "router_z_loss_mlp": 0.32495117, + "step": 4189, + "time_per_iteration": 2.7428812980651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106136, + "balance_loss_mlp": 1.02964997, + "epoch": 0.8060792612543286, + "flos": 660928674816.0, + "grad_norm": 0.0522294077132656, + "language_loss": 0.81488943, + "learning_rate": 9.541826738671233e-05, + "loss": 0.82550299, + "num_input_tokens_seen": 347676928, + "router_z_loss_mlp": 0.31689453, + "step": 4190, + "time_per_iteration": 2.8001224994659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062885, + "balance_loss_mlp": 1.03172374, + "epoch": 0.8062716429395922, + "flos": 454842017280.0, + "grad_norm": 0.057773567854599146, + "language_loss": 0.82252741, + "learning_rate": 9.523528878291904e-05, + "loss": 0.83315623, + "num_input_tokens_seen": 347741552, + "router_z_loss_mlp": 0.3112793, + "step": 4191, + "time_per_iteration": 2.521599531173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065048, + "balance_loss_mlp": 1.03267062, + "epoch": 0.8064640246248557, + "flos": 526153632768.0, + "grad_norm": 0.06481758336131359, + "language_loss": 0.85109866, + "learning_rate": 9.50524673227231e-05, + "loss": 0.86174917, + "num_input_tokens_seen": 347807008, + "router_z_loss_mlp": 0.32373047, + "step": 4192, + "time_per_iteration": 2.5771045684814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106484, + "balance_loss_mlp": 1.03239143, + "epoch": 0.8066564063101193, + "flos": 864726617088.0, + "grad_norm": 0.04614815892553539, + "language_loss": 0.82226491, + "learning_rate": 9.486980307710208e-05, + "loss": 0.83291328, + "num_input_tokens_seen": 347895728, + "router_z_loss_mlp": 0.32446289, + "step": 4193, + "time_per_iteration": 3.1550650596618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106366, + "balance_loss_mlp": 1.03142595, + "epoch": 0.8068487879953828, + "flos": 530261019648.0, + "grad_norm": 0.04778244873492842, + "language_loss": 0.81720543, + "learning_rate": 9.468729611697246e-05, + "loss": 0.82784206, + "num_input_tokens_seen": 347970368, + "router_z_loss_mlp": 0.32226562, + "step": 4194, + "time_per_iteration": 2.68117618560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063088, + "balance_loss_mlp": 1.03094923, + "epoch": 0.8070411696806464, + "flos": 565918313472.0, + "grad_norm": 0.05165640646024543, + "language_loss": 0.81600553, + "learning_rate": 9.450494651319003e-05, + "loss": 0.82663637, + "num_input_tokens_seen": 348039040, + "router_z_loss_mlp": 0.32128906, + "step": 4195, + "time_per_iteration": 2.633653402328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068382, + "balance_loss_mlp": 1.03557551, + "epoch": 0.80723355136591, + "flos": 986176954368.0, + "grad_norm": 0.07405936380462144, + "language_loss": 0.7911948, + "learning_rate": 9.432275433654885e-05, + "loss": 0.80187857, + "num_input_tokens_seen": 348126064, + "router_z_loss_mlp": 0.328125, + "step": 4196, + "time_per_iteration": 3.312209129333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066123, + "balance_loss_mlp": 1.0333643, + "epoch": 0.8074259330511735, + "flos": 566682158592.0, + "grad_norm": 0.05744842453598568, + "language_loss": 0.82666802, + "learning_rate": 9.414071965778221e-05, + "loss": 0.83732927, + "num_input_tokens_seen": 348205888, + "router_z_loss_mlp": 0.32763672, + "step": 4197, + "time_per_iteration": 2.8450043201446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062207, + "balance_loss_mlp": 1.02997255, + "epoch": 0.8076183147364371, + "flos": 494391320064.0, + "grad_norm": 0.049534177459162165, + "language_loss": 0.79876429, + "learning_rate": 9.395884254756242e-05, + "loss": 0.80938637, + "num_input_tokens_seen": 348278608, + "router_z_loss_mlp": 0.32226562, + "step": 4198, + "time_per_iteration": 2.742640733718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071014, + "balance_loss_mlp": 1.03858888, + "epoch": 0.8078106964217007, + "flos": 419798771712.0, + "grad_norm": 0.05248878052625419, + "language_loss": 0.79987884, + "learning_rate": 9.377712307650044e-05, + "loss": 0.81058896, + "num_input_tokens_seen": 348341312, + "router_z_loss_mlp": 0.32421875, + "step": 4199, + "time_per_iteration": 2.483083486557007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061181, + "balance_loss_mlp": 1.02877951, + "epoch": 0.8080030781069643, + "flos": 527281242624.0, + "grad_norm": 0.05810503485387347, + "language_loss": 0.82971925, + "learning_rate": 9.359556131514602e-05, + "loss": 0.84033108, + "num_input_tokens_seen": 348409184, + "router_z_loss_mlp": 0.32397461, + "step": 4200, + "time_per_iteration": 2.6200740337371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068361, + "balance_loss_mlp": 1.0357213, + "epoch": 0.8081954597922277, + "flos": 543898616832.0, + "grad_norm": 0.05004595325075464, + "language_loss": 0.81427366, + "learning_rate": 9.341415733398733e-05, + "loss": 0.82495731, + "num_input_tokens_seen": 348480832, + "router_z_loss_mlp": 0.32641602, + "step": 4201, + "time_per_iteration": 2.621709108352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066327, + "balance_loss_mlp": 1.03383064, + "epoch": 0.8083878414774913, + "flos": 640593819648.0, + "grad_norm": 0.050042429423884156, + "language_loss": 0.75564444, + "learning_rate": 9.323291120345207e-05, + "loss": 0.76630765, + "num_input_tokens_seen": 348559232, + "router_z_loss_mlp": 0.32495117, + "step": 4202, + "time_per_iteration": 2.843702793121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062749, + "balance_loss_mlp": 1.03072977, + "epoch": 0.8085802231627549, + "flos": 705292545024.0, + "grad_norm": 0.0932476800908623, + "language_loss": 0.72727638, + "learning_rate": 9.305182299390614e-05, + "loss": 0.73790395, + "num_input_tokens_seen": 348638960, + "router_z_loss_mlp": 0.32006836, + "step": 4203, + "time_per_iteration": 2.8963630199432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062026, + "balance_loss_mlp": 1.02960098, + "epoch": 0.8087726048480185, + "flos": 419538313728.0, + "grad_norm": 0.05589938001894908, + "language_loss": 0.88482207, + "learning_rate": 9.287089277565409e-05, + "loss": 0.89544237, + "num_input_tokens_seen": 348704816, + "router_z_loss_mlp": 0.32421875, + "step": 4204, + "time_per_iteration": 2.5090560913085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063669, + "balance_loss_mlp": 1.03205466, + "epoch": 0.8089649865332821, + "flos": 508493016576.0, + "grad_norm": 0.05129598135232067, + "language_loss": 0.87062168, + "learning_rate": 9.269012061893922e-05, + "loss": 0.88125837, + "num_input_tokens_seen": 348783504, + "router_z_loss_mlp": 0.31591797, + "step": 4205, + "time_per_iteration": 2.7403717041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064175, + "balance_loss_mlp": 1.03174961, + "epoch": 0.8091573682185456, + "flos": 456960434688.0, + "grad_norm": 0.051102923464018915, + "language_loss": 0.84725749, + "learning_rate": 9.250950659394386e-05, + "loss": 0.85789919, + "num_input_tokens_seen": 348858272, + "router_z_loss_mlp": 0.32421875, + "step": 4206, + "time_per_iteration": 2.657475709915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064522, + "balance_loss_mlp": 1.03204989, + "epoch": 0.8093497499038091, + "flos": 524977970688.0, + "grad_norm": 0.05067893035392799, + "language_loss": 0.76910889, + "learning_rate": 9.232905077078824e-05, + "loss": 0.77975416, + "num_input_tokens_seen": 348934432, + "router_z_loss_mlp": 0.32470703, + "step": 4207, + "time_per_iteration": 2.7274835109710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069405, + "balance_loss_mlp": 1.03726602, + "epoch": 0.8095421315890727, + "flos": 489377493504.0, + "grad_norm": 0.07892159060292517, + "language_loss": 0.77032375, + "learning_rate": 9.214875321953164e-05, + "loss": 0.78101778, + "num_input_tokens_seen": 349003856, + "router_z_loss_mlp": 0.32128906, + "step": 4208, + "time_per_iteration": 2.5977513790130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064316, + "balance_loss_mlp": 1.03205764, + "epoch": 0.8097345132743363, + "flos": 624817456128.0, + "grad_norm": 0.04941967434183097, + "language_loss": 0.80449629, + "learning_rate": 9.196861401017164e-05, + "loss": 0.81513947, + "num_input_tokens_seen": 349080544, + "router_z_loss_mlp": 0.32250977, + "step": 4209, + "time_per_iteration": 2.735485792160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067095, + "balance_loss_mlp": 1.03416991, + "epoch": 0.8099268949595998, + "flos": 615393524736.0, + "grad_norm": 0.0567514842087568, + "language_loss": 0.79628795, + "learning_rate": 9.178863321264475e-05, + "loss": 0.80695891, + "num_input_tokens_seen": 349159072, + "router_z_loss_mlp": 0.3293457, + "step": 4210, + "time_per_iteration": 2.764686346054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106462, + "balance_loss_mlp": 1.03245759, + "epoch": 0.8101192766448634, + "flos": 479383183872.0, + "grad_norm": 0.04955977006686596, + "language_loss": 0.80138111, + "learning_rate": 9.160881089682566e-05, + "loss": 0.81202734, + "num_input_tokens_seen": 349230176, + "router_z_loss_mlp": 0.3215332, + "step": 4211, + "time_per_iteration": 2.6480655670166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066314, + "balance_loss_mlp": 1.03434241, + "epoch": 0.810311658330127, + "flos": 517078909440.0, + "grad_norm": 0.05012077281177707, + "language_loss": 0.86578828, + "learning_rate": 9.142914713252725e-05, + "loss": 0.87645137, + "num_input_tokens_seen": 349299760, + "router_z_loss_mlp": 0.31958008, + "step": 4212, + "time_per_iteration": 2.6052680015563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106724, + "balance_loss_mlp": 1.03467226, + "epoch": 0.8105040400153906, + "flos": 575481867264.0, + "grad_norm": 0.044856961843391975, + "language_loss": 0.84047955, + "learning_rate": 9.124964198950159e-05, + "loss": 0.85115194, + "num_input_tokens_seen": 349379712, + "router_z_loss_mlp": 0.32568359, + "step": 4213, + "time_per_iteration": 2.824300765991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063471, + "balance_loss_mlp": 1.03147483, + "epoch": 0.8106964217006541, + "flos": 638658694656.0, + "grad_norm": 0.05372438091559442, + "language_loss": 0.85053265, + "learning_rate": 9.107029553743862e-05, + "loss": 0.86116743, + "num_input_tokens_seen": 349460320, + "router_z_loss_mlp": 0.31982422, + "step": 4214, + "time_per_iteration": 2.8542444705963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061569, + "balance_loss_mlp": 1.02997828, + "epoch": 0.8108888033859176, + "flos": 579237225984.0, + "grad_norm": 0.06664809164052823, + "language_loss": 0.81298697, + "learning_rate": 9.089110784596672e-05, + "loss": 0.82360268, + "num_input_tokens_seen": 349527648, + "router_z_loss_mlp": 0.31567383, + "step": 4215, + "time_per_iteration": 2.680931329727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062307, + "balance_loss_mlp": 1.03040659, + "epoch": 0.8110811850711812, + "flos": 559612371456.0, + "grad_norm": 0.04909557742541004, + "language_loss": 0.8349334, + "learning_rate": 9.071207898465284e-05, + "loss": 0.8455565, + "num_input_tokens_seen": 349606912, + "router_z_loss_mlp": 0.31884766, + "step": 4216, + "time_per_iteration": 2.774233102798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012014, + "balance_loss_mlp": 1.00366914, + "epoch": 0.8112735667564448, + "flos": 1517160969216.0, + "grad_norm": 0.0037601729160873353, + "language_loss": 0.77260417, + "learning_rate": 9.053320902300205e-05, + "loss": 0.78272432, + "num_input_tokens_seen": 349827040, + "router_z_loss_mlp": 0.08349609, + "step": 4217, + "time_per_iteration": 4.66510534286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065826, + "balance_loss_mlp": 1.03375852, + "epoch": 0.8114659484417084, + "flos": 616048270848.0, + "grad_norm": 0.06288981625786437, + "language_loss": 0.85243338, + "learning_rate": 9.035449803045792e-05, + "loss": 0.86309159, + "num_input_tokens_seen": 349900080, + "router_z_loss_mlp": 0.32055664, + "step": 4218, + "time_per_iteration": 2.800776958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059611, + "balance_loss_mlp": 1.02835429, + "epoch": 0.8116583301269719, + "flos": 649624872960.0, + "grad_norm": 0.05605772970333741, + "language_loss": 0.78812903, + "learning_rate": 9.017594607640211e-05, + "loss": 0.79872513, + "num_input_tokens_seen": 349983568, + "router_z_loss_mlp": 0.31225586, + "step": 4219, + "time_per_iteration": 2.9232895374298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067588, + "balance_loss_mlp": 1.0353061, + "epoch": 0.8118507118122354, + "flos": 552811806720.0, + "grad_norm": 0.055355121711081465, + "language_loss": 0.80514246, + "learning_rate": 8.999755323015463e-05, + "loss": 0.81581837, + "num_input_tokens_seen": 350054928, + "router_z_loss_mlp": 0.32275391, + "step": 4220, + "time_per_iteration": 2.668212413787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063714, + "balance_loss_mlp": 1.03186107, + "epoch": 0.812043093497499, + "flos": 543854946816.0, + "grad_norm": 0.048800759009726725, + "language_loss": 0.87706423, + "learning_rate": 8.981931956097384e-05, + "loss": 0.88770139, + "num_input_tokens_seen": 350127872, + "router_z_loss_mlp": 0.31835938, + "step": 4221, + "time_per_iteration": 2.6155126094818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065942, + "balance_loss_mlp": 1.03370762, + "epoch": 0.8122354751827626, + "flos": 583113268224.0, + "grad_norm": 0.04761865072569173, + "language_loss": 0.83265173, + "learning_rate": 8.964124513805628e-05, + "loss": 0.84331113, + "num_input_tokens_seen": 350206592, + "router_z_loss_mlp": 0.32226562, + "step": 4222, + "time_per_iteration": 2.774775505065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013622, + "balance_loss_mlp": 1.00532508, + "epoch": 0.8124278568680262, + "flos": 1529747970048.0, + "grad_norm": 0.0043191347413629065, + "language_loss": 0.78250074, + "learning_rate": 8.94633300305363e-05, + "loss": 0.79263699, + "num_input_tokens_seen": 350436048, + "router_z_loss_mlp": 0.08300781, + "step": 4223, + "time_per_iteration": 4.961513519287109 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063048, + "balance_loss_mlp": 1.03012204, + "epoch": 0.8126202385532897, + "flos": 432640438272.0, + "grad_norm": 0.05869447426107839, + "language_loss": 0.80021381, + "learning_rate": 8.928557430748668e-05, + "loss": 0.8108443, + "num_input_tokens_seen": 350501376, + "router_z_loss_mlp": 0.3293457, + "step": 4224, + "time_per_iteration": 2.574615240097046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013546, + "balance_loss_mlp": 1.00524938, + "epoch": 0.8128126202385533, + "flos": 1547098665984.0, + "grad_norm": 0.004316183823014201, + "language_loss": 0.76495624, + "learning_rate": 8.910797803791854e-05, + "loss": 0.77509177, + "num_input_tokens_seen": 350735232, + "router_z_loss_mlp": 0.08300781, + "step": 4225, + "time_per_iteration": 4.941166639328003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063113, + "balance_loss_mlp": 1.03166568, + "epoch": 0.8130050019238169, + "flos": 528064026624.0, + "grad_norm": 0.05209839082843741, + "language_loss": 0.89101052, + "learning_rate": 8.893054129078077e-05, + "loss": 0.90164173, + "num_input_tokens_seen": 350805088, + "router_z_loss_mlp": 0.31420898, + "step": 4226, + "time_per_iteration": 2.6181585788726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066672, + "balance_loss_mlp": 1.03441381, + "epoch": 0.8131973836090804, + "flos": 542850992640.0, + "grad_norm": 0.07190535116471519, + "language_loss": 0.80237639, + "learning_rate": 8.875326413496037e-05, + "loss": 0.81304312, + "num_input_tokens_seen": 350876896, + "router_z_loss_mlp": 0.32250977, + "step": 4227, + "time_per_iteration": 2.753739595413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064339, + "balance_loss_mlp": 1.03219986, + "epoch": 0.8133897652943439, + "flos": 576223953408.0, + "grad_norm": 0.052554063935664315, + "language_loss": 0.82211399, + "learning_rate": 8.857614663928249e-05, + "loss": 0.83275741, + "num_input_tokens_seen": 350948400, + "router_z_loss_mlp": 0.32128906, + "step": 4228, + "time_per_iteration": 2.775632858276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063809, + "balance_loss_mlp": 1.03245687, + "epoch": 0.8135821469796075, + "flos": 578937480192.0, + "grad_norm": 0.05954408838402955, + "language_loss": 0.78956014, + "learning_rate": 8.839918887251025e-05, + "loss": 0.80019832, + "num_input_tokens_seen": 351023328, + "router_z_loss_mlp": 0.31323242, + "step": 4229, + "time_per_iteration": 2.7360551357269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063217, + "balance_loss_mlp": 1.0319128, + "epoch": 0.8137745286648711, + "flos": 650023543296.0, + "grad_norm": 0.049012494433050675, + "language_loss": 0.83987892, + "learning_rate": 8.822239090334472e-05, + "loss": 0.85051107, + "num_input_tokens_seen": 351108672, + "router_z_loss_mlp": 0.31274414, + "step": 4230, + "time_per_iteration": 2.9183671474456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063188, + "balance_loss_mlp": 1.03076267, + "epoch": 0.8139669103501347, + "flos": 701579446272.0, + "grad_norm": 0.050773528250442714, + "language_loss": 0.7552613, + "learning_rate": 8.804575280042493e-05, + "loss": 0.7658931, + "num_input_tokens_seen": 351185056, + "router_z_loss_mlp": 0.32421875, + "step": 4231, + "time_per_iteration": 2.892979860305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106874, + "balance_loss_mlp": 1.0362674, + "epoch": 0.8141592920353983, + "flos": 649933383168.0, + "grad_norm": 0.0634360445287057, + "language_loss": 0.83362955, + "learning_rate": 8.786927463232774e-05, + "loss": 0.84431696, + "num_input_tokens_seen": 351255856, + "router_z_loss_mlp": 0.32470703, + "step": 4232, + "time_per_iteration": 2.747666120529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067521, + "balance_loss_mlp": 1.03521562, + "epoch": 0.8143516737206618, + "flos": 536577136128.0, + "grad_norm": 0.05551196564912533, + "language_loss": 0.81381476, + "learning_rate": 8.769295646756853e-05, + "loss": 0.82448995, + "num_input_tokens_seen": 351322336, + "router_z_loss_mlp": 0.32299805, + "step": 4233, + "time_per_iteration": 2.6436498165130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063586, + "balance_loss_mlp": 1.03082705, + "epoch": 0.8145440554059253, + "flos": 508117667328.0, + "grad_norm": 0.050777916612580305, + "language_loss": 0.82274687, + "learning_rate": 8.751679837459963e-05, + "loss": 0.83338279, + "num_input_tokens_seen": 351387440, + "router_z_loss_mlp": 0.32763672, + "step": 4234, + "time_per_iteration": 2.5614001750946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066442, + "balance_loss_mlp": 1.03454125, + "epoch": 0.8147364370911889, + "flos": 634720043520.0, + "grad_norm": 0.05344342526685846, + "language_loss": 0.86183035, + "learning_rate": 8.734080042181181e-05, + "loss": 0.87249482, + "num_input_tokens_seen": 351464192, + "router_z_loss_mlp": 0.31884766, + "step": 4235, + "time_per_iteration": 2.8306832313537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065616, + "balance_loss_mlp": 1.0336678, + "epoch": 0.8149288187764525, + "flos": 422576317440.0, + "grad_norm": 0.05510479621984829, + "language_loss": 0.7812373, + "learning_rate": 8.716496267753343e-05, + "loss": 0.79189348, + "num_input_tokens_seen": 351528016, + "router_z_loss_mlp": 0.31933594, + "step": 4236, + "time_per_iteration": 2.4926445484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066427, + "balance_loss_mlp": 1.03512251, + "epoch": 0.8151212004617161, + "flos": 597150945792.0, + "grad_norm": 0.05070421466516423, + "language_loss": 0.81572199, + "learning_rate": 8.698928521003097e-05, + "loss": 0.82638621, + "num_input_tokens_seen": 351601648, + "router_z_loss_mlp": 0.31274414, + "step": 4237, + "time_per_iteration": 2.7581071853637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015484, + "balance_loss_mlp": 1.00713933, + "epoch": 0.8153135821469796, + "flos": 1478563186176.0, + "grad_norm": 0.0058140182195042965, + "language_loss": 0.77852845, + "learning_rate": 8.681376808750835e-05, + "loss": 0.7886833, + "num_input_tokens_seen": 351826720, + "router_z_loss_mlp": 0.08349609, + "step": 4238, + "time_per_iteration": 4.994212627410889 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064499, + "balance_loss_mlp": 1.03255105, + "epoch": 0.8155059638322432, + "flos": 436870070784.0, + "grad_norm": 0.0778292766394767, + "language_loss": 0.82763624, + "learning_rate": 8.663841137810741e-05, + "loss": 0.83828127, + "num_input_tokens_seen": 351891760, + "router_z_loss_mlp": 0.31933594, + "step": 4239, + "time_per_iteration": 2.522731065750122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063565, + "balance_loss_mlp": 1.03149724, + "epoch": 0.8156983455175068, + "flos": 794034842112.0, + "grad_norm": 0.05452846456487443, + "language_loss": 0.85091472, + "learning_rate": 8.646321514990763e-05, + "loss": 0.86155033, + "num_input_tokens_seen": 351977504, + "router_z_loss_mlp": 0.32055664, + "step": 4240, + "time_per_iteration": 3.029479742050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067898, + "balance_loss_mlp": 1.0353775, + "epoch": 0.8158907272027703, + "flos": 685685219328.0, + "grad_norm": 0.05011553315331007, + "language_loss": 0.81715024, + "learning_rate": 8.628817947092616e-05, + "loss": 0.82782924, + "num_input_tokens_seen": 352050176, + "router_z_loss_mlp": 0.32519531, + "step": 4241, + "time_per_iteration": 2.8111627101898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063687, + "balance_loss_mlp": 1.03140509, + "epoch": 0.8160831088880338, + "flos": 486812353536.0, + "grad_norm": 0.0676026015098648, + "language_loss": 0.84249878, + "learning_rate": 8.611330440911797e-05, + "loss": 0.85313565, + "num_input_tokens_seen": 352116848, + "router_z_loss_mlp": 0.32275391, + "step": 4242, + "time_per_iteration": 2.6123099327087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066489, + "balance_loss_mlp": 1.03439796, + "epoch": 0.8162754905732974, + "flos": 464635505664.0, + "grad_norm": 0.05452366301324058, + "language_loss": 0.80174685, + "learning_rate": 8.593859003237558e-05, + "loss": 0.81241173, + "num_input_tokens_seen": 352185056, + "router_z_loss_mlp": 0.32080078, + "step": 4243, + "time_per_iteration": 2.628828287124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014713, + "balance_loss_mlp": 1.00617814, + "epoch": 0.816467872258561, + "flos": 1238879577600.0, + "grad_norm": 0.006298566861144672, + "language_loss": 0.75285125, + "learning_rate": 8.576403640852904e-05, + "loss": 0.76299834, + "num_input_tokens_seen": 352397648, + "router_z_loss_mlp": 0.08544922, + "step": 4244, + "time_per_iteration": 4.721359014511108 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066399, + "balance_loss_mlp": 1.03497541, + "epoch": 0.8166602539438246, + "flos": 686862291456.0, + "grad_norm": 0.04683457091377271, + "language_loss": 0.86992514, + "learning_rate": 8.558964360534615e-05, + "loss": 0.88058907, + "num_input_tokens_seen": 352478272, + "router_z_loss_mlp": 0.31396484, + "step": 4245, + "time_per_iteration": 2.9027247428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014799, + "balance_loss_mlp": 1.00626397, + "epoch": 0.8168526356290882, + "flos": 1489674779136.0, + "grad_norm": 0.006322038265945877, + "language_loss": 0.72974741, + "learning_rate": 8.541541169053219e-05, + "loss": 0.7398954, + "num_input_tokens_seen": 352707104, + "router_z_loss_mlp": 0.08544922, + "step": 4246, + "time_per_iteration": 4.944149017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063972, + "balance_loss_mlp": 1.03269172, + "epoch": 0.8170450173143516, + "flos": 577927733760.0, + "grad_norm": 0.07058072733424971, + "language_loss": 0.84509909, + "learning_rate": 8.524134073172984e-05, + "loss": 0.85573876, + "num_input_tokens_seen": 352779248, + "router_z_loss_mlp": 0.3125, + "step": 4247, + "time_per_iteration": 2.7045445442199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066255, + "balance_loss_mlp": 1.03476024, + "epoch": 0.8172373989996152, + "flos": 570985984512.0, + "grad_norm": 0.10797119683150772, + "language_loss": 0.84231156, + "learning_rate": 8.506743079651974e-05, + "loss": 0.85297412, + "num_input_tokens_seen": 352856784, + "router_z_loss_mlp": 0.31469727, + "step": 4248, + "time_per_iteration": 2.775876760482788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066682, + "balance_loss_mlp": 1.03528225, + "epoch": 0.8174297806848788, + "flos": 528576178176.0, + "grad_norm": 0.05516186993789668, + "language_loss": 0.80867159, + "learning_rate": 8.489368195241948e-05, + "loss": 0.81933844, + "num_input_tokens_seen": 352926496, + "router_z_loss_mlp": 0.3137207, + "step": 4249, + "time_per_iteration": 2.6222243309020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064974, + "balance_loss_mlp": 1.03300214, + "epoch": 0.8176221623701424, + "flos": 568819514880.0, + "grad_norm": 0.05332500176787013, + "language_loss": 0.78964639, + "learning_rate": 8.47200942668846e-05, + "loss": 0.80029613, + "num_input_tokens_seen": 353005312, + "router_z_loss_mlp": 0.31958008, + "step": 4250, + "time_per_iteration": 2.788813829421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062759, + "balance_loss_mlp": 1.0308584, + "epoch": 0.8178145440554059, + "flos": 656226178560.0, + "grad_norm": 0.0759018879957778, + "language_loss": 0.80471349, + "learning_rate": 8.454666780730735e-05, + "loss": 0.81534111, + "num_input_tokens_seen": 353085120, + "router_z_loss_mlp": 0.31884766, + "step": 4251, + "time_per_iteration": 2.870534896850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064692, + "balance_loss_mlp": 1.03293502, + "epoch": 0.8180069257406695, + "flos": 545643095040.0, + "grad_norm": 0.05126338036877886, + "language_loss": 0.87462568, + "learning_rate": 8.437340264101828e-05, + "loss": 0.88527262, + "num_input_tokens_seen": 353160992, + "router_z_loss_mlp": 0.31738281, + "step": 4252, + "time_per_iteration": 2.7038302421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067347, + "balance_loss_mlp": 1.0348742, + "epoch": 0.818199307425933, + "flos": 618987350016.0, + "grad_norm": 0.059044445551817724, + "language_loss": 0.84837854, + "learning_rate": 8.420029883528474e-05, + "loss": 0.85905206, + "num_input_tokens_seen": 353233328, + "router_z_loss_mlp": 0.32470703, + "step": 4253, + "time_per_iteration": 2.713452100753784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062773, + "balance_loss_mlp": 1.03082526, + "epoch": 0.8183916891111966, + "flos": 647291077632.0, + "grad_norm": 0.08799557625316533, + "language_loss": 0.77167904, + "learning_rate": 8.402735645731157e-05, + "loss": 0.78230679, + "num_input_tokens_seen": 353310592, + "router_z_loss_mlp": 0.31933594, + "step": 4254, + "time_per_iteration": 2.958004951477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065309, + "balance_loss_mlp": 1.03305101, + "epoch": 0.8185840707964602, + "flos": 498875618304.0, + "grad_norm": 0.05690863085587787, + "language_loss": 0.78464049, + "learning_rate": 8.385457557424098e-05, + "loss": 0.79529357, + "num_input_tokens_seen": 353376544, + "router_z_loss_mlp": 0.32250977, + "step": 4255, + "time_per_iteration": 2.570077896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061679, + "balance_loss_mlp": 1.0300889, + "epoch": 0.8187764524817237, + "flos": 785885497344.0, + "grad_norm": 0.049659571224279345, + "language_loss": 0.79487193, + "learning_rate": 8.368195625315251e-05, + "loss": 0.80548877, + "num_input_tokens_seen": 353461200, + "router_z_loss_mlp": 0.31567383, + "step": 4256, + "time_per_iteration": 3.077765464782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066604, + "balance_loss_mlp": 1.03422654, + "epoch": 0.8189688341669873, + "flos": 550443105792.0, + "grad_norm": 0.048188663930783165, + "language_loss": 0.80790627, + "learning_rate": 8.350949856106283e-05, + "loss": 0.81857234, + "num_input_tokens_seen": 353538608, + "router_z_loss_mlp": 0.32373047, + "step": 4257, + "time_per_iteration": 2.7946553230285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013006, + "balance_loss_mlp": 1.00451815, + "epoch": 0.8191612158522509, + "flos": 1351247837184.0, + "grad_norm": 0.004934638976943409, + "language_loss": 0.71149343, + "learning_rate": 8.333720256492599e-05, + "loss": 0.72162348, + "num_input_tokens_seen": 353766960, + "router_z_loss_mlp": 0.08496094, + "step": 4258, + "time_per_iteration": 4.824636697769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064916, + "balance_loss_mlp": 1.03308725, + "epoch": 0.8193535975375145, + "flos": 543997541376.0, + "grad_norm": 0.061566617876948435, + "language_loss": 0.83655453, + "learning_rate": 8.316506833163318e-05, + "loss": 0.84720367, + "num_input_tokens_seen": 353833552, + "router_z_loss_mlp": 0.31811523, + "step": 4259, + "time_per_iteration": 2.652660846710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061066, + "balance_loss_mlp": 1.02978587, + "epoch": 0.8195459792227779, + "flos": 865361014272.0, + "grad_norm": 0.04568174068597017, + "language_loss": 0.85518008, + "learning_rate": 8.299309592801297e-05, + "loss": 0.86579072, + "num_input_tokens_seen": 353915520, + "router_z_loss_mlp": 0.3125, + "step": 4260, + "time_per_iteration": 3.0853793621063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066817, + "balance_loss_mlp": 1.03479743, + "epoch": 0.8197383609080415, + "flos": 569015953920.0, + "grad_norm": 0.0626065931580156, + "language_loss": 0.81053776, + "learning_rate": 8.282128542083101e-05, + "loss": 0.82120585, + "num_input_tokens_seen": 353992048, + "router_z_loss_mlp": 0.32006836, + "step": 4261, + "time_per_iteration": 2.69217848777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064059, + "balance_loss_mlp": 1.03232574, + "epoch": 0.8199307425933051, + "flos": 530546208768.0, + "grad_norm": 0.051394925331661186, + "language_loss": 0.84971988, + "learning_rate": 8.264963687678978e-05, + "loss": 0.86036044, + "num_input_tokens_seen": 354064848, + "router_z_loss_mlp": 0.31713867, + "step": 4262, + "time_per_iteration": 2.63204288482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066158, + "balance_loss_mlp": 1.03342354, + "epoch": 0.8201231242785687, + "flos": 566781083136.0, + "grad_norm": 0.047344222679516414, + "language_loss": 0.85058379, + "learning_rate": 8.247815036252921e-05, + "loss": 0.86124539, + "num_input_tokens_seen": 354138848, + "router_z_loss_mlp": 0.32739258, + "step": 4263, + "time_per_iteration": 2.725421905517578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064288, + "balance_loss_mlp": 1.03281677, + "epoch": 0.8203155059638323, + "flos": 1230037913088.0, + "grad_norm": 0.057193475382645485, + "language_loss": 0.83039153, + "learning_rate": 8.230682594462652e-05, + "loss": 0.84103441, + "num_input_tokens_seen": 354227696, + "router_z_loss_mlp": 0.31445312, + "step": 4264, + "time_per_iteration": 3.5159566402435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064005, + "balance_loss_mlp": 1.03148425, + "epoch": 0.8205078876490958, + "flos": 573929445888.0, + "grad_norm": 0.11088210201568381, + "language_loss": 0.7990979, + "learning_rate": 8.213566368959558e-05, + "loss": 0.80973792, + "num_input_tokens_seen": 354298400, + "router_z_loss_mlp": 0.32519531, + "step": 4265, + "time_per_iteration": 2.6384832859039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064287, + "balance_loss_mlp": 1.03198123, + "epoch": 0.8207002693343594, + "flos": 931005467136.0, + "grad_norm": 0.05472798836871878, + "language_loss": 0.78268075, + "learning_rate": 8.196466366388744e-05, + "loss": 0.79332364, + "num_input_tokens_seen": 354385024, + "router_z_loss_mlp": 0.32299805, + "step": 4266, + "time_per_iteration": 3.189403533935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066434, + "balance_loss_mlp": 1.03419971, + "epoch": 0.8208926510196229, + "flos": 549300939264.0, + "grad_norm": 0.05160040567241084, + "language_loss": 0.80250126, + "learning_rate": 8.179382593389029e-05, + "loss": 0.81316555, + "num_input_tokens_seen": 354456384, + "router_z_loss_mlp": 0.32226562, + "step": 4267, + "time_per_iteration": 2.6618669033050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067589, + "balance_loss_mlp": 1.03592706, + "epoch": 0.8210850327048865, + "flos": 647876012544.0, + "grad_norm": 0.06251068315447797, + "language_loss": 0.81684315, + "learning_rate": 8.162315056592918e-05, + "loss": 0.827519, + "num_input_tokens_seen": 354531296, + "router_z_loss_mlp": 0.31640625, + "step": 4268, + "time_per_iteration": 2.8171610832214355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063634, + "balance_loss_mlp": 1.03230619, + "epoch": 0.82127741439015, + "flos": 601227809280.0, + "grad_norm": 0.04884521994961647, + "language_loss": 0.81217563, + "learning_rate": 8.145263762626615e-05, + "loss": 0.82281196, + "num_input_tokens_seen": 354605680, + "router_z_loss_mlp": 0.31298828, + "step": 4269, + "time_per_iteration": 2.7463865280151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060593, + "balance_loss_mlp": 1.02955055, + "epoch": 0.8214697960754136, + "flos": 474577380864.0, + "grad_norm": 0.08185779566994117, + "language_loss": 0.83571124, + "learning_rate": 8.128228718110015e-05, + "loss": 0.84631717, + "num_input_tokens_seen": 354678160, + "router_z_loss_mlp": 0.31005859, + "step": 4270, + "time_per_iteration": 2.7095983028411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062123, + "balance_loss_mlp": 1.03058004, + "epoch": 0.8216621777606772, + "flos": 903288084480.0, + "grad_norm": 0.054129185570740465, + "language_loss": 0.84336656, + "learning_rate": 8.11120992965671e-05, + "loss": 0.85398781, + "num_input_tokens_seen": 354751024, + "router_z_loss_mlp": 0.31518555, + "step": 4271, + "time_per_iteration": 3.082247257232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064562, + "balance_loss_mlp": 1.03232741, + "epoch": 0.8218545594459408, + "flos": 514203849216.0, + "grad_norm": 0.04982374432283012, + "language_loss": 0.81935704, + "learning_rate": 8.094207403873998e-05, + "loss": 0.83000261, + "num_input_tokens_seen": 354819408, + "router_z_loss_mlp": 0.32226562, + "step": 4272, + "time_per_iteration": 2.5820279121398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064387, + "balance_loss_mlp": 1.0330354, + "epoch": 0.8220469411312044, + "flos": 494282221056.0, + "grad_norm": 0.0504374120036925, + "language_loss": 0.85846829, + "learning_rate": 8.077221147362829e-05, + "loss": 0.86911225, + "num_input_tokens_seen": 354887376, + "router_z_loss_mlp": 0.31323242, + "step": 4273, + "time_per_iteration": 2.584005355834961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064804, + "balance_loss_mlp": 1.0325458, + "epoch": 0.8222393228164678, + "flos": 386223579648.0, + "grad_norm": 0.057889479724380286, + "language_loss": 0.89503336, + "learning_rate": 8.060251166717835e-05, + "loss": 0.90568137, + "num_input_tokens_seen": 354948288, + "router_z_loss_mlp": 0.32250977, + "step": 4274, + "time_per_iteration": 2.382969379425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065829, + "balance_loss_mlp": 1.03388083, + "epoch": 0.8224317045017314, + "flos": 536331234816.0, + "grad_norm": 0.054333881103771485, + "language_loss": 0.86981678, + "learning_rate": 8.043297468527383e-05, + "loss": 0.88047504, + "num_input_tokens_seen": 355016912, + "router_z_loss_mlp": 0.31933594, + "step": 4275, + "time_per_iteration": 2.648383855819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062123, + "balance_loss_mlp": 1.0305084, + "epoch": 0.822624086186995, + "flos": 554637832704.0, + "grad_norm": 0.06589028036614175, + "language_loss": 0.82127655, + "learning_rate": 8.02636005937346e-05, + "loss": 0.83189774, + "num_input_tokens_seen": 355085936, + "router_z_loss_mlp": 0.31591797, + "step": 4276, + "time_per_iteration": 2.634843587875366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060304, + "balance_loss_mlp": 1.02861845, + "epoch": 0.8228164678722586, + "flos": 539296455168.0, + "grad_norm": 0.04688779470500233, + "language_loss": 0.79762036, + "learning_rate": 8.009438945831771e-05, + "loss": 0.80822337, + "num_input_tokens_seen": 355161984, + "router_z_loss_mlp": 0.31665039, + "step": 4277, + "time_per_iteration": 2.6846394538879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106328, + "balance_loss_mlp": 1.0314033, + "epoch": 0.8230088495575221, + "flos": 473001638400.0, + "grad_norm": 0.04881634129650232, + "language_loss": 0.79050267, + "learning_rate": 7.992534134471641e-05, + "loss": 0.80113542, + "num_input_tokens_seen": 355234544, + "router_z_loss_mlp": 0.31860352, + "step": 4278, + "time_per_iteration": 2.6381237506866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065439, + "balance_loss_mlp": 1.03332353, + "epoch": 0.8232012312427857, + "flos": 591403797504.0, + "grad_norm": 0.06782075009284445, + "language_loss": 0.82754999, + "learning_rate": 7.975645631856127e-05, + "loss": 0.83820438, + "num_input_tokens_seen": 355302896, + "router_z_loss_mlp": 0.32104492, + "step": 4279, + "time_per_iteration": 2.653944969177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065308, + "balance_loss_mlp": 1.03345561, + "epoch": 0.8233936129280492, + "flos": 572359495680.0, + "grad_norm": 0.04992965269342151, + "language_loss": 0.7469328, + "learning_rate": 7.958773444541916e-05, + "loss": 0.75758588, + "num_input_tokens_seen": 355377040, + "router_z_loss_mlp": 0.31835938, + "step": 4280, + "time_per_iteration": 2.7473456859588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066109, + "balance_loss_mlp": 1.03375518, + "epoch": 0.8235859946133128, + "flos": 730986052608.0, + "grad_norm": 0.041840919211567525, + "language_loss": 0.78355992, + "learning_rate": 7.941917579079383e-05, + "loss": 0.79422104, + "num_input_tokens_seen": 355461616, + "router_z_loss_mlp": 0.32348633, + "step": 4281, + "time_per_iteration": 3.0071098804473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066616, + "balance_loss_mlp": 1.03485882, + "epoch": 0.8237783762985764, + "flos": 570044639232.0, + "grad_norm": 0.056480020127142934, + "language_loss": 0.81059593, + "learning_rate": 7.92507804201253e-05, + "loss": 0.82126206, + "num_input_tokens_seen": 355532480, + "router_z_loss_mlp": 0.31738281, + "step": 4282, + "time_per_iteration": 2.6841516494750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004566, + "balance_loss_mlp": 0.99603093, + "epoch": 0.8239707579838399, + "flos": 1465437740544.0, + "grad_norm": 0.009020840107608945, + "language_loss": 0.75297678, + "learning_rate": 7.908254839879092e-05, + "loss": 0.76302242, + "num_input_tokens_seen": 355768752, + "router_z_loss_mlp": 0.08544922, + "step": 4283, + "time_per_iteration": 5.020637512207031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062901, + "balance_loss_mlp": 1.03035665, + "epoch": 0.8241631396691035, + "flos": 467068225536.0, + "grad_norm": 0.08084450716262151, + "language_loss": 0.8038317, + "learning_rate": 7.89144797921037e-05, + "loss": 0.81446069, + "num_input_tokens_seen": 355838800, + "router_z_loss_mlp": 0.32543945, + "step": 4284, + "time_per_iteration": 2.624439001083374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002031, + "balance_loss_mlp": 0.99354351, + "epoch": 0.8243555213543671, + "flos": 1538648165376.0, + "grad_norm": 0.010143921205099075, + "language_loss": 0.77934271, + "learning_rate": 7.874657466531388e-05, + "loss": 0.78936303, + "num_input_tokens_seen": 356069280, + "router_z_loss_mlp": 0.08496094, + "step": 4285, + "time_per_iteration": 4.928821086883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061224, + "balance_loss_mlp": 1.02958596, + "epoch": 0.8245479030396307, + "flos": 797072845824.0, + "grad_norm": 0.044332510336863265, + "language_loss": 0.82397342, + "learning_rate": 7.85788330836078e-05, + "loss": 0.83458561, + "num_input_tokens_seen": 356164528, + "router_z_loss_mlp": 0.31616211, + "step": 4286, + "time_per_iteration": 3.14546537399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063052, + "balance_loss_mlp": 1.03150964, + "epoch": 0.8247402847248941, + "flos": 645793910784.0, + "grad_norm": 0.045147974337775294, + "language_loss": 0.76310241, + "learning_rate": 7.841125511210878e-05, + "loss": 0.77373296, + "num_input_tokens_seen": 356243600, + "router_z_loss_mlp": 0.31518555, + "step": 4287, + "time_per_iteration": 2.898638963699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064179, + "balance_loss_mlp": 1.03242135, + "epoch": 0.8249326664101577, + "flos": 604123218432.0, + "grad_norm": 0.06674407920035312, + "language_loss": 0.79493982, + "learning_rate": 7.824384081587637e-05, + "loss": 0.80558157, + "num_input_tokens_seen": 356320320, + "router_z_loss_mlp": 0.31738281, + "step": 4288, + "time_per_iteration": 2.7821269035339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064292, + "balance_loss_mlp": 1.03186679, + "epoch": 0.8251250480954213, + "flos": 824006034432.0, + "grad_norm": 0.06583021420859499, + "language_loss": 0.86280286, + "learning_rate": 7.807659025990637e-05, + "loss": 0.87344575, + "num_input_tokens_seen": 356406928, + "router_z_loss_mlp": 0.32421875, + "step": 4289, + "time_per_iteration": 3.1109914779663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060406, + "balance_loss_mlp": 1.02848136, + "epoch": 0.8253174297806849, + "flos": 757060853760.0, + "grad_norm": 0.058384222358934625, + "language_loss": 0.77839482, + "learning_rate": 7.790950350913112e-05, + "loss": 0.78899884, + "num_input_tokens_seen": 356481456, + "router_z_loss_mlp": 0.3190918, + "step": 4290, + "time_per_iteration": 2.9492805004119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106367, + "balance_loss_mlp": 1.03117323, + "epoch": 0.8255098114659485, + "flos": 794090096640.0, + "grad_norm": 0.05143125962292425, + "language_loss": 0.87082183, + "learning_rate": 7.774258062841971e-05, + "loss": 0.88145852, + "num_input_tokens_seen": 356568736, + "router_z_loss_mlp": 0.32495117, + "step": 4291, + "time_per_iteration": 3.2069146633148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062351, + "balance_loss_mlp": 1.03111804, + "epoch": 0.825702193151212, + "flos": 710102730240.0, + "grad_norm": 0.12023426920878982, + "language_loss": 0.77597296, + "learning_rate": 7.757582168257731e-05, + "loss": 0.78659642, + "num_input_tokens_seen": 356643328, + "router_z_loss_mlp": 0.31201172, + "step": 4292, + "time_per_iteration": 2.850250244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061486, + "balance_loss_mlp": 1.02918029, + "epoch": 0.8258945748364755, + "flos": 683076409344.0, + "grad_norm": 0.048890635163102686, + "language_loss": 0.80603421, + "learning_rate": 7.740922673634537e-05, + "loss": 0.81664902, + "num_input_tokens_seen": 356723824, + "router_z_loss_mlp": 0.32299805, + "step": 4293, + "time_per_iteration": 2.910806179046631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060982, + "balance_loss_mlp": 1.02924871, + "epoch": 0.8260869565217391, + "flos": 594284649984.0, + "grad_norm": 0.05591520410462236, + "language_loss": 0.78902394, + "learning_rate": 7.724279585440186e-05, + "loss": 0.7996338, + "num_input_tokens_seen": 356796512, + "router_z_loss_mlp": 0.31713867, + "step": 4294, + "time_per_iteration": 2.704671859741211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060187, + "balance_loss_mlp": 1.02816761, + "epoch": 0.8262793382070027, + "flos": 651189030912.0, + "grad_norm": 0.04861629765656741, + "language_loss": 0.84998047, + "learning_rate": 7.707652910136098e-05, + "loss": 0.86058241, + "num_input_tokens_seen": 356868624, + "router_z_loss_mlp": 0.32006836, + "step": 4295, + "time_per_iteration": 2.823174238204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060095, + "balance_loss_mlp": 1.02807498, + "epoch": 0.8264717198922663, + "flos": 538665030144.0, + "grad_norm": 0.05706956652691896, + "language_loss": 0.84780651, + "learning_rate": 7.691042654177315e-05, + "loss": 0.85840744, + "num_input_tokens_seen": 356934368, + "router_z_loss_mlp": 0.32006836, + "step": 4296, + "time_per_iteration": 2.6320347785949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061015, + "balance_loss_mlp": 1.02949572, + "epoch": 0.8266641015775298, + "flos": 538689761280.0, + "grad_norm": 0.053774403970765, + "language_loss": 0.74997044, + "learning_rate": 7.674448824012514e-05, + "loss": 0.7605806, + "num_input_tokens_seen": 357005536, + "router_z_loss_mlp": 0.31494141, + "step": 4297, + "time_per_iteration": 2.6300857067108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064228, + "balance_loss_mlp": 1.03199387, + "epoch": 0.8268564832627934, + "flos": 585077506560.0, + "grad_norm": 0.046756565982589225, + "language_loss": 0.84059066, + "learning_rate": 7.657871426083979e-05, + "loss": 0.85123295, + "num_input_tokens_seen": 357082160, + "router_z_loss_mlp": 0.32226562, + "step": 4298, + "time_per_iteration": 2.7552220821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056678, + "balance_loss_mlp": 1.02449179, + "epoch": 0.827048864948057, + "flos": 430434680832.0, + "grad_norm": 0.061322212544335376, + "language_loss": 0.83956921, + "learning_rate": 7.641310466827667e-05, + "loss": 0.85013604, + "num_input_tokens_seen": 357146928, + "router_z_loss_mlp": 0.32177734, + "step": 4299, + "time_per_iteration": 2.44634747505188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063761, + "balance_loss_mlp": 1.03214669, + "epoch": 0.8272412466333205, + "flos": 1387915181568.0, + "grad_norm": 0.051483866399296904, + "language_loss": 0.85022169, + "learning_rate": 7.624765952673069e-05, + "loss": 0.86085927, + "num_input_tokens_seen": 357236768, + "router_z_loss_mlp": 0.31591797, + "step": 4300, + "time_per_iteration": 3.7406394481658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061571, + "balance_loss_mlp": 1.02978992, + "epoch": 0.827433628318584, + "flos": 537952057344.0, + "grad_norm": 0.06027744747687877, + "language_loss": 0.82495129, + "learning_rate": 7.608237890043335e-05, + "loss": 0.835567, + "num_input_tokens_seen": 357307568, + "router_z_loss_mlp": 0.31762695, + "step": 4301, + "time_per_iteration": 2.707937479019165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062303, + "balance_loss_mlp": 1.0309273, + "epoch": 0.8276260100038476, + "flos": 730404089856.0, + "grad_norm": 0.051103301822031, + "language_loss": 0.77301157, + "learning_rate": 7.59172628535526e-05, + "loss": 0.78363454, + "num_input_tokens_seen": 357387712, + "router_z_loss_mlp": 0.31347656, + "step": 4302, + "time_per_iteration": 2.938246488571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062788, + "balance_loss_mlp": 1.0308876, + "epoch": 0.8278183916891112, + "flos": 870713874432.0, + "grad_norm": 0.042056828562468666, + "language_loss": 0.82270902, + "learning_rate": 7.575231145019196e-05, + "loss": 0.83333695, + "num_input_tokens_seen": 357473360, + "router_z_loss_mlp": 0.31884766, + "step": 4303, + "time_per_iteration": 3.2142274379730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064009, + "balance_loss_mlp": 1.03122663, + "epoch": 0.8280107733743748, + "flos": 594255536640.0, + "grad_norm": 0.05242827706405345, + "language_loss": 0.77579129, + "learning_rate": 7.558752475439134e-05, + "loss": 0.78643137, + "num_input_tokens_seen": 357548432, + "router_z_loss_mlp": 0.32788086, + "step": 4304, + "time_per_iteration": 2.814234972000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061162, + "balance_loss_mlp": 1.02957141, + "epoch": 0.8282031550596384, + "flos": 768253994496.0, + "grad_norm": 0.059346298032511166, + "language_loss": 0.84176564, + "learning_rate": 7.542290283012653e-05, + "loss": 0.85237724, + "num_input_tokens_seen": 357625968, + "router_z_loss_mlp": 0.31567383, + "step": 4305, + "time_per_iteration": 3.011970281600952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062221, + "balance_loss_mlp": 1.0302968, + "epoch": 0.8283955367449019, + "flos": 695775481344.0, + "grad_norm": 0.050284408342684474, + "language_loss": 0.77980834, + "learning_rate": 7.525844574130947e-05, + "loss": 0.79043055, + "num_input_tokens_seen": 357705824, + "router_z_loss_mlp": 0.3190918, + "step": 4306, + "time_per_iteration": 2.8967409133911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062089, + "balance_loss_mlp": 1.0303793, + "epoch": 0.8285879184301654, + "flos": 660304452096.0, + "grad_norm": 0.051650533406218724, + "language_loss": 0.82582647, + "learning_rate": 7.509415355178806e-05, + "loss": 0.83644736, + "num_input_tokens_seen": 357787040, + "router_z_loss_mlp": 0.31689453, + "step": 4307, + "time_per_iteration": 2.919273853302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060613, + "balance_loss_mlp": 1.02835536, + "epoch": 0.828780300115429, + "flos": 558444063744.0, + "grad_norm": 0.06767226386788444, + "language_loss": 0.78127337, + "learning_rate": 7.493002632534618e-05, + "loss": 0.79187953, + "num_input_tokens_seen": 357856960, + "router_z_loss_mlp": 0.32250977, + "step": 4308, + "time_per_iteration": 2.6678566932678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063076, + "balance_loss_mlp": 1.03141367, + "epoch": 0.8289726818006926, + "flos": 830613132288.0, + "grad_norm": 0.05551844794996671, + "language_loss": 0.81721139, + "learning_rate": 7.476606412570352e-05, + "loss": 0.82784212, + "num_input_tokens_seen": 357937760, + "router_z_loss_mlp": 0.31640625, + "step": 4309, + "time_per_iteration": 3.086724042892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063148, + "balance_loss_mlp": 1.03050852, + "epoch": 0.8291650634859561, + "flos": 731974040064.0, + "grad_norm": 0.06581932804634757, + "language_loss": 0.80861235, + "learning_rate": 7.460226701651624e-05, + "loss": 0.81924385, + "num_input_tokens_seen": 358012480, + "router_z_loss_mlp": 0.32641602, + "step": 4310, + "time_per_iteration": 2.8933186531066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067077, + "balance_loss_mlp": 1.03384113, + "epoch": 0.8293574451712197, + "flos": 860521715712.0, + "grad_norm": 0.04893141587643439, + "language_loss": 0.81319165, + "learning_rate": 7.443863506137566e-05, + "loss": 0.82386243, + "num_input_tokens_seen": 358100720, + "router_z_loss_mlp": 0.33251953, + "step": 4311, + "time_per_iteration": 3.168560743331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064421, + "balance_loss_mlp": 1.03168607, + "epoch": 0.8295498268564833, + "flos": 494874358272.0, + "grad_norm": 0.0449610760938366, + "language_loss": 0.81700766, + "learning_rate": 7.427516832380948e-05, + "loss": 0.82765186, + "num_input_tokens_seen": 358180496, + "router_z_loss_mlp": 0.32739258, + "step": 4312, + "time_per_iteration": 2.8094916343688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060381, + "balance_loss_mlp": 1.02867162, + "epoch": 0.8297422085417469, + "flos": 554176553472.0, + "grad_norm": 0.04659314008447996, + "language_loss": 0.777403, + "learning_rate": 7.4111866867281e-05, + "loss": 0.78800684, + "num_input_tokens_seen": 358261104, + "router_z_loss_mlp": 0.31689453, + "step": 4313, + "time_per_iteration": 2.776169538497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060377, + "balance_loss_mlp": 1.02849996, + "epoch": 0.8299345902270104, + "flos": 1247001523200.0, + "grad_norm": 0.04777485610881539, + "language_loss": 0.77525687, + "learning_rate": 7.39487307551896e-05, + "loss": 0.78586066, + "num_input_tokens_seen": 358356368, + "router_z_loss_mlp": 0.31860352, + "step": 4314, + "time_per_iteration": 3.6645400524139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063672, + "balance_loss_mlp": 1.03217673, + "epoch": 0.8301269719122739, + "flos": 584974199808.0, + "grad_norm": 0.056657764932407616, + "language_loss": 0.83212584, + "learning_rate": 7.378576005087034e-05, + "loss": 0.84276259, + "num_input_tokens_seen": 358429104, + "router_z_loss_mlp": 0.31469727, + "step": 4315, + "time_per_iteration": 2.713848352432251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060295, + "balance_loss_mlp": 1.02834654, + "epoch": 0.8303193535975375, + "flos": 509472239616.0, + "grad_norm": 0.057881745487426015, + "language_loss": 0.84784532, + "learning_rate": 7.362295481759412e-05, + "loss": 0.85844827, + "num_input_tokens_seen": 358501344, + "router_z_loss_mlp": 0.31933594, + "step": 4316, + "time_per_iteration": 2.6434786319732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106228, + "balance_loss_mlp": 1.03061819, + "epoch": 0.8305117352828011, + "flos": 580375010304.0, + "grad_norm": 0.06045853162415408, + "language_loss": 0.83559334, + "learning_rate": 7.346031511856722e-05, + "loss": 0.84621614, + "num_input_tokens_seen": 358575584, + "router_z_loss_mlp": 0.31640625, + "step": 4317, + "time_per_iteration": 2.6957998275756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060074, + "balance_loss_mlp": 1.02750635, + "epoch": 0.8307041169680647, + "flos": 481372153344.0, + "grad_norm": 0.047579165766144, + "language_loss": 0.78932106, + "learning_rate": 7.329784101693232e-05, + "loss": 0.79992181, + "num_input_tokens_seen": 358644304, + "router_z_loss_mlp": 0.32568359, + "step": 4318, + "time_per_iteration": 2.626657724380493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060149, + "balance_loss_mlp": 1.0276053, + "epoch": 0.8308964986533282, + "flos": 624319861248.0, + "grad_norm": 0.06476981139443477, + "language_loss": 0.82870758, + "learning_rate": 7.313553257576727e-05, + "loss": 0.8393091, + "num_input_tokens_seen": 358712384, + "router_z_loss_mlp": 0.32543945, + "step": 4319, + "time_per_iteration": 2.7071280479431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059715, + "balance_loss_mlp": 1.02738571, + "epoch": 0.8310888803385917, + "flos": 826974226944.0, + "grad_norm": 0.05413916081766935, + "language_loss": 0.78281611, + "learning_rate": 7.297338985808589e-05, + "loss": 0.79341328, + "num_input_tokens_seen": 358789264, + "router_z_loss_mlp": 0.32324219, + "step": 4320, + "time_per_iteration": 2.990934133529663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106001, + "balance_loss_mlp": 1.02813339, + "epoch": 0.8312812620238553, + "flos": 583443537408.0, + "grad_norm": 0.04487229329770065, + "language_loss": 0.8192122, + "learning_rate": 7.281141292683746e-05, + "loss": 0.82981229, + "num_input_tokens_seen": 358868976, + "router_z_loss_mlp": 0.31860352, + "step": 4321, + "time_per_iteration": 2.77132248878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060374, + "balance_loss_mlp": 1.02818751, + "epoch": 0.8314736437091189, + "flos": 1115165560320.0, + "grad_norm": 0.05537156492885857, + "language_loss": 0.7462157, + "learning_rate": 7.26496018449071e-05, + "loss": 0.75681943, + "num_input_tokens_seen": 358953600, + "router_z_loss_mlp": 0.32177734, + "step": 4322, + "time_per_iteration": 3.414076328277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060533, + "balance_loss_mlp": 1.0283463, + "epoch": 0.8316660253943825, + "flos": 517295697408.0, + "grad_norm": 0.051649264825651166, + "language_loss": 0.81687033, + "learning_rate": 7.248795667511543e-05, + "loss": 0.82747567, + "num_input_tokens_seen": 359028768, + "router_z_loss_mlp": 0.32177734, + "step": 4323, + "time_per_iteration": 2.763958692550659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061491, + "balance_loss_mlp": 1.02944708, + "epoch": 0.831858407079646, + "flos": 794989334016.0, + "grad_norm": 0.059163760563895655, + "language_loss": 0.77817202, + "learning_rate": 7.232647748021864e-05, + "loss": 0.78878695, + "num_input_tokens_seen": 359116208, + "router_z_loss_mlp": 0.3203125, + "step": 4324, + "time_per_iteration": 2.988997459411621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106194, + "balance_loss_mlp": 1.02980113, + "epoch": 0.8320507887649096, + "flos": 549699609600.0, + "grad_norm": 0.05611218346767701, + "language_loss": 0.83145595, + "learning_rate": 7.216516432290843e-05, + "loss": 0.84207541, + "num_input_tokens_seen": 359189552, + "router_z_loss_mlp": 0.32128906, + "step": 4325, + "time_per_iteration": 2.6479220390319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061886, + "balance_loss_mlp": 1.03029585, + "epoch": 0.8322431704501732, + "flos": 479160603648.0, + "grad_norm": 0.06615664891911413, + "language_loss": 0.81790996, + "learning_rate": 7.20040172658123e-05, + "loss": 0.82852876, + "num_input_tokens_seen": 359253008, + "router_z_loss_mlp": 0.31567383, + "step": 4326, + "time_per_iteration": 2.569667339324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060893, + "balance_loss_mlp": 1.029374, + "epoch": 0.8324355521354367, + "flos": 572157264384.0, + "grad_norm": 0.05577163656635302, + "language_loss": 0.85195124, + "learning_rate": 7.184303637149308e-05, + "loss": 0.86256015, + "num_input_tokens_seen": 359326368, + "router_z_loss_mlp": 0.31494141, + "step": 4327, + "time_per_iteration": 2.7528669834136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059731, + "balance_loss_mlp": 1.02763987, + "epoch": 0.8326279338207002, + "flos": 503208557568.0, + "grad_norm": 0.044215822441669876, + "language_loss": 0.82180458, + "learning_rate": 7.168222170244888e-05, + "loss": 0.83240187, + "num_input_tokens_seen": 359394192, + "router_z_loss_mlp": 0.32080078, + "step": 4328, + "time_per_iteration": 2.6244540214538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061835, + "balance_loss_mlp": 1.02972031, + "epoch": 0.8328203155059638, + "flos": 605442885120.0, + "grad_norm": 0.054206971603661426, + "language_loss": 0.81084836, + "learning_rate": 7.152157332111364e-05, + "loss": 0.82146668, + "num_input_tokens_seen": 359476016, + "router_z_loss_mlp": 0.32104492, + "step": 4329, + "time_per_iteration": 2.9348015785217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060654, + "balance_loss_mlp": 1.02818131, + "epoch": 0.8330126971912274, + "flos": 697469087232.0, + "grad_norm": 0.04560103027765181, + "language_loss": 0.85918784, + "learning_rate": 7.136109128985663e-05, + "loss": 0.86979437, + "num_input_tokens_seen": 359554048, + "router_z_loss_mlp": 0.32470703, + "step": 4330, + "time_per_iteration": 2.9106035232543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062656, + "balance_loss_mlp": 1.03032613, + "epoch": 0.833205078876491, + "flos": 493799182848.0, + "grad_norm": 0.05705382865518944, + "language_loss": 0.86475688, + "learning_rate": 7.120077567098249e-05, + "loss": 0.8753835, + "num_input_tokens_seen": 359621440, + "router_z_loss_mlp": 0.32324219, + "step": 4331, + "time_per_iteration": 2.5757062435150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064064, + "balance_loss_mlp": 1.03237844, + "epoch": 0.8333974605617546, + "flos": 482568164352.0, + "grad_norm": 0.06444033025960733, + "language_loss": 0.82880324, + "learning_rate": 7.104062652673115e-05, + "loss": 0.83944392, + "num_input_tokens_seen": 359690320, + "router_z_loss_mlp": 0.31665039, + "step": 4332, + "time_per_iteration": 2.6238632202148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060115, + "balance_loss_mlp": 1.02788115, + "epoch": 0.833589842247018, + "flos": 686517465600.0, + "grad_norm": 0.059078619019291526, + "language_loss": 0.82772213, + "learning_rate": 7.088064391927818e-05, + "loss": 0.8383233, + "num_input_tokens_seen": 359759888, + "router_z_loss_mlp": 0.32226562, + "step": 4333, + "time_per_iteration": 2.832642078399658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061254, + "balance_loss_mlp": 1.02868593, + "epoch": 0.8337822239322816, + "flos": 881377486848.0, + "grad_norm": 0.05463560677088328, + "language_loss": 0.82398927, + "learning_rate": 7.072082791073419e-05, + "loss": 0.83460188, + "num_input_tokens_seen": 359836544, + "router_z_loss_mlp": 0.32568359, + "step": 4334, + "time_per_iteration": 3.058436632156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062782, + "balance_loss_mlp": 1.03023815, + "epoch": 0.8339746056175452, + "flos": 496940493312.0, + "grad_norm": 0.05988457548558227, + "language_loss": 0.82327098, + "learning_rate": 7.056117856314531e-05, + "loss": 0.83389878, + "num_input_tokens_seen": 359903024, + "router_z_loss_mlp": 0.32543945, + "step": 4335, + "time_per_iteration": 2.6513023376464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059506, + "balance_loss_mlp": 1.02810621, + "epoch": 0.8341669873028088, + "flos": 510244849152.0, + "grad_norm": 0.06293164805467606, + "language_loss": 0.85905898, + "learning_rate": 7.040169593849289e-05, + "loss": 0.869654, + "num_input_tokens_seen": 359971200, + "router_z_loss_mlp": 0.3137207, + "step": 4336, + "time_per_iteration": 2.5953714847564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060837, + "balance_loss_mlp": 1.02924645, + "epoch": 0.8343593689880723, + "flos": 692017302528.0, + "grad_norm": 0.05109928618703078, + "language_loss": 0.84164715, + "learning_rate": 7.024238009869366e-05, + "loss": 0.85225552, + "num_input_tokens_seen": 360042560, + "router_z_loss_mlp": 0.31567383, + "step": 4337, + "time_per_iteration": 2.83786940574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061215, + "balance_loss_mlp": 1.02909958, + "epoch": 0.8345517506733359, + "flos": 552132329472.0, + "grad_norm": 0.0511172744686772, + "language_loss": 0.78007007, + "learning_rate": 7.008323110559956e-05, + "loss": 0.79068226, + "num_input_tokens_seen": 360118048, + "router_z_loss_mlp": 0.32104492, + "step": 4338, + "time_per_iteration": 2.7188031673431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063317, + "balance_loss_mlp": 1.03082061, + "epoch": 0.8347441323585995, + "flos": 591750033408.0, + "grad_norm": 0.060812686933994074, + "language_loss": 0.7611599, + "learning_rate": 6.992424902099754e-05, + "loss": 0.77179301, + "num_input_tokens_seen": 360192528, + "router_z_loss_mlp": 0.32495117, + "step": 4339, + "time_per_iteration": 2.7962260246276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060806, + "balance_loss_mlp": 1.02838063, + "epoch": 0.834936514043863, + "flos": 614625297408.0, + "grad_norm": 0.05105659953358199, + "language_loss": 0.84234512, + "learning_rate": 6.976543390660983e-05, + "loss": 0.85295308, + "num_input_tokens_seen": 360266880, + "router_z_loss_mlp": 0.32421875, + "step": 4340, + "time_per_iteration": 2.727919101715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061694, + "balance_loss_mlp": 1.03050888, + "epoch": 0.8351288957291266, + "flos": 467590551552.0, + "grad_norm": 0.05876177315186617, + "language_loss": 0.79671931, + "learning_rate": 6.960678582409424e-05, + "loss": 0.80733621, + "num_input_tokens_seen": 360336336, + "router_z_loss_mlp": 0.31152344, + "step": 4341, + "time_per_iteration": 2.5811197757720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064328, + "balance_loss_mlp": 1.03276157, + "epoch": 0.8353212774143901, + "flos": 509063394816.0, + "grad_norm": 0.0467408399393633, + "language_loss": 0.78745788, + "learning_rate": 6.944830483504328e-05, + "loss": 0.79810113, + "num_input_tokens_seen": 360409776, + "router_z_loss_mlp": 0.31542969, + "step": 4342, + "time_per_iteration": 2.6592719554901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061639, + "balance_loss_mlp": 1.02957189, + "epoch": 0.8355136590996537, + "flos": 687477749760.0, + "grad_norm": 0.047752406673021996, + "language_loss": 0.80783325, + "learning_rate": 6.928999100098483e-05, + "loss": 0.81844962, + "num_input_tokens_seen": 360486800, + "router_z_loss_mlp": 0.32055664, + "step": 4343, + "time_per_iteration": 2.8339645862579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060012, + "balance_loss_mlp": 1.0284456, + "epoch": 0.8357060407849173, + "flos": 984019249152.0, + "grad_norm": 0.06155280246036442, + "language_loss": 0.83912945, + "learning_rate": 6.913184438338138e-05, + "loss": 0.84972966, + "num_input_tokens_seen": 360568624, + "router_z_loss_mlp": 0.31542969, + "step": 4344, + "time_per_iteration": 3.216474771499634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063293, + "balance_loss_mlp": 1.03132069, + "epoch": 0.8358984224701809, + "flos": 842657458176.0, + "grad_norm": 0.0551356082042311, + "language_loss": 0.84887034, + "learning_rate": 6.89738650436313e-05, + "loss": 0.85950327, + "num_input_tokens_seen": 360652384, + "router_z_loss_mlp": 0.31958008, + "step": 4345, + "time_per_iteration": 3.195633888244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060346, + "balance_loss_mlp": 1.02849305, + "epoch": 0.8360908041554445, + "flos": 625945065984.0, + "grad_norm": 0.047281454191363835, + "language_loss": 0.81882936, + "learning_rate": 6.881605304306748e-05, + "loss": 0.82943279, + "num_input_tokens_seen": 360723200, + "router_z_loss_mlp": 0.31835938, + "step": 4346, + "time_per_iteration": 2.7578022480010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061434, + "balance_loss_mlp": 1.02884197, + "epoch": 0.8362831858407079, + "flos": 575781613056.0, + "grad_norm": 0.047125075234917546, + "language_loss": 0.84707719, + "learning_rate": 6.865840844295796e-05, + "loss": 0.85769153, + "num_input_tokens_seen": 360798240, + "router_z_loss_mlp": 0.32592773, + "step": 4347, + "time_per_iteration": 2.7195725440979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063205, + "balance_loss_mlp": 1.03120947, + "epoch": 0.8364755675259715, + "flos": 833434348032.0, + "grad_norm": 0.05954085482957289, + "language_loss": 0.80601609, + "learning_rate": 6.850093130450569e-05, + "loss": 0.81664807, + "num_input_tokens_seen": 360873552, + "router_z_loss_mlp": 0.31982422, + "step": 4348, + "time_per_iteration": 3.0620546340942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061293, + "balance_loss_mlp": 1.02963078, + "epoch": 0.8366679492112351, + "flos": 582211210752.0, + "grad_norm": 0.0624278254364128, + "language_loss": 0.86158174, + "learning_rate": 6.834362168884912e-05, + "loss": 0.87219471, + "num_input_tokens_seen": 360940800, + "router_z_loss_mlp": 0.31665039, + "step": 4349, + "time_per_iteration": 2.67112135887146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066667, + "balance_loss_mlp": 1.03438473, + "epoch": 0.8368603308964987, + "flos": 611434524672.0, + "grad_norm": 0.06986465121037809, + "language_loss": 0.87439007, + "learning_rate": 6.818647965706076e-05, + "loss": 0.88505673, + "num_input_tokens_seen": 361014368, + "router_z_loss_mlp": 0.32275391, + "step": 4350, + "time_per_iteration": 2.751300573348999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061351, + "balance_loss_mlp": 1.03030932, + "epoch": 0.8370527125817622, + "flos": 507014788608.0, + "grad_norm": 0.04892721462190198, + "language_loss": 0.85628557, + "learning_rate": 6.802950527014884e-05, + "loss": 0.86689907, + "num_input_tokens_seen": 361087184, + "router_z_loss_mlp": 0.31005859, + "step": 4351, + "time_per_iteration": 2.7105777263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062361, + "balance_loss_mlp": 1.03017473, + "epoch": 0.8372450942670258, + "flos": 770621285376.0, + "grad_norm": 0.06534150847957279, + "language_loss": 0.82583368, + "learning_rate": 6.787269858905603e-05, + "loss": 0.83645725, + "num_input_tokens_seen": 361160720, + "router_z_loss_mlp": 0.32177734, + "step": 4352, + "time_per_iteration": 2.9064080715179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065324, + "balance_loss_mlp": 1.03306603, + "epoch": 0.8374374759522893, + "flos": 579005881344.0, + "grad_norm": 0.05598661276707463, + "language_loss": 0.84989685, + "learning_rate": 6.771605967466033e-05, + "loss": 0.86055005, + "num_input_tokens_seen": 361234432, + "router_z_loss_mlp": 0.32250977, + "step": 4353, + "time_per_iteration": 2.6632039546966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061526, + "balance_loss_mlp": 1.0297451, + "epoch": 0.8376298576375529, + "flos": 787781334528.0, + "grad_norm": 0.06243401457394196, + "language_loss": 0.82518673, + "learning_rate": 6.755958858777434e-05, + "loss": 0.83580196, + "num_input_tokens_seen": 361309376, + "router_z_loss_mlp": 0.31762695, + "step": 4354, + "time_per_iteration": 2.9677281379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010626, + "balance_loss_mlp": 1.03024638, + "epoch": 0.8378222393228165, + "flos": 577337006592.0, + "grad_norm": 0.10666459553025696, + "language_loss": 0.80665678, + "learning_rate": 6.74032853891452e-05, + "loss": 0.8172828, + "num_input_tokens_seen": 361386768, + "router_z_loss_mlp": 0.32348633, + "step": 4355, + "time_per_iteration": 2.7135212421417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063218, + "balance_loss_mlp": 1.03084075, + "epoch": 0.83801462100808, + "flos": 480618482688.0, + "grad_norm": 0.05787052388359443, + "language_loss": 0.81662172, + "learning_rate": 6.724715013945548e-05, + "loss": 0.82725382, + "num_input_tokens_seen": 361456704, + "router_z_loss_mlp": 0.32373047, + "step": 4356, + "time_per_iteration": 2.58859920501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061947, + "balance_loss_mlp": 1.03021395, + "epoch": 0.8382070026933436, + "flos": 550523091456.0, + "grad_norm": 0.060545576710462894, + "language_loss": 0.89191318, + "learning_rate": 6.709118289932226e-05, + "loss": 0.90253264, + "num_input_tokens_seen": 361533648, + "router_z_loss_mlp": 0.31713867, + "step": 4357, + "time_per_iteration": 2.770278215408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063858, + "balance_loss_mlp": 1.032125, + "epoch": 0.8383993843786072, + "flos": 624655922688.0, + "grad_norm": 0.06062312450424789, + "language_loss": 0.81920969, + "learning_rate": 6.693538372929725e-05, + "loss": 0.82984829, + "num_input_tokens_seen": 361614256, + "router_z_loss_mlp": 0.31713867, + "step": 4358, + "time_per_iteration": 2.9120824337005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064332, + "balance_loss_mlp": 1.031955, + "epoch": 0.8385917660638708, + "flos": 490928504832.0, + "grad_norm": 0.050165446506244216, + "language_loss": 0.86263275, + "learning_rate": 6.677975268986719e-05, + "loss": 0.87327605, + "num_input_tokens_seen": 361679008, + "router_z_loss_mlp": 0.32373047, + "step": 4359, + "time_per_iteration": 2.5493242740631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064879, + "balance_loss_mlp": 1.0325731, + "epoch": 0.8387841477491342, + "flos": 466659380736.0, + "grad_norm": 0.05193023362700978, + "language_loss": 0.87059301, + "learning_rate": 6.662428984145336e-05, + "loss": 0.8812418, + "num_input_tokens_seen": 361747600, + "router_z_loss_mlp": 0.32299805, + "step": 4360, + "time_per_iteration": 2.61664080619812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006292, + "balance_loss_mlp": 0.99775666, + "epoch": 0.8389765294343978, + "flos": 1563339128832.0, + "grad_norm": 0.00919489122759599, + "language_loss": 0.71780187, + "learning_rate": 6.646899524441175e-05, + "loss": 0.7278648, + "num_input_tokens_seen": 361983104, + "router_z_loss_mlp": 0.08544922, + "step": 4361, + "time_per_iteration": 5.009884357452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060823, + "balance_loss_mlp": 1.02956581, + "epoch": 0.8391689111196614, + "flos": 601849059840.0, + "grad_norm": 0.04367937475787612, + "language_loss": 0.83063507, + "learning_rate": 6.631386895903308e-05, + "loss": 0.84124339, + "num_input_tokens_seen": 362065824, + "router_z_loss_mlp": 0.31225586, + "step": 4362, + "time_per_iteration": 2.8306806087493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106322, + "balance_loss_mlp": 1.0315814, + "epoch": 0.839361292804925, + "flos": 442818040320.0, + "grad_norm": 0.052955552359322186, + "language_loss": 0.79883057, + "learning_rate": 6.615891104554261e-05, + "loss": 0.80946279, + "num_input_tokens_seen": 362128240, + "router_z_loss_mlp": 0.31616211, + "step": 4363, + "time_per_iteration": 2.479904890060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062818, + "balance_loss_mlp": 1.02994013, + "epoch": 0.8395536744901886, + "flos": 593885979648.0, + "grad_norm": 0.04635728781901914, + "language_loss": 0.82487506, + "learning_rate": 6.600412156410057e-05, + "loss": 0.83550322, + "num_input_tokens_seen": 362198256, + "router_z_loss_mlp": 0.32885742, + "step": 4364, + "time_per_iteration": 2.7378604412078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059223, + "balance_loss_mlp": 1.02803802, + "epoch": 0.8397460561754521, + "flos": 889462812672.0, + "grad_norm": 0.058311818484936, + "language_loss": 0.84599465, + "learning_rate": 6.58495005748016e-05, + "loss": 0.85658687, + "num_input_tokens_seen": 362279792, + "router_z_loss_mlp": 0.31152344, + "step": 4365, + "time_per_iteration": 3.1387250423431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066412, + "balance_loss_mlp": 1.0339396, + "epoch": 0.8399384378607156, + "flos": 553239590400.0, + "grad_norm": 0.07373316547529772, + "language_loss": 0.88838422, + "learning_rate": 6.569504813767463e-05, + "loss": 0.89904833, + "num_input_tokens_seen": 362351712, + "router_z_loss_mlp": 0.32470703, + "step": 4366, + "time_per_iteration": 2.594947576522827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106208, + "balance_loss_mlp": 1.02955997, + "epoch": 0.8401308195459792, + "flos": 518664826368.0, + "grad_norm": 0.04997889714704866, + "language_loss": 0.83415538, + "learning_rate": 6.554076431268341e-05, + "loss": 0.84477615, + "num_input_tokens_seen": 362423424, + "router_z_loss_mlp": 0.32519531, + "step": 4367, + "time_per_iteration": 2.6347951889038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063314, + "balance_loss_mlp": 1.0316515, + "epoch": 0.8403232012312428, + "flos": 684593925120.0, + "grad_norm": 0.058557481945258026, + "language_loss": 0.81210721, + "learning_rate": 6.538664915972648e-05, + "loss": 0.82274044, + "num_input_tokens_seen": 362514704, + "router_z_loss_mlp": 0.31640625, + "step": 4368, + "time_per_iteration": 3.0035693645477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064984, + "balance_loss_mlp": 1.03313088, + "epoch": 0.8405155829165063, + "flos": 577424346624.0, + "grad_norm": 0.05437483826569731, + "language_loss": 0.77127528, + "learning_rate": 6.523270273863652e-05, + "loss": 0.78192508, + "num_input_tokens_seen": 362581296, + "router_z_loss_mlp": 0.31835938, + "step": 4369, + "time_per_iteration": 2.682255983352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065622, + "balance_loss_mlp": 1.03350711, + "epoch": 0.8407079646017699, + "flos": 456393028608.0, + "grad_norm": 0.059338918722140754, + "language_loss": 0.87703532, + "learning_rate": 6.507892510918079e-05, + "loss": 0.88769156, + "num_input_tokens_seen": 362648304, + "router_z_loss_mlp": 0.32104492, + "step": 4370, + "time_per_iteration": 2.5616416931152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062128, + "balance_loss_mlp": 1.03032303, + "epoch": 0.8409003462870335, + "flos": 534647803392.0, + "grad_norm": 0.06484553532482988, + "language_loss": 0.81577432, + "learning_rate": 6.492531633106114e-05, + "loss": 0.82639563, + "num_input_tokens_seen": 362721264, + "router_z_loss_mlp": 0.31787109, + "step": 4371, + "time_per_iteration": 2.758342981338501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063458, + "balance_loss_mlp": 1.03165269, + "epoch": 0.8410927279722971, + "flos": 556475443200.0, + "grad_norm": 0.060016335878015956, + "language_loss": 0.77717382, + "learning_rate": 6.477187646391374e-05, + "loss": 0.78780836, + "num_input_tokens_seen": 362795312, + "router_z_loss_mlp": 0.31787109, + "step": 4372, + "time_per_iteration": 2.6928319931030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006701, + "balance_loss_mlp": 0.99816585, + "epoch": 0.8412851096575606, + "flos": 1548963979776.0, + "grad_norm": 0.007825134945211695, + "language_loss": 0.77679121, + "learning_rate": 6.461860556730925e-05, + "loss": 0.7868582, + "num_input_tokens_seen": 363026272, + "router_z_loss_mlp": 0.08544922, + "step": 4373, + "time_per_iteration": 4.882466793060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062183, + "balance_loss_mlp": 1.03075981, + "epoch": 0.8414774913428241, + "flos": 551777329152.0, + "grad_norm": 0.0528843207904054, + "language_loss": 0.78680658, + "learning_rate": 6.446550370075271e-05, + "loss": 0.79742843, + "num_input_tokens_seen": 363098384, + "router_z_loss_mlp": 0.31396484, + "step": 4374, + "time_per_iteration": 2.6880640983581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066012, + "balance_loss_mlp": 1.03404009, + "epoch": 0.8416698730280877, + "flos": 572752373760.0, + "grad_norm": 0.0786696469695305, + "language_loss": 0.77104962, + "learning_rate": 6.431257092368336e-05, + "loss": 0.78170967, + "num_input_tokens_seen": 363170960, + "router_z_loss_mlp": 0.31958008, + "step": 4375, + "time_per_iteration": 2.669539213180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061463, + "balance_loss_mlp": 1.02946734, + "epoch": 0.8418622547133513, + "flos": 758405251584.0, + "grad_norm": 0.05603873808243246, + "language_loss": 0.79780394, + "learning_rate": 6.415980729547543e-05, + "loss": 0.80841863, + "num_input_tokens_seen": 363242000, + "router_z_loss_mlp": 0.31982422, + "step": 4376, + "time_per_iteration": 2.902561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064279, + "balance_loss_mlp": 1.03223574, + "epoch": 0.8420546363986149, + "flos": 1073717448192.0, + "grad_norm": 0.08072292873132886, + "language_loss": 0.72504145, + "learning_rate": 6.40072128754366e-05, + "loss": 0.73568422, + "num_input_tokens_seen": 363340288, + "router_z_loss_mlp": 0.3203125, + "step": 4377, + "time_per_iteration": 3.4048268795013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064955, + "balance_loss_mlp": 1.03353119, + "epoch": 0.8422470180838784, + "flos": 525632716800.0, + "grad_norm": 0.05128837603691623, + "language_loss": 0.8274287, + "learning_rate": 6.385478772280933e-05, + "loss": 0.83807814, + "num_input_tokens_seen": 363416208, + "router_z_loss_mlp": 0.31396484, + "step": 4378, + "time_per_iteration": 2.7541863918304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061165, + "balance_loss_mlp": 1.02928829, + "epoch": 0.842439399769142, + "flos": 600552714240.0, + "grad_norm": 0.07361287537672946, + "language_loss": 0.82019341, + "learning_rate": 6.370253189677038e-05, + "loss": 0.83080506, + "num_input_tokens_seen": 363492864, + "router_z_loss_mlp": 0.31860352, + "step": 4379, + "time_per_iteration": 2.717729330062866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062548, + "balance_loss_mlp": 1.03007579, + "epoch": 0.8426317814544055, + "flos": 551935890432.0, + "grad_norm": 0.05730022029420674, + "language_loss": 0.86337304, + "learning_rate": 6.355044545643073e-05, + "loss": 0.87399852, + "num_input_tokens_seen": 363572000, + "router_z_loss_mlp": 0.32470703, + "step": 4380, + "time_per_iteration": 2.8673453330993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059699, + "balance_loss_mlp": 1.02810872, + "epoch": 0.8428241631396691, + "flos": 678531064320.0, + "grad_norm": 0.05645772213934178, + "language_loss": 0.77814853, + "learning_rate": 6.33985284608356e-05, + "loss": 0.78874558, + "num_input_tokens_seen": 363646480, + "router_z_loss_mlp": 0.31567383, + "step": 4381, + "time_per_iteration": 2.797013759613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066522, + "balance_loss_mlp": 1.03512263, + "epoch": 0.8430165448249327, + "flos": 753365131776.0, + "grad_norm": 0.06324404810028335, + "language_loss": 0.79739416, + "learning_rate": 6.324678096896435e-05, + "loss": 0.80805933, + "num_input_tokens_seen": 363737552, + "router_z_loss_mlp": 0.3137207, + "step": 4382, + "time_per_iteration": 3.068904399871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061548, + "balance_loss_mlp": 1.03002882, + "epoch": 0.8432089265101962, + "flos": 698817867264.0, + "grad_norm": 0.052650923658821146, + "language_loss": 0.80755234, + "learning_rate": 6.30952030397306e-05, + "loss": 0.81816781, + "num_input_tokens_seen": 363816016, + "router_z_loss_mlp": 0.31518555, + "step": 4383, + "time_per_iteration": 2.910621166229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064072, + "balance_loss_mlp": 1.03200483, + "epoch": 0.8434013081954598, + "flos": 485513035776.0, + "grad_norm": 0.0532202077177964, + "language_loss": 0.8463192, + "learning_rate": 6.294379473198208e-05, + "loss": 0.85695994, + "num_input_tokens_seen": 363888192, + "router_z_loss_mlp": 0.32055664, + "step": 4384, + "time_per_iteration": 2.6428587436676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061742, + "balance_loss_mlp": 1.03017521, + "epoch": 0.8435936898807234, + "flos": 520372988928.0, + "grad_norm": 0.05180994761233764, + "language_loss": 0.85464537, + "learning_rate": 6.279255610450068e-05, + "loss": 0.86526275, + "num_input_tokens_seen": 363953904, + "router_z_loss_mlp": 0.31542969, + "step": 4385, + "time_per_iteration": 2.6025028228759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062937, + "balance_loss_mlp": 1.03146529, + "epoch": 0.843786071565987, + "flos": 785604690432.0, + "grad_norm": 0.05052030046697327, + "language_loss": 0.80340213, + "learning_rate": 6.264148721600254e-05, + "loss": 0.81403148, + "num_input_tokens_seen": 364031552, + "router_z_loss_mlp": 0.31445312, + "step": 4386, + "time_per_iteration": 2.9918956756591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003602, + "balance_loss_mlp": 0.99516225, + "epoch": 0.8439784532512504, + "flos": 1445472442368.0, + "grad_norm": 0.005014150757635798, + "language_loss": 0.75836509, + "learning_rate": 6.24905881251378e-05, + "loss": 0.76840121, + "num_input_tokens_seen": 364256480, + "router_z_loss_mlp": 0.08447266, + "step": 4387, + "time_per_iteration": 4.906704664230347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062352, + "balance_loss_mlp": 1.03026116, + "epoch": 0.844170834936514, + "flos": 708384393216.0, + "grad_norm": 0.0608021946942925, + "language_loss": 0.82278877, + "learning_rate": 6.23398588904906e-05, + "loss": 0.83341229, + "num_input_tokens_seen": 364329696, + "router_z_loss_mlp": 0.32080078, + "step": 4388, + "time_per_iteration": 2.8366615772247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062411, + "balance_loss_mlp": 1.03039169, + "epoch": 0.8443632166217776, + "flos": 483183622656.0, + "grad_norm": 0.06339245452093885, + "language_loss": 0.79611492, + "learning_rate": 6.218929957057922e-05, + "loss": 0.80673903, + "num_input_tokens_seen": 364400944, + "router_z_loss_mlp": 0.32006836, + "step": 4389, + "time_per_iteration": 2.6750078201293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064765, + "balance_loss_mlp": 1.03262651, + "epoch": 0.8445555983070412, + "flos": 678388469760.0, + "grad_norm": 0.10070557471224953, + "language_loss": 0.80328929, + "learning_rate": 6.2038910223856e-05, + "loss": 0.81393689, + "num_input_tokens_seen": 364475744, + "router_z_loss_mlp": 0.32128906, + "step": 4390, + "time_per_iteration": 2.855630874633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062723, + "balance_loss_mlp": 1.03094149, + "epoch": 0.8447479799923048, + "flos": 741143305728.0, + "grad_norm": 0.055868322560849205, + "language_loss": 0.74313754, + "learning_rate": 6.18886909087073e-05, + "loss": 0.75376475, + "num_input_tokens_seen": 364557248, + "router_z_loss_mlp": 0.31762695, + "step": 4391, + "time_per_iteration": 2.9666664600372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061502, + "balance_loss_mlp": 1.02984023, + "epoch": 0.8449403616775683, + "flos": 952897125888.0, + "grad_norm": 0.08754318552441484, + "language_loss": 0.80220729, + "learning_rate": 6.173864168345344e-05, + "loss": 0.81282234, + "num_input_tokens_seen": 364647856, + "router_z_loss_mlp": 0.31640625, + "step": 4392, + "time_per_iteration": 3.2447478771209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106655, + "balance_loss_mlp": 1.03405356, + "epoch": 0.8451327433628318, + "flos": 657054042624.0, + "grad_norm": 0.061100470877336555, + "language_loss": 0.72091293, + "learning_rate": 6.158876260634871e-05, + "loss": 0.73157841, + "num_input_tokens_seen": 364728848, + "router_z_loss_mlp": 0.32495117, + "step": 4393, + "time_per_iteration": 2.870314598083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062012, + "balance_loss_mlp": 1.03011155, + "epoch": 0.8453251250480954, + "flos": 445880775168.0, + "grad_norm": 0.056584995707747415, + "language_loss": 0.83570069, + "learning_rate": 6.143905373558112e-05, + "loss": 0.84632081, + "num_input_tokens_seen": 364794032, + "router_z_loss_mlp": 0.31884766, + "step": 4394, + "time_per_iteration": 2.5169620513916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065302, + "balance_loss_mlp": 1.0335449, + "epoch": 0.845517506733359, + "flos": 542491610112.0, + "grad_norm": 0.06893065847374383, + "language_loss": 0.70728701, + "learning_rate": 6.128951512927305e-05, + "loss": 0.71794009, + "num_input_tokens_seen": 364868624, + "router_z_loss_mlp": 0.31738281, + "step": 4395, + "time_per_iteration": 2.6561121940612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061554, + "balance_loss_mlp": 1.02962959, + "epoch": 0.8457098884186226, + "flos": 502175490048.0, + "grad_norm": 0.0520725454143225, + "language_loss": 0.84400153, + "learning_rate": 6.114014684548046e-05, + "loss": 0.854617, + "num_input_tokens_seen": 364938208, + "router_z_loss_mlp": 0.3190918, + "step": 4396, + "time_per_iteration": 2.607789993286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065204, + "balance_loss_mlp": 1.03335178, + "epoch": 0.8459022701038861, + "flos": 448643764224.0, + "grad_norm": 0.06588987467547251, + "language_loss": 0.79514521, + "learning_rate": 6.099094894219326e-05, + "loss": 0.80579728, + "num_input_tokens_seen": 365009440, + "router_z_loss_mlp": 0.31835938, + "step": 4397, + "time_per_iteration": 2.749403953552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105971, + "balance_loss_mlp": 1.02838171, + "epoch": 0.8460946517891497, + "flos": 742855850496.0, + "grad_norm": 0.053779165984043746, + "language_loss": 0.74806583, + "learning_rate": 6.0841921477335194e-05, + "loss": 0.75866288, + "num_input_tokens_seen": 365085904, + "router_z_loss_mlp": 0.31298828, + "step": 4398, + "time_per_iteration": 2.9234650135040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060698, + "balance_loss_mlp": 1.02929795, + "epoch": 0.8462870334744133, + "flos": 552939844608.0, + "grad_norm": 0.053512872285819295, + "language_loss": 0.79614019, + "learning_rate": 6.069306450876389e-05, + "loss": 0.80674708, + "num_input_tokens_seen": 365163600, + "router_z_loss_mlp": 0.3137207, + "step": 4399, + "time_per_iteration": 2.733107089996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01000293, + "balance_loss_mlp": 0.99199617, + "epoch": 0.8464794151596768, + "flos": 1564033162752.0, + "grad_norm": 0.008637773298876451, + "language_loss": 0.81708568, + "learning_rate": 6.054437809427071e-05, + "loss": 0.82708859, + "num_input_tokens_seen": 365384528, + "router_z_loss_mlp": 0.08300781, + "step": 4400, + "time_per_iteration": 4.845052719116211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062975, + "balance_loss_mlp": 1.03066921, + "epoch": 0.8466717968449403, + "flos": 549930954240.0, + "grad_norm": 0.05555564096889626, + "language_loss": 0.79862118, + "learning_rate": 6.039586229158084e-05, + "loss": 0.80925095, + "num_input_tokens_seen": 365453760, + "router_z_loss_mlp": 0.32299805, + "step": 4401, + "time_per_iteration": 2.6397857666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062133, + "balance_loss_mlp": 1.02954078, + "epoch": 0.8468641785302039, + "flos": 551625970176.0, + "grad_norm": 0.05665481648862394, + "language_loss": 0.84565353, + "learning_rate": 6.024751715835314e-05, + "loss": 0.85627484, + "num_input_tokens_seen": 365532416, + "router_z_loss_mlp": 0.32592773, + "step": 4402, + "time_per_iteration": 2.751540422439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060788, + "balance_loss_mlp": 1.02891159, + "epoch": 0.8470565602154675, + "flos": 572384226816.0, + "grad_norm": 0.05542155513246256, + "language_loss": 0.8710115, + "learning_rate": 6.009934275218049e-05, + "loss": 0.88161939, + "num_input_tokens_seen": 365603776, + "router_z_loss_mlp": 0.31860352, + "step": 4403, + "time_per_iteration": 2.699852705001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063, + "balance_loss_mlp": 1.0309329, + "epoch": 0.8472489419007311, + "flos": 472597175808.0, + "grad_norm": 0.06255636958013598, + "language_loss": 0.84244227, + "learning_rate": 5.995133913058936e-05, + "loss": 0.85307229, + "num_input_tokens_seen": 365670432, + "router_z_loss_mlp": 0.32055664, + "step": 4404, + "time_per_iteration": 2.5569348335266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059261, + "balance_loss_mlp": 1.02776599, + "epoch": 0.8474413235859947, + "flos": 797682511872.0, + "grad_norm": 0.06074469603252205, + "language_loss": 0.79307783, + "learning_rate": 5.980350635103954e-05, + "loss": 0.80367047, + "num_input_tokens_seen": 365741584, + "router_z_loss_mlp": 0.31469727, + "step": 4405, + "time_per_iteration": 2.9716339111328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066394, + "balance_loss_mlp": 1.03525686, + "epoch": 0.8476337052712581, + "flos": 502130409984.0, + "grad_norm": 0.05964249854785047, + "language_loss": 0.80296123, + "learning_rate": 5.9655844470924866e-05, + "loss": 0.81362522, + "num_input_tokens_seen": 365805344, + "router_z_loss_mlp": 0.31103516, + "step": 4406, + "time_per_iteration": 2.5619914531707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061086, + "balance_loss_mlp": 1.02990091, + "epoch": 0.8478260869565217, + "flos": 931586019840.0, + "grad_norm": 0.047034388507541325, + "language_loss": 0.82831132, + "learning_rate": 5.9508353547573e-05, + "loss": 0.83892226, + "num_input_tokens_seen": 365890976, + "router_z_loss_mlp": 0.31152344, + "step": 4407, + "time_per_iteration": 3.201432228088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062926, + "balance_loss_mlp": 1.03207469, + "epoch": 0.8480184686417853, + "flos": 708502256640.0, + "grad_norm": 0.0520483514476875, + "language_loss": 0.80806863, + "learning_rate": 5.9361033638244855e-05, + "loss": 0.81869787, + "num_input_tokens_seen": 365968912, + "router_z_loss_mlp": 0.30810547, + "step": 4408, + "time_per_iteration": 2.8537440299987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061734, + "balance_loss_mlp": 1.03047752, + "epoch": 0.8482108503270489, + "flos": 614152433664.0, + "grad_norm": 0.048904900371612575, + "language_loss": 0.82296753, + "learning_rate": 5.9213884800135066e-05, + "loss": 0.8335849, + "num_input_tokens_seen": 366047680, + "router_z_loss_mlp": 0.31225586, + "step": 4409, + "time_per_iteration": 2.8240151405334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063069, + "balance_loss_mlp": 1.03200269, + "epoch": 0.8484032320123124, + "flos": 530752822272.0, + "grad_norm": 0.12602145996095604, + "language_loss": 0.82197714, + "learning_rate": 5.906690709037194e-05, + "loss": 0.83260781, + "num_input_tokens_seen": 366118720, + "router_z_loss_mlp": 0.31030273, + "step": 4410, + "time_per_iteration": 2.600715398788452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01001074, + "balance_loss_mlp": 0.99291962, + "epoch": 0.848595613697576, + "flos": 1541930508288.0, + "grad_norm": 0.0076180940143389065, + "language_loss": 0.76296914, + "learning_rate": 5.892010056601726e-05, + "loss": 0.77297986, + "num_input_tokens_seen": 366346928, + "router_z_loss_mlp": 0.08154297, + "step": 4411, + "time_per_iteration": 4.879023551940918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061447, + "balance_loss_mlp": 1.03061938, + "epoch": 0.8487879953828396, + "flos": 677025133056.0, + "grad_norm": 0.05602222185131278, + "language_loss": 0.7385751, + "learning_rate": 5.877346528406635e-05, + "loss": 0.7491895, + "num_input_tokens_seen": 366422848, + "router_z_loss_mlp": 0.30786133, + "step": 4412, + "time_per_iteration": 2.880748987197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061373, + "balance_loss_mlp": 1.03035462, + "epoch": 0.8489803770681031, + "flos": 503425345536.0, + "grad_norm": 0.05707259676031019, + "language_loss": 0.79403526, + "learning_rate": 5.8627001301448105e-05, + "loss": 0.80464894, + "num_input_tokens_seen": 366492016, + "router_z_loss_mlp": 0.30981445, + "step": 4413, + "time_per_iteration": 2.5811662673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010618, + "balance_loss_mlp": 1.03063893, + "epoch": 0.8491727587533667, + "flos": 562896276480.0, + "grad_norm": 0.051344751965668234, + "language_loss": 0.76542878, + "learning_rate": 5.84807086750247e-05, + "loss": 0.77604681, + "num_input_tokens_seen": 366566400, + "router_z_loss_mlp": 0.3112793, + "step": 4414, + "time_per_iteration": 2.7214043140411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063085, + "balance_loss_mlp": 1.03044593, + "epoch": 0.8493651404386302, + "flos": 459544513536.0, + "grad_norm": 0.0639628244470696, + "language_loss": 0.77723747, + "learning_rate": 5.833458746159243e-05, + "loss": 0.78786838, + "num_input_tokens_seen": 366634016, + "router_z_loss_mlp": 0.32641602, + "step": 4415, + "time_per_iteration": 2.603907823562622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062645, + "balance_loss_mlp": 1.03086436, + "epoch": 0.8495575221238938, + "flos": 460928199168.0, + "grad_norm": 0.06700935131460924, + "language_loss": 0.81717062, + "learning_rate": 5.818863771788013e-05, + "loss": 0.82779706, + "num_input_tokens_seen": 366704384, + "router_z_loss_mlp": 0.31762695, + "step": 4416, + "time_per_iteration": 2.6823158264160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061617, + "balance_loss_mlp": 1.03016961, + "epoch": 0.8497499038091574, + "flos": 870353081856.0, + "grad_norm": 0.05272559442866759, + "language_loss": 0.81311977, + "learning_rate": 5.8042859500550604e-05, + "loss": 0.82373595, + "num_input_tokens_seen": 366785456, + "router_z_loss_mlp": 0.31420898, + "step": 4417, + "time_per_iteration": 3.0989885330200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061164, + "balance_loss_mlp": 1.029598, + "epoch": 0.849942285494421, + "flos": 779258050560.0, + "grad_norm": 0.12053257380548967, + "language_loss": 0.78102922, + "learning_rate": 5.789725286620018e-05, + "loss": 0.79164088, + "num_input_tokens_seen": 366862848, + "router_z_loss_mlp": 0.31542969, + "step": 4418, + "time_per_iteration": 3.0168724060058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061302, + "balance_loss_mlp": 1.0296638, + "epoch": 0.8501346671796844, + "flos": 513544720896.0, + "grad_norm": 0.05675897396855917, + "language_loss": 0.84916025, + "learning_rate": 5.775181787135819e-05, + "loss": 0.85977328, + "num_input_tokens_seen": 366934800, + "router_z_loss_mlp": 0.31616211, + "step": 4419, + "time_per_iteration": 2.651323080062866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062383, + "balance_loss_mlp": 1.0311023, + "epoch": 0.850327048864948, + "flos": 621149437440.0, + "grad_norm": 0.04589752872200537, + "language_loss": 0.83418536, + "learning_rate": 5.76065545724877e-05, + "loss": 0.84480917, + "num_input_tokens_seen": 367015152, + "router_z_loss_mlp": 0.3125, + "step": 4420, + "time_per_iteration": 2.8431475162506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061049, + "balance_loss_mlp": 1.02974463, + "epoch": 0.8505194305502116, + "flos": 773890633728.0, + "grad_norm": 0.05328835761082586, + "language_loss": 0.79577553, + "learning_rate": 5.746146302598454e-05, + "loss": 0.80638599, + "num_input_tokens_seen": 367092192, + "router_z_loss_mlp": 0.31274414, + "step": 4421, + "time_per_iteration": 3.0034854412078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059109, + "balance_loss_mlp": 1.02761436, + "epoch": 0.8507118122354752, + "flos": 465019619328.0, + "grad_norm": 0.05394674589529038, + "language_loss": 0.8635028, + "learning_rate": 5.731654328817859e-05, + "loss": 0.87409389, + "num_input_tokens_seen": 367159744, + "router_z_loss_mlp": 0.31469727, + "step": 4422, + "time_per_iteration": 2.6123807430267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061076, + "balance_loss_mlp": 1.02958083, + "epoch": 0.8509041939207388, + "flos": 534150208512.0, + "grad_norm": 0.05638570018802557, + "language_loss": 0.847974, + "learning_rate": 5.717179541533257e-05, + "loss": 0.85858476, + "num_input_tokens_seen": 367226384, + "router_z_loss_mlp": 0.31469727, + "step": 4423, + "time_per_iteration": 2.6444549560546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061706, + "balance_loss_mlp": 1.03002, + "epoch": 0.8510965756060023, + "flos": 583466858496.0, + "grad_norm": 0.05656669272742153, + "language_loss": 0.84546876, + "learning_rate": 5.702721946364264e-05, + "loss": 0.85608578, + "num_input_tokens_seen": 367294768, + "router_z_loss_mlp": 0.31665039, + "step": 4424, + "time_per_iteration": 2.6550204753875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062538, + "balance_loss_mlp": 1.03092384, + "epoch": 0.8512889572912659, + "flos": 600548332032.0, + "grad_norm": 0.06195322404547727, + "language_loss": 0.77515125, + "learning_rate": 5.688281548923796e-05, + "loss": 0.78577662, + "num_input_tokens_seen": 367372368, + "router_z_loss_mlp": 0.31591797, + "step": 4425, + "time_per_iteration": 2.788872003555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062864, + "balance_loss_mlp": 1.03124976, + "epoch": 0.8514813389765294, + "flos": 654474345984.0, + "grad_norm": 0.06031925061221975, + "language_loss": 0.78652239, + "learning_rate": 5.673858354818151e-05, + "loss": 0.79715109, + "num_input_tokens_seen": 367452656, + "router_z_loss_mlp": 0.31591797, + "step": 4426, + "time_per_iteration": 2.8395063877105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062488, + "balance_loss_mlp": 1.03132665, + "epoch": 0.851673720661793, + "flos": 429538415616.0, + "grad_norm": 0.06630539351517445, + "language_loss": 0.7778796, + "learning_rate": 5.6594523696468726e-05, + "loss": 0.78850448, + "num_input_tokens_seen": 367517808, + "router_z_loss_mlp": 0.3112793, + "step": 4427, + "time_per_iteration": 2.52323842048645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063991, + "balance_loss_mlp": 1.03256702, + "epoch": 0.8518661023470565, + "flos": 641277679104.0, + "grad_norm": 0.054901162952162144, + "language_loss": 0.79352564, + "learning_rate": 5.645063599002875e-05, + "loss": 0.80416554, + "num_input_tokens_seen": 367591728, + "router_z_loss_mlp": 0.31396484, + "step": 4428, + "time_per_iteration": 2.7660624980926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066318, + "balance_loss_mlp": 1.03451312, + "epoch": 0.8520584840323201, + "flos": 561880737792.0, + "grad_norm": 0.05488389146689813, + "language_loss": 0.79418516, + "learning_rate": 5.630692048472363e-05, + "loss": 0.80484831, + "num_input_tokens_seen": 367664496, + "router_z_loss_mlp": 0.31787109, + "step": 4429, + "time_per_iteration": 2.6520426273345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060507, + "balance_loss_mlp": 1.0296793, + "epoch": 0.8522508657175837, + "flos": 526793822208.0, + "grad_norm": 0.059198790333543354, + "language_loss": 0.78811502, + "learning_rate": 5.61633772363489e-05, + "loss": 0.79872012, + "num_input_tokens_seen": 367735584, + "router_z_loss_mlp": 0.30786133, + "step": 4430, + "time_per_iteration": 2.639409065246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061431, + "balance_loss_mlp": 1.03053164, + "epoch": 0.8524432474028473, + "flos": 498875618304.0, + "grad_norm": 0.0498357465810583, + "language_loss": 0.80618191, + "learning_rate": 5.602000630063298e-05, + "loss": 0.81679624, + "num_input_tokens_seen": 367801136, + "router_z_loss_mlp": 0.30859375, + "step": 4431, + "time_per_iteration": 2.5802340507507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063948, + "balance_loss_mlp": 1.03176129, + "epoch": 0.8526356290881109, + "flos": 421089325056.0, + "grad_norm": 0.06741417888605215, + "language_loss": 0.79432404, + "learning_rate": 5.587680773323706e-05, + "loss": 0.80496353, + "num_input_tokens_seen": 367865312, + "router_z_loss_mlp": 0.32177734, + "step": 4432, + "time_per_iteration": 2.5548770427703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064662, + "balance_loss_mlp": 1.03373873, + "epoch": 0.8528280107733743, + "flos": 507078807552.0, + "grad_norm": 0.06908943944415502, + "language_loss": 0.80579317, + "learning_rate": 5.5733781589756115e-05, + "loss": 0.81643981, + "num_input_tokens_seen": 367931104, + "router_z_loss_mlp": 0.30883789, + "step": 4433, + "time_per_iteration": 2.6177783012390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062039, + "balance_loss_mlp": 1.03111625, + "epoch": 0.8530203924586379, + "flos": 445663987200.0, + "grad_norm": 0.061652443542314195, + "language_loss": 0.82560652, + "learning_rate": 5.5590927925717684e-05, + "loss": 0.83622682, + "num_input_tokens_seen": 367995520, + "router_z_loss_mlp": 0.30883789, + "step": 4434, + "time_per_iteration": 2.497509479522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063718, + "balance_loss_mlp": 1.03157926, + "epoch": 0.8532127741439015, + "flos": 657452712960.0, + "grad_norm": 0.07889599950997465, + "language_loss": 0.83384633, + "learning_rate": 5.54482467965825e-05, + "loss": 0.84448349, + "num_input_tokens_seen": 368073664, + "router_z_loss_mlp": 0.32128906, + "step": 4435, + "time_per_iteration": 2.8157849311828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062143, + "balance_loss_mlp": 1.03076673, + "epoch": 0.8534051558291651, + "flos": 535750682112.0, + "grad_norm": 0.05205501193179772, + "language_loss": 0.8285321, + "learning_rate": 5.5305738257744264e-05, + "loss": 0.83915353, + "num_input_tokens_seen": 368147536, + "router_z_loss_mlp": 0.31347656, + "step": 4436, + "time_per_iteration": 2.7104005813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065615, + "balance_loss_mlp": 1.03404808, + "epoch": 0.8535975375144286, + "flos": 532741791744.0, + "grad_norm": 0.06598498607456474, + "language_loss": 0.79102892, + "learning_rate": 5.5163402364529655e-05, + "loss": 0.80168509, + "num_input_tokens_seen": 368218672, + "router_z_loss_mlp": 0.31542969, + "step": 4437, + "time_per_iteration": 2.6187076568603516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059712, + "balance_loss_mlp": 1.02819347, + "epoch": 0.8537899191996922, + "flos": 573861044736.0, + "grad_norm": 0.06908451740926473, + "language_loss": 0.82492721, + "learning_rate": 5.502123917219848e-05, + "loss": 0.83552432, + "num_input_tokens_seen": 368287056, + "router_z_loss_mlp": 0.31494141, + "step": 4438, + "time_per_iteration": 2.676590919494629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067428, + "balance_loss_mlp": 1.03602839, + "epoch": 0.8539823008849557, + "flos": 464759161344.0, + "grad_norm": 0.05953176501348747, + "language_loss": 0.83260107, + "learning_rate": 5.48792487359433e-05, + "loss": 0.84327531, + "num_input_tokens_seen": 368358400, + "router_z_loss_mlp": 0.3137207, + "step": 4439, + "time_per_iteration": 2.6769707202911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059953, + "balance_loss_mlp": 1.0280292, + "epoch": 0.8541746825702193, + "flos": 554441393664.0, + "grad_norm": 0.059973770389771766, + "language_loss": 0.8156724, + "learning_rate": 5.4737431110889745e-05, + "loss": 0.82627189, + "num_input_tokens_seen": 368427168, + "router_z_loss_mlp": 0.3190918, + "step": 4440, + "time_per_iteration": 2.642137050628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064084, + "balance_loss_mlp": 1.0323509, + "epoch": 0.8543670642554829, + "flos": 546101402112.0, + "grad_norm": 0.04886224100797358, + "language_loss": 0.7693212, + "learning_rate": 5.4595786352096165e-05, + "loss": 0.77996206, + "num_input_tokens_seen": 368503584, + "router_z_loss_mlp": 0.31713867, + "step": 4441, + "time_per_iteration": 2.737921714782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061215, + "balance_loss_mlp": 1.02988696, + "epoch": 0.8545594459407464, + "flos": 511766747136.0, + "grad_norm": 0.05039354849040429, + "language_loss": 0.81989372, + "learning_rate": 5.4454314514554236e-05, + "loss": 0.83050585, + "num_input_tokens_seen": 368576976, + "router_z_loss_mlp": 0.31298828, + "step": 4442, + "time_per_iteration": 2.7490930557250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059972, + "balance_loss_mlp": 1.02814305, + "epoch": 0.85475182762601, + "flos": 420961287168.0, + "grad_norm": 0.05564199021069246, + "language_loss": 0.81687701, + "learning_rate": 5.431301565318786e-05, + "loss": 0.82747674, + "num_input_tokens_seen": 368641664, + "router_z_loss_mlp": 0.31811523, + "step": 4443, + "time_per_iteration": 2.5382466316223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063197, + "balance_loss_mlp": 1.03174961, + "epoch": 0.8549442093112736, + "flos": 389222295552.0, + "grad_norm": 0.07918770769320456, + "language_loss": 0.7730273, + "learning_rate": 5.41718898228542e-05, + "loss": 0.78365928, + "num_input_tokens_seen": 368705616, + "router_z_loss_mlp": 0.31420898, + "step": 4444, + "time_per_iteration": 2.478496551513672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105818, + "balance_loss_mlp": 1.02589846, + "epoch": 0.8551365909965372, + "flos": 605620385280.0, + "grad_norm": 0.3659120344191015, + "language_loss": 0.795048, + "learning_rate": 5.403093707834334e-05, + "loss": 0.80562979, + "num_input_tokens_seen": 368779664, + "router_z_loss_mlp": 0.32275391, + "step": 4445, + "time_per_iteration": 2.8390157222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063635, + "balance_loss_mlp": 1.03149629, + "epoch": 0.8553289726818007, + "flos": 503912765952.0, + "grad_norm": 0.1926069572015181, + "language_loss": 0.78790247, + "learning_rate": 5.3890157474377865e-05, + "loss": 0.7985388, + "num_input_tokens_seen": 368846656, + "router_z_loss_mlp": 0.32128906, + "step": 4446, + "time_per_iteration": 2.548919677734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063127, + "balance_loss_mlp": 1.03189397, + "epoch": 0.8555213543670642, + "flos": 556735901184.0, + "grad_norm": 0.056427781498625505, + "language_loss": 0.76149607, + "learning_rate": 5.374955106561324e-05, + "loss": 0.77212739, + "num_input_tokens_seen": 368923712, + "router_z_loss_mlp": 0.31201172, + "step": 4447, + "time_per_iteration": 2.7372312545776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106394, + "balance_loss_mlp": 1.03263545, + "epoch": 0.8557137360523278, + "flos": 547843060224.0, + "grad_norm": 0.052660516430393885, + "language_loss": 0.74772626, + "learning_rate": 5.360911790663775e-05, + "loss": 0.75836563, + "num_input_tokens_seen": 368994496, + "router_z_loss_mlp": 0.31274414, + "step": 4448, + "time_per_iteration": 2.6941676139831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060031, + "balance_loss_mlp": 1.02848864, + "epoch": 0.8559061177375914, + "flos": 727853506560.0, + "grad_norm": 0.04843590517934965, + "language_loss": 0.78747225, + "learning_rate": 5.346885805197238e-05, + "loss": 0.79807258, + "num_input_tokens_seen": 369077088, + "router_z_loss_mlp": 0.31518555, + "step": 4449, + "time_per_iteration": 2.9643099308013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061228, + "balance_loss_mlp": 1.02951789, + "epoch": 0.856098499422855, + "flos": 535608087552.0, + "grad_norm": 0.08512499863393, + "language_loss": 0.8254863, + "learning_rate": 5.332877155607085e-05, + "loss": 0.83609855, + "num_input_tokens_seen": 369147680, + "router_z_loss_mlp": 0.31689453, + "step": 4450, + "time_per_iteration": 2.645606517791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066721, + "balance_loss_mlp": 1.03372383, + "epoch": 0.8562908811081185, + "flos": 573388180992.0, + "grad_norm": 0.05548360969534156, + "language_loss": 0.83302569, + "learning_rate": 5.3188858473319504e-05, + "loss": 0.8436929, + "num_input_tokens_seen": 369224320, + "router_z_loss_mlp": 0.33007812, + "step": 4451, + "time_per_iteration": 2.685065507888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059945, + "balance_loss_mlp": 1.02825868, + "epoch": 0.856483262793382, + "flos": 781391024640.0, + "grad_norm": 0.057471374374553505, + "language_loss": 0.80552411, + "learning_rate": 5.3049118858037426e-05, + "loss": 0.81612355, + "num_input_tokens_seen": 369315744, + "router_z_loss_mlp": 0.31665039, + "step": 4452, + "time_per_iteration": 3.104637861251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061008, + "balance_loss_mlp": 1.02944088, + "epoch": 0.8566756444786456, + "flos": 455585513472.0, + "grad_norm": 0.046190458281021356, + "language_loss": 0.84324169, + "learning_rate": 5.290955276447651e-05, + "loss": 0.85385174, + "num_input_tokens_seen": 369382800, + "router_z_loss_mlp": 0.31542969, + "step": 4453, + "time_per_iteration": 2.52528715133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060297, + "balance_loss_mlp": 1.02844453, + "epoch": 0.8568680261639092, + "flos": 449150123520.0, + "grad_norm": 0.05730147141848336, + "language_loss": 0.84070498, + "learning_rate": 5.277016024682091e-05, + "loss": 0.85130793, + "num_input_tokens_seen": 369447312, + "router_z_loss_mlp": 0.31835938, + "step": 4454, + "time_per_iteration": 2.5523717403411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061824, + "balance_loss_mlp": 1.02999473, + "epoch": 0.8570604078491728, + "flos": 479736774144.0, + "grad_norm": 0.06420834276058747, + "language_loss": 0.8262471, + "learning_rate": 5.2630941359187665e-05, + "loss": 0.83686531, + "num_input_tokens_seen": 369512800, + "router_z_loss_mlp": 0.31811523, + "step": 4455, + "time_per_iteration": 2.5419952869415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064325, + "balance_loss_mlp": 1.03206658, + "epoch": 0.8572527895344363, + "flos": 505695121920.0, + "grad_norm": 0.047691689677810546, + "language_loss": 0.84493929, + "learning_rate": 5.249189615562627e-05, + "loss": 0.85558259, + "num_input_tokens_seen": 369580720, + "router_z_loss_mlp": 0.32250977, + "step": 4456, + "time_per_iteration": 2.61643648147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062107, + "balance_loss_mlp": 1.03077888, + "epoch": 0.8574451712196999, + "flos": 786688630272.0, + "grad_norm": 0.0541100916451422, + "language_loss": 0.82981288, + "learning_rate": 5.235302469011905e-05, + "loss": 0.8404339, + "num_input_tokens_seen": 369672544, + "router_z_loss_mlp": 0.31298828, + "step": 4457, + "time_per_iteration": 3.0279483795166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061214, + "balance_loss_mlp": 1.02933741, + "epoch": 0.8576375529049635, + "flos": 508980436992.0, + "grad_norm": 0.050346317822273064, + "language_loss": 0.75190985, + "learning_rate": 5.2214327016580575e-05, + "loss": 0.76252198, + "num_input_tokens_seen": 369745776, + "router_z_loss_mlp": 0.31860352, + "step": 4458, + "time_per_iteration": 2.7106165885925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009578, + "balance_loss_mlp": 1.00147212, + "epoch": 0.857829934590227, + "flos": 1459996130304.0, + "grad_norm": 0.00950278300704673, + "language_loss": 0.84767288, + "learning_rate": 5.207580318885802e-05, + "loss": 0.85776865, + "num_input_tokens_seen": 369975200, + "router_z_loss_mlp": 0.08105469, + "step": 4459, + "time_per_iteration": 4.9368908405303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062088, + "balance_loss_mlp": 1.02963924, + "epoch": 0.8580223162754905, + "flos": 479057296896.0, + "grad_norm": 0.050243088547339124, + "language_loss": 0.88987887, + "learning_rate": 5.193745326073118e-05, + "loss": 0.9004997, + "num_input_tokens_seen": 370043296, + "router_z_loss_mlp": 0.32446289, + "step": 4460, + "time_per_iteration": 2.6464526653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106302, + "balance_loss_mlp": 1.03142953, + "epoch": 0.8582146979607541, + "flos": 705926942208.0, + "grad_norm": 0.0551820595184576, + "language_loss": 0.79153854, + "learning_rate": 5.179927728591227e-05, + "loss": 0.80216873, + "num_input_tokens_seen": 370111152, + "router_z_loss_mlp": 0.31567383, + "step": 4461, + "time_per_iteration": 2.8269202709198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062228, + "balance_loss_mlp": 1.03023219, + "epoch": 0.8584070796460177, + "flos": 764826084864.0, + "grad_norm": 0.05216333699988601, + "language_loss": 0.82483435, + "learning_rate": 5.1661275318045874e-05, + "loss": 0.83545661, + "num_input_tokens_seen": 370190272, + "router_z_loss_mlp": 0.31982422, + "step": 4462, + "time_per_iteration": 3.035334825515747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060691, + "balance_loss_mlp": 1.02862346, + "epoch": 0.8585994613312813, + "flos": 586535385600.0, + "grad_norm": 0.04772965278083779, + "language_loss": 0.85539973, + "learning_rate": 5.152344741070919e-05, + "loss": 0.86600661, + "num_input_tokens_seen": 370267056, + "router_z_loss_mlp": 0.32055664, + "step": 4463, + "time_per_iteration": 2.7710516452789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061667, + "balance_loss_mlp": 1.02917063, + "epoch": 0.8587918430165449, + "flos": 607993468416.0, + "grad_norm": 0.052538127660877086, + "language_loss": 0.78715736, + "learning_rate": 5.138579361741169e-05, + "loss": 0.79777402, + "num_input_tokens_seen": 370344176, + "router_z_loss_mlp": 0.32495117, + "step": 4464, + "time_per_iteration": 2.799131393432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061744, + "balance_loss_mlp": 1.02929556, + "epoch": 0.8589842247018084, + "flos": 588710619648.0, + "grad_norm": 0.052020809474610616, + "language_loss": 0.81095582, + "learning_rate": 5.124831399159535e-05, + "loss": 0.82157326, + "num_input_tokens_seen": 370414224, + "router_z_loss_mlp": 0.32446289, + "step": 4465, + "time_per_iteration": 2.6786587238311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064242, + "balance_loss_mlp": 1.03172147, + "epoch": 0.8591766063870719, + "flos": 543609045504.0, + "grad_norm": 0.07015130917214298, + "language_loss": 0.78573036, + "learning_rate": 5.1111008586634475e-05, + "loss": 0.79637277, + "num_input_tokens_seen": 370484736, + "router_z_loss_mlp": 0.32519531, + "step": 4466, + "time_per_iteration": 2.6729421615600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059412, + "balance_loss_mlp": 1.02724934, + "epoch": 0.8593689880723355, + "flos": 493499437056.0, + "grad_norm": 0.057052492442745496, + "language_loss": 0.80829519, + "learning_rate": 5.0973877455835816e-05, + "loss": 0.81888938, + "num_input_tokens_seen": 370556512, + "router_z_loss_mlp": 0.3215332, + "step": 4467, + "time_per_iteration": 2.665745496749878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060448, + "balance_loss_mlp": 1.02892888, + "epoch": 0.8595613697575991, + "flos": 533652613632.0, + "grad_norm": 0.08486411489909984, + "language_loss": 0.83856833, + "learning_rate": 5.083692065243822e-05, + "loss": 0.84917283, + "num_input_tokens_seen": 370622880, + "router_z_loss_mlp": 0.31494141, + "step": 4468, + "time_per_iteration": 2.605087995529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065534, + "balance_loss_mlp": 1.03279936, + "epoch": 0.8597537514428626, + "flos": 617347588608.0, + "grad_norm": 0.05427930741428609, + "language_loss": 0.7589013, + "learning_rate": 5.070013822961328e-05, + "loss": 0.7695567, + "num_input_tokens_seen": 370691632, + "router_z_loss_mlp": 0.32739258, + "step": 4469, + "time_per_iteration": 2.726418972015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063558, + "balance_loss_mlp": 1.03208721, + "epoch": 0.8599461331281262, + "flos": 608450365440.0, + "grad_norm": 0.050935024727546276, + "language_loss": 0.83729804, + "learning_rate": 5.056353024046462e-05, + "loss": 0.84793365, + "num_input_tokens_seen": 370764848, + "router_z_loss_mlp": 0.31445312, + "step": 4470, + "time_per_iteration": 2.757049798965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062047, + "balance_loss_mlp": 1.02978909, + "epoch": 0.8601385148133898, + "flos": 550979988480.0, + "grad_norm": 0.05051789331606474, + "language_loss": 0.83390927, + "learning_rate": 5.042709673802786e-05, + "loss": 0.84452975, + "num_input_tokens_seen": 370832496, + "router_z_loss_mlp": 0.32250977, + "step": 4471, + "time_per_iteration": 2.6580097675323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.03182566, + "epoch": 0.8603308964986534, + "flos": 580907510784.0, + "grad_norm": 0.0485236684203926, + "language_loss": 0.81116891, + "learning_rate": 5.0290837775271494e-05, + "loss": 0.82181472, + "num_input_tokens_seen": 370917104, + "router_z_loss_mlp": 0.32763672, + "step": 4472, + "time_per_iteration": 2.839219808578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060787, + "balance_loss_mlp": 1.02836204, + "epoch": 0.8605232781839169, + "flos": 628731376128.0, + "grad_norm": 0.06326689914609517, + "language_loss": 0.7511692, + "learning_rate": 5.0154753405095846e-05, + "loss": 0.76177704, + "num_input_tokens_seen": 370984512, + "router_z_loss_mlp": 0.32421875, + "step": 4473, + "time_per_iteration": 2.7530417442321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062597, + "balance_loss_mlp": 1.0308156, + "epoch": 0.8607156598691804, + "flos": 467904854016.0, + "grad_norm": 0.05697633265814371, + "language_loss": 0.76700628, + "learning_rate": 5.0018843680333604e-05, + "loss": 0.77763224, + "num_input_tokens_seen": 371049664, + "router_z_loss_mlp": 0.31762695, + "step": 4474, + "time_per_iteration": 2.497021198272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063581, + "balance_loss_mlp": 1.03177595, + "epoch": 0.860908041554444, + "flos": 488142194688.0, + "grad_norm": 0.05343941183350629, + "language_loss": 0.8279053, + "learning_rate": 4.988310865374945e-05, + "loss": 0.83854115, + "num_input_tokens_seen": 371120704, + "router_z_loss_mlp": 0.31787109, + "step": 4475, + "time_per_iteration": 2.661044120788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062188, + "balance_loss_mlp": 1.03143167, + "epoch": 0.8611004232397076, + "flos": 591827198976.0, + "grad_norm": 0.06189407919584074, + "language_loss": 0.79964399, + "learning_rate": 4.974754837804057e-05, + "loss": 0.8102659, + "num_input_tokens_seen": 371189376, + "router_z_loss_mlp": 0.30712891, + "step": 4476, + "time_per_iteration": 2.6757631301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059104, + "balance_loss_mlp": 1.0272038, + "epoch": 0.8612928049249712, + "flos": 773857138176.0, + "grad_norm": 0.052722096051547256, + "language_loss": 0.85997331, + "learning_rate": 4.9612162905836036e-05, + "loss": 0.87056434, + "num_input_tokens_seen": 371275184, + "router_z_loss_mlp": 0.31884766, + "step": 4477, + "time_per_iteration": 3.0340847969055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063746, + "balance_loss_mlp": 1.03148818, + "epoch": 0.8614851866102347, + "flos": 537291518976.0, + "grad_norm": 0.06050404360645883, + "language_loss": 0.82557905, + "learning_rate": 4.947695228969718e-05, + "loss": 0.83621651, + "num_input_tokens_seen": 371347920, + "router_z_loss_mlp": 0.32250977, + "step": 4478, + "time_per_iteration": 2.700917959213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057319, + "balance_loss_mlp": 1.02632427, + "epoch": 0.8616775682954982, + "flos": 565647681024.0, + "grad_norm": 0.04893161007491131, + "language_loss": 0.79213041, + "learning_rate": 4.934191658211729e-05, + "loss": 0.80270362, + "num_input_tokens_seen": 371419728, + "router_z_loss_mlp": 0.30957031, + "step": 4479, + "time_per_iteration": 2.638625144958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.03211105, + "epoch": 0.8618699499807618, + "flos": 481351804416.0, + "grad_norm": 0.06177653986234914, + "language_loss": 0.81433135, + "learning_rate": 4.92070558355221e-05, + "loss": 0.82497722, + "num_input_tokens_seen": 371488768, + "router_z_loss_mlp": 0.32470703, + "step": 4480, + "time_per_iteration": 2.5831832885742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063637, + "balance_loss_mlp": 1.03095031, + "epoch": 0.8620623316660254, + "flos": 649214618112.0, + "grad_norm": 0.06454644868462169, + "language_loss": 0.74008536, + "learning_rate": 4.9072370102269226e-05, + "loss": 0.75072169, + "num_input_tokens_seen": 371560144, + "router_z_loss_mlp": 0.3269043, + "step": 4481, + "time_per_iteration": 2.800652503967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062478, + "balance_loss_mlp": 1.03045893, + "epoch": 0.862254713351289, + "flos": 751457710080.0, + "grad_norm": 0.060736689413149525, + "language_loss": 0.85903275, + "learning_rate": 4.893785943464801e-05, + "loss": 0.86965752, + "num_input_tokens_seen": 371635920, + "router_z_loss_mlp": 0.32006836, + "step": 4482, + "time_per_iteration": 2.9607954025268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062729, + "balance_loss_mlp": 1.03061461, + "epoch": 0.8624470950365525, + "flos": 841147144704.0, + "grad_norm": 0.053974741732672415, + "language_loss": 0.77722287, + "learning_rate": 4.880352388488024e-05, + "loss": 0.78785014, + "num_input_tokens_seen": 371727664, + "router_z_loss_mlp": 0.32104492, + "step": 4483, + "time_per_iteration": 3.226982355117798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061835, + "balance_loss_mlp": 1.0292666, + "epoch": 0.8626394767218161, + "flos": 754470982656.0, + "grad_norm": 0.062075712884062044, + "language_loss": 0.8284198, + "learning_rate": 4.866936350511969e-05, + "loss": 0.83903813, + "num_input_tokens_seen": 371800832, + "router_z_loss_mlp": 0.32568359, + "step": 4484, + "time_per_iteration": 2.9436373710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060867, + "balance_loss_mlp": 1.02875233, + "epoch": 0.8628318584070797, + "flos": 703268669952.0, + "grad_norm": 0.06969941920153218, + "language_loss": 0.82350802, + "learning_rate": 4.853537834745203e-05, + "loss": 0.8341167, + "num_input_tokens_seen": 371871472, + "router_z_loss_mlp": 0.32104492, + "step": 4485, + "time_per_iteration": 2.8440961837768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059193, + "balance_loss_mlp": 1.0271498, + "epoch": 0.8630242400923432, + "flos": 471006876672.0, + "grad_norm": 0.06648371861207134, + "language_loss": 0.77326876, + "learning_rate": 4.840156846389487e-05, + "loss": 0.78386068, + "num_input_tokens_seen": 371936512, + "router_z_loss_mlp": 0.3203125, + "step": 4486, + "time_per_iteration": 2.5322835445404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061159, + "balance_loss_mlp": 1.02921081, + "epoch": 0.8632166217776067, + "flos": 963965200896.0, + "grad_norm": 0.06128857742360526, + "language_loss": 0.77070493, + "learning_rate": 4.826793390639783e-05, + "loss": 0.78131652, + "num_input_tokens_seen": 372018032, + "router_z_loss_mlp": 0.31933594, + "step": 4487, + "time_per_iteration": 3.189706563949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060666, + "balance_loss_mlp": 1.02886093, + "epoch": 0.8634090034628703, + "flos": 767583281664.0, + "grad_norm": 0.07112701478219728, + "language_loss": 0.78677434, + "learning_rate": 4.813447472684246e-05, + "loss": 0.79738104, + "num_input_tokens_seen": 372092176, + "router_z_loss_mlp": 0.31787109, + "step": 4488, + "time_per_iteration": 2.954084634780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064986, + "balance_loss_mlp": 1.03203702, + "epoch": 0.8636013851481339, + "flos": 520310380032.0, + "grad_norm": 0.0522005534966177, + "language_loss": 0.83078921, + "learning_rate": 4.800119097704214e-05, + "loss": 0.84143913, + "num_input_tokens_seen": 372166880, + "router_z_loss_mlp": 0.32958984, + "step": 4489, + "time_per_iteration": 2.7687644958496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059066, + "balance_loss_mlp": 1.02695084, + "epoch": 0.8637937668333975, + "flos": 631858129920.0, + "grad_norm": 0.055171692637559167, + "language_loss": 0.80351138, + "learning_rate": 4.7868082708742324e-05, + "loss": 0.81410205, + "num_input_tokens_seen": 372234608, + "router_z_loss_mlp": 0.32104492, + "step": 4490, + "time_per_iteration": 2.71951961517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063816, + "balance_loss_mlp": 1.03227329, + "epoch": 0.8639861485186611, + "flos": 855739233792.0, + "grad_norm": 0.04993299201791118, + "language_loss": 0.76146114, + "learning_rate": 4.773514997362e-05, + "loss": 0.77209932, + "num_input_tokens_seen": 372314704, + "router_z_loss_mlp": 0.31542969, + "step": 4491, + "time_per_iteration": 3.069485664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060774, + "balance_loss_mlp": 1.02968383, + "epoch": 0.8641785302039245, + "flos": 481017153024.0, + "grad_norm": 0.0570401502594965, + "language_loss": 0.77674156, + "learning_rate": 4.7602392823284605e-05, + "loss": 0.78734934, + "num_input_tokens_seen": 372374848, + "router_z_loss_mlp": 0.31054688, + "step": 4492, + "time_per_iteration": 2.533755302429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063273, + "balance_loss_mlp": 1.03168309, + "epoch": 0.8643709118891881, + "flos": 504385629696.0, + "grad_norm": 0.04924498248309733, + "language_loss": 0.80397034, + "learning_rate": 4.746981130927675e-05, + "loss": 0.81460309, + "num_input_tokens_seen": 372442432, + "router_z_loss_mlp": 0.31567383, + "step": 4493, + "time_per_iteration": 2.587989568710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059069, + "balance_loss_mlp": 1.02762127, + "epoch": 0.8645632935744517, + "flos": 552074102784.0, + "grad_norm": 0.05394896105958079, + "language_loss": 0.82090062, + "learning_rate": 4.733740548306908e-05, + "loss": 0.83149135, + "num_input_tokens_seen": 372520048, + "router_z_loss_mlp": 0.31420898, + "step": 4494, + "time_per_iteration": 2.7614352703094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061639, + "balance_loss_mlp": 1.02978659, + "epoch": 0.8647556752597153, + "flos": 524489140224.0, + "grad_norm": 0.055631203926486274, + "language_loss": 0.83849758, + "learning_rate": 4.7205175396066336e-05, + "loss": 0.849114, + "num_input_tokens_seen": 372587968, + "router_z_loss_mlp": 0.31835938, + "step": 4495, + "time_per_iteration": 2.5548112392425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060323, + "balance_loss_mlp": 1.02830327, + "epoch": 0.8649480569449788, + "flos": 787403013120.0, + "grad_norm": 0.057759633350782755, + "language_loss": 0.82145321, + "learning_rate": 4.707312109960471e-05, + "loss": 0.83205652, + "num_input_tokens_seen": 372672544, + "router_z_loss_mlp": 0.32006836, + "step": 4496, + "time_per_iteration": 3.0689432621002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058381, + "balance_loss_mlp": 1.02605116, + "epoch": 0.8651404386302424, + "flos": 763531149312.0, + "grad_norm": 0.0515665037941851, + "language_loss": 0.76591146, + "learning_rate": 4.694124264495225e-05, + "loss": 0.77649528, + "num_input_tokens_seen": 372751296, + "router_z_loss_mlp": 0.32324219, + "step": 4497, + "time_per_iteration": 3.0204567909240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063092, + "balance_loss_mlp": 1.03135872, + "epoch": 0.865332820315506, + "flos": 539620932096.0, + "grad_norm": 0.07535101034145897, + "language_loss": 0.8228246, + "learning_rate": 4.680954008330851e-05, + "loss": 0.83345556, + "num_input_tokens_seen": 372825264, + "router_z_loss_mlp": 0.31713867, + "step": 4498, + "time_per_iteration": 2.704850196838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010122, + "balance_loss_mlp": 1.0021112, + "epoch": 0.8655252020007695, + "flos": 1475874390528.0, + "grad_norm": 0.008064108702508102, + "language_loss": 0.79174447, + "learning_rate": 4.667801346580519e-05, + "loss": 0.80184567, + "num_input_tokens_seen": 373052000, + "router_z_loss_mlp": 0.08007812, + "step": 4499, + "time_per_iteration": 4.744141101837158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063596, + "balance_loss_mlp": 1.0319581, + "epoch": 0.8657175836860331, + "flos": 517094876160.0, + "grad_norm": 0.06467601085738069, + "language_loss": 0.82661498, + "learning_rate": 4.6546662843505396e-05, + "loss": 0.83725095, + "num_input_tokens_seen": 373124128, + "router_z_loss_mlp": 0.31616211, + "step": 4500, + "time_per_iteration": 2.6995272636413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106124, + "balance_loss_mlp": 1.02957797, + "epoch": 0.8659099653712966, + "flos": 590247074304.0, + "grad_norm": 0.0488325766448074, + "language_loss": 0.79730713, + "learning_rate": 4.641548826740394e-05, + "loss": 0.8079195, + "num_input_tokens_seen": 373195472, + "router_z_loss_mlp": 0.31640625, + "step": 4501, + "time_per_iteration": 2.756542921066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061576, + "balance_loss_mlp": 1.02979493, + "epoch": 0.8661023470565602, + "flos": 590168498688.0, + "grad_norm": 0.04885372872328607, + "language_loss": 0.87834525, + "learning_rate": 4.628448978842731e-05, + "loss": 0.88896096, + "num_input_tokens_seen": 373273504, + "router_z_loss_mlp": 0.31762695, + "step": 4502, + "time_per_iteration": 2.8756601810455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061181, + "balance_loss_mlp": 1.02963829, + "epoch": 0.8662947287418238, + "flos": 567405305856.0, + "grad_norm": 0.0556252853798282, + "language_loss": 0.79367119, + "learning_rate": 4.61536674574336e-05, + "loss": 0.80428302, + "num_input_tokens_seen": 373346032, + "router_z_loss_mlp": 0.31518555, + "step": 4503, + "time_per_iteration": 2.730353832244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061547, + "balance_loss_mlp": 1.02995646, + "epoch": 0.8664871104270874, + "flos": 515661728256.0, + "grad_norm": 0.06306513847558601, + "language_loss": 0.82253635, + "learning_rate": 4.6023021325212636e-05, + "loss": 0.83315182, + "num_input_tokens_seen": 373419968, + "router_z_loss_mlp": 0.31567383, + "step": 4504, + "time_per_iteration": 2.8205671310424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062618, + "balance_loss_mlp": 1.03100419, + "epoch": 0.866679492112351, + "flos": 556973038080.0, + "grad_norm": 0.05196414276050884, + "language_loss": 0.78062767, + "learning_rate": 4.589255144248561e-05, + "loss": 0.79125381, + "num_input_tokens_seen": 373502448, + "router_z_loss_mlp": 0.31591797, + "step": 4505, + "time_per_iteration": 2.7927794456481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061347, + "balance_loss_mlp": 1.03013778, + "epoch": 0.8668718737976144, + "flos": 722145646080.0, + "grad_norm": 0.06563340698886916, + "language_loss": 0.81826079, + "learning_rate": 4.57622578599054e-05, + "loss": 0.82887423, + "num_input_tokens_seen": 373581184, + "router_z_loss_mlp": 0.31176758, + "step": 4506, + "time_per_iteration": 2.884320020675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060244, + "balance_loss_mlp": 1.02860546, + "epoch": 0.867064255482878, + "flos": 600424676352.0, + "grad_norm": 0.06596119953742512, + "language_loss": 0.8435837, + "learning_rate": 4.5632140628056705e-05, + "loss": 0.85418612, + "num_input_tokens_seen": 373652272, + "router_z_loss_mlp": 0.31616211, + "step": 4507, + "time_per_iteration": 2.7202742099761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060491, + "balance_loss_mlp": 1.02847147, + "epoch": 0.8672566371681416, + "flos": 803177966592.0, + "grad_norm": 0.05619429968013786, + "language_loss": 0.75715232, + "learning_rate": 4.550219979745529e-05, + "loss": 0.76775718, + "num_input_tokens_seen": 373734896, + "router_z_loss_mlp": 0.32006836, + "step": 4508, + "time_per_iteration": 3.0621252059936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059917, + "balance_loss_mlp": 1.02877963, + "epoch": 0.8674490188534052, + "flos": 627072675840.0, + "grad_norm": 0.04631178506085383, + "language_loss": 0.837807, + "learning_rate": 4.5372435418548905e-05, + "loss": 0.8484062, + "num_input_tokens_seen": 373806960, + "router_z_loss_mlp": 0.31103516, + "step": 4509, + "time_per_iteration": 2.726402759552002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062583, + "balance_loss_mlp": 1.03075373, + "epoch": 0.8676414005386687, + "flos": 727489741824.0, + "grad_norm": 0.04521100568191671, + "language_loss": 0.8632676, + "learning_rate": 4.524284754171615e-05, + "loss": 0.87389338, + "num_input_tokens_seen": 373888352, + "router_z_loss_mlp": 0.31811523, + "step": 4510, + "time_per_iteration": 2.9605391025543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063445, + "balance_loss_mlp": 1.03199792, + "epoch": 0.8678337822239323, + "flos": 539676186624.0, + "grad_norm": 0.05450838794945064, + "language_loss": 0.80499184, + "learning_rate": 4.5113436217267765e-05, + "loss": 0.81562626, + "num_input_tokens_seen": 373962112, + "router_z_loss_mlp": 0.31420898, + "step": 4511, + "time_per_iteration": 2.7507681846618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063839, + "balance_loss_mlp": 1.03251052, + "epoch": 0.8680261639091958, + "flos": 507270864384.0, + "grad_norm": 0.06627366254356618, + "language_loss": 0.7913667, + "learning_rate": 4.4984201495445744e-05, + "loss": 0.80200505, + "num_input_tokens_seen": 374028256, + "router_z_loss_mlp": 0.31323242, + "step": 4512, + "time_per_iteration": 2.5611917972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062078, + "balance_loss_mlp": 1.0304879, + "epoch": 0.8682185455944594, + "flos": 486871990272.0, + "grad_norm": 0.05491483118349731, + "language_loss": 0.80959839, + "learning_rate": 4.4855143426423275e-05, + "loss": 0.82021916, + "num_input_tokens_seen": 374100080, + "router_z_loss_mlp": 0.31567383, + "step": 4513, + "time_per_iteration": 2.7118194103240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060989, + "balance_loss_mlp": 1.03016114, + "epoch": 0.868410927279723, + "flos": 603413217792.0, + "grad_norm": 0.06031526727588095, + "language_loss": 0.80724663, + "learning_rate": 4.472626206030528e-05, + "loss": 0.81785655, + "num_input_tokens_seen": 374174368, + "router_z_loss_mlp": 0.30786133, + "step": 4514, + "time_per_iteration": 2.7290971279144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061083, + "balance_loss_mlp": 1.02884901, + "epoch": 0.8686033089649865, + "flos": 1118552772096.0, + "grad_norm": 0.057770146114941426, + "language_loss": 0.84628344, + "learning_rate": 4.4597557447127846e-05, + "loss": 0.85689425, + "num_input_tokens_seen": 374257328, + "router_z_loss_mlp": 0.32226562, + "step": 4515, + "time_per_iteration": 3.4016566276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063551, + "balance_loss_mlp": 1.03179383, + "epoch": 0.8687956906502501, + "flos": 567750131712.0, + "grad_norm": 0.059882180592515495, + "language_loss": 0.83618152, + "learning_rate": 4.446902963685862e-05, + "loss": 0.84681702, + "num_input_tokens_seen": 374327936, + "router_z_loss_mlp": 0.31738281, + "step": 4516, + "time_per_iteration": 2.6526734828948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106162, + "balance_loss_mlp": 1.03033924, + "epoch": 0.8689880723355137, + "flos": 544071734784.0, + "grad_norm": 0.05519247318645579, + "language_loss": 0.84240782, + "learning_rate": 4.4340678679396454e-05, + "loss": 0.85302395, + "num_input_tokens_seen": 374400496, + "router_z_loss_mlp": 0.3125, + "step": 4517, + "time_per_iteration": 2.685844898223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062904, + "balance_loss_mlp": 1.03143275, + "epoch": 0.8691804540207773, + "flos": 457185987072.0, + "grad_norm": 0.05019728971382003, + "language_loss": 0.85942495, + "learning_rate": 4.4212504624571495e-05, + "loss": 0.87005395, + "num_input_tokens_seen": 374470528, + "router_z_loss_mlp": 0.31445312, + "step": 4518, + "time_per_iteration": 2.6083579063415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063422, + "balance_loss_mlp": 1.03216529, + "epoch": 0.8693728357060407, + "flos": 591591472128.0, + "grad_norm": 0.05314480965772763, + "language_loss": 0.79870903, + "learning_rate": 4.40845075221456e-05, + "loss": 0.80934334, + "num_input_tokens_seen": 374542656, + "router_z_loss_mlp": 0.31225586, + "step": 4519, + "time_per_iteration": 2.686267614364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058795, + "balance_loss_mlp": 1.02801549, + "epoch": 0.8695652173913043, + "flos": 679949655552.0, + "grad_norm": 0.06051907660879658, + "language_loss": 0.79408765, + "learning_rate": 4.395668742181164e-05, + "loss": 0.80467558, + "num_input_tokens_seen": 374617232, + "router_z_loss_mlp": 0.30737305, + "step": 4520, + "time_per_iteration": 2.954463005065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059756, + "balance_loss_mlp": 1.02873731, + "epoch": 0.8697575990765679, + "flos": 492120133632.0, + "grad_norm": 0.12778474077428317, + "language_loss": 0.78053939, + "learning_rate": 4.38290443731934e-05, + "loss": 0.79113692, + "num_input_tokens_seen": 374681888, + "router_z_loss_mlp": 0.30981445, + "step": 4521, + "time_per_iteration": 2.5474846363067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065092, + "balance_loss_mlp": 1.03412151, + "epoch": 0.8699499807618315, + "flos": 526690515456.0, + "grad_norm": 0.0489157541753566, + "language_loss": 0.81840932, + "learning_rate": 4.370157842584671e-05, + "loss": 0.82906032, + "num_input_tokens_seen": 374750464, + "router_z_loss_mlp": 0.30932617, + "step": 4522, + "time_per_iteration": 2.6422176361083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106357, + "balance_loss_mlp": 1.0320034, + "epoch": 0.8701423624470951, + "flos": 813981201408.0, + "grad_norm": 0.05775570775583278, + "language_loss": 0.79841703, + "learning_rate": 4.357428962925808e-05, + "loss": 0.80905277, + "num_input_tokens_seen": 374836064, + "router_z_loss_mlp": 0.31542969, + "step": 4523, + "time_per_iteration": 3.1012167930603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059208, + "balance_loss_mlp": 1.02723622, + "epoch": 0.8703347441323586, + "flos": 556519113216.0, + "grad_norm": 0.05599933750738037, + "language_loss": 0.88216102, + "learning_rate": 4.344717803284542e-05, + "loss": 0.89275301, + "num_input_tokens_seen": 374903392, + "router_z_loss_mlp": 0.31958008, + "step": 4524, + "time_per_iteration": 2.6371071338653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059655, + "balance_loss_mlp": 1.02835059, + "epoch": 0.8705271258176221, + "flos": 585151699968.0, + "grad_norm": 0.05141240379057952, + "language_loss": 0.84252208, + "learning_rate": 4.3320243685957825e-05, + "loss": 0.85311866, + "num_input_tokens_seen": 374985904, + "router_z_loss_mlp": 0.31274414, + "step": 4525, + "time_per_iteration": 2.8390161991119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062694, + "balance_loss_mlp": 1.03136575, + "epoch": 0.8707195075028857, + "flos": 668896137216.0, + "grad_norm": 0.04600181508448894, + "language_loss": 0.85132861, + "learning_rate": 4.3193486637875536e-05, + "loss": 0.86195552, + "num_input_tokens_seen": 375062992, + "router_z_loss_mlp": 0.31298828, + "step": 4526, + "time_per_iteration": 2.9073646068573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106146, + "balance_loss_mlp": 1.02955973, + "epoch": 0.8709118891881493, + "flos": 520122705408.0, + "grad_norm": 0.052761141095982234, + "language_loss": 0.83831137, + "learning_rate": 4.306690693781007e-05, + "loss": 0.84892601, + "num_input_tokens_seen": 375139296, + "router_z_loss_mlp": 0.31884766, + "step": 4527, + "time_per_iteration": 2.767987012863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061555, + "balance_loss_mlp": 1.03008366, + "epoch": 0.8711042708734128, + "flos": 552944226816.0, + "grad_norm": 0.06573110271460979, + "language_loss": 0.8149147, + "learning_rate": 4.294050463490401e-05, + "loss": 0.82553029, + "num_input_tokens_seen": 375206576, + "router_z_loss_mlp": 0.31445312, + "step": 4528, + "time_per_iteration": 2.665611505508423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063718, + "balance_loss_mlp": 1.03305793, + "epoch": 0.8712966525586764, + "flos": 501933970944.0, + "grad_norm": 0.08156910290101471, + "language_loss": 0.82087851, + "learning_rate": 4.281427977823094e-05, + "loss": 0.83151567, + "num_input_tokens_seen": 375279008, + "router_z_loss_mlp": 0.30615234, + "step": 4529, + "time_per_iteration": 2.6962764263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063911, + "balance_loss_mlp": 1.0326066, + "epoch": 0.87148903424394, + "flos": 803739580416.0, + "grad_norm": 0.09272960269684108, + "language_loss": 0.73765504, + "learning_rate": 4.268823241679593e-05, + "loss": 0.74829412, + "num_input_tokens_seen": 375368512, + "router_z_loss_mlp": 0.31274414, + "step": 4530, + "time_per_iteration": 3.0716986656188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060824, + "balance_loss_mlp": 1.02975786, + "epoch": 0.8716814159292036, + "flos": 773088910848.0, + "grad_norm": 0.041732057671581745, + "language_loss": 0.86070085, + "learning_rate": 4.256236259953489e-05, + "loss": 0.87130916, + "num_input_tokens_seen": 375450528, + "router_z_loss_mlp": 0.31030273, + "step": 4531, + "time_per_iteration": 3.009658098220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063716, + "balance_loss_mlp": 1.03300774, + "epoch": 0.8718737976144671, + "flos": 486595565568.0, + "grad_norm": 0.057775826764998164, + "language_loss": 0.85169399, + "learning_rate": 4.243667037531468e-05, + "loss": 0.86233115, + "num_input_tokens_seen": 375518256, + "router_z_loss_mlp": 0.30664062, + "step": 4532, + "time_per_iteration": 2.5984034538269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062579, + "balance_loss_mlp": 1.03122652, + "epoch": 0.8720661792997306, + "flos": 583850972160.0, + "grad_norm": 0.050781364973923986, + "language_loss": 0.78413302, + "learning_rate": 4.2311155792933264e-05, + "loss": 0.7947588, + "num_input_tokens_seen": 375588112, + "router_z_loss_mlp": 0.31323242, + "step": 4533, + "time_per_iteration": 2.710092306137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013214, + "balance_loss_mlp": 1.00510764, + "epoch": 0.8722585609849942, + "flos": 1495180560384.0, + "grad_norm": 0.005859621779777296, + "language_loss": 0.80966806, + "learning_rate": 4.2185818901119946e-05, + "loss": 0.81980014, + "num_input_tokens_seen": 375814496, + "router_z_loss_mlp": 0.08105469, + "step": 4534, + "time_per_iteration": 4.804488897323608 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059822, + "balance_loss_mlp": 1.02785015, + "epoch": 0.8724509426702578, + "flos": 595885123584.0, + "grad_norm": 0.05535231433932961, + "language_loss": 0.87492794, + "learning_rate": 4.206065974853479e-05, + "loss": 0.88552618, + "num_input_tokens_seen": 375885440, + "router_z_loss_mlp": 0.31958008, + "step": 4535, + "time_per_iteration": 2.7415826320648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062201, + "balance_loss_mlp": 1.03077722, + "epoch": 0.8726433243555214, + "flos": 443408767488.0, + "grad_norm": 0.05376597495630459, + "language_loss": 0.80928969, + "learning_rate": 4.193567838376888e-05, + "loss": 0.81991172, + "num_input_tokens_seen": 375952640, + "router_z_loss_mlp": 0.31396484, + "step": 4536, + "time_per_iteration": 2.5765163898468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061895, + "balance_loss_mlp": 1.03101945, + "epoch": 0.8728357060407849, + "flos": 552919495680.0, + "grad_norm": 0.06265478182979331, + "language_loss": 0.82042164, + "learning_rate": 4.181087485534402e-05, + "loss": 0.83104056, + "num_input_tokens_seen": 376021648, + "router_z_loss_mlp": 0.30834961, + "step": 4537, + "time_per_iteration": 2.6786723136901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058304, + "balance_loss_mlp": 1.02654707, + "epoch": 0.8730280877260485, + "flos": 627506251776.0, + "grad_norm": 0.050518176985259906, + "language_loss": 0.78287077, + "learning_rate": 4.16862492117136e-05, + "loss": 0.79345381, + "num_input_tokens_seen": 376102304, + "router_z_loss_mlp": 0.31738281, + "step": 4538, + "time_per_iteration": 2.8118271827697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061457, + "balance_loss_mlp": 1.02948546, + "epoch": 0.873220469411312, + "flos": 535106110464.0, + "grad_norm": 0.05971217476802488, + "language_loss": 0.80033374, + "learning_rate": 4.156180150126143e-05, + "loss": 0.81094825, + "num_input_tokens_seen": 376177072, + "router_z_loss_mlp": 0.31958008, + "step": 4539, + "time_per_iteration": 2.6918630599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106634, + "balance_loss_mlp": 1.0347259, + "epoch": 0.8734128510965756, + "flos": 561605723136.0, + "grad_norm": 0.05091768450849498, + "language_loss": 0.8370958, + "learning_rate": 4.143753177230242e-05, + "loss": 0.84775919, + "num_input_tokens_seen": 376251376, + "router_z_loss_mlp": 0.31591797, + "step": 4540, + "time_per_iteration": 2.6893396377563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106051, + "balance_loss_mlp": 1.0292058, + "epoch": 0.8736052327818392, + "flos": 686134761984.0, + "grad_norm": 0.06496183714869043, + "language_loss": 0.79499495, + "learning_rate": 4.131344007308224e-05, + "loss": 0.80560005, + "num_input_tokens_seen": 376337104, + "router_z_loss_mlp": 0.31274414, + "step": 4541, + "time_per_iteration": 2.944713830947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062058, + "balance_loss_mlp": 1.03046799, + "epoch": 0.8737976144671027, + "flos": 531384247296.0, + "grad_norm": 0.05340963644738213, + "language_loss": 0.81737614, + "learning_rate": 4.1189526451777816e-05, + "loss": 0.82799673, + "num_input_tokens_seen": 376415456, + "router_z_loss_mlp": 0.31567383, + "step": 4542, + "time_per_iteration": 2.820056676864624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061357, + "balance_loss_mlp": 1.02981448, + "epoch": 0.8739899961523663, + "flos": 575308749312.0, + "grad_norm": 0.051498816346912536, + "language_loss": 0.81901187, + "learning_rate": 4.106579095649649e-05, + "loss": 0.82962549, + "num_input_tokens_seen": 376494880, + "router_z_loss_mlp": 0.31518555, + "step": 4543, + "time_per_iteration": 2.8078243732452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063084, + "balance_loss_mlp": 1.03137445, + "epoch": 0.8741823778376299, + "flos": 731009373696.0, + "grad_norm": 0.08412552836981564, + "language_loss": 0.76142043, + "learning_rate": 4.094223363527666e-05, + "loss": 0.77205127, + "num_input_tokens_seen": 376571760, + "router_z_loss_mlp": 0.31689453, + "step": 4544, + "time_per_iteration": 2.895599365234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062313, + "balance_loss_mlp": 1.0306983, + "epoch": 0.8743747595228935, + "flos": 566795639808.0, + "grad_norm": 0.06200041308589868, + "language_loss": 0.83552009, + "learning_rate": 4.081885453608747e-05, + "loss": 0.84614325, + "num_input_tokens_seen": 376644464, + "router_z_loss_mlp": 0.31591797, + "step": 4545, + "time_per_iteration": 2.728745222091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062007, + "balance_loss_mlp": 1.03020191, + "epoch": 0.8745671412081569, + "flos": 493115323392.0, + "grad_norm": 0.053444102465167606, + "language_loss": 0.81948709, + "learning_rate": 4.0695653706829095e-05, + "loss": 0.83010715, + "num_input_tokens_seen": 376709584, + "router_z_loss_mlp": 0.31787109, + "step": 4546, + "time_per_iteration": 2.5782153606414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060239, + "balance_loss_mlp": 1.02814758, + "epoch": 0.8747595228934205, + "flos": 523883856384.0, + "grad_norm": 0.04577609874107169, + "language_loss": 0.83400089, + "learning_rate": 4.057263119533233e-05, + "loss": 0.8446033, + "num_input_tokens_seen": 376779472, + "router_z_loss_mlp": 0.32080078, + "step": 4547, + "time_per_iteration": 2.6548120975494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061892, + "balance_loss_mlp": 1.03011048, + "epoch": 0.8749519045786841, + "flos": 743999427072.0, + "grad_norm": 0.06038252152055413, + "language_loss": 0.79740083, + "learning_rate": 4.044978704935853e-05, + "loss": 0.80801976, + "num_input_tokens_seen": 376863408, + "router_z_loss_mlp": 0.31762695, + "step": 4548, + "time_per_iteration": 3.041475534439087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106142, + "balance_loss_mlp": 1.03061604, + "epoch": 0.8751442862639477, + "flos": 594003843072.0, + "grad_norm": 0.048567013140779235, + "language_loss": 0.80103874, + "learning_rate": 4.032712131660027e-05, + "loss": 0.81165296, + "num_input_tokens_seen": 376942080, + "router_z_loss_mlp": 0.30786133, + "step": 4549, + "time_per_iteration": 2.8483879566192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063327, + "balance_loss_mlp": 1.03190303, + "epoch": 0.8753366679492113, + "flos": 496285747200.0, + "grad_norm": 0.05234931726187161, + "language_loss": 0.78447485, + "learning_rate": 4.020463404468055e-05, + "loss": 0.79510808, + "num_input_tokens_seen": 377015696, + "router_z_loss_mlp": 0.31396484, + "step": 4550, + "time_per_iteration": 2.732851982116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060994, + "balance_loss_mlp": 1.02890277, + "epoch": 0.8755290496344748, + "flos": 489619012608.0, + "grad_norm": 0.0557326561034337, + "language_loss": 0.81853771, + "learning_rate": 4.0082325281153074e-05, + "loss": 0.8291477, + "num_input_tokens_seen": 377081424, + "router_z_loss_mlp": 0.32080078, + "step": 4551, + "time_per_iteration": 2.5726113319396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061371, + "balance_loss_mlp": 1.02987576, + "epoch": 0.8757214313197383, + "flos": 591557976576.0, + "grad_norm": 0.04987162654706675, + "language_loss": 0.81259084, + "learning_rate": 3.9960195073502345e-05, + "loss": 0.82320452, + "num_input_tokens_seen": 377159360, + "router_z_loss_mlp": 0.31469727, + "step": 4552, + "time_per_iteration": 2.817884683609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062068, + "balance_loss_mlp": 1.03012002, + "epoch": 0.8759138130050019, + "flos": 976456249344.0, + "grad_norm": 0.06584475086644327, + "language_loss": 0.77716434, + "learning_rate": 3.9838243469143555e-05, + "loss": 0.78778505, + "num_input_tokens_seen": 377240704, + "router_z_loss_mlp": 0.31933594, + "step": 4553, + "time_per_iteration": 3.2275702953338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059915, + "balance_loss_mlp": 1.02808642, + "epoch": 0.8761061946902655, + "flos": 802405357056.0, + "grad_norm": 0.04329284311928099, + "language_loss": 0.77679586, + "learning_rate": 3.971647051542243e-05, + "loss": 0.78739506, + "num_input_tokens_seen": 377324176, + "router_z_loss_mlp": 0.31811523, + "step": 4554, + "time_per_iteration": 3.0647592544555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058788, + "balance_loss_mlp": 1.02731705, + "epoch": 0.8762985763755291, + "flos": 698158738944.0, + "grad_norm": 0.048765571498963003, + "language_loss": 0.74434495, + "learning_rate": 3.95948762596155e-05, + "loss": 0.75493276, + "num_input_tokens_seen": 377403440, + "router_z_loss_mlp": 0.31445312, + "step": 4555, + "time_per_iteration": 2.961852550506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106342, + "balance_loss_mlp": 1.03130519, + "epoch": 0.8764909580607926, + "flos": 629416645632.0, + "grad_norm": 0.05400398397225891, + "language_loss": 0.80043375, + "learning_rate": 3.9473460748929765e-05, + "loss": 0.81106794, + "num_input_tokens_seen": 377483440, + "router_z_loss_mlp": 0.32104492, + "step": 4556, + "time_per_iteration": 2.833293914794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059523, + "balance_loss_mlp": 1.028934, + "epoch": 0.8766833397460562, + "flos": 481297959936.0, + "grad_norm": 0.05296826742775525, + "language_loss": 0.80469096, + "learning_rate": 3.935222403050304e-05, + "loss": 0.81528622, + "num_input_tokens_seen": 377554688, + "router_z_loss_mlp": 0.30541992, + "step": 4557, + "time_per_iteration": 2.6411917209625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059244, + "balance_loss_mlp": 1.02808261, + "epoch": 0.8768757214313198, + "flos": 407514336768.0, + "grad_norm": 0.05694908001629326, + "language_loss": 0.78118753, + "learning_rate": 3.923116615140354e-05, + "loss": 0.79177999, + "num_input_tokens_seen": 377617616, + "router_z_loss_mlp": 0.3112793, + "step": 4558, + "time_per_iteration": 2.472745180130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061431, + "balance_loss_mlp": 1.02931571, + "epoch": 0.8770681031165833, + "flos": 582314517504.0, + "grad_norm": 0.06519133015059232, + "language_loss": 0.8193962, + "learning_rate": 3.9110287158630076e-05, + "loss": 0.83001053, + "num_input_tokens_seen": 377685888, + "router_z_loss_mlp": 0.32104492, + "step": 4559, + "time_per_iteration": 2.6806676387786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062883, + "balance_loss_mlp": 1.03086364, + "epoch": 0.8772604848018468, + "flos": 508437762048.0, + "grad_norm": 0.06392536215328089, + "language_loss": 0.80933923, + "learning_rate": 3.8989587099111875e-05, + "loss": 0.81996804, + "num_input_tokens_seen": 377755744, + "router_z_loss_mlp": 0.32006836, + "step": 4560, + "time_per_iteration": 2.5991218090057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059632, + "balance_loss_mlp": 1.02837586, + "epoch": 0.8774528664871104, + "flos": 408617215488.0, + "grad_norm": 0.06067903743456465, + "language_loss": 0.84571135, + "learning_rate": 3.886906601970913e-05, + "loss": 0.85630763, + "num_input_tokens_seen": 377818880, + "router_z_loss_mlp": 0.31225586, + "step": 4561, + "time_per_iteration": 2.455996513366699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061512, + "balance_loss_mlp": 1.02975512, + "epoch": 0.877645248172374, + "flos": 500589573120.0, + "grad_norm": 0.05408973543487403, + "language_loss": 0.83434474, + "learning_rate": 3.8748723967212184e-05, + "loss": 0.84495986, + "num_input_tokens_seen": 377893280, + "router_z_loss_mlp": 0.31738281, + "step": 4562, + "time_per_iteration": 2.6538524627685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061289, + "balance_loss_mlp": 1.02948415, + "epoch": 0.8778376298576376, + "flos": 632857701888.0, + "grad_norm": 0.05369014995175808, + "language_loss": 0.77912921, + "learning_rate": 3.862856098834189e-05, + "loss": 0.78974211, + "num_input_tokens_seen": 377972912, + "router_z_loss_mlp": 0.31787109, + "step": 4563, + "time_per_iteration": 2.8910348415374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063792, + "balance_loss_mlp": 1.03274965, + "epoch": 0.8780300115429012, + "flos": 533707868160.0, + "grad_norm": 0.053856474502613036, + "language_loss": 0.79521894, + "learning_rate": 3.850857712974976e-05, + "loss": 0.80585694, + "num_input_tokens_seen": 378054000, + "router_z_loss_mlp": 0.31005859, + "step": 4564, + "time_per_iteration": 2.7875571250915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059359, + "balance_loss_mlp": 1.02855527, + "epoch": 0.8782223932281646, + "flos": 511411746816.0, + "grad_norm": 0.04753076591808214, + "language_loss": 0.7683506, + "learning_rate": 3.838877243801758e-05, + "loss": 0.77894419, + "num_input_tokens_seen": 378120336, + "router_z_loss_mlp": 0.30761719, + "step": 4565, + "time_per_iteration": 2.6198067665100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061378, + "balance_loss_mlp": 1.02950168, + "epoch": 0.8784147749134282, + "flos": 780333225984.0, + "grad_norm": 0.05851858406426915, + "language_loss": 0.69561017, + "learning_rate": 3.826914695965766e-05, + "loss": 0.70622396, + "num_input_tokens_seen": 378216672, + "router_z_loss_mlp": 0.31860352, + "step": 4566, + "time_per_iteration": 3.172079563140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063568, + "balance_loss_mlp": 1.03228772, + "epoch": 0.8786071565986918, + "flos": 560738571264.0, + "grad_norm": 0.06257605820389481, + "language_loss": 0.75450444, + "learning_rate": 3.814970074111279e-05, + "loss": 0.76514018, + "num_input_tokens_seen": 378287536, + "router_z_loss_mlp": 0.3125, + "step": 4567, + "time_per_iteration": 2.718393087387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060158, + "balance_loss_mlp": 1.02818608, + "epoch": 0.8787995382839554, + "flos": 603148377600.0, + "grad_norm": 0.04978565158238451, + "language_loss": 0.77231061, + "learning_rate": 3.8030433828755926e-05, + "loss": 0.78291219, + "num_input_tokens_seen": 378362128, + "router_z_loss_mlp": 0.31958008, + "step": 4568, + "time_per_iteration": 2.783825159072876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059599, + "balance_loss_mlp": 1.02838981, + "epoch": 0.8789919199692189, + "flos": 559970343936.0, + "grad_norm": 0.04489278278007527, + "language_loss": 0.84756523, + "learning_rate": 3.7911346268890924e-05, + "loss": 0.85816121, + "num_input_tokens_seen": 378435696, + "router_z_loss_mlp": 0.31176758, + "step": 4569, + "time_per_iteration": 2.6692512035369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059281, + "balance_loss_mlp": 1.0283581, + "epoch": 0.8791843016544825, + "flos": 538857086976.0, + "grad_norm": 0.056088812281162366, + "language_loss": 0.81766403, + "learning_rate": 3.7792438107751405e-05, + "loss": 0.82825685, + "num_input_tokens_seen": 378505664, + "router_z_loss_mlp": 0.30883789, + "step": 4570, + "time_per_iteration": 2.6016881465911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060689, + "balance_loss_mlp": 1.02938414, + "epoch": 0.8793766833397461, + "flos": 1008275226624.0, + "grad_norm": 0.05357677729025578, + "language_loss": 0.79291123, + "learning_rate": 3.767370939150167e-05, + "loss": 0.80351812, + "num_input_tokens_seen": 378598016, + "router_z_loss_mlp": 0.31274414, + "step": 4571, + "time_per_iteration": 3.325495481491089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065188, + "balance_loss_mlp": 1.03319228, + "epoch": 0.8795690650250096, + "flos": 678320068608.0, + "grad_norm": 0.04870756479019928, + "language_loss": 0.80827546, + "learning_rate": 3.755516016623628e-05, + "loss": 0.81892741, + "num_input_tokens_seen": 378676176, + "router_z_loss_mlp": 0.31982422, + "step": 4572, + "time_per_iteration": 2.8708269596099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058214, + "balance_loss_mlp": 1.02669477, + "epoch": 0.8797614467102732, + "flos": 453202255872.0, + "grad_norm": 0.06319465276598665, + "language_loss": 0.88573319, + "learning_rate": 3.7436790477980157e-05, + "loss": 0.89631534, + "num_input_tokens_seen": 378737952, + "router_z_loss_mlp": 0.31494141, + "step": 4573, + "time_per_iteration": 2.5026752948760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059994, + "balance_loss_mlp": 1.02866578, + "epoch": 0.8799538283955367, + "flos": 550649719296.0, + "grad_norm": 0.04909208675254753, + "language_loss": 0.8408621, + "learning_rate": 3.7318600372688526e-05, + "loss": 0.85146207, + "num_input_tokens_seen": 378806704, + "router_z_loss_mlp": 0.31298828, + "step": 4574, + "time_per_iteration": 2.6635913848876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062455, + "balance_loss_mlp": 1.03079283, + "epoch": 0.8801462100808003, + "flos": 807072947712.0, + "grad_norm": 0.06010438341663848, + "language_loss": 0.8430174, + "learning_rate": 3.720058989624681e-05, + "loss": 0.85364199, + "num_input_tokens_seen": 378887616, + "router_z_loss_mlp": 0.31640625, + "step": 4575, + "time_per_iteration": 3.0828750133514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106221, + "balance_loss_mlp": 1.03057218, + "epoch": 0.8803385917660639, + "flos": 768366065664.0, + "grad_norm": 0.047129972836998074, + "language_loss": 0.84180987, + "learning_rate": 3.708275909447079e-05, + "loss": 0.85243201, + "num_input_tokens_seen": 378964656, + "router_z_loss_mlp": 0.31616211, + "step": 4576, + "time_per_iteration": 2.9739205837249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105957, + "balance_loss_mlp": 1.02814686, + "epoch": 0.8805309734513275, + "flos": 567070654464.0, + "grad_norm": 0.053878873022787786, + "language_loss": 0.8110702, + "learning_rate": 3.696510801310632e-05, + "loss": 0.82166588, + "num_input_tokens_seen": 379036752, + "router_z_loss_mlp": 0.31396484, + "step": 4577, + "time_per_iteration": 2.696599006652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060244, + "balance_loss_mlp": 1.02851081, + "epoch": 0.880723355136591, + "flos": 679481174016.0, + "grad_norm": 0.05262790478746066, + "language_loss": 0.8144868, + "learning_rate": 3.6847636697829755e-05, + "loss": 0.82508922, + "num_input_tokens_seen": 379106480, + "router_z_loss_mlp": 0.31713867, + "step": 4578, + "time_per_iteration": 2.802515745162964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106303, + "balance_loss_mlp": 1.03151155, + "epoch": 0.8809157368218545, + "flos": 565347935232.0, + "grad_norm": 0.05499943484212242, + "language_loss": 0.78842521, + "learning_rate": 3.673034519424734e-05, + "loss": 0.79905552, + "num_input_tokens_seen": 379182544, + "router_z_loss_mlp": 0.31494141, + "step": 4579, + "time_per_iteration": 2.7430505752563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060042, + "balance_loss_mlp": 1.02816534, + "epoch": 0.8811081185071181, + "flos": 515153958912.0, + "grad_norm": 0.04849663850018554, + "language_loss": 0.7603749, + "learning_rate": 3.661323354789586e-05, + "loss": 0.77097535, + "num_input_tokens_seen": 379255856, + "router_z_loss_mlp": 0.31860352, + "step": 4580, + "time_per_iteration": 2.7311344146728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061477, + "balance_loss_mlp": 1.03117371, + "epoch": 0.8813005001923817, + "flos": 594067862016.0, + "grad_norm": 0.06983405724673822, + "language_loss": 0.8144145, + "learning_rate": 3.649630180424191e-05, + "loss": 0.82502925, + "num_input_tokens_seen": 379322704, + "router_z_loss_mlp": 0.30273438, + "step": 4581, + "time_per_iteration": 2.7526886463165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055676, + "balance_loss_mlp": 1.02415729, + "epoch": 0.8814928818776453, + "flos": 666630743040.0, + "grad_norm": 0.055477837077192144, + "language_loss": 0.79182696, + "learning_rate": 3.637955000868254e-05, + "loss": 0.80238372, + "num_input_tokens_seen": 379395008, + "router_z_loss_mlp": 0.31494141, + "step": 4582, + "time_per_iteration": 2.8307814598083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060037, + "balance_loss_mlp": 1.02868462, + "epoch": 0.8816852635629088, + "flos": 608873766912.0, + "grad_norm": 0.048277670038562745, + "language_loss": 0.85441208, + "learning_rate": 3.626297820654467e-05, + "loss": 0.86501247, + "num_input_tokens_seen": 379465824, + "router_z_loss_mlp": 0.31323242, + "step": 4583, + "time_per_iteration": 2.7170026302337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061128, + "balance_loss_mlp": 1.02958536, + "epoch": 0.8818776452481724, + "flos": 480131062272.0, + "grad_norm": 0.06483310109620229, + "language_loss": 0.82067692, + "learning_rate": 3.614658644308572e-05, + "loss": 0.83128822, + "num_input_tokens_seen": 379534960, + "router_z_loss_mlp": 0.31518555, + "step": 4584, + "time_per_iteration": 2.618037223815918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064716, + "balance_loss_mlp": 1.0326488, + "epoch": 0.882070026933436, + "flos": 1044985936896.0, + "grad_norm": 0.06883560573017064, + "language_loss": 0.73482269, + "learning_rate": 3.60303747634928e-05, + "loss": 0.74546981, + "num_input_tokens_seen": 379617456, + "router_z_loss_mlp": 0.32055664, + "step": 4585, + "time_per_iteration": 3.3003652095794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060323, + "balance_loss_mlp": 1.02880442, + "epoch": 0.8822624086186995, + "flos": 474153979392.0, + "grad_norm": 0.05196830736419867, + "language_loss": 0.79747665, + "learning_rate": 3.591434321288345e-05, + "loss": 0.80807984, + "num_input_tokens_seen": 379687792, + "router_z_loss_mlp": 0.31494141, + "step": 4586, + "time_per_iteration": 2.6248860359191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059576, + "balance_loss_mlp": 1.02862895, + "epoch": 0.882454790303963, + "flos": 653725057536.0, + "grad_norm": 0.06065088286401367, + "language_loss": 0.81563091, + "learning_rate": 3.579849183630485e-05, + "loss": 0.82622671, + "num_input_tokens_seen": 379761120, + "router_z_loss_mlp": 0.30932617, + "step": 4587, + "time_per_iteration": 2.7769362926483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061148, + "balance_loss_mlp": 1.02922332, + "epoch": 0.8826471719892266, + "flos": 470081498112.0, + "grad_norm": 0.04739147053606439, + "language_loss": 0.78241807, + "learning_rate": 3.568282067873468e-05, + "loss": 0.79302955, + "num_input_tokens_seen": 379829008, + "router_z_loss_mlp": 0.3190918, + "step": 4588, + "time_per_iteration": 2.570162773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061217, + "balance_loss_mlp": 1.02879214, + "epoch": 0.8828395536744902, + "flos": 468501373440.0, + "grad_norm": 0.047057706534628874, + "language_loss": 0.83742458, + "learning_rate": 3.556732978508048e-05, + "loss": 0.84803677, + "num_input_tokens_seen": 379899584, + "router_z_loss_mlp": 0.32421875, + "step": 4589, + "time_per_iteration": 2.686675786972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105933, + "balance_loss_mlp": 1.02783465, + "epoch": 0.8830319353597538, + "flos": 721044177408.0, + "grad_norm": 0.04950856406845548, + "language_loss": 0.81078488, + "learning_rate": 3.545201920017971e-05, + "loss": 0.82137823, + "num_input_tokens_seen": 379979440, + "router_z_loss_mlp": 0.31469727, + "step": 4590, + "time_per_iteration": 2.9431474208831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064053, + "balance_loss_mlp": 1.03282046, + "epoch": 0.8832243170450174, + "flos": 443049384960.0, + "grad_norm": 0.10786996917657111, + "language_loss": 0.81250805, + "learning_rate": 3.5336888968799996e-05, + "loss": 0.82314861, + "num_input_tokens_seen": 380046944, + "router_z_loss_mlp": 0.31201172, + "step": 4591, + "time_per_iteration": 2.5478732585906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061662, + "balance_loss_mlp": 1.02983332, + "epoch": 0.8834166987302808, + "flos": 566293662720.0, + "grad_norm": 0.05506928252909138, + "language_loss": 0.82108343, + "learning_rate": 3.5221939135638756e-05, + "loss": 0.83170003, + "num_input_tokens_seen": 380118048, + "router_z_loss_mlp": 0.31811523, + "step": 4592, + "time_per_iteration": 2.7421905994415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059531, + "balance_loss_mlp": 1.02796483, + "epoch": 0.8836090804155444, + "flos": 609022153728.0, + "grad_norm": 0.0589985976139412, + "language_loss": 0.81940699, + "learning_rate": 3.510716974532352e-05, + "loss": 0.83000231, + "num_input_tokens_seen": 380192416, + "router_z_loss_mlp": 0.31542969, + "step": 4593, + "time_per_iteration": 2.7895593643188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061044, + "balance_loss_mlp": 1.02964461, + "epoch": 0.883801462100808, + "flos": 556804302336.0, + "grad_norm": 0.05233139357243583, + "language_loss": 0.80372763, + "learning_rate": 3.4992580842411745e-05, + "loss": 0.81433809, + "num_input_tokens_seen": 380264432, + "router_z_loss_mlp": 0.3137207, + "step": 4594, + "time_per_iteration": 2.6803102493286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060001, + "balance_loss_mlp": 1.02843499, + "epoch": 0.8839938437860716, + "flos": 515936742912.0, + "grad_norm": 0.07064799718920817, + "language_loss": 0.77353942, + "learning_rate": 3.487817247139064e-05, + "loss": 0.78413939, + "num_input_tokens_seen": 380334192, + "router_z_loss_mlp": 0.31542969, + "step": 4595, + "time_per_iteration": 2.6111834049224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106149, + "balance_loss_mlp": 1.02989948, + "epoch": 0.8841862254713351, + "flos": 713386635264.0, + "grad_norm": 0.09630405385113956, + "language_loss": 0.7850247, + "learning_rate": 3.47639446766777e-05, + "loss": 0.79563963, + "num_input_tokens_seen": 380407504, + "router_z_loss_mlp": 0.31567383, + "step": 4596, + "time_per_iteration": 2.863654375076294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061691, + "balance_loss_mlp": 1.03012478, + "epoch": 0.8843786071565987, + "flos": 833626404864.0, + "grad_norm": 0.05760354635847824, + "language_loss": 0.82457066, + "learning_rate": 3.4649897502620095e-05, + "loss": 0.83518755, + "num_input_tokens_seen": 380486272, + "router_z_loss_mlp": 0.31542969, + "step": 4597, + "time_per_iteration": 2.9976413249969482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062715, + "balance_loss_mlp": 1.03188777, + "epoch": 0.8845709888418622, + "flos": 656562240000.0, + "grad_norm": 0.048945613215712114, + "language_loss": 0.82873589, + "learning_rate": 3.453603099349462e-05, + "loss": 0.83936304, + "num_input_tokens_seen": 380568480, + "router_z_loss_mlp": 0.30810547, + "step": 4598, + "time_per_iteration": 2.8781182765960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061116, + "balance_loss_mlp": 1.03000224, + "epoch": 0.8847633705271258, + "flos": 523038463488.0, + "grad_norm": 0.0499308034693015, + "language_loss": 0.80891055, + "learning_rate": 3.442234519350823e-05, + "loss": 0.81952167, + "num_input_tokens_seen": 380643088, + "router_z_loss_mlp": 0.31079102, + "step": 4599, + "time_per_iteration": 2.723100423812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063587, + "balance_loss_mlp": 1.03261626, + "epoch": 0.8849557522123894, + "flos": 548330480640.0, + "grad_norm": 0.05641788542621963, + "language_loss": 0.84390503, + "learning_rate": 3.430884014679786e-05, + "loss": 0.85454094, + "num_input_tokens_seen": 380714512, + "router_z_loss_mlp": 0.30932617, + "step": 4600, + "time_per_iteration": 2.6754045486450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062521, + "balance_loss_mlp": 1.03088272, + "epoch": 0.8851481338976529, + "flos": 622070433792.0, + "grad_norm": 0.0608435446742517, + "language_loss": 0.83655906, + "learning_rate": 3.4195515897429974e-05, + "loss": 0.84718424, + "num_input_tokens_seen": 380789168, + "router_z_loss_mlp": 0.31616211, + "step": 4601, + "time_per_iteration": 2.7671477794647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106026, + "balance_loss_mlp": 1.02902758, + "epoch": 0.8853405155829165, + "flos": 444123150336.0, + "grad_norm": 0.05872328742975128, + "language_loss": 0.80575866, + "learning_rate": 3.408237248940088e-05, + "loss": 0.81636125, + "num_input_tokens_seen": 380856992, + "router_z_loss_mlp": 0.31201172, + "step": 4602, + "time_per_iteration": 2.560318946838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062734, + "balance_loss_mlp": 1.03095329, + "epoch": 0.8855328972681801, + "flos": 730152396288.0, + "grad_norm": 0.056216541125300654, + "language_loss": 0.77893907, + "learning_rate": 3.396940996663683e-05, + "loss": 0.7895664, + "num_input_tokens_seen": 380930480, + "router_z_loss_mlp": 0.31762695, + "step": 4603, + "time_per_iteration": 2.8867790699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063127, + "balance_loss_mlp": 1.03201365, + "epoch": 0.8857252789534437, + "flos": 487132448256.0, + "grad_norm": 0.07079921333147207, + "language_loss": 0.78746498, + "learning_rate": 3.385662837299375e-05, + "loss": 0.7980963, + "num_input_tokens_seen": 380994192, + "router_z_loss_mlp": 0.31079102, + "step": 4604, + "time_per_iteration": 2.5524046421051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062191, + "balance_loss_mlp": 1.03117263, + "epoch": 0.8859176606387072, + "flos": 508290785280.0, + "grad_norm": 0.05238353409776557, + "language_loss": 0.81618583, + "learning_rate": 3.374402775225727e-05, + "loss": 0.82680774, + "num_input_tokens_seen": 381066848, + "router_z_loss_mlp": 0.31005859, + "step": 4605, + "time_per_iteration": 2.6777870655059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062144, + "balance_loss_mlp": 1.03043461, + "epoch": 0.8861100423239707, + "flos": 516370318848.0, + "grad_norm": 0.055497975758408605, + "language_loss": 0.85710311, + "learning_rate": 3.3631608148142925e-05, + "loss": 0.8677246, + "num_input_tokens_seen": 381138816, + "router_z_loss_mlp": 0.31689453, + "step": 4606, + "time_per_iteration": 2.6625237464904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064071, + "balance_loss_mlp": 1.03319585, + "epoch": 0.8863024240092343, + "flos": 626692944384.0, + "grad_norm": 0.05509705271526416, + "language_loss": 0.79623628, + "learning_rate": 3.3519369604295746e-05, + "loss": 0.80687696, + "num_input_tokens_seen": 381208448, + "router_z_loss_mlp": 0.30834961, + "step": 4607, + "time_per_iteration": 2.7269294261932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106404, + "balance_loss_mlp": 1.03230667, + "epoch": 0.8864948056944979, + "flos": 766564770816.0, + "grad_norm": 0.10040451996396124, + "language_loss": 0.83269691, + "learning_rate": 3.340731216429083e-05, + "loss": 0.8433373, + "num_input_tokens_seen": 381289712, + "router_z_loss_mlp": 0.31713867, + "step": 4608, + "time_per_iteration": 2.991093397140503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018938, + "balance_loss_mlp": 1.01102269, + "epoch": 0.8866871873797615, + "flos": 1501500907008.0, + "grad_norm": 0.009535247872241597, + "language_loss": 0.78830957, + "learning_rate": 3.329543587163253e-05, + "loss": 0.79849893, + "num_input_tokens_seen": 381520848, + "router_z_loss_mlp": 0.07910156, + "step": 4609, + "time_per_iteration": 4.8284571170806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061082, + "balance_loss_mlp": 1.02837062, + "epoch": 0.886879569065025, + "flos": 811164367872.0, + "grad_norm": 0.13586161840975353, + "language_loss": 0.81234121, + "learning_rate": 3.3183740769755e-05, + "loss": 0.82295209, + "num_input_tokens_seen": 381603008, + "router_z_loss_mlp": 0.32714844, + "step": 4610, + "time_per_iteration": 3.0232110023498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018727, + "balance_loss_mlp": 1.01081121, + "epoch": 0.8870719507502886, + "flos": 1581994934784.0, + "grad_norm": 0.009521282732020938, + "language_loss": 0.7691083, + "learning_rate": 3.307222690202238e-05, + "loss": 0.77929556, + "num_input_tokens_seen": 381844336, + "router_z_loss_mlp": 0.07910156, + "step": 4611, + "time_per_iteration": 5.034501552581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106293, + "balance_loss_mlp": 1.03231668, + "epoch": 0.8872643324355521, + "flos": 633743792640.0, + "grad_norm": 0.05784261037220574, + "language_loss": 0.74835932, + "learning_rate": 3.296089431172811e-05, + "loss": 0.75898862, + "num_input_tokens_seen": 381918576, + "router_z_loss_mlp": 0.30566406, + "step": 4612, + "time_per_iteration": 2.8261477947235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062432, + "balance_loss_mlp": 1.031461, + "epoch": 0.8874567141208157, + "flos": 535498988544.0, + "grad_norm": 0.0754643632292133, + "language_loss": 0.8301453, + "learning_rate": 3.284974304209532e-05, + "loss": 0.84076959, + "num_input_tokens_seen": 381987296, + "router_z_loss_mlp": 0.30932617, + "step": 4613, + "time_per_iteration": 2.6077656745910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058933, + "balance_loss_mlp": 1.02801013, + "epoch": 0.8876490958060793, + "flos": 1565700931584.0, + "grad_norm": 0.05499745508093668, + "language_loss": 0.79193819, + "learning_rate": 3.27387731362766e-05, + "loss": 0.80252743, + "num_input_tokens_seen": 382091744, + "router_z_loss_mlp": 0.30883789, + "step": 4614, + "time_per_iteration": 3.8746235370635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064375, + "balance_loss_mlp": 1.03335643, + "epoch": 0.8878414774913428, + "flos": 636343838208.0, + "grad_norm": 0.05793142822201318, + "language_loss": 0.84617949, + "learning_rate": 3.2627984637354444e-05, + "loss": 0.85682321, + "num_input_tokens_seen": 382169600, + "router_z_loss_mlp": 0.30981445, + "step": 4615, + "time_per_iteration": 2.779799461364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061212, + "balance_loss_mlp": 1.03014576, + "epoch": 0.8880338591766064, + "flos": 496182440448.0, + "grad_norm": 0.06017785119690372, + "language_loss": 0.81558824, + "learning_rate": 3.251737758834084e-05, + "loss": 0.82620031, + "num_input_tokens_seen": 382238336, + "router_z_loss_mlp": 0.31030273, + "step": 4616, + "time_per_iteration": 2.609734058380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063093, + "balance_loss_mlp": 1.03126431, + "epoch": 0.88822624086187, + "flos": 542599299072.0, + "grad_norm": 0.05444758565813165, + "language_loss": 0.79956746, + "learning_rate": 3.2406952032177086e-05, + "loss": 0.81019837, + "num_input_tokens_seen": 382308560, + "router_z_loss_mlp": 0.31811523, + "step": 4617, + "time_per_iteration": 2.63232684135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061015, + "balance_loss_mlp": 1.02890027, + "epoch": 0.8884186225471336, + "flos": 551560541184.0, + "grad_norm": 0.06903875760224201, + "language_loss": 0.83818024, + "learning_rate": 3.229670801173418e-05, + "loss": 0.84879041, + "num_input_tokens_seen": 382377504, + "router_z_loss_mlp": 0.32104492, + "step": 4618, + "time_per_iteration": 2.589545488357544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013111, + "balance_loss_mlp": 1.00505221, + "epoch": 0.888611004232397, + "flos": 1564417276416.0, + "grad_norm": 0.0068369418927251785, + "language_loss": 0.78512192, + "learning_rate": 3.218664556981288e-05, + "loss": 0.79525304, + "num_input_tokens_seen": 382615728, + "router_z_loss_mlp": 0.08056641, + "step": 4619, + "time_per_iteration": 5.003114938735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062648, + "balance_loss_mlp": 1.03170085, + "epoch": 0.8888033859176606, + "flos": 766678252032.0, + "grad_norm": 0.057281222385008684, + "language_loss": 0.82745749, + "learning_rate": 3.207676474914301e-05, + "loss": 0.83808392, + "num_input_tokens_seen": 382695552, + "router_z_loss_mlp": 0.30908203, + "step": 4620, + "time_per_iteration": 2.990114212036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061842, + "balance_loss_mlp": 1.0310626, + "epoch": 0.8889957676029242, + "flos": 933727758336.0, + "grad_norm": 0.053752902191243575, + "language_loss": 0.84139264, + "learning_rate": 3.1967065592384105e-05, + "loss": 0.85201108, + "num_input_tokens_seen": 382775824, + "router_z_loss_mlp": 0.30761719, + "step": 4621, + "time_per_iteration": 3.1363883018493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064357, + "balance_loss_mlp": 1.03295684, + "epoch": 0.8891881492881878, + "flos": 589317313536.0, + "grad_norm": 0.057360134783463114, + "language_loss": 0.81454372, + "learning_rate": 3.1857548142125104e-05, + "loss": 0.82518733, + "num_input_tokens_seen": 382854464, + "router_z_loss_mlp": 0.3137207, + "step": 4622, + "time_per_iteration": 2.7618589401245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060903, + "balance_loss_mlp": 1.02976584, + "epoch": 0.8893805309734514, + "flos": 540438621696.0, + "grad_norm": 0.06850653634595572, + "language_loss": 0.82143193, + "learning_rate": 3.174821244088466e-05, + "loss": 0.83204097, + "num_input_tokens_seen": 382925088, + "router_z_loss_mlp": 0.3112793, + "step": 4623, + "time_per_iteration": 2.7498483657836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061404, + "balance_loss_mlp": 1.02990842, + "epoch": 0.8895729126587149, + "flos": 559827749376.0, + "grad_norm": 0.17707667007827743, + "language_loss": 0.81648695, + "learning_rate": 3.163905853111054e-05, + "loss": 0.82710099, + "num_input_tokens_seen": 382998640, + "router_z_loss_mlp": 0.31469727, + "step": 4624, + "time_per_iteration": 2.650419235229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106132, + "balance_loss_mlp": 1.03027821, + "epoch": 0.8897652943439784, + "flos": 609873338880.0, + "grad_norm": 0.04962289154740808, + "language_loss": 0.81375515, + "learning_rate": 3.153008645517996e-05, + "loss": 0.82436836, + "num_input_tokens_seen": 383076000, + "router_z_loss_mlp": 0.31005859, + "step": 4625, + "time_per_iteration": 2.7451446056365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062136, + "balance_loss_mlp": 1.03047383, + "epoch": 0.889957676029242, + "flos": 917455209984.0, + "grad_norm": 0.051652869550322736, + "language_loss": 0.77054471, + "learning_rate": 3.142129625539969e-05, + "loss": 0.78116608, + "num_input_tokens_seen": 383166640, + "router_z_loss_mlp": 0.31640625, + "step": 4626, + "time_per_iteration": 3.1661722660064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063731, + "balance_loss_mlp": 1.03202164, + "epoch": 0.8901500577145056, + "flos": 488452114944.0, + "grad_norm": 0.056002822435115965, + "language_loss": 0.80131978, + "learning_rate": 3.131268797400588e-05, + "loss": 0.81195712, + "num_input_tokens_seen": 383232928, + "router_z_loss_mlp": 0.31689453, + "step": 4627, + "time_per_iteration": 2.566154718399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062359, + "balance_loss_mlp": 1.03029203, + "epoch": 0.8903424393997691, + "flos": 733332994560.0, + "grad_norm": 0.06173508777641705, + "language_loss": 0.80719995, + "learning_rate": 3.120426165316398e-05, + "loss": 0.81782359, + "num_input_tokens_seen": 383314352, + "router_z_loss_mlp": 0.32055664, + "step": 4628, + "time_per_iteration": 2.9888558387756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105903, + "balance_loss_mlp": 1.02784455, + "epoch": 0.8905348210850327, + "flos": 519546534912.0, + "grad_norm": 0.04930821670569134, + "language_loss": 0.81522822, + "learning_rate": 3.109601733496881e-05, + "loss": 0.82581854, + "num_input_tokens_seen": 383384848, + "router_z_loss_mlp": 0.31152344, + "step": 4629, + "time_per_iteration": 2.6437489986419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060464, + "balance_loss_mlp": 1.02958894, + "epoch": 0.8907272027702963, + "flos": 578672640000.0, + "grad_norm": 0.05385866355437149, + "language_loss": 0.79690862, + "learning_rate": 3.098795506144458e-05, + "loss": 0.8075133, + "num_input_tokens_seen": 383463360, + "router_z_loss_mlp": 0.30834961, + "step": 4630, + "time_per_iteration": 2.810612916946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061651, + "balance_loss_mlp": 1.03070378, + "epoch": 0.8909195844555599, + "flos": 893258869248.0, + "grad_norm": 0.052849257039567936, + "language_loss": 0.79265952, + "learning_rate": 3.088007487454475e-05, + "loss": 0.803276, + "num_input_tokens_seen": 383542080, + "router_z_loss_mlp": 0.30908203, + "step": 4631, + "time_per_iteration": 3.088334321975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062189, + "balance_loss_mlp": 1.03069353, + "epoch": 0.8911119661408234, + "flos": 549596302848.0, + "grad_norm": 0.06712203160274297, + "language_loss": 0.84319258, + "learning_rate": 3.077237681615208e-05, + "loss": 0.85381448, + "num_input_tokens_seen": 383613056, + "router_z_loss_mlp": 0.31469727, + "step": 4632, + "time_per_iteration": 2.6473772525787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061159, + "balance_loss_mlp": 1.02980685, + "epoch": 0.8913043478260869, + "flos": 480884732928.0, + "grad_norm": 0.07195593938803238, + "language_loss": 0.83490551, + "learning_rate": 3.066486092807874e-05, + "loss": 0.84551716, + "num_input_tokens_seen": 383683280, + "router_z_loss_mlp": 0.31323242, + "step": 4633, + "time_per_iteration": 2.620727777481079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062888, + "balance_loss_mlp": 1.03086805, + "epoch": 0.8914967295113505, + "flos": 484317024768.0, + "grad_norm": 0.04555285940128422, + "language_loss": 0.84773296, + "learning_rate": 3.055752725206601e-05, + "loss": 0.85836184, + "num_input_tokens_seen": 383754624, + "router_z_loss_mlp": 0.32006836, + "step": 4634, + "time_per_iteration": 2.618859052658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060843, + "balance_loss_mlp": 1.02932405, + "epoch": 0.8916891111966141, + "flos": 445432642560.0, + "grad_norm": 0.0523806827340635, + "language_loss": 0.81158233, + "learning_rate": 3.0450375829784714e-05, + "loss": 0.82219076, + "num_input_tokens_seen": 383821984, + "router_z_loss_mlp": 0.31494141, + "step": 4635, + "time_per_iteration": 2.5323636531829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060849, + "balance_loss_mlp": 1.03002167, + "epoch": 0.8918814928818777, + "flos": 563751843840.0, + "grad_norm": 0.0513354141188765, + "language_loss": 0.78050125, + "learning_rate": 3.034340670283453e-05, + "loss": 0.79110974, + "num_input_tokens_seen": 383890880, + "router_z_loss_mlp": 0.30786133, + "step": 4636, + "time_per_iteration": 2.6924479007720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060607, + "balance_loss_mlp": 1.03030384, + "epoch": 0.8920738745671412, + "flos": 575672514048.0, + "grad_norm": 0.04845445615899239, + "language_loss": 0.81120145, + "learning_rate": 3.0236619912744513e-05, + "loss": 0.8218075, + "num_input_tokens_seen": 383962480, + "router_z_loss_mlp": 0.30249023, + "step": 4637, + "time_per_iteration": 2.693192481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061314, + "balance_loss_mlp": 1.0303911, + "epoch": 0.8922662562524047, + "flos": 619898171904.0, + "grad_norm": 0.049196243556278496, + "language_loss": 0.84060216, + "learning_rate": 3.0130015500973163e-05, + "loss": 0.8512153, + "num_input_tokens_seen": 384033616, + "router_z_loss_mlp": 0.30883789, + "step": 4638, + "time_per_iteration": 2.7037692070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060568, + "balance_loss_mlp": 1.02969277, + "epoch": 0.8924586379376683, + "flos": 583330056192.0, + "grad_norm": 0.05184193670463406, + "language_loss": 0.79242623, + "learning_rate": 3.0023593508907877e-05, + "loss": 0.80303186, + "num_input_tokens_seen": 384108848, + "router_z_loss_mlp": 0.30834961, + "step": 4639, + "time_per_iteration": 2.748689889907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062785, + "balance_loss_mlp": 1.03164768, + "epoch": 0.8926510196229319, + "flos": 524922716160.0, + "grad_norm": 0.04558515504354127, + "language_loss": 0.8157109, + "learning_rate": 2.991735397786538e-05, + "loss": 0.82633877, + "num_input_tokens_seen": 384185728, + "router_z_loss_mlp": 0.31103516, + "step": 4640, + "time_per_iteration": 2.780665874481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064798, + "balance_loss_mlp": 1.03318334, + "epoch": 0.8928434013081955, + "flos": 486428239872.0, + "grad_norm": 0.05672028214333359, + "language_loss": 0.80730885, + "learning_rate": 2.981129694909146e-05, + "loss": 0.81795681, + "num_input_tokens_seen": 384251552, + "router_z_loss_mlp": 0.31591797, + "step": 4641, + "time_per_iteration": 2.545320749282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012709, + "balance_loss_mlp": 1.00441241, + "epoch": 0.893035782993459, + "flos": 1447580837376.0, + "grad_norm": 0.005693234754152928, + "language_loss": 0.80330861, + "learning_rate": 2.970542246376118e-05, + "loss": 0.81343567, + "num_input_tokens_seen": 384472176, + "router_z_loss_mlp": 0.08300781, + "step": 4642, + "time_per_iteration": 4.690560817718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061836, + "balance_loss_mlp": 1.03088951, + "epoch": 0.8932281646787226, + "flos": 611040236544.0, + "grad_norm": 0.05793428976399419, + "language_loss": 0.8072226, + "learning_rate": 2.95997305629786e-05, + "loss": 0.81784093, + "num_input_tokens_seen": 384544224, + "router_z_loss_mlp": 0.30908203, + "step": 4643, + "time_per_iteration": 2.758070945739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062801, + "balance_loss_mlp": 1.03104377, + "epoch": 0.8934205463639862, + "flos": 565494912000.0, + "grad_norm": 0.04973706186555829, + "language_loss": 0.84834957, + "learning_rate": 2.9494221287776957e-05, + "loss": 0.85897756, + "num_input_tokens_seen": 384611728, + "router_z_loss_mlp": 0.31738281, + "step": 4644, + "time_per_iteration": 2.6707870960235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063096, + "balance_loss_mlp": 1.03217363, + "epoch": 0.8936129280492497, + "flos": 488181482496.0, + "grad_norm": 0.09316028593492325, + "language_loss": 0.77998525, + "learning_rate": 2.9388894679118484e-05, + "loss": 0.79061615, + "num_input_tokens_seen": 384678048, + "router_z_loss_mlp": 0.30883789, + "step": 4645, + "time_per_iteration": 2.5601553916931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063396, + "balance_loss_mlp": 1.03168607, + "epoch": 0.8938053097345132, + "flos": 886095949824.0, + "grad_norm": 0.05248493446128753, + "language_loss": 0.8068549, + "learning_rate": 2.9283750777894912e-05, + "loss": 0.81748885, + "num_input_tokens_seen": 384766768, + "router_z_loss_mlp": 0.31689453, + "step": 4646, + "time_per_iteration": 3.2007439136505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060405, + "balance_loss_mlp": 1.02924371, + "epoch": 0.8939976914197768, + "flos": 592999888896.0, + "grad_norm": 0.05511284894633522, + "language_loss": 0.83739501, + "learning_rate": 2.9178789624926427e-05, + "loss": 0.8479991, + "num_input_tokens_seen": 384842352, + "router_z_loss_mlp": 0.3112793, + "step": 4647, + "time_per_iteration": 2.709075927734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065548, + "balance_loss_mlp": 1.03357601, + "epoch": 0.8941900731050404, + "flos": 522983208960.0, + "grad_norm": 0.056894932724212664, + "language_loss": 0.80778831, + "learning_rate": 2.9074011260962706e-05, + "loss": 0.81844378, + "num_input_tokens_seen": 384912048, + "router_z_loss_mlp": 0.31958008, + "step": 4648, + "time_per_iteration": 2.6082539558410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061533, + "balance_loss_mlp": 1.03051448, + "epoch": 0.894382454790304, + "flos": 800247651840.0, + "grad_norm": 0.04566115166749404, + "language_loss": 0.80567217, + "learning_rate": 2.8969415726682158e-05, + "loss": 0.8162874, + "num_input_tokens_seen": 384986560, + "router_z_loss_mlp": 0.30981445, + "step": 4649, + "time_per_iteration": 2.979668140411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106435, + "balance_loss_mlp": 1.03302193, + "epoch": 0.8945748364755676, + "flos": 478782282240.0, + "grad_norm": 0.05084175765827824, + "language_loss": 0.84974194, + "learning_rate": 2.8865003062692517e-05, + "loss": 0.86038542, + "num_input_tokens_seen": 385057376, + "router_z_loss_mlp": 0.31298828, + "step": 4650, + "time_per_iteration": 2.5919971466064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061126, + "balance_loss_mlp": 1.03068006, + "epoch": 0.894767218160831, + "flos": 508507573248.0, + "grad_norm": 0.050809321075872965, + "language_loss": 0.82988006, + "learning_rate": 2.876077330953042e-05, + "loss": 0.84049129, + "num_input_tokens_seen": 385130880, + "router_z_loss_mlp": 0.30395508, + "step": 4651, + "time_per_iteration": 2.7233057022094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058609, + "balance_loss_mlp": 1.02763844, + "epoch": 0.8949595998460946, + "flos": 685557181440.0, + "grad_norm": 0.06487677306684464, + "language_loss": 0.81605327, + "learning_rate": 2.8656726507661378e-05, + "loss": 0.82663941, + "num_input_tokens_seen": 385205808, + "router_z_loss_mlp": 0.30932617, + "step": 4652, + "time_per_iteration": 2.82380747795105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061293, + "balance_loss_mlp": 1.02941608, + "epoch": 0.8951519815313582, + "flos": 799578349056.0, + "grad_norm": 0.05081684186853934, + "language_loss": 0.7694239, + "learning_rate": 2.855286269747981e-05, + "loss": 0.78003681, + "num_input_tokens_seen": 385283616, + "router_z_loss_mlp": 0.31860352, + "step": 4653, + "time_per_iteration": 2.9739062786102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059724, + "balance_loss_mlp": 1.02849102, + "epoch": 0.8953443632166218, + "flos": 666443068416.0, + "grad_norm": 0.06375205061358989, + "language_loss": 0.85606253, + "learning_rate": 2.8449181919309398e-05, + "loss": 0.86665976, + "num_input_tokens_seen": 385357488, + "router_z_loss_mlp": 0.31201172, + "step": 4654, + "time_per_iteration": 2.8078479766845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057999, + "balance_loss_mlp": 1.02690959, + "epoch": 0.8955367449018854, + "flos": 644670683136.0, + "grad_norm": 0.04984422394174067, + "language_loss": 0.83020389, + "learning_rate": 2.8345684213402556e-05, + "loss": 0.84078383, + "num_input_tokens_seen": 385431280, + "router_z_loss_mlp": 0.31054688, + "step": 4655, + "time_per_iteration": 2.814558506011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062912, + "balance_loss_mlp": 1.03113103, + "epoch": 0.8957291265871489, + "flos": 808353326592.0, + "grad_norm": 0.053021459210243815, + "language_loss": 0.77264309, + "learning_rate": 2.8242369619940644e-05, + "loss": 0.78327227, + "num_input_tokens_seen": 385509840, + "router_z_loss_mlp": 0.31762695, + "step": 4656, + "time_per_iteration": 3.0364105701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062996, + "balance_loss_mlp": 1.03104842, + "epoch": 0.8959215082724125, + "flos": 518664826368.0, + "grad_norm": 0.06969643798779511, + "language_loss": 0.77000499, + "learning_rate": 2.813923817903391e-05, + "loss": 0.78063488, + "num_input_tokens_seen": 385580384, + "router_z_loss_mlp": 0.31933594, + "step": 4657, + "time_per_iteration": 2.6151626110076904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106056, + "balance_loss_mlp": 1.02889752, + "epoch": 0.896113889957676, + "flos": 476669657088.0, + "grad_norm": 0.04964287244699384, + "language_loss": 0.76999301, + "learning_rate": 2.8036289930721603e-05, + "loss": 0.78059864, + "num_input_tokens_seen": 385649184, + "router_z_loss_mlp": 0.31640625, + "step": 4658, + "time_per_iteration": 2.5889346599578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062157, + "balance_loss_mlp": 1.03051877, + "epoch": 0.8963062716429396, + "flos": 517911155712.0, + "grad_norm": 0.05573202137448351, + "language_loss": 0.82991636, + "learning_rate": 2.7933524914971697e-05, + "loss": 0.84053797, + "num_input_tokens_seen": 385717072, + "router_z_loss_mlp": 0.31616211, + "step": 4659, + "time_per_iteration": 2.6229076385498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059463, + "balance_loss_mlp": 1.02868307, + "epoch": 0.8964986533282031, + "flos": 508231148544.0, + "grad_norm": 0.05335291293119473, + "language_loss": 0.81595254, + "learning_rate": 2.7830943171681113e-05, + "loss": 0.82654721, + "num_input_tokens_seen": 385788880, + "router_z_loss_mlp": 0.30737305, + "step": 4660, + "time_per_iteration": 2.6678032875061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061446, + "balance_loss_mlp": 1.03007066, + "epoch": 0.8966910350134667, + "flos": 535819083264.0, + "grad_norm": 0.05953051105641742, + "language_loss": 0.81392318, + "learning_rate": 2.77285447406756e-05, + "loss": 0.82453763, + "num_input_tokens_seen": 385854240, + "router_z_loss_mlp": 0.31347656, + "step": 4661, + "time_per_iteration": 2.5935118198394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061865, + "balance_loss_mlp": 1.03110909, + "epoch": 0.8968834166987303, + "flos": 722909491200.0, + "grad_norm": 0.053847981098818644, + "language_loss": 0.83905041, + "learning_rate": 2.7626329661709914e-05, + "loss": 0.8496691, + "num_input_tokens_seen": 385926080, + "router_z_loss_mlp": 0.30712891, + "step": 4662, + "time_per_iteration": 2.8665292263031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064722, + "balance_loss_mlp": 1.0333935, + "epoch": 0.8970757983839939, + "flos": 681372628992.0, + "grad_norm": 0.04437914022124262, + "language_loss": 0.83813488, + "learning_rate": 2.7524297974467372e-05, + "loss": 0.84878206, + "num_input_tokens_seen": 386005696, + "router_z_loss_mlp": 0.31298828, + "step": 4663, + "time_per_iteration": 2.8976876735687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059748, + "balance_loss_mlp": 1.02832484, + "epoch": 0.8972681800692575, + "flos": 612758573568.0, + "grad_norm": 0.06417585918010674, + "language_loss": 0.75612116, + "learning_rate": 2.742244971856006e-05, + "loss": 0.76671863, + "num_input_tokens_seen": 386073248, + "router_z_loss_mlp": 0.31396484, + "step": 4664, + "time_per_iteration": 2.703761577606201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061274, + "balance_loss_mlp": 1.0300653, + "epoch": 0.8974605617545209, + "flos": 572064132096.0, + "grad_norm": 0.05003512602131667, + "language_loss": 0.83072126, + "learning_rate": 2.732078493352913e-05, + "loss": 0.84133399, + "num_input_tokens_seen": 386148528, + "router_z_loss_mlp": 0.31176758, + "step": 4665, + "time_per_iteration": 2.7152178287506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062922, + "balance_loss_mlp": 1.03080714, + "epoch": 0.8976529434397845, + "flos": 520147436544.0, + "grad_norm": 0.04771841444398887, + "language_loss": 0.87391418, + "learning_rate": 2.721930365884434e-05, + "loss": 0.88454342, + "num_input_tokens_seen": 386218528, + "router_z_loss_mlp": 0.32104492, + "step": 4666, + "time_per_iteration": 2.6735920906066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057794, + "balance_loss_mlp": 1.02670431, + "epoch": 0.8978453251250481, + "flos": 471124740096.0, + "grad_norm": 0.0485725814683155, + "language_loss": 0.82510161, + "learning_rate": 2.7118005933904176e-05, + "loss": 0.83567965, + "num_input_tokens_seen": 386284704, + "router_z_loss_mlp": 0.31054688, + "step": 4667, + "time_per_iteration": 2.604840040206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058903, + "balance_loss_mlp": 1.02862406, + "epoch": 0.8980377068103117, + "flos": 591370301952.0, + "grad_norm": 0.051378272948665586, + "language_loss": 0.81776893, + "learning_rate": 2.7016891798035904e-05, + "loss": 0.82835793, + "num_input_tokens_seen": 386356128, + "router_z_loss_mlp": 0.30249023, + "step": 4668, + "time_per_iteration": 2.750239372253418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063468, + "balance_loss_mlp": 1.0324496, + "epoch": 0.8982300884955752, + "flos": 767287918080.0, + "grad_norm": 0.06911870880947439, + "language_loss": 0.82571542, + "learning_rate": 2.691596129049556e-05, + "loss": 0.83635008, + "num_input_tokens_seen": 386434048, + "router_z_loss_mlp": 0.31005859, + "step": 4669, + "time_per_iteration": 2.945383071899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106161, + "balance_loss_mlp": 1.03099728, + "epoch": 0.8984224701808388, + "flos": 844189530624.0, + "grad_norm": 0.06637017917952584, + "language_loss": 0.7722441, + "learning_rate": 2.681521445046775e-05, + "loss": 0.78286028, + "num_input_tokens_seen": 386532384, + "router_z_loss_mlp": 0.30566406, + "step": 4670, + "time_per_iteration": 3.198310613632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062976, + "balance_loss_mlp": 1.03155208, + "epoch": 0.8986148518661023, + "flos": 757303782912.0, + "grad_norm": 0.07008375711524328, + "language_loss": 0.75845528, + "learning_rate": 2.6714651317065963e-05, + "loss": 0.76908505, + "num_input_tokens_seen": 386627120, + "router_z_loss_mlp": 0.31396484, + "step": 4671, + "time_per_iteration": 3.1156165599823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061856, + "balance_loss_mlp": 1.03047979, + "epoch": 0.8988072335513659, + "flos": 562801734144.0, + "grad_norm": 0.05163883650190103, + "language_loss": 0.76486373, + "learning_rate": 2.6614271929332133e-05, + "loss": 0.7754823, + "num_input_tokens_seen": 386700192, + "router_z_loss_mlp": 0.31347656, + "step": 4672, + "time_per_iteration": 2.671839475631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062538, + "balance_loss_mlp": 1.03097177, + "epoch": 0.8989996152366295, + "flos": 492440228352.0, + "grad_norm": 0.05196577286527717, + "language_loss": 0.86882824, + "learning_rate": 2.6514076326237147e-05, + "loss": 0.87945366, + "num_input_tokens_seen": 386764256, + "router_z_loss_mlp": 0.31542969, + "step": 4673, + "time_per_iteration": 2.5203633308410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060918, + "balance_loss_mlp": 1.02963722, + "epoch": 0.899191996921893, + "flos": 542303935488.0, + "grad_norm": 0.06061502607868415, + "language_loss": 0.758295, + "learning_rate": 2.6414064546680438e-05, + "loss": 0.76890421, + "num_input_tokens_seen": 386835792, + "router_z_loss_mlp": 0.3125, + "step": 4674, + "time_per_iteration": 2.6241261959075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060398, + "balance_loss_mlp": 1.02914178, + "epoch": 0.8993843786071566, + "flos": 471081070080.0, + "grad_norm": 0.052429553353469285, + "language_loss": 0.80238754, + "learning_rate": 2.631423662948984e-05, + "loss": 0.8129915, + "num_input_tokens_seen": 386904368, + "router_z_loss_mlp": 0.31225586, + "step": 4675, + "time_per_iteration": 2.5443856716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062814, + "balance_loss_mlp": 1.03170013, + "epoch": 0.8995767602924202, + "flos": 526454788608.0, + "grad_norm": 0.04980258004254359, + "language_loss": 0.82579398, + "learning_rate": 2.621459261342196e-05, + "loss": 0.83642209, + "num_input_tokens_seen": 386977872, + "router_z_loss_mlp": 0.31079102, + "step": 4676, + "time_per_iteration": 2.721583127975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061451, + "balance_loss_mlp": 1.02997994, + "epoch": 0.8997691419776838, + "flos": 557365916160.0, + "grad_norm": 0.08112559791576887, + "language_loss": 0.84614646, + "learning_rate": 2.6115132537162245e-05, + "loss": 0.85676098, + "num_input_tokens_seen": 387052080, + "router_z_loss_mlp": 0.31445312, + "step": 4677, + "time_per_iteration": 2.678091049194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064491, + "balance_loss_mlp": 1.03287649, + "epoch": 0.8999615236629472, + "flos": 638722713600.0, + "grad_norm": 0.05386852580878479, + "language_loss": 0.8060981, + "learning_rate": 2.601585643932436e-05, + "loss": 0.81674302, + "num_input_tokens_seen": 387129712, + "router_z_loss_mlp": 0.31591797, + "step": 4678, + "time_per_iteration": 2.8065719604492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008515, + "balance_loss_mlp": 1.00031304, + "epoch": 0.9001539053482108, + "flos": 1430743703040.0, + "grad_norm": 0.0043436947203847566, + "language_loss": 0.85784018, + "learning_rate": 2.5916764358450862e-05, + "loss": 0.86792541, + "num_input_tokens_seen": 387356560, + "router_z_loss_mlp": 0.08203125, + "step": 4679, + "time_per_iteration": 4.774789810180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063627, + "balance_loss_mlp": 1.03268027, + "epoch": 0.9003462870334744, + "flos": 566589026304.0, + "grad_norm": 0.06797302187822865, + "language_loss": 0.79665619, + "learning_rate": 2.5817856333012425e-05, + "loss": 0.80729246, + "num_input_tokens_seen": 387438640, + "router_z_loss_mlp": 0.30908203, + "step": 4680, + "time_per_iteration": 2.839365243911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063395, + "balance_loss_mlp": 1.03173351, + "epoch": 0.900538668718738, + "flos": 538394397696.0, + "grad_norm": 0.051508441084865235, + "language_loss": 0.78311312, + "learning_rate": 2.5719132401408883e-05, + "loss": 0.79374701, + "num_input_tokens_seen": 387507088, + "router_z_loss_mlp": 0.31640625, + "step": 4681, + "time_per_iteration": 2.6403775215148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062917, + "balance_loss_mlp": 1.03225613, + "epoch": 0.9007310504040016, + "flos": 488146576896.0, + "grad_norm": 0.06865687057695076, + "language_loss": 0.85842234, + "learning_rate": 2.5620592601968028e-05, + "loss": 0.86905152, + "num_input_tokens_seen": 387574160, + "router_z_loss_mlp": 0.30615234, + "step": 4682, + "time_per_iteration": 2.5301215648651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061072, + "balance_loss_mlp": 1.02983928, + "epoch": 0.9009234320892651, + "flos": 652593065472.0, + "grad_norm": 0.06244574695746065, + "language_loss": 0.78513706, + "learning_rate": 2.5522236972946532e-05, + "loss": 0.79574782, + "num_input_tokens_seen": 387652528, + "router_z_loss_mlp": 0.31201172, + "step": 4683, + "time_per_iteration": 2.8237221240997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061087, + "balance_loss_mlp": 1.02878177, + "epoch": 0.9011158137745287, + "flos": 545302651392.0, + "grad_norm": 0.04806790950859059, + "language_loss": 0.85241842, + "learning_rate": 2.5424065552529295e-05, + "loss": 0.86302924, + "num_input_tokens_seen": 387723520, + "router_z_loss_mlp": 0.32299805, + "step": 4684, + "time_per_iteration": 2.6552274227142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059354, + "balance_loss_mlp": 1.02890825, + "epoch": 0.9013081954597922, + "flos": 559429079040.0, + "grad_norm": 0.06508023738808166, + "language_loss": 0.82589149, + "learning_rate": 2.532607837883011e-05, + "loss": 0.83648503, + "num_input_tokens_seen": 387793664, + "router_z_loss_mlp": 0.30395508, + "step": 4685, + "time_per_iteration": 2.766566753387451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060398, + "balance_loss_mlp": 1.02921259, + "epoch": 0.9015005771450558, + "flos": 728330752512.0, + "grad_norm": 0.05594908680106596, + "language_loss": 0.81363046, + "learning_rate": 2.5228275489890706e-05, + "loss": 0.82423443, + "num_input_tokens_seen": 387871008, + "router_z_loss_mlp": 0.31176758, + "step": 4686, + "time_per_iteration": 2.903522491455078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060619, + "balance_loss_mlp": 1.02867079, + "epoch": 0.9016929588303193, + "flos": 517148720640.0, + "grad_norm": 0.04850157903091311, + "language_loss": 0.80952024, + "learning_rate": 2.5130656923681605e-05, + "loss": 0.82012641, + "num_input_tokens_seen": 387950832, + "router_z_loss_mlp": 0.31933594, + "step": 4687, + "time_per_iteration": 2.770630121231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060384, + "balance_loss_mlp": 1.02941346, + "epoch": 0.9018853405155829, + "flos": 622031145984.0, + "grad_norm": 0.05618517422813967, + "language_loss": 0.8593204, + "learning_rate": 2.503322271810171e-05, + "loss": 0.86992431, + "num_input_tokens_seen": 388029792, + "router_z_loss_mlp": 0.30932617, + "step": 4688, + "time_per_iteration": 2.8023810386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106025, + "balance_loss_mlp": 1.02908909, + "epoch": 0.9020777222008465, + "flos": 523022496768.0, + "grad_norm": 0.05048030300979413, + "language_loss": 0.77799124, + "learning_rate": 2.4935972910978378e-05, + "loss": 0.78859371, + "num_input_tokens_seen": 388095872, + "router_z_loss_mlp": 0.3112793, + "step": 4689, + "time_per_iteration": 2.626427412033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059848, + "balance_loss_mlp": 1.02890086, + "epoch": 0.9022701038861101, + "flos": 633419315712.0, + "grad_norm": 0.05593641687528262, + "language_loss": 0.81798267, + "learning_rate": 2.4838907540067346e-05, + "loss": 0.82858115, + "num_input_tokens_seen": 388171632, + "router_z_loss_mlp": 0.30908203, + "step": 4690, + "time_per_iteration": 2.8088419437408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065299, + "balance_loss_mlp": 1.03471041, + "epoch": 0.9024624855713737, + "flos": 513036951552.0, + "grad_norm": 0.04860641304257661, + "language_loss": 0.84015805, + "learning_rate": 2.474202664305253e-05, + "loss": 0.850811, + "num_input_tokens_seen": 388242240, + "router_z_loss_mlp": 0.30541992, + "step": 4691, + "time_per_iteration": 2.6090428829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060526, + "balance_loss_mlp": 1.02977014, + "epoch": 0.9026548672566371, + "flos": 477152695296.0, + "grad_norm": 0.07265058382258091, + "language_loss": 0.86403483, + "learning_rate": 2.464533025754673e-05, + "loss": 0.87464011, + "num_input_tokens_seen": 388310960, + "router_z_loss_mlp": 0.30712891, + "step": 4692, + "time_per_iteration": 2.6414620876312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063388, + "balance_loss_mlp": 1.03158331, + "epoch": 0.9028472489419007, + "flos": 661701284352.0, + "grad_norm": 0.050677487333482145, + "language_loss": 0.73312789, + "learning_rate": 2.454881842109058e-05, + "loss": 0.74376178, + "num_input_tokens_seen": 388387280, + "router_z_loss_mlp": 0.31787109, + "step": 4693, + "time_per_iteration": 2.8417153358459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106063, + "balance_loss_mlp": 1.02927816, + "epoch": 0.9030396306271643, + "flos": 534332090880.0, + "grad_norm": 0.052476495010180334, + "language_loss": 0.8169986, + "learning_rate": 2.4452491171153445e-05, + "loss": 0.82760489, + "num_input_tokens_seen": 388456992, + "router_z_loss_mlp": 0.31323242, + "step": 4694, + "time_per_iteration": 2.6444764137268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062559, + "balance_loss_mlp": 1.03151679, + "epoch": 0.9032320123124279, + "flos": 800695784448.0, + "grad_norm": 0.05285845656928848, + "language_loss": 0.82164681, + "learning_rate": 2.43563485451328e-05, + "loss": 0.83227229, + "num_input_tokens_seen": 388534896, + "router_z_loss_mlp": 0.31005859, + "step": 4695, + "time_per_iteration": 2.9648303985595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105879, + "balance_loss_mlp": 1.02741396, + "epoch": 0.9034243939976914, + "flos": 553673166336.0, + "grad_norm": 0.06654317576841562, + "language_loss": 0.76353633, + "learning_rate": 2.426039058035451e-05, + "loss": 0.77412426, + "num_input_tokens_seen": 388606640, + "router_z_loss_mlp": 0.31347656, + "step": 4696, + "time_per_iteration": 2.6323938369750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061699, + "balance_loss_mlp": 1.03065646, + "epoch": 0.903616775682955, + "flos": 503656690176.0, + "grad_norm": 0.05234220926358092, + "language_loss": 0.82479656, + "learning_rate": 2.4164617314072823e-05, + "loss": 0.83541358, + "num_input_tokens_seen": 388675920, + "router_z_loss_mlp": 0.31005859, + "step": 4697, + "time_per_iteration": 2.598928928375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060807, + "balance_loss_mlp": 1.02962184, + "epoch": 0.9038091573682185, + "flos": 436058173440.0, + "grad_norm": 0.05082677150360358, + "language_loss": 0.7861774, + "learning_rate": 2.406902878347017e-05, + "loss": 0.79678547, + "num_input_tokens_seen": 388743968, + "router_z_loss_mlp": 0.31176758, + "step": 4698, + "time_per_iteration": 2.60606050491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059475, + "balance_loss_mlp": 1.0276463, + "epoch": 0.9040015390534821, + "flos": 532648659456.0, + "grad_norm": 0.06214700158023469, + "language_loss": 0.81251138, + "learning_rate": 2.3973625025657253e-05, + "loss": 0.82310611, + "num_input_tokens_seen": 388810784, + "router_z_loss_mlp": 0.31811523, + "step": 4699, + "time_per_iteration": 2.6206655502319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057598, + "balance_loss_mlp": 1.02665126, + "epoch": 0.9041939207387457, + "flos": 564028268544.0, + "grad_norm": 0.06726582605850791, + "language_loss": 0.80017805, + "learning_rate": 2.3878406077673275e-05, + "loss": 0.810754, + "num_input_tokens_seen": 388885072, + "router_z_loss_mlp": 0.30908203, + "step": 4700, + "time_per_iteration": 2.755746364593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059864, + "balance_loss_mlp": 1.02913213, + "epoch": 0.9043863024240092, + "flos": 515257265664.0, + "grad_norm": 0.06510896632722754, + "language_loss": 0.77814531, + "learning_rate": 2.3783371976485447e-05, + "loss": 0.78874397, + "num_input_tokens_seen": 388951184, + "router_z_loss_mlp": 0.30688477, + "step": 4701, + "time_per_iteration": 2.5619757175445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008403, + "balance_loss_mlp": 1.00024879, + "epoch": 0.9045786841092728, + "flos": 1277243043840.0, + "grad_norm": 0.003958799533886951, + "language_loss": 0.72929788, + "learning_rate": 2.368852275898914e-05, + "loss": 0.73938191, + "num_input_tokens_seen": 389170752, + "router_z_loss_mlp": 0.08154297, + "step": 4702, + "time_per_iteration": 4.942458152770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062191, + "balance_loss_mlp": 1.03133917, + "epoch": 0.9047710657945364, + "flos": 585569309184.0, + "grad_norm": 0.05494675450974493, + "language_loss": 0.82736337, + "learning_rate": 2.3593858462008178e-05, + "loss": 0.83798528, + "num_input_tokens_seen": 389239600, + "router_z_loss_mlp": 0.30810547, + "step": 4703, + "time_per_iteration": 2.676253080368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061667, + "balance_loss_mlp": 1.03017187, + "epoch": 0.9049634474798, + "flos": 571655287296.0, + "grad_norm": 0.05299767963476469, + "language_loss": 0.79625463, + "learning_rate": 2.3499379122294495e-05, + "loss": 0.80687135, + "num_input_tokens_seen": 389316032, + "router_z_loss_mlp": 0.31469727, + "step": 4704, + "time_per_iteration": 2.7089710235595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061381, + "balance_loss_mlp": 1.03081548, + "epoch": 0.9051558291650635, + "flos": 572353703424.0, + "grad_norm": 0.06198220417102737, + "language_loss": 0.74331594, + "learning_rate": 2.3405084776528307e-05, + "loss": 0.75392973, + "num_input_tokens_seen": 389383504, + "router_z_loss_mlp": 0.30517578, + "step": 4705, + "time_per_iteration": 2.657379388809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061059, + "balance_loss_mlp": 1.03013611, + "epoch": 0.905348210850327, + "flos": 540280060416.0, + "grad_norm": 0.07947376611636264, + "language_loss": 0.79365158, + "learning_rate": 2.331097546131783e-05, + "loss": 0.80426216, + "num_input_tokens_seen": 389454592, + "router_z_loss_mlp": 0.30883789, + "step": 4706, + "time_per_iteration": 2.6540727615356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064249, + "balance_loss_mlp": 1.03370762, + "epoch": 0.9055405925355906, + "flos": 516128799744.0, + "grad_norm": 0.057615129617973604, + "language_loss": 0.81330758, + "learning_rate": 2.321705121319956e-05, + "loss": 0.82395005, + "num_input_tokens_seen": 389519696, + "router_z_loss_mlp": 0.30493164, + "step": 4707, + "time_per_iteration": 2.5897743701934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056914, + "balance_loss_mlp": 1.02506149, + "epoch": 0.9057329742208542, + "flos": 914249880576.0, + "grad_norm": 0.04603068937294546, + "language_loss": 0.84972519, + "learning_rate": 2.3123312068638104e-05, + "loss": 0.86029434, + "num_input_tokens_seen": 389603568, + "router_z_loss_mlp": 0.31835938, + "step": 4708, + "time_per_iteration": 3.160703420639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010632, + "balance_loss_mlp": 1.03206229, + "epoch": 0.9059253559061178, + "flos": 904884175872.0, + "grad_norm": 0.16660465263722607, + "language_loss": 0.82760024, + "learning_rate": 2.3029758064026295e-05, + "loss": 0.83823222, + "num_input_tokens_seen": 389687504, + "router_z_loss_mlp": 0.31103516, + "step": 4709, + "time_per_iteration": 3.179295301437378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059241, + "balance_loss_mlp": 1.02836561, + "epoch": 0.9061177375913813, + "flos": 664218372096.0, + "grad_norm": 0.06166960355776129, + "language_loss": 0.77393854, + "learning_rate": 2.2936389235684918e-05, + "loss": 0.78453094, + "num_input_tokens_seen": 389764880, + "router_z_loss_mlp": 0.30834961, + "step": 4710, + "time_per_iteration": 2.8492090702056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063495, + "balance_loss_mlp": 1.03223789, + "epoch": 0.9063101192766448, + "flos": 565318821888.0, + "grad_norm": 0.05907794054329625, + "language_loss": 0.82644868, + "learning_rate": 2.2843205619862972e-05, + "loss": 0.8370837, + "num_input_tokens_seen": 389838304, + "router_z_loss_mlp": 0.31225586, + "step": 4711, + "time_per_iteration": 2.7433969974517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106258, + "balance_loss_mlp": 1.03234863, + "epoch": 0.9065025009619084, + "flos": 727064930304.0, + "grad_norm": 0.06819697260441993, + "language_loss": 0.78757668, + "learning_rate": 2.2750207252737742e-05, + "loss": 0.79820251, + "num_input_tokens_seen": 389908592, + "router_z_loss_mlp": 0.30175781, + "step": 4712, + "time_per_iteration": 2.885631799697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061608, + "balance_loss_mlp": 1.03123391, + "epoch": 0.906694882647172, + "flos": 531254799360.0, + "grad_norm": 0.06086888254866861, + "language_loss": 0.79970586, + "learning_rate": 2.265739417041418e-05, + "loss": 0.81032193, + "num_input_tokens_seen": 389979040, + "router_z_loss_mlp": 0.30322266, + "step": 4713, + "time_per_iteration": 2.6492934226989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065061, + "balance_loss_mlp": 1.03363752, + "epoch": 0.9068872643324356, + "flos": 429563146752.0, + "grad_norm": 0.05345280060341207, + "language_loss": 0.84838974, + "learning_rate": 2.2564766408925574e-05, + "loss": 0.85904038, + "num_input_tokens_seen": 390046080, + "router_z_loss_mlp": 0.31396484, + "step": 4714, + "time_per_iteration": 2.578385591506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061674, + "balance_loss_mlp": 1.03034616, + "epoch": 0.9070796460176991, + "flos": 588095161344.0, + "grad_norm": 0.054473857957546834, + "language_loss": 0.79786414, + "learning_rate": 2.2472324004233214e-05, + "loss": 0.80848086, + "num_input_tokens_seen": 390122176, + "router_z_loss_mlp": 0.31298828, + "step": 4715, + "time_per_iteration": 2.7411398887634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062125, + "balance_loss_mlp": 1.03082108, + "epoch": 0.9072720277029627, + "flos": 571314843648.0, + "grad_norm": 0.06136918280584941, + "language_loss": 0.7556839, + "learning_rate": 2.2380066992226446e-05, + "loss": 0.76630509, + "num_input_tokens_seen": 390195216, + "router_z_loss_mlp": 0.31274414, + "step": 4716, + "time_per_iteration": 2.717400550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065363, + "balance_loss_mlp": 1.03432047, + "epoch": 0.9074644093882263, + "flos": 555534097920.0, + "grad_norm": 0.05054647084062828, + "language_loss": 0.88467407, + "learning_rate": 2.2287995408722617e-05, + "loss": 0.89532775, + "num_input_tokens_seen": 390263216, + "router_z_loss_mlp": 0.31005859, + "step": 4717, + "time_per_iteration": 2.626262664794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065114, + "balance_loss_mlp": 1.0337857, + "epoch": 0.9076567910734898, + "flos": 640701508608.0, + "grad_norm": 0.05014211489878531, + "language_loss": 0.82399035, + "learning_rate": 2.2196109289467083e-05, + "loss": 0.83464146, + "num_input_tokens_seen": 390337360, + "router_z_loss_mlp": 0.31298828, + "step": 4718, + "time_per_iteration": 2.8218960762023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063784, + "balance_loss_mlp": 1.03300405, + "epoch": 0.9078491727587533, + "flos": 733635560448.0, + "grad_norm": 0.05294662816605839, + "language_loss": 0.81557, + "learning_rate": 2.2104408670133193e-05, + "loss": 0.82620788, + "num_input_tokens_seen": 390427728, + "router_z_loss_mlp": 0.30737305, + "step": 4719, + "time_per_iteration": 3.107689142227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063177, + "balance_loss_mlp": 1.0316577, + "epoch": 0.9080415544440169, + "flos": 654464171520.0, + "grad_norm": 0.05391534489649744, + "language_loss": 0.86544436, + "learning_rate": 2.2012893586322245e-05, + "loss": 0.8760761, + "num_input_tokens_seen": 390504736, + "router_z_loss_mlp": 0.31494141, + "step": 4720, + "time_per_iteration": 2.8423755168914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060877, + "balance_loss_mlp": 1.02964377, + "epoch": 0.9082339361292805, + "flos": 597180059136.0, + "grad_norm": 0.051732261345012694, + "language_loss": 0.79402268, + "learning_rate": 2.1921564073563604e-05, + "loss": 0.80463141, + "num_input_tokens_seen": 390582048, + "router_z_loss_mlp": 0.31201172, + "step": 4721, + "time_per_iteration": 2.7443206310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063085, + "balance_loss_mlp": 1.03211474, + "epoch": 0.9084263178145441, + "flos": 504154285056.0, + "grad_norm": 0.05049795376918643, + "language_loss": 0.84334135, + "learning_rate": 2.183042016731457e-05, + "loss": 0.8539722, + "num_input_tokens_seen": 390652976, + "router_z_loss_mlp": 0.30932617, + "step": 4722, + "time_per_iteration": 2.6413490772247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063234, + "balance_loss_mlp": 1.03235853, + "epoch": 0.9086186994998077, + "flos": 549763628544.0, + "grad_norm": 0.052887401454076326, + "language_loss": 0.8025831, + "learning_rate": 2.1739461902960223e-05, + "loss": 0.81321543, + "num_input_tokens_seen": 390726832, + "router_z_loss_mlp": 0.30834961, + "step": 4723, + "time_per_iteration": 2.7238101959228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059634, + "balance_loss_mlp": 1.02861619, + "epoch": 0.9088110811850711, + "flos": 1133620545024.0, + "grad_norm": 0.049238077529050184, + "language_loss": 0.75059247, + "learning_rate": 2.1648689315813763e-05, + "loss": 0.76118881, + "num_input_tokens_seen": 390824480, + "router_z_loss_mlp": 0.30981445, + "step": 4724, + "time_per_iteration": 3.576720952987671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063217, + "balance_loss_mlp": 1.03165007, + "epoch": 0.9090034628703347, + "flos": 556725726720.0, + "grad_norm": 0.0503925655199207, + "language_loss": 0.76640475, + "learning_rate": 2.155810244111628e-05, + "loss": 0.77703691, + "num_input_tokens_seen": 390897552, + "router_z_loss_mlp": 0.31542969, + "step": 4725, + "time_per_iteration": 2.6426239013671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061991, + "balance_loss_mlp": 1.03090096, + "epoch": 0.9091958445555983, + "flos": 543697795584.0, + "grad_norm": 0.05168476660108364, + "language_loss": 0.84019625, + "learning_rate": 2.146770131403658e-05, + "loss": 0.85081613, + "num_input_tokens_seen": 390969008, + "router_z_loss_mlp": 0.31054688, + "step": 4726, + "time_per_iteration": 2.671800374984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062231, + "balance_loss_mlp": 1.03128409, + "epoch": 0.9093882262408619, + "flos": 525858269184.0, + "grad_norm": 0.057180053188060825, + "language_loss": 0.81223357, + "learning_rate": 2.1377485969671594e-05, + "loss": 0.82285595, + "num_input_tokens_seen": 391038880, + "router_z_loss_mlp": 0.30908203, + "step": 4727, + "time_per_iteration": 2.626508951187134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059711, + "balance_loss_mlp": 1.02893078, + "epoch": 0.9095806079261254, + "flos": 548266461696.0, + "grad_norm": 0.059368213087244666, + "language_loss": 0.81565529, + "learning_rate": 2.1287456443046084e-05, + "loss": 0.82625234, + "num_input_tokens_seen": 391106720, + "router_z_loss_mlp": 0.30737305, + "step": 4728, + "time_per_iteration": 2.679184913635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062462, + "balance_loss_mlp": 1.03063333, + "epoch": 0.909772989611389, + "flos": 572260571136.0, + "grad_norm": 0.11685858483587666, + "language_loss": 0.8463881, + "learning_rate": 2.1197612769112528e-05, + "loss": 0.85701275, + "num_input_tokens_seen": 391178128, + "router_z_loss_mlp": 0.31811523, + "step": 4729, + "time_per_iteration": 2.692808151245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062924, + "balance_loss_mlp": 1.03154778, + "epoch": 0.9099653712966526, + "flos": 561546086400.0, + "grad_norm": 0.06067152965418052, + "language_loss": 0.79611409, + "learning_rate": 2.1107954982751254e-05, + "loss": 0.80674326, + "num_input_tokens_seen": 391248848, + "router_z_loss_mlp": 0.31347656, + "step": 4730, + "time_per_iteration": 2.662307024002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059671, + "balance_loss_mlp": 1.02843797, + "epoch": 0.9101577529819161, + "flos": 1093377208320.0, + "grad_norm": 0.05540170289696782, + "language_loss": 0.79978657, + "learning_rate": 2.101848311877069e-05, + "loss": 0.81038332, + "num_input_tokens_seen": 391328000, + "router_z_loss_mlp": 0.31201172, + "step": 4731, + "time_per_iteration": 3.3738834857940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063156, + "balance_loss_mlp": 1.03147006, + "epoch": 0.9103501346671797, + "flos": 445215854592.0, + "grad_norm": 0.05697116916892201, + "language_loss": 0.81553221, + "learning_rate": 2.092919721190678e-05, + "loss": 0.82616377, + "num_input_tokens_seen": 391391616, + "router_z_loss_mlp": 0.31665039, + "step": 4732, + "time_per_iteration": 2.5527071952819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062088, + "balance_loss_mlp": 1.03056908, + "epoch": 0.9105425163524432, + "flos": 500510997504.0, + "grad_norm": 0.06484045285144836, + "language_loss": 0.7739293, + "learning_rate": 2.0840097296823346e-05, + "loss": 0.78455019, + "num_input_tokens_seen": 391461312, + "router_z_loss_mlp": 0.31494141, + "step": 4733, + "time_per_iteration": 2.650042772293091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062465, + "balance_loss_mlp": 1.03080285, + "epoch": 0.9107348980377068, + "flos": 657206811648.0, + "grad_norm": 0.04976495879335239, + "language_loss": 0.83918369, + "learning_rate": 2.0751183408112162e-05, + "loss": 0.84980834, + "num_input_tokens_seen": 391542192, + "router_z_loss_mlp": 0.31640625, + "step": 4734, + "time_per_iteration": 2.8551266193389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065441, + "balance_loss_mlp": 1.03442264, + "epoch": 0.9109272797229704, + "flos": 553406916096.0, + "grad_norm": 0.06703437522614884, + "language_loss": 0.84643781, + "learning_rate": 2.066245558029256e-05, + "loss": 0.85709226, + "num_input_tokens_seen": 391609968, + "router_z_loss_mlp": 0.30981445, + "step": 4735, + "time_per_iteration": 2.6464221477508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060664, + "balance_loss_mlp": 1.03045595, + "epoch": 0.911119661408234, + "flos": 518757958656.0, + "grad_norm": 0.05362209566963938, + "language_loss": 0.84261322, + "learning_rate": 2.057391384781182e-05, + "loss": 0.85321987, + "num_input_tokens_seen": 391681264, + "router_z_loss_mlp": 0.30175781, + "step": 4736, + "time_per_iteration": 2.6503520011901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060922, + "balance_loss_mlp": 1.02937949, + "epoch": 0.9113120430934974, + "flos": 554111124480.0, + "grad_norm": 0.056779441339490845, + "language_loss": 0.83084607, + "learning_rate": 2.0485558245044834e-05, + "loss": 0.84145528, + "num_input_tokens_seen": 391751392, + "router_z_loss_mlp": 0.31518555, + "step": 4737, + "time_per_iteration": 2.6577727794647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062197, + "balance_loss_mlp": 1.03105998, + "epoch": 0.911504424778761, + "flos": 501624050688.0, + "grad_norm": 0.056537694741311456, + "language_loss": 0.81219387, + "learning_rate": 2.0397388806294216e-05, + "loss": 0.82281584, + "num_input_tokens_seen": 391823952, + "router_z_loss_mlp": 0.31103516, + "step": 4738, + "time_per_iteration": 2.62200927734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063366, + "balance_loss_mlp": 1.03237128, + "epoch": 0.9116968064640246, + "flos": 610823448576.0, + "grad_norm": 0.05036513509417674, + "language_loss": 0.82349581, + "learning_rate": 2.0309405565790527e-05, + "loss": 0.83412945, + "num_input_tokens_seen": 391895264, + "router_z_loss_mlp": 0.30957031, + "step": 4739, + "time_per_iteration": 2.7241289615631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061012, + "balance_loss_mlp": 1.02968431, + "epoch": 0.9118891881492882, + "flos": 572625745920.0, + "grad_norm": 0.05909426557587561, + "language_loss": 0.82400405, + "learning_rate": 2.0221608557691895e-05, + "loss": 0.83461416, + "num_input_tokens_seen": 391973040, + "router_z_loss_mlp": 0.31298828, + "step": 4740, + "time_per_iteration": 2.800021171569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059842, + "balance_loss_mlp": 1.02877665, + "epoch": 0.9120815698345518, + "flos": 635659978752.0, + "grad_norm": 0.06447832705175557, + "language_loss": 0.77531219, + "learning_rate": 2.0133997816083992e-05, + "loss": 0.78591061, + "num_input_tokens_seen": 392048160, + "router_z_loss_mlp": 0.31030273, + "step": 4741, + "time_per_iteration": 2.816603183746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062793, + "balance_loss_mlp": 1.03186965, + "epoch": 0.9122739515198153, + "flos": 701988291072.0, + "grad_norm": 0.05201992294054252, + "language_loss": 0.85963714, + "learning_rate": 2.0046573374980447e-05, + "loss": 0.87026513, + "num_input_tokens_seen": 392128960, + "router_z_loss_mlp": 0.30883789, + "step": 4742, + "time_per_iteration": 2.8994803428649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065258, + "balance_loss_mlp": 1.03369117, + "epoch": 0.9124663332050789, + "flos": 524435295744.0, + "grad_norm": 0.05856400150605942, + "language_loss": 0.87501878, + "learning_rate": 1.995933526832239e-05, + "loss": 0.88567138, + "num_input_tokens_seen": 392195008, + "router_z_loss_mlp": 0.31542969, + "step": 4743, + "time_per_iteration": 2.594181776046753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061888, + "balance_loss_mlp": 1.03077435, + "epoch": 0.9126587148903424, + "flos": 563033078784.0, + "grad_norm": 0.05280528716664947, + "language_loss": 0.8234272, + "learning_rate": 1.9872283529978662e-05, + "loss": 0.83404607, + "num_input_tokens_seen": 392265168, + "router_z_loss_mlp": 0.31079102, + "step": 4744, + "time_per_iteration": 2.6380250453948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060202, + "balance_loss_mlp": 1.02942252, + "epoch": 0.912851096575606, + "flos": 505695121920.0, + "grad_norm": 0.05435021172866501, + "language_loss": 0.79992861, + "learning_rate": 1.978541819374574e-05, + "loss": 0.8105306, + "num_input_tokens_seen": 392329456, + "router_z_loss_mlp": 0.30737305, + "step": 4745, + "time_per_iteration": 2.591541290283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060289, + "balance_loss_mlp": 1.02939034, + "epoch": 0.9130434782608695, + "flos": 550472219136.0, + "grad_norm": 0.0649422024104131, + "language_loss": 0.82114339, + "learning_rate": 1.9698739293347755e-05, + "loss": 0.83174634, + "num_input_tokens_seen": 392397792, + "router_z_loss_mlp": 0.30859375, + "step": 4746, + "time_per_iteration": 2.655029773712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066365, + "balance_loss_mlp": 1.03584766, + "epoch": 0.9132358599461331, + "flos": 468737100288.0, + "grad_norm": 0.12332969566626222, + "language_loss": 0.83492082, + "learning_rate": 1.9612246862436456e-05, + "loss": 0.84558451, + "num_input_tokens_seen": 392462928, + "router_z_loss_mlp": 0.3046875, + "step": 4747, + "time_per_iteration": 2.555858850479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062525, + "balance_loss_mlp": 1.03150725, + "epoch": 0.9134282416313967, + "flos": 505847890944.0, + "grad_norm": 0.05396566993602361, + "language_loss": 0.79646921, + "learning_rate": 1.9525940934591148e-05, + "loss": 0.80709445, + "num_input_tokens_seen": 392531840, + "router_z_loss_mlp": 0.30981445, + "step": 4748, + "time_per_iteration": 2.614349365234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063896, + "balance_loss_mlp": 1.03297329, + "epoch": 0.9136206233166603, + "flos": 604540827648.0, + "grad_norm": 0.0546299136084745, + "language_loss": 0.8396163, + "learning_rate": 1.9439821543318748e-05, + "loss": 0.85025525, + "num_input_tokens_seen": 392602464, + "router_z_loss_mlp": 0.30883789, + "step": 4749, + "time_per_iteration": 2.7605695724487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060881, + "balance_loss_mlp": 1.0290997, + "epoch": 0.9138130050019239, + "flos": 561467510784.0, + "grad_norm": 0.05257527121038526, + "language_loss": 0.82906801, + "learning_rate": 1.9353888722053793e-05, + "loss": 0.8396768, + "num_input_tokens_seen": 392669872, + "router_z_loss_mlp": 0.31762695, + "step": 4750, + "time_per_iteration": 2.6780149936676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066432, + "balance_loss_mlp": 1.03536606, + "epoch": 0.9140053866871873, + "flos": 689811545088.0, + "grad_norm": 0.12355226926767966, + "language_loss": 0.89985728, + "learning_rate": 1.9268142504158426e-05, + "loss": 0.91052163, + "num_input_tokens_seen": 392744256, + "router_z_loss_mlp": 0.31030273, + "step": 4751, + "time_per_iteration": 2.8180720806121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059231, + "balance_loss_mlp": 1.02840388, + "epoch": 0.9141977683724509, + "flos": 550734087168.0, + "grad_norm": 0.05041700860442144, + "language_loss": 0.84207261, + "learning_rate": 1.9182582922922186e-05, + "loss": 0.85266495, + "num_input_tokens_seen": 392816832, + "router_z_loss_mlp": 0.30786133, + "step": 4752, + "time_per_iteration": 2.6917872428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067812, + "balance_loss_mlp": 1.03634083, + "epoch": 0.9143901500577145, + "flos": 539831927808.0, + "grad_norm": 0.04860414083954547, + "language_loss": 0.75207782, + "learning_rate": 1.9097210011562228e-05, + "loss": 0.76275599, + "num_input_tokens_seen": 392886304, + "router_z_loss_mlp": 0.31445312, + "step": 4753, + "time_per_iteration": 2.653679370880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106309, + "balance_loss_mlp": 1.03102279, + "epoch": 0.9145825317429781, + "flos": 528512159232.0, + "grad_norm": 0.05770192006998304, + "language_loss": 0.80789167, + "learning_rate": 1.9012023803223366e-05, + "loss": 0.81852257, + "num_input_tokens_seen": 392955872, + "router_z_loss_mlp": 0.32055664, + "step": 4754, + "time_per_iteration": 2.645815849304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066571, + "balance_loss_mlp": 1.03557611, + "epoch": 0.9147749134282416, + "flos": 514538500608.0, + "grad_norm": 0.05242685059037384, + "language_loss": 0.79065865, + "learning_rate": 1.892702433097776e-05, + "loss": 0.80132431, + "num_input_tokens_seen": 393025776, + "router_z_loss_mlp": 0.30957031, + "step": 4755, + "time_per_iteration": 2.668991804122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062006, + "balance_loss_mlp": 1.03136897, + "epoch": 0.9149672951135052, + "flos": 514174735872.0, + "grad_norm": 0.0565157200230722, + "language_loss": 0.85695755, + "learning_rate": 1.8842211627825233e-05, + "loss": 0.86757755, + "num_input_tokens_seen": 393095936, + "router_z_loss_mlp": 0.3059082, + "step": 4756, + "time_per_iteration": 2.7136027812957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061248, + "balance_loss_mlp": 1.02951407, + "epoch": 0.9151596767987688, + "flos": 576781185024.0, + "grad_norm": 0.07352542931591961, + "language_loss": 0.80928689, + "learning_rate": 1.8757585726692727e-05, + "loss": 0.81989938, + "num_input_tokens_seen": 393166816, + "router_z_loss_mlp": 0.31713867, + "step": 4757, + "time_per_iteration": 2.7354605197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060549, + "balance_loss_mlp": 1.02974486, + "epoch": 0.9153520584840323, + "flos": 619051368960.0, + "grad_norm": 0.044801284055131146, + "language_loss": 0.82543564, + "learning_rate": 1.8673146660435182e-05, + "loss": 0.83604121, + "num_input_tokens_seen": 393242176, + "router_z_loss_mlp": 0.30761719, + "step": 4758, + "time_per_iteration": 2.726820707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065566, + "balance_loss_mlp": 1.03359389, + "epoch": 0.9155444401692959, + "flos": 468687638016.0, + "grad_norm": 0.05141972147747453, + "language_loss": 0.82493746, + "learning_rate": 1.8588894461834704e-05, + "loss": 0.8355931, + "num_input_tokens_seen": 393311792, + "router_z_loss_mlp": 0.31958008, + "step": 4759, + "time_per_iteration": 2.5751149654388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101691, + "balance_loss_mlp": 1.00904226, + "epoch": 0.9157368218545594, + "flos": 1409931601920.0, + "grad_norm": 0.008900792110931678, + "language_loss": 0.7481907, + "learning_rate": 1.8504829163600855e-05, + "loss": 0.75835979, + "num_input_tokens_seen": 393535648, + "router_z_loss_mlp": 0.07861328, + "step": 4760, + "time_per_iteration": 4.846553325653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016898, + "balance_loss_mlp": 1.00903058, + "epoch": 0.915929203539823, + "flos": 1521195572736.0, + "grad_norm": 0.008902417917998095, + "language_loss": 0.79576051, + "learning_rate": 1.8420950798370584e-05, + "loss": 0.80592954, + "num_input_tokens_seen": 393767040, + "router_z_loss_mlp": 0.07861328, + "step": 4761, + "time_per_iteration": 4.906817674636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066282, + "balance_loss_mlp": 1.03528786, + "epoch": 0.9161215852250866, + "flos": 535480049664.0, + "grad_norm": 0.061604051041974375, + "language_loss": 0.80440938, + "learning_rate": 1.8337259398708616e-05, + "loss": 0.81507224, + "num_input_tokens_seen": 393841232, + "router_z_loss_mlp": 0.30957031, + "step": 4762, + "time_per_iteration": 2.75858473777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063073, + "balance_loss_mlp": 1.03234076, + "epoch": 0.9163139669103502, + "flos": 590350381056.0, + "grad_norm": 0.050240655434021286, + "language_loss": 0.80299342, + "learning_rate": 1.8253754997106632e-05, + "loss": 0.81362408, + "num_input_tokens_seen": 393910512, + "router_z_loss_mlp": 0.30688477, + "step": 4763, + "time_per_iteration": 2.6782495975494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061226, + "balance_loss_mlp": 1.03011227, + "epoch": 0.9165063485956138, + "flos": 821627159040.0, + "grad_norm": 0.07609877775920502, + "language_loss": 0.84720802, + "learning_rate": 1.817043762598397e-05, + "loss": 0.85782027, + "num_input_tokens_seen": 393988624, + "router_z_loss_mlp": 0.31079102, + "step": 4764, + "time_per_iteration": 3.0433642864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061144, + "balance_loss_mlp": 1.03069818, + "epoch": 0.9166987302808772, + "flos": 524932890624.0, + "grad_norm": 0.05222854338861463, + "language_loss": 0.82242793, + "learning_rate": 1.8087307317687264e-05, + "loss": 0.83303934, + "num_input_tokens_seen": 394059184, + "router_z_loss_mlp": 0.30395508, + "step": 4765, + "time_per_iteration": 2.6640570163726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060211, + "balance_loss_mlp": 1.02890635, + "epoch": 0.9168911119661408, + "flos": 654784266240.0, + "grad_norm": 0.07195922114466717, + "language_loss": 0.84169734, + "learning_rate": 1.800436410449058e-05, + "loss": 0.85229945, + "num_input_tokens_seen": 394142160, + "router_z_loss_mlp": 0.31274414, + "step": 4766, + "time_per_iteration": 2.899909257888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064104, + "balance_loss_mlp": 1.03327632, + "epoch": 0.9170834936514044, + "flos": 491504675328.0, + "grad_norm": 0.07194392234567955, + "language_loss": 0.84633625, + "learning_rate": 1.7921608018595436e-05, + "loss": 0.85697722, + "num_input_tokens_seen": 394207056, + "router_z_loss_mlp": 0.30786133, + "step": 4767, + "time_per_iteration": 2.571272611618042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061618, + "balance_loss_mlp": 1.03043294, + "epoch": 0.917275875336668, + "flos": 627756535296.0, + "grad_norm": 0.057558765907327766, + "language_loss": 0.80572951, + "learning_rate": 1.7839039092130415e-05, + "loss": 0.81634569, + "num_input_tokens_seen": 394275456, + "router_z_loss_mlp": 0.31152344, + "step": 4768, + "time_per_iteration": 2.8055806159973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017406, + "balance_loss_mlp": 1.00949097, + "epoch": 0.9174682570219315, + "flos": 1517176935936.0, + "grad_norm": 0.0087369718613956, + "language_loss": 0.78180236, + "learning_rate": 1.7756657357151762e-05, + "loss": 0.79197639, + "num_input_tokens_seen": 394503808, + "router_z_loss_mlp": 0.07910156, + "step": 4769, + "time_per_iteration": 4.936990976333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060292, + "balance_loss_mlp": 1.02846277, + "epoch": 0.917660638707195, + "flos": 559749173760.0, + "grad_norm": 0.05006448592361951, + "language_loss": 0.84848541, + "learning_rate": 1.7674462845642835e-05, + "loss": 0.8590883, + "num_input_tokens_seen": 394573776, + "router_z_loss_mlp": 0.31811523, + "step": 4770, + "time_per_iteration": 2.677330255508423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061055, + "balance_loss_mlp": 1.03022778, + "epoch": 0.9178530203924586, + "flos": 447022941696.0, + "grad_norm": 0.0519833907610009, + "language_loss": 0.84258509, + "learning_rate": 1.7592455589514387e-05, + "loss": 0.85319561, + "num_input_tokens_seen": 394637600, + "router_z_loss_mlp": 0.30786133, + "step": 4771, + "time_per_iteration": 2.495462656021118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063599, + "balance_loss_mlp": 1.03255713, + "epoch": 0.9180454020777222, + "flos": 465734002176.0, + "grad_norm": 0.04964919418416434, + "language_loss": 0.80612022, + "learning_rate": 1.7510635620604453e-05, + "loss": 0.81675619, + "num_input_tokens_seen": 394707344, + "router_z_loss_mlp": 0.31005859, + "step": 4772, + "time_per_iteration": 2.5905964374542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057828, + "balance_loss_mlp": 1.02702451, + "epoch": 0.9182377837629858, + "flos": 596023335936.0, + "grad_norm": 0.05330480791985852, + "language_loss": 0.87082404, + "learning_rate": 1.74290029706784e-05, + "loss": 0.88140237, + "num_input_tokens_seen": 394786368, + "router_z_loss_mlp": 0.30761719, + "step": 4773, + "time_per_iteration": 2.7483558654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061724, + "balance_loss_mlp": 1.03030062, + "epoch": 0.9184301654482493, + "flos": 996251249664.0, + "grad_norm": 0.04999010093437848, + "language_loss": 0.82507402, + "learning_rate": 1.734755767142876e-05, + "loss": 0.83569121, + "num_input_tokens_seen": 394876976, + "router_z_loss_mlp": 0.31396484, + "step": 4774, + "time_per_iteration": 3.3252460956573486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061886, + "balance_loss_mlp": 1.03043866, + "epoch": 0.9186225471335129, + "flos": 508600705536.0, + "grad_norm": 0.043103948269501015, + "language_loss": 0.84609812, + "learning_rate": 1.7266299754475467e-05, + "loss": 0.85671699, + "num_input_tokens_seen": 394949024, + "router_z_loss_mlp": 0.31420898, + "step": 4775, + "time_per_iteration": 2.6413958072662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063705, + "balance_loss_mlp": 1.03211498, + "epoch": 0.9188149288187765, + "flos": 940011789312.0, + "grad_norm": 0.05598618240977498, + "language_loss": 0.78646922, + "learning_rate": 1.718522925136551e-05, + "loss": 0.79710621, + "num_input_tokens_seen": 395044352, + "router_z_loss_mlp": 0.31567383, + "step": 4776, + "time_per_iteration": 3.2665579319000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064215, + "balance_loss_mlp": 1.03322053, + "epoch": 0.91900731050404, + "flos": 583402839552.0, + "grad_norm": 0.0464124427178186, + "language_loss": 0.84131777, + "learning_rate": 1.7104346193573484e-05, + "loss": 0.85195988, + "num_input_tokens_seen": 395113824, + "router_z_loss_mlp": 0.30981445, + "step": 4777, + "time_per_iteration": 2.707064151763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062419, + "balance_loss_mlp": 1.03173482, + "epoch": 0.9191996921893035, + "flos": 580941006336.0, + "grad_norm": 0.06415459977537942, + "language_loss": 0.79562324, + "learning_rate": 1.7023650612500828e-05, + "loss": 0.80624747, + "num_input_tokens_seen": 395184496, + "router_z_loss_mlp": 0.30639648, + "step": 4778, + "time_per_iteration": 2.6951544284820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059351, + "balance_loss_mlp": 1.02845156, + "epoch": 0.9193920738745671, + "flos": 908566751232.0, + "grad_norm": 0.05239795011711653, + "language_loss": 0.79845613, + "learning_rate": 1.6943142539476374e-05, + "loss": 0.80904967, + "num_input_tokens_seen": 395263760, + "router_z_loss_mlp": 0.30859375, + "step": 4779, + "time_per_iteration": 3.128244638442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014928, + "balance_loss_mlp": 1.0069648, + "epoch": 0.9195844555598307, + "flos": 1557557074944.0, + "grad_norm": 0.006881946591681044, + "language_loss": 0.79795396, + "learning_rate": 1.686282200575606e-05, + "loss": 0.8081032, + "num_input_tokens_seen": 395482384, + "router_z_loss_mlp": 0.07958984, + "step": 4780, + "time_per_iteration": 4.738587379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060578, + "balance_loss_mlp": 1.02970314, + "epoch": 0.9197768372450943, + "flos": 473813535744.0, + "grad_norm": 0.05955041173862442, + "language_loss": 0.7853713, + "learning_rate": 1.678268904252317e-05, + "loss": 0.79597706, + "num_input_tokens_seen": 395550384, + "router_z_loss_mlp": 0.30834961, + "step": 4781, + "time_per_iteration": 2.539076805114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063538, + "balance_loss_mlp": 1.03287697, + "epoch": 0.9199692189303579, + "flos": 856622352384.0, + "grad_norm": 0.0707934897138979, + "language_loss": 0.83959591, + "learning_rate": 1.6702743680888088e-05, + "loss": 0.85023129, + "num_input_tokens_seen": 395632320, + "router_z_loss_mlp": 0.30615234, + "step": 4782, + "time_per_iteration": 3.2116215229034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064036, + "balance_loss_mlp": 1.03299415, + "epoch": 0.9201616006156214, + "flos": 504144110592.0, + "grad_norm": 0.06738288492919854, + "language_loss": 0.77458489, + "learning_rate": 1.6622985951888327e-05, + "loss": 0.78522527, + "num_input_tokens_seen": 395703856, + "router_z_loss_mlp": 0.31005859, + "step": 4783, + "time_per_iteration": 2.632368803024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058894, + "balance_loss_mlp": 1.02828109, + "epoch": 0.9203539823008849, + "flos": 548503598592.0, + "grad_norm": 0.09947574411564165, + "language_loss": 0.85094196, + "learning_rate": 1.6543415886488554e-05, + "loss": 0.8615309, + "num_input_tokens_seen": 395779456, + "router_z_loss_mlp": 0.30566406, + "step": 4784, + "time_per_iteration": 2.7621445655822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059931, + "balance_loss_mlp": 1.02903175, + "epoch": 0.9205463639861485, + "flos": 539738795520.0, + "grad_norm": 0.05215962907165292, + "language_loss": 0.82254821, + "learning_rate": 1.6464033515580624e-05, + "loss": 0.83314753, + "num_input_tokens_seen": 395849584, + "router_z_loss_mlp": 0.30859375, + "step": 4785, + "time_per_iteration": 2.6422579288482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061681, + "balance_loss_mlp": 1.03109241, + "epoch": 0.9207387456714121, + "flos": 799367353344.0, + "grad_norm": 0.05717975074851554, + "language_loss": 0.78002059, + "learning_rate": 1.6384838869983488e-05, + "loss": 0.79063737, + "num_input_tokens_seen": 395943712, + "router_z_loss_mlp": 0.30541992, + "step": 4786, + "time_per_iteration": 3.083732843399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061487, + "balance_loss_mlp": 1.0302304, + "epoch": 0.9209311273566756, + "flos": 502607655936.0, + "grad_norm": 0.057025299382847054, + "language_loss": 0.78579599, + "learning_rate": 1.630583198044333e-05, + "loss": 0.79641086, + "num_input_tokens_seen": 396013168, + "router_z_loss_mlp": 0.31225586, + "step": 4787, + "time_per_iteration": 2.648620367050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059126, + "balance_loss_mlp": 1.02777362, + "epoch": 0.9211235090419392, + "flos": 569059623936.0, + "grad_norm": 0.06012963691374042, + "language_loss": 0.82516944, + "learning_rate": 1.6227012877633173e-05, + "loss": 0.83576071, + "num_input_tokens_seen": 396082032, + "router_z_loss_mlp": 0.31323242, + "step": 4788, + "time_per_iteration": 2.6646766662597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063827, + "balance_loss_mlp": 1.03249896, + "epoch": 0.9213158907272028, + "flos": 806205795840.0, + "grad_norm": 0.06327373467774539, + "language_loss": 0.82420582, + "learning_rate": 1.6148381592153538e-05, + "loss": 0.83484411, + "num_input_tokens_seen": 396157984, + "router_z_loss_mlp": 0.31298828, + "step": 4789, + "time_per_iteration": 2.979316473007202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062579, + "balance_loss_mlp": 1.03115511, + "epoch": 0.9215082724124664, + "flos": 490441084416.0, + "grad_norm": 0.09581366649695534, + "language_loss": 0.76114941, + "learning_rate": 1.6069938154531618e-05, + "loss": 0.77177519, + "num_input_tokens_seen": 396223840, + "router_z_loss_mlp": 0.31396484, + "step": 4790, + "time_per_iteration": 2.5435032844543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010433, + "balance_loss_mlp": 1.00242269, + "epoch": 0.9217006540977299, + "flos": 1513648539648.0, + "grad_norm": 0.004451009217126261, + "language_loss": 0.77070266, + "learning_rate": 1.599168259522188e-05, + "loss": 0.78080696, + "num_input_tokens_seen": 396458288, + "router_z_loss_mlp": 0.08007812, + "step": 4791, + "time_per_iteration": 5.021566867828369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059671, + "balance_loss_mlp": 1.02851009, + "epoch": 0.9218930357829934, + "flos": 743471308800.0, + "grad_norm": 0.04726382939179337, + "language_loss": 0.76547706, + "learning_rate": 1.5913614944605804e-05, + "loss": 0.77607369, + "num_input_tokens_seen": 396536208, + "router_z_loss_mlp": 0.3112793, + "step": 4792, + "time_per_iteration": 2.9518425464630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059575, + "balance_loss_mlp": 1.02936769, + "epoch": 0.922085417468257, + "flos": 452803585536.0, + "grad_norm": 0.06742500071670546, + "language_loss": 0.8039397, + "learning_rate": 1.5835735232992032e-05, + "loss": 0.81453544, + "num_input_tokens_seen": 396599984, + "router_z_loss_mlp": 0.30151367, + "step": 4793, + "time_per_iteration": 2.4930520057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106032, + "balance_loss_mlp": 1.02949262, + "epoch": 0.9222777991535206, + "flos": 500003228160.0, + "grad_norm": 0.06238609397119238, + "language_loss": 0.84686369, + "learning_rate": 1.575804349061616e-05, + "loss": 0.85746688, + "num_input_tokens_seen": 396664592, + "router_z_loss_mlp": 0.30810547, + "step": 4794, + "time_per_iteration": 2.6074256896972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061377, + "balance_loss_mlp": 1.02976298, + "epoch": 0.9224701808387842, + "flos": 527704644096.0, + "grad_norm": 0.053967741899602094, + "language_loss": 0.78791153, + "learning_rate": 1.5680539747640722e-05, + "loss": 0.79852533, + "num_input_tokens_seen": 396729472, + "router_z_loss_mlp": 0.31591797, + "step": 4795, + "time_per_iteration": 2.697005033493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063499, + "balance_loss_mlp": 1.03229022, + "epoch": 0.9226625625240477, + "flos": 874272794112.0, + "grad_norm": 0.04984598897704024, + "language_loss": 0.75265729, + "learning_rate": 1.5603224034155315e-05, + "loss": 0.76329225, + "num_input_tokens_seen": 396810384, + "router_z_loss_mlp": 0.31176758, + "step": 4796, + "time_per_iteration": 3.1383020877838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064444, + "balance_loss_mlp": 1.03321099, + "epoch": 0.9228549442093112, + "flos": 502529080320.0, + "grad_norm": 0.06658857929604714, + "language_loss": 0.87877327, + "learning_rate": 1.5526096380176657e-05, + "loss": 0.88941771, + "num_input_tokens_seen": 396875472, + "router_z_loss_mlp": 0.31201172, + "step": 4797, + "time_per_iteration": 2.576430559158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106157, + "balance_loss_mlp": 1.0299077, + "epoch": 0.9230473258945748, + "flos": 599705911296.0, + "grad_norm": 0.04563191808794579, + "language_loss": 0.84813899, + "learning_rate": 1.544915681564829e-05, + "loss": 0.85875475, + "num_input_tokens_seen": 396949888, + "router_z_loss_mlp": 0.31640625, + "step": 4798, + "time_per_iteration": 2.83512544631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061344, + "balance_loss_mlp": 1.03018308, + "epoch": 0.9232397075798384, + "flos": 822168423936.0, + "grad_norm": 0.05160964536593656, + "language_loss": 0.7911216, + "learning_rate": 1.5372405370440822e-05, + "loss": 0.80173504, + "num_input_tokens_seen": 397027504, + "router_z_loss_mlp": 0.3112793, + "step": 4799, + "time_per_iteration": 3.1468732357025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059473, + "balance_loss_mlp": 1.02850246, + "epoch": 0.923432089265102, + "flos": 706719900672.0, + "grad_norm": 0.05428627979787911, + "language_loss": 0.8464976, + "learning_rate": 1.5295842074351805e-05, + "loss": 0.85709232, + "num_input_tokens_seen": 397101600, + "router_z_loss_mlp": 0.30932617, + "step": 4800, + "time_per_iteration": 2.8784780502319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060913, + "balance_loss_mlp": 1.02951312, + "epoch": 0.9236244709503655, + "flos": 701554715136.0, + "grad_norm": 0.06190533279611968, + "language_loss": 0.76700497, + "learning_rate": 1.5219466957105798e-05, + "loss": 0.77761406, + "num_input_tokens_seen": 397170880, + "router_z_loss_mlp": 0.3137207, + "step": 4801, + "time_per_iteration": 2.8618581295013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065155, + "balance_loss_mlp": 1.03466153, + "epoch": 0.9238168526356291, + "flos": 514780019712.0, + "grad_norm": 0.05113589786176994, + "language_loss": 0.83695769, + "learning_rate": 1.5143280048354136e-05, + "loss": 0.84760928, + "num_input_tokens_seen": 397242272, + "router_z_loss_mlp": 0.30444336, + "step": 4802, + "time_per_iteration": 2.5920772552490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059251, + "balance_loss_mlp": 1.02842343, + "epoch": 0.9240092343208927, + "flos": 491789864448.0, + "grad_norm": 0.061659751026456815, + "language_loss": 0.8127811, + "learning_rate": 1.5067281377675213e-05, + "loss": 0.82337356, + "num_input_tokens_seen": 397308032, + "router_z_loss_mlp": 0.30786133, + "step": 4803, + "time_per_iteration": 2.563819646835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058284, + "balance_loss_mlp": 1.02755177, + "epoch": 0.9242016160061562, + "flos": 646915728384.0, + "grad_norm": 0.05582004577056025, + "language_loss": 0.73311841, + "learning_rate": 1.4991470974574484e-05, + "loss": 0.74370122, + "num_input_tokens_seen": 397390944, + "router_z_loss_mlp": 0.30688477, + "step": 4804, + "time_per_iteration": 2.8679351806640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061479, + "balance_loss_mlp": 1.0312233, + "epoch": 0.9243939976914197, + "flos": 729094597632.0, + "grad_norm": 0.056639828384697895, + "language_loss": 0.78709513, + "learning_rate": 1.4915848868484016e-05, + "loss": 0.79770994, + "num_input_tokens_seen": 397468128, + "router_z_loss_mlp": 0.30200195, + "step": 4805, + "time_per_iteration": 3.022678852081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060975, + "balance_loss_mlp": 1.03014719, + "epoch": 0.9245863793766833, + "flos": 452006244864.0, + "grad_norm": 0.0445420112549805, + "language_loss": 0.90410256, + "learning_rate": 1.4840415088763048e-05, + "loss": 0.91471231, + "num_input_tokens_seen": 397538976, + "router_z_loss_mlp": 0.30786133, + "step": 4806, + "time_per_iteration": 2.6259498596191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063869, + "balance_loss_mlp": 1.03213537, + "epoch": 0.9247787610619469, + "flos": 754697945088.0, + "grad_norm": 0.052517724780417725, + "language_loss": 0.76738948, + "learning_rate": 1.476516966469732e-05, + "loss": 0.77802819, + "num_input_tokens_seen": 397612944, + "router_z_loss_mlp": 0.31713867, + "step": 4807, + "time_per_iteration": 2.9332311153411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062399, + "balance_loss_mlp": 1.03049862, + "epoch": 0.9249711427472105, + "flos": 561640628736.0, + "grad_norm": 0.044414575006585695, + "language_loss": 0.84793067, + "learning_rate": 1.4690112625499908e-05, + "loss": 0.85855472, + "num_input_tokens_seen": 397690848, + "router_z_loss_mlp": 0.31884766, + "step": 4808, + "time_per_iteration": 2.7425179481506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062085, + "balance_loss_mlp": 1.02982748, + "epoch": 0.9251635244324741, + "flos": 526430057472.0, + "grad_norm": 0.052440534962070226, + "language_loss": 0.85194021, + "learning_rate": 1.4615244000310501e-05, + "loss": 0.86256105, + "num_input_tokens_seen": 397761008, + "router_z_loss_mlp": 0.32250977, + "step": 4809, + "time_per_iteration": 2.6689164638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062775, + "balance_loss_mlp": 1.03118443, + "epoch": 0.9253559061177375, + "flos": 610982009856.0, + "grad_norm": 0.057022740257233, + "language_loss": 0.79165608, + "learning_rate": 1.4540563818195685e-05, + "loss": 0.80228388, + "num_input_tokens_seen": 397840640, + "router_z_loss_mlp": 0.31567383, + "step": 4810, + "time_per_iteration": 2.81392240524292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005244, + "balance_loss_mlp": 0.99713796, + "epoch": 0.9255482878030011, + "flos": 1550461146624.0, + "grad_norm": 0.004507621566339502, + "language_loss": 0.76925391, + "learning_rate": 1.446607210814882e-05, + "loss": 0.77930635, + "num_input_tokens_seen": 398060096, + "router_z_loss_mlp": 0.08105469, + "step": 4811, + "time_per_iteration": 4.72790789604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062769, + "balance_loss_mlp": 1.03141689, + "epoch": 0.9257406694882647, + "flos": 766008949248.0, + "grad_norm": 0.06092581196020588, + "language_loss": 0.8103286, + "learning_rate": 1.4391768899090219e-05, + "loss": 0.82095635, + "num_input_tokens_seen": 398143680, + "router_z_loss_mlp": 0.31323242, + "step": 4812, + "time_per_iteration": 3.064039707183838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064787, + "balance_loss_mlp": 1.03302956, + "epoch": 0.9259330511735283, + "flos": 497748008448.0, + "grad_norm": 0.053196549248037406, + "language_loss": 0.83248472, + "learning_rate": 1.431765421986686e-05, + "loss": 0.84313262, + "num_input_tokens_seen": 398207056, + "router_z_loss_mlp": 0.31738281, + "step": 4813, + "time_per_iteration": 2.5401344299316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060071, + "balance_loss_mlp": 1.02910006, + "epoch": 0.9261254328587919, + "flos": 626589637632.0, + "grad_norm": 0.0906762335238156, + "language_loss": 0.78651297, + "learning_rate": 1.424372809925273e-05, + "loss": 0.79711372, + "num_input_tokens_seen": 398277472, + "router_z_loss_mlp": 0.30932617, + "step": 4814, + "time_per_iteration": 2.7242367267608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105935, + "balance_loss_mlp": 1.02818894, + "epoch": 0.9263178145440554, + "flos": 597105865728.0, + "grad_norm": 0.05489571993390207, + "language_loss": 0.85417783, + "learning_rate": 1.416999056594831e-05, + "loss": 0.86477137, + "num_input_tokens_seen": 398346544, + "router_z_loss_mlp": 0.3112793, + "step": 4815, + "time_per_iteration": 2.7542569637298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059465, + "balance_loss_mlp": 1.02861321, + "epoch": 0.926510196229319, + "flos": 388350761472.0, + "grad_norm": 0.05346963676557139, + "language_loss": 0.83451992, + "learning_rate": 1.4096441648581259e-05, + "loss": 0.84511459, + "num_input_tokens_seen": 398409344, + "router_z_loss_mlp": 0.30810547, + "step": 4816, + "time_per_iteration": 2.5149407386779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060992, + "balance_loss_mlp": 1.02940106, + "epoch": 0.9267025779145825, + "flos": 545533996032.0, + "grad_norm": 0.056029117232846586, + "language_loss": 0.84429115, + "learning_rate": 1.4023081375705737e-05, + "loss": 0.85490108, + "num_input_tokens_seen": 398478816, + "router_z_loss_mlp": 0.31567383, + "step": 4817, + "time_per_iteration": 2.630242109298706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061904, + "balance_loss_mlp": 1.03036106, + "epoch": 0.9268949595998461, + "flos": 499540538880.0, + "grad_norm": 0.05209040991874763, + "language_loss": 0.81881189, + "learning_rate": 1.3949909775802682e-05, + "loss": 0.82943094, + "num_input_tokens_seen": 398550384, + "router_z_loss_mlp": 0.31518555, + "step": 4818, + "time_per_iteration": 2.7070581912994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063037, + "balance_loss_mlp": 1.03037405, + "epoch": 0.9270873412851096, + "flos": 432601150464.0, + "grad_norm": 0.07453857754542507, + "language_loss": 0.82538891, + "learning_rate": 1.3876926877279817e-05, + "loss": 0.83601934, + "num_input_tokens_seen": 398620832, + "router_z_loss_mlp": 0.32666016, + "step": 4819, + "time_per_iteration": 2.63124942779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057056, + "balance_loss_mlp": 1.02644277, + "epoch": 0.9272797229703732, + "flos": 466512403968.0, + "grad_norm": 0.05984200943619328, + "language_loss": 0.86118358, + "learning_rate": 1.380413270847164e-05, + "loss": 0.87175417, + "num_input_tokens_seen": 398689776, + "router_z_loss_mlp": 0.30566406, + "step": 4820, + "time_per_iteration": 2.6563737392425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106259, + "balance_loss_mlp": 1.0314765, + "epoch": 0.9274721046556368, + "flos": 704486439936.0, + "grad_norm": 0.05327632622082771, + "language_loss": 0.78716862, + "learning_rate": 1.373152729763938e-05, + "loss": 0.79779452, + "num_input_tokens_seen": 398775072, + "router_z_loss_mlp": 0.31079102, + "step": 4821, + "time_per_iteration": 3.0101308822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005202, + "balance_loss_mlp": 0.99709588, + "epoch": 0.9276644863409004, + "flos": 1401486893568.0, + "grad_norm": 0.004162007556462311, + "language_loss": 0.82380462, + "learning_rate": 1.3659110672970931e-05, + "loss": 0.83385664, + "num_input_tokens_seen": 399002016, + "router_z_loss_mlp": 0.08105469, + "step": 4822, + "time_per_iteration": 4.932299375534058 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061034, + "balance_loss_mlp": 1.02965784, + "epoch": 0.927856868026164, + "flos": 741370268160.0, + "grad_norm": 0.044770471632053395, + "language_loss": 0.79991037, + "learning_rate": 1.3586882862580917e-05, + "loss": 0.81052071, + "num_input_tokens_seen": 399085808, + "router_z_loss_mlp": 0.31347656, + "step": 4823, + "time_per_iteration": 3.075979709625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062708, + "balance_loss_mlp": 1.03152299, + "epoch": 0.9280492497114274, + "flos": 412000045056.0, + "grad_norm": 0.05794564929676867, + "language_loss": 0.73926902, + "learning_rate": 1.3514843894510686e-05, + "loss": 0.74989611, + "num_input_tokens_seen": 399146768, + "router_z_loss_mlp": 0.31152344, + "step": 4824, + "time_per_iteration": 2.475565195083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061263, + "balance_loss_mlp": 1.03003049, + "epoch": 0.928241631396691, + "flos": 646215902208.0, + "grad_norm": 0.05880246406469922, + "language_loss": 0.84044743, + "learning_rate": 1.3442993796728254e-05, + "loss": 0.85106003, + "num_input_tokens_seen": 399220192, + "router_z_loss_mlp": 0.31201172, + "step": 4825, + "time_per_iteration": 2.7478461265563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060328, + "balance_loss_mlp": 1.02926219, + "epoch": 0.9284340130819546, + "flos": 696537916416.0, + "grad_norm": 0.052202361516365085, + "language_loss": 0.80711192, + "learning_rate": 1.3371332597128249e-05, + "loss": 0.81771523, + "num_input_tokens_seen": 399300064, + "router_z_loss_mlp": 0.31030273, + "step": 4826, + "time_per_iteration": 2.9145355224609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062726, + "balance_loss_mlp": 1.03082526, + "epoch": 0.9286263947672182, + "flos": 758780600832.0, + "grad_norm": 0.04756980301049446, + "language_loss": 0.83686376, + "learning_rate": 1.3299860323532032e-05, + "loss": 0.84749097, + "num_input_tokens_seen": 399383200, + "router_z_loss_mlp": 0.31884766, + "step": 4827, + "time_per_iteration": 3.0382120609283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061304, + "balance_loss_mlp": 1.03050017, + "epoch": 0.9288187764524817, + "flos": 672495754752.0, + "grad_norm": 0.05853151917870524, + "language_loss": 0.80073225, + "learning_rate": 1.3228577003687681e-05, + "loss": 0.81134522, + "num_input_tokens_seen": 399466400, + "router_z_loss_mlp": 0.30761719, + "step": 4828, + "time_per_iteration": 2.977632761001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059808, + "balance_loss_mlp": 1.02857471, + "epoch": 0.9290111581377453, + "flos": 500220016128.0, + "grad_norm": 0.05187531918585966, + "language_loss": 0.83971095, + "learning_rate": 1.3157482665269727e-05, + "loss": 0.85030901, + "num_input_tokens_seen": 399533504, + "router_z_loss_mlp": 0.31201172, + "step": 4829, + "time_per_iteration": 2.5926513671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005303, + "balance_loss_mlp": 0.99719697, + "epoch": 0.9292035398230089, + "flos": 1562773132800.0, + "grad_norm": 0.004143837665711613, + "language_loss": 0.72122061, + "learning_rate": 1.3086577335879424e-05, + "loss": 0.73127365, + "num_input_tokens_seen": 399769872, + "router_z_loss_mlp": 0.08105469, + "step": 4830, + "time_per_iteration": 4.936404228210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005303, + "balance_loss_mlp": 0.99719697, + "epoch": 0.9293959215082724, + "flos": 1517828709888.0, + "grad_norm": 0.004144572143444307, + "language_loss": 0.79511833, + "learning_rate": 1.3015861043044753e-05, + "loss": 0.80517137, + "num_input_tokens_seen": 399997760, + "router_z_loss_mlp": 0.08105469, + "step": 4831, + "time_per_iteration": 4.858310222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061504, + "balance_loss_mlp": 1.03012788, + "epoch": 0.929588303193536, + "flos": 557572529664.0, + "grad_norm": 0.06711772734465098, + "language_loss": 0.84252775, + "learning_rate": 1.2945333814220195e-05, + "loss": 0.8531428, + "num_input_tokens_seen": 400063872, + "router_z_loss_mlp": 0.31347656, + "step": 4832, + "time_per_iteration": 2.642258644104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062693, + "balance_loss_mlp": 1.03134084, + "epoch": 0.9297806848787995, + "flos": 478338531840.0, + "grad_norm": 0.06848432277172195, + "language_loss": 0.80321014, + "learning_rate": 1.2874995676786905e-05, + "loss": 0.81383705, + "num_input_tokens_seen": 400126064, + "router_z_loss_mlp": 0.31323242, + "step": 4833, + "time_per_iteration": 2.535723924636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064305, + "balance_loss_mlp": 1.03290522, + "epoch": 0.9299730665640631, + "flos": 564259613184.0, + "grad_norm": 0.04598376752179828, + "language_loss": 0.80186009, + "learning_rate": 1.2804846658052372e-05, + "loss": 0.8125031, + "num_input_tokens_seen": 400201776, + "router_z_loss_mlp": 0.3137207, + "step": 4834, + "time_per_iteration": 2.8007967472076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061052, + "balance_loss_mlp": 1.03041565, + "epoch": 0.9301654482493267, + "flos": 559883003904.0, + "grad_norm": 0.050887682082653, + "language_loss": 0.82550877, + "learning_rate": 1.2734886785251032e-05, + "loss": 0.83611929, + "num_input_tokens_seen": 400279504, + "router_z_loss_mlp": 0.3059082, + "step": 4835, + "time_per_iteration": 2.780515193939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004579, + "balance_loss_mlp": 0.9964726, + "epoch": 0.9303578299345903, + "flos": 1519251683328.0, + "grad_norm": 0.004150284998796472, + "language_loss": 0.76852441, + "learning_rate": 1.2665116085543715e-05, + "loss": 0.77857018, + "num_input_tokens_seen": 400514800, + "router_z_loss_mlp": 0.08105469, + "step": 4836, + "time_per_iteration": 4.956341981887817 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106311, + "balance_loss_mlp": 1.03147149, + "epoch": 0.9305502116198537, + "flos": 530589878784.0, + "grad_norm": 0.05359665120507271, + "language_loss": 0.82833552, + "learning_rate": 1.2595534586017698e-05, + "loss": 0.83896661, + "num_input_tokens_seen": 400582640, + "router_z_loss_mlp": 0.31640625, + "step": 4837, + "time_per_iteration": 2.6186673641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106216, + "balance_loss_mlp": 1.03049755, + "epoch": 0.9307425933051173, + "flos": 474660338688.0, + "grad_norm": 0.06180694800901607, + "language_loss": 0.81545842, + "learning_rate": 1.2526142313686983e-05, + "loss": 0.82607996, + "num_input_tokens_seen": 400646912, + "router_z_loss_mlp": 0.31640625, + "step": 4838, + "time_per_iteration": 2.63519549369812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064317, + "balance_loss_mlp": 1.03258371, + "epoch": 0.9309349749903809, + "flos": 584600260608.0, + "grad_norm": 0.1559896918594763, + "language_loss": 0.86706674, + "learning_rate": 1.245693929549213e-05, + "loss": 0.87770993, + "num_input_tokens_seen": 400722128, + "router_z_loss_mlp": 0.31713867, + "step": 4839, + "time_per_iteration": 2.7265617847442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105852, + "balance_loss_mlp": 1.02771592, + "epoch": 0.9311273566756445, + "flos": 861298707456.0, + "grad_norm": 0.0464248741671958, + "language_loss": 0.76823103, + "learning_rate": 1.2387925558299984e-05, + "loss": 0.77881616, + "num_input_tokens_seen": 400801440, + "router_z_loss_mlp": 0.30761719, + "step": 4840, + "time_per_iteration": 3.1054060459136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062325, + "balance_loss_mlp": 1.0304966, + "epoch": 0.9313197383609081, + "flos": 547828503552.0, + "grad_norm": 0.05682803021706539, + "language_loss": 0.82184482, + "learning_rate": 1.231910112890411e-05, + "loss": 0.83246803, + "num_input_tokens_seen": 400873008, + "router_z_loss_mlp": 0.31835938, + "step": 4841, + "time_per_iteration": 2.6747379302978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060001, + "balance_loss_mlp": 1.02833903, + "epoch": 0.9315121200461716, + "flos": 468520312320.0, + "grad_norm": 0.0713585448689604, + "language_loss": 0.81151795, + "learning_rate": 1.2250466034024522e-05, + "loss": 0.82211792, + "num_input_tokens_seen": 400935328, + "router_z_loss_mlp": 0.31640625, + "step": 4842, + "time_per_iteration": 2.541785955429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061026, + "balance_loss_mlp": 1.03003192, + "epoch": 0.9317045017314352, + "flos": 417435863040.0, + "grad_norm": 0.05813435250991612, + "language_loss": 0.77865148, + "learning_rate": 1.2182020300307684e-05, + "loss": 0.7892617, + "num_input_tokens_seen": 401000720, + "router_z_loss_mlp": 0.30957031, + "step": 4843, + "time_per_iteration": 2.506622552871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061434, + "balance_loss_mlp": 1.03063035, + "epoch": 0.9318968834166987, + "flos": 540207277056.0, + "grad_norm": 0.0508930555691298, + "language_loss": 0.76882333, + "learning_rate": 1.2113763954326729e-05, + "loss": 0.77943766, + "num_input_tokens_seen": 401079664, + "router_z_loss_mlp": 0.30761719, + "step": 4844, + "time_per_iteration": 2.7364871501922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060895, + "balance_loss_mlp": 1.02944803, + "epoch": 0.9320892651019623, + "flos": 521077197312.0, + "grad_norm": 0.06090788976916376, + "language_loss": 0.80515504, + "learning_rate": 1.2045697022581015e-05, + "loss": 0.81576395, + "num_input_tokens_seen": 401146160, + "router_z_loss_mlp": 0.31420898, + "step": 4845, + "time_per_iteration": 2.6302125453948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067349, + "balance_loss_mlp": 1.03566349, + "epoch": 0.9322816467872258, + "flos": 581779044864.0, + "grad_norm": 0.05303023243065918, + "language_loss": 0.80538929, + "learning_rate": 1.1977819531496348e-05, + "loss": 0.81606281, + "num_input_tokens_seen": 401223264, + "router_z_loss_mlp": 0.31665039, + "step": 4846, + "time_per_iteration": 2.740966796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057851, + "balance_loss_mlp": 1.02647471, + "epoch": 0.9324740284724894, + "flos": 484484350464.0, + "grad_norm": 0.06270216233520148, + "language_loss": 0.82024521, + "learning_rate": 1.191013150742537e-05, + "loss": 0.83082366, + "num_input_tokens_seen": 401296368, + "router_z_loss_mlp": 0.31347656, + "step": 4847, + "time_per_iteration": 2.6899847984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059256, + "balance_loss_mlp": 1.02733231, + "epoch": 0.932666410157753, + "flos": 732227143680.0, + "grad_norm": 0.056276578673258616, + "language_loss": 0.82572961, + "learning_rate": 1.1842632976646672e-05, + "loss": 0.83632219, + "num_input_tokens_seen": 401383936, + "router_z_loss_mlp": 0.3190918, + "step": 4848, + "time_per_iteration": 3.029046058654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063015, + "balance_loss_mlp": 1.03149569, + "epoch": 0.9328587918430166, + "flos": 965127716352.0, + "grad_norm": 0.055194771265743715, + "language_loss": 0.78700304, + "learning_rate": 1.1775323965365681e-05, + "loss": 0.79763317, + "num_input_tokens_seen": 401468784, + "router_z_loss_mlp": 0.31494141, + "step": 4849, + "time_per_iteration": 3.231687545776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060415, + "balance_loss_mlp": 1.02865744, + "epoch": 0.9330511735282802, + "flos": 614270297088.0, + "grad_norm": 0.052004387996905495, + "language_loss": 0.80041909, + "learning_rate": 1.1708204499713936e-05, + "loss": 0.81102324, + "num_input_tokens_seen": 401539712, + "router_z_loss_mlp": 0.31738281, + "step": 4850, + "time_per_iteration": 2.6882708072662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066853, + "balance_loss_mlp": 1.03516674, + "epoch": 0.9332435552135436, + "flos": 558823795200.0, + "grad_norm": 0.048091399288315254, + "language_loss": 0.8570627, + "learning_rate": 1.1641274605749653e-05, + "loss": 0.86773121, + "num_input_tokens_seen": 401610432, + "router_z_loss_mlp": 0.31665039, + "step": 4851, + "time_per_iteration": 2.7675979137420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106148, + "balance_loss_mlp": 1.02984154, + "epoch": 0.9334359368988072, + "flos": 515281996800.0, + "grad_norm": 0.053519206023987825, + "language_loss": 0.82029992, + "learning_rate": 1.1574534309457208e-05, + "loss": 0.83091474, + "num_input_tokens_seen": 401677344, + "router_z_loss_mlp": 0.31616211, + "step": 4852, + "time_per_iteration": 2.5811235904693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061165, + "balance_loss_mlp": 1.03019428, + "epoch": 0.9336283185840708, + "flos": 539527799808.0, + "grad_norm": 0.043954400636649044, + "language_loss": 0.82742274, + "learning_rate": 1.1507983636747488e-05, + "loss": 0.83803439, + "num_input_tokens_seen": 401756864, + "router_z_loss_mlp": 0.30932617, + "step": 4853, + "time_per_iteration": 2.8195674419403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005382, + "balance_loss_mlp": 0.99732333, + "epoch": 0.9338207002693344, + "flos": 1562003495424.0, + "grad_norm": 0.004487342279678937, + "language_loss": 0.78455019, + "learning_rate": 1.1441622613457824e-05, + "loss": 0.79460394, + "num_input_tokens_seen": 401983664, + "router_z_loss_mlp": 0.08056641, + "step": 4854, + "time_per_iteration": 4.930206298828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060266, + "balance_loss_mlp": 1.02898586, + "epoch": 0.9340130819545979, + "flos": 644951490048.0, + "grad_norm": 0.05547450276969274, + "language_loss": 0.81409979, + "learning_rate": 1.1375451265351833e-05, + "loss": 0.8247025, + "num_input_tokens_seen": 402065744, + "router_z_loss_mlp": 0.3125, + "step": 4855, + "time_per_iteration": 2.8930420875549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063067, + "balance_loss_mlp": 1.03185797, + "epoch": 0.9342054636398615, + "flos": 503175062016.0, + "grad_norm": 0.05200041653829253, + "language_loss": 0.76766109, + "learning_rate": 1.1309469618119516e-05, + "loss": 0.77829176, + "num_input_tokens_seen": 402137728, + "router_z_loss_mlp": 0.31176758, + "step": 4856, + "time_per_iteration": 2.6574809551239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061148, + "balance_loss_mlp": 1.02962923, + "epoch": 0.934397845325125, + "flos": 592724874240.0, + "grad_norm": 0.07486950539873422, + "language_loss": 0.84321606, + "learning_rate": 1.1243677697377109e-05, + "loss": 0.8538276, + "num_input_tokens_seen": 402220160, + "router_z_loss_mlp": 0.31494141, + "step": 4857, + "time_per_iteration": 2.8692033290863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062363, + "balance_loss_mlp": 1.03198814, + "epoch": 0.9345902270103886, + "flos": 499643845632.0, + "grad_norm": 0.052677646415496285, + "language_loss": 0.80566096, + "learning_rate": 1.1178075528667453e-05, + "loss": 0.8162846, + "num_input_tokens_seen": 402285168, + "router_z_loss_mlp": 0.3034668, + "step": 4858, + "time_per_iteration": 2.6162071228027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005374, + "balance_loss_mlp": 0.99731594, + "epoch": 0.9347826086956522, + "flos": 1519563165696.0, + "grad_norm": 0.004492710484548213, + "language_loss": 0.7598772, + "learning_rate": 1.1112663137459566e-05, + "loss": 0.76993096, + "num_input_tokens_seen": 402504912, + "router_z_loss_mlp": 0.08056641, + "step": 4859, + "time_per_iteration": 4.6757166385650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061693, + "balance_loss_mlp": 1.0306747, + "epoch": 0.9349749903809157, + "flos": 504273558528.0, + "grad_norm": 0.05481870385966651, + "language_loss": 0.81309426, + "learning_rate": 1.1047440549148636e-05, + "loss": 0.82371128, + "num_input_tokens_seen": 402582032, + "router_z_loss_mlp": 0.30981445, + "step": 4860, + "time_per_iteration": 2.792273998260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059343, + "balance_loss_mlp": 1.02822995, + "epoch": 0.9351673720661793, + "flos": 568636222464.0, + "grad_norm": 0.06410233006416319, + "language_loss": 0.77857924, + "learning_rate": 1.0982407789056514e-05, + "loss": 0.78917265, + "num_input_tokens_seen": 402650144, + "router_z_loss_mlp": 0.31079102, + "step": 4861, + "time_per_iteration": 2.6576950550079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058849, + "balance_loss_mlp": 1.02806914, + "epoch": 0.9353597537514429, + "flos": 544342367232.0, + "grad_norm": 0.05887078488817451, + "language_loss": 0.86108792, + "learning_rate": 1.0917564882430952e-05, + "loss": 0.87167645, + "num_input_tokens_seen": 402720368, + "router_z_loss_mlp": 0.30737305, + "step": 4862, + "time_per_iteration": 2.635856866836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062763, + "balance_loss_mlp": 1.0311482, + "epoch": 0.9355521354367065, + "flos": 518743401984.0, + "grad_norm": 0.05060577749050637, + "language_loss": 0.84681392, + "learning_rate": 1.0852911854446368e-05, + "loss": 0.85744154, + "num_input_tokens_seen": 402795568, + "router_z_loss_mlp": 0.31591797, + "step": 4863, + "time_per_iteration": 2.744649887084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060564, + "balance_loss_mlp": 1.02952147, + "epoch": 0.93574451712197, + "flos": 446087388672.0, + "grad_norm": 0.05386717349507103, + "language_loss": 0.78386593, + "learning_rate": 1.0788448730203237e-05, + "loss": 0.79447162, + "num_input_tokens_seen": 402858784, + "router_z_loss_mlp": 0.31005859, + "step": 4864, + "time_per_iteration": 2.4612977504730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058845, + "balance_loss_mlp": 1.02804136, + "epoch": 0.9359368988072335, + "flos": 480273656832.0, + "grad_norm": 0.07391698555826227, + "language_loss": 0.77168214, + "learning_rate": 1.072417553472832e-05, + "loss": 0.78227055, + "num_input_tokens_seen": 402924144, + "router_z_loss_mlp": 0.30761719, + "step": 4865, + "time_per_iteration": 2.5211689472198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062281, + "balance_loss_mlp": 1.03181124, + "epoch": 0.9361292804924971, + "flos": 496876474368.0, + "grad_norm": 0.058428574526624755, + "language_loss": 0.85151851, + "learning_rate": 1.0660092292974766e-05, + "loss": 0.86214131, + "num_input_tokens_seen": 402987488, + "router_z_loss_mlp": 0.30419922, + "step": 4866, + "time_per_iteration": 2.622624635696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060087, + "balance_loss_mlp": 1.0294745, + "epoch": 0.9363216621777607, + "flos": 617830626816.0, + "grad_norm": 0.055760701356742395, + "language_loss": 0.84262055, + "learning_rate": 1.059619902982184e-05, + "loss": 0.85322142, + "num_input_tokens_seen": 403058224, + "router_z_loss_mlp": 0.30566406, + "step": 4867, + "time_per_iteration": 2.7364232540130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005383, + "balance_loss_mlp": 0.99727678, + "epoch": 0.9365140438630243, + "flos": 1415169570816.0, + "grad_norm": 0.004497337042276508, + "language_loss": 0.79203337, + "learning_rate": 1.053249577007509e-05, + "loss": 0.80208719, + "num_input_tokens_seen": 403289072, + "router_z_loss_mlp": 0.08105469, + "step": 4868, + "time_per_iteration": 4.865636110305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062556, + "balance_loss_mlp": 1.03108454, + "epoch": 0.9367064255482878, + "flos": 590217960960.0, + "grad_norm": 0.04987850830161197, + "language_loss": 0.81500798, + "learning_rate": 1.0468982538466287e-05, + "loss": 0.82563359, + "num_input_tokens_seen": 403361728, + "router_z_loss_mlp": 0.31445312, + "step": 4869, + "time_per_iteration": 2.699848175048828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058901, + "balance_loss_mlp": 1.02764463, + "epoch": 0.9368988072335513, + "flos": 526384977408.0, + "grad_norm": 0.05441615624063978, + "language_loss": 0.81575727, + "learning_rate": 1.0405659359653597e-05, + "loss": 0.82634622, + "num_input_tokens_seen": 403431536, + "router_z_loss_mlp": 0.31225586, + "step": 4870, + "time_per_iteration": 2.6536993980407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063994, + "balance_loss_mlp": 1.03202248, + "epoch": 0.9370911889188149, + "flos": 742880581632.0, + "grad_norm": 0.05850362106366467, + "language_loss": 0.78898335, + "learning_rate": 1.034252625822113e-05, + "loss": 0.79962337, + "num_input_tokens_seen": 403504768, + "router_z_loss_mlp": 0.31958008, + "step": 4871, + "time_per_iteration": 2.871039867401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062533, + "balance_loss_mlp": 1.03160965, + "epoch": 0.9372835706040785, + "flos": 545779897344.0, + "grad_norm": 0.051618847830223594, + "language_loss": 0.78720641, + "learning_rate": 1.0279583258679448e-05, + "loss": 0.79783177, + "num_input_tokens_seen": 403575584, + "router_z_loss_mlp": 0.30883789, + "step": 4872, + "time_per_iteration": 2.647601842880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106415, + "balance_loss_mlp": 1.03224909, + "epoch": 0.9374759522893421, + "flos": 491367873024.0, + "grad_norm": 0.055623553349456685, + "language_loss": 0.81515515, + "learning_rate": 1.0216830385465003e-05, + "loss": 0.82579672, + "num_input_tokens_seen": 403648720, + "router_z_loss_mlp": 0.31884766, + "step": 4873, + "time_per_iteration": 2.6937618255615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060458, + "balance_loss_mlp": 1.02903473, + "epoch": 0.9376683339746056, + "flos": 578144521728.0, + "grad_norm": 0.055922001744963604, + "language_loss": 0.82857215, + "learning_rate": 1.0154267662940809e-05, + "loss": 0.83917665, + "num_input_tokens_seen": 403721392, + "router_z_loss_mlp": 0.31396484, + "step": 4874, + "time_per_iteration": 2.6639533042907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060789, + "balance_loss_mlp": 1.02912736, + "epoch": 0.9378607156598692, + "flos": 506039947776.0, + "grad_norm": 0.05510447696512615, + "language_loss": 0.80243266, + "learning_rate": 1.0091895115395766e-05, + "loss": 0.81304049, + "num_input_tokens_seen": 403792112, + "router_z_loss_mlp": 0.31640625, + "step": 4875, + "time_per_iteration": 2.614619016647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059994, + "balance_loss_mlp": 1.02852273, + "epoch": 0.9380530973451328, + "flos": 519753148416.0, + "grad_norm": 0.07676786722670897, + "language_loss": 0.77718991, + "learning_rate": 1.0029712767045062e-05, + "loss": 0.78778982, + "num_input_tokens_seen": 403860928, + "router_z_loss_mlp": 0.31445312, + "step": 4876, + "time_per_iteration": 2.632483720779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060472, + "balance_loss_mlp": 1.02964473, + "epoch": 0.9382454790303963, + "flos": 557533241856.0, + "grad_norm": 0.09739330390757249, + "language_loss": 0.84747428, + "learning_rate": 9.967720642029999e-06, + "loss": 0.85807896, + "num_input_tokens_seen": 403928240, + "router_z_loss_mlp": 0.30786133, + "step": 4877, + "time_per_iteration": 2.667158365249634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105889, + "balance_loss_mlp": 1.02815771, + "epoch": 0.9384378607156598, + "flos": 695149848576.0, + "grad_norm": 0.38308809515223985, + "language_loss": 0.81761467, + "learning_rate": 9.905918764418153e-06, + "loss": 0.82820356, + "num_input_tokens_seen": 404004320, + "router_z_loss_mlp": 0.30688477, + "step": 4878, + "time_per_iteration": 2.888810873031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059406, + "balance_loss_mlp": 1.0284121, + "epoch": 0.9386302424009234, + "flos": 554480681472.0, + "grad_norm": 0.0575140000640527, + "language_loss": 0.80870175, + "learning_rate": 9.844307158203058e-06, + "loss": 0.81929588, + "num_input_tokens_seen": 404077040, + "router_z_loss_mlp": 0.30957031, + "step": 4879, + "time_per_iteration": 2.649317979812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063144, + "balance_loss_mlp": 1.03210211, + "epoch": 0.938822624086187, + "flos": 566711271936.0, + "grad_norm": 0.05421321108233327, + "language_loss": 0.79483026, + "learning_rate": 9.782885847304469e-06, + "loss": 0.80546176, + "num_input_tokens_seen": 404145248, + "router_z_loss_mlp": 0.31005859, + "step": 4880, + "time_per_iteration": 2.6381187438964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060611, + "balance_loss_mlp": 1.02975965, + "epoch": 0.9390150057714506, + "flos": 417367461888.0, + "grad_norm": 0.05143950105590942, + "language_loss": 0.8027178, + "learning_rate": 9.721654855568196e-06, + "loss": 0.81332386, + "num_input_tokens_seen": 404212000, + "router_z_loss_mlp": 0.30810547, + "step": 4881, + "time_per_iteration": 2.5627872943878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059492, + "balance_loss_mlp": 1.02883101, + "epoch": 0.9392073874567142, + "flos": 1553281256448.0, + "grad_norm": 0.05634975894400428, + "language_loss": 0.76259017, + "learning_rate": 9.660614206766394e-06, + "loss": 0.77318507, + "num_input_tokens_seen": 404305408, + "router_z_loss_mlp": 0.30615234, + "step": 4882, + "time_per_iteration": 3.692448854446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061611, + "balance_loss_mlp": 1.03099859, + "epoch": 0.9393997691419776, + "flos": 652238065152.0, + "grad_norm": 0.06867303852107298, + "language_loss": 0.77672702, + "learning_rate": 9.59976392459705e-06, + "loss": 0.78734314, + "num_input_tokens_seen": 404383248, + "router_z_loss_mlp": 0.30566406, + "step": 4883, + "time_per_iteration": 2.7691049575805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009065, + "balance_loss_mlp": 1.00091124, + "epoch": 0.9395921508272412, + "flos": 1552480639488.0, + "grad_norm": 0.003319864834589177, + "language_loss": 0.78170681, + "learning_rate": 9.539104032684209e-06, + "loss": 0.7917974, + "num_input_tokens_seen": 404615264, + "router_z_loss_mlp": 0.08154297, + "step": 4884, + "time_per_iteration": 4.804852247238159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062685, + "balance_loss_mlp": 1.03123808, + "epoch": 0.9397845325125048, + "flos": 497881838592.0, + "grad_norm": 0.054651008950097155, + "language_loss": 0.782938, + "learning_rate": 9.478634554578314e-06, + "loss": 0.7935648, + "num_input_tokens_seen": 404684656, + "router_z_loss_mlp": 0.31420898, + "step": 4885, + "time_per_iteration": 2.6088409423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060175, + "balance_loss_mlp": 1.02944279, + "epoch": 0.9399769141977684, + "flos": 498348910080.0, + "grad_norm": 0.05219236638184246, + "language_loss": 0.83581132, + "learning_rate": 9.418355513755638e-06, + "loss": 0.84641308, + "num_input_tokens_seen": 404752096, + "router_z_loss_mlp": 0.30688477, + "step": 4886, + "time_per_iteration": 2.6242825984954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009087, + "balance_loss_mlp": 1.00093293, + "epoch": 0.9401692958830319, + "flos": 1401709473792.0, + "grad_norm": 0.0033293422286518343, + "language_loss": 0.79332191, + "learning_rate": 9.358266933618575e-06, + "loss": 0.8034128, + "num_input_tokens_seen": 404980944, + "router_z_loss_mlp": 0.08154297, + "step": 4887, + "time_per_iteration": 4.809056997299194 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061863, + "balance_loss_mlp": 1.03063035, + "epoch": 0.9403616775682955, + "flos": 539852276736.0, + "grad_norm": 0.04229008056200792, + "language_loss": 0.85158825, + "learning_rate": 9.298368837495575e-06, + "loss": 0.86220682, + "num_input_tokens_seen": 405056688, + "router_z_loss_mlp": 0.31201172, + "step": 4888, + "time_per_iteration": 2.714353084564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010096, + "balance_loss_mlp": 1.00144565, + "epoch": 0.9405540592535591, + "flos": 1321340663808.0, + "grad_norm": 0.004028749544672952, + "language_loss": 0.75169432, + "learning_rate": 9.238661248641089e-06, + "loss": 0.76179039, + "num_input_tokens_seen": 405284656, + "router_z_loss_mlp": 0.08154297, + "step": 4889, + "time_per_iteration": 4.866438388824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060754, + "balance_loss_mlp": 1.02985442, + "epoch": 0.9407464409388226, + "flos": 572097627648.0, + "grad_norm": 0.05566218913854017, + "language_loss": 0.82558531, + "learning_rate": 9.179144190235799e-06, + "loss": 0.83619285, + "num_input_tokens_seen": 405351584, + "router_z_loss_mlp": 0.30859375, + "step": 4890, + "time_per_iteration": 2.618964195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064127, + "balance_loss_mlp": 1.03220248, + "epoch": 0.9409388226240862, + "flos": 510994137600.0, + "grad_norm": 0.04766222233300551, + "language_loss": 0.7677294, + "learning_rate": 9.119817685386112e-06, + "loss": 0.77837062, + "num_input_tokens_seen": 405425712, + "router_z_loss_mlp": 0.3190918, + "step": 4891, + "time_per_iteration": 2.7037875652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009613, + "balance_loss_mlp": 1.00145948, + "epoch": 0.9411312043093497, + "flos": 1569060135936.0, + "grad_norm": 0.0040360903132469, + "language_loss": 0.80241883, + "learning_rate": 9.06068175712471e-06, + "loss": 0.81251502, + "num_input_tokens_seen": 405655760, + "router_z_loss_mlp": 0.08154297, + "step": 4892, + "time_per_iteration": 4.862957715988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061584, + "balance_loss_mlp": 1.03099477, + "epoch": 0.9413235859946133, + "flos": 569197836288.0, + "grad_norm": 0.06563594953788358, + "language_loss": 0.78013027, + "learning_rate": 9.001736428410234e-06, + "loss": 0.79074609, + "num_input_tokens_seen": 405731664, + "router_z_loss_mlp": 0.30541992, + "step": 4893, + "time_per_iteration": 2.72495436668396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060652, + "balance_loss_mlp": 1.02989554, + "epoch": 0.9415159676798769, + "flos": 781567114752.0, + "grad_norm": 0.07880011282868978, + "language_loss": 0.80272818, + "learning_rate": 8.942981722127263e-06, + "loss": 0.8133347, + "num_input_tokens_seen": 405808128, + "router_z_loss_mlp": 0.30712891, + "step": 4894, + "time_per_iteration": 3.00886869430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064323, + "balance_loss_mlp": 1.03375769, + "epoch": 0.9417083493651405, + "flos": 848960428032.0, + "grad_norm": 0.06735892348971287, + "language_loss": 0.80011809, + "learning_rate": 8.884417661086331e-06, + "loss": 0.81076133, + "num_input_tokens_seen": 405892448, + "router_z_loss_mlp": 0.30517578, + "step": 4895, + "time_per_iteration": 3.1446125507354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058314, + "balance_loss_mlp": 1.02741504, + "epoch": 0.941900731050404, + "flos": 529054834176.0, + "grad_norm": 0.05884920935493865, + "language_loss": 0.85655093, + "learning_rate": 8.826044268024025e-06, + "loss": 0.86713409, + "num_input_tokens_seen": 405966736, + "router_z_loss_mlp": 0.30859375, + "step": 4896, + "time_per_iteration": 2.6839241981506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061174, + "balance_loss_mlp": 1.02986979, + "epoch": 0.9420931127356675, + "flos": 556799920128.0, + "grad_norm": 0.05000327104616657, + "language_loss": 0.80053771, + "learning_rate": 8.767861565602997e-06, + "loss": 0.81114948, + "num_input_tokens_seen": 406043264, + "router_z_loss_mlp": 0.31274414, + "step": 4897, + "time_per_iteration": 2.7563629150390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061483, + "balance_loss_mlp": 1.03072667, + "epoch": 0.9422854944209311, + "flos": 652233682944.0, + "grad_norm": 0.05764227997043217, + "language_loss": 0.86452037, + "learning_rate": 8.709869576411733e-06, + "loss": 0.87513518, + "num_input_tokens_seen": 406119552, + "router_z_loss_mlp": 0.30712891, + "step": 4898, + "time_per_iteration": 2.875542640686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058844, + "balance_loss_mlp": 1.02777863, + "epoch": 0.9424778761061947, + "flos": 553417090560.0, + "grad_norm": 0.054125332002499735, + "language_loss": 0.83954608, + "learning_rate": 8.65206832296478e-06, + "loss": 0.85013449, + "num_input_tokens_seen": 406192464, + "router_z_loss_mlp": 0.31030273, + "step": 4899, + "time_per_iteration": 2.676485300064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060281, + "balance_loss_mlp": 1.02921546, + "epoch": 0.9426702577914583, + "flos": 588287218176.0, + "grad_norm": 0.05730536240438454, + "language_loss": 0.79517835, + "learning_rate": 8.594457827702406e-06, + "loss": 0.80578119, + "num_input_tokens_seen": 406262640, + "router_z_loss_mlp": 0.31030273, + "step": 4900, + "time_per_iteration": 2.698810338973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061683, + "balance_loss_mlp": 1.03064072, + "epoch": 0.9428626394767218, + "flos": 616329077760.0, + "grad_norm": 0.07093887274859992, + "language_loss": 0.78249103, + "learning_rate": 8.537038112991114e-06, + "loss": 0.79310787, + "num_input_tokens_seen": 406341328, + "router_z_loss_mlp": 0.31005859, + "step": 4901, + "time_per_iteration": 2.7636358737945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061896, + "balance_loss_mlp": 1.03025842, + "epoch": 0.9430550211619854, + "flos": 610129414656.0, + "grad_norm": 0.057626683505121824, + "language_loss": 0.8184545, + "learning_rate": 8.479809201123178e-06, + "loss": 0.82907343, + "num_input_tokens_seen": 406418864, + "router_z_loss_mlp": 0.31616211, + "step": 4902, + "time_per_iteration": 2.6953253746032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061661, + "balance_loss_mlp": 1.03064311, + "epoch": 0.943247402847249, + "flos": 565726256640.0, + "grad_norm": 0.056063093918735984, + "language_loss": 0.78086585, + "learning_rate": 8.422771114316885e-06, + "loss": 0.79148251, + "num_input_tokens_seen": 406492320, + "router_z_loss_mlp": 0.30981445, + "step": 4903, + "time_per_iteration": 2.6811859607696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060874, + "balance_loss_mlp": 1.03061843, + "epoch": 0.9434397845325125, + "flos": 526779265536.0, + "grad_norm": 0.05832485336087126, + "language_loss": 0.81253076, + "learning_rate": 8.365923874716297e-06, + "loss": 0.82313943, + "num_input_tokens_seen": 406560448, + "router_z_loss_mlp": 0.30200195, + "step": 4904, + "time_per_iteration": 2.5902273654937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064715, + "balance_loss_mlp": 1.03333879, + "epoch": 0.943632166217776, + "flos": 593167214592.0, + "grad_norm": 0.0538585081715504, + "language_loss": 0.82290983, + "learning_rate": 8.309267504391593e-06, + "loss": 0.83355695, + "num_input_tokens_seen": 406631376, + "router_z_loss_mlp": 0.31347656, + "step": 4905, + "time_per_iteration": 2.7094435691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061362, + "balance_loss_mlp": 1.03051043, + "epoch": 0.9438245479030396, + "flos": 572468594688.0, + "grad_norm": 0.04298887979827658, + "language_loss": 0.85656631, + "learning_rate": 8.252802025338623e-06, + "loss": 0.86717987, + "num_input_tokens_seen": 406713728, + "router_z_loss_mlp": 0.30810547, + "step": 4906, + "time_per_iteration": 2.802527904510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060239, + "balance_loss_mlp": 1.02919722, + "epoch": 0.9440169295883032, + "flos": 488018539008.0, + "grad_norm": 0.05907547898577151, + "language_loss": 0.81667566, + "learning_rate": 8.196527459479242e-06, + "loss": 0.82727802, + "num_input_tokens_seen": 406779168, + "router_z_loss_mlp": 0.31005859, + "step": 4907, + "time_per_iteration": 2.5595598220825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061232, + "balance_loss_mlp": 1.02968919, + "epoch": 0.9442093112735668, + "flos": 731399279616.0, + "grad_norm": 0.05714065590928067, + "language_loss": 0.73537469, + "learning_rate": 8.140443828661137e-06, + "loss": 0.74598706, + "num_input_tokens_seen": 406860816, + "router_z_loss_mlp": 0.31518555, + "step": 4908, + "time_per_iteration": 3.017683267593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065511, + "balance_loss_mlp": 1.03446937, + "epoch": 0.9444016929588304, + "flos": 570763404288.0, + "grad_norm": 0.06175757947134421, + "language_loss": 0.82048225, + "learning_rate": 8.084551154658004e-06, + "loss": 0.83113736, + "num_input_tokens_seen": 406929888, + "router_z_loss_mlp": 0.31005859, + "step": 4909, + "time_per_iteration": 2.673168659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062254, + "balance_loss_mlp": 1.03032982, + "epoch": 0.9445940746440938, + "flos": 509038663680.0, + "grad_norm": 0.0760284691333908, + "language_loss": 0.85880816, + "learning_rate": 8.028849459169318e-06, + "loss": 0.86943078, + "num_input_tokens_seen": 406998224, + "router_z_loss_mlp": 0.3190918, + "step": 4910, + "time_per_iteration": 2.5887327194213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060208, + "balance_loss_mlp": 1.02926159, + "epoch": 0.9447864563293574, + "flos": 624247077888.0, + "grad_norm": 0.0535010023544938, + "language_loss": 0.80667341, + "learning_rate": 7.97333876382028e-06, + "loss": 0.81727552, + "num_input_tokens_seen": 407075088, + "router_z_loss_mlp": 0.30908203, + "step": 4911, + "time_per_iteration": 2.823601245880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059967, + "balance_loss_mlp": 1.02799463, + "epoch": 0.944978838014621, + "flos": 505011262464.0, + "grad_norm": 0.06633809570148991, + "language_loss": 0.8041541, + "learning_rate": 7.918019090162098e-06, + "loss": 0.81475377, + "num_input_tokens_seen": 407147792, + "router_z_loss_mlp": 0.31958008, + "step": 4912, + "time_per_iteration": 2.7652816772460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009718, + "balance_loss_mlp": 1.00156379, + "epoch": 0.9451712196998846, + "flos": 1483371809280.0, + "grad_norm": 0.00406090983810477, + "language_loss": 0.78287339, + "learning_rate": 7.862890459671812e-06, + "loss": 0.7929706, + "num_input_tokens_seen": 407387216, + "router_z_loss_mlp": 0.08154297, + "step": 4913, + "time_per_iteration": 4.969675779342651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058714, + "balance_loss_mlp": 1.02781546, + "epoch": 0.9453636013851482, + "flos": 520885140480.0, + "grad_norm": 0.0550476227772574, + "language_loss": 0.9011662, + "learning_rate": 7.80795289375219e-06, + "loss": 0.9117533, + "num_input_tokens_seen": 407457664, + "router_z_loss_mlp": 0.30859375, + "step": 4914, + "time_per_iteration": 2.6254100799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009716, + "balance_loss_mlp": 1.00156236, + "epoch": 0.9455559830704117, + "flos": 1496060706816.0, + "grad_norm": 0.004058659107795025, + "language_loss": 0.8356235, + "learning_rate": 7.75320641373195e-06, + "loss": 0.84572065, + "num_input_tokens_seen": 407700256, + "router_z_loss_mlp": 0.08154297, + "step": 4915, + "time_per_iteration": 4.945310831069946 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062356, + "balance_loss_mlp": 1.03174341, + "epoch": 0.9457483647556753, + "flos": 497871664128.0, + "grad_norm": 0.05064963619563798, + "language_loss": 0.81528383, + "learning_rate": 7.698651040865534e-06, + "loss": 0.82590735, + "num_input_tokens_seen": 407770080, + "router_z_loss_mlp": 0.30566406, + "step": 4916, + "time_per_iteration": 2.631326913833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065049, + "balance_loss_mlp": 1.03434074, + "epoch": 0.9459407464409388, + "flos": 1018979536896.0, + "grad_norm": 0.047302370588053054, + "language_loss": 0.82041919, + "learning_rate": 7.644286796333222e-06, + "loss": 0.83106971, + "num_input_tokens_seen": 407854640, + "router_z_loss_mlp": 0.30664062, + "step": 4917, + "time_per_iteration": 3.3984169960021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065516, + "balance_loss_mlp": 1.0340929, + "epoch": 0.9461331281262024, + "flos": 513332315136.0, + "grad_norm": 0.06405124336828617, + "language_loss": 0.81178218, + "learning_rate": 7.590113701241075e-06, + "loss": 0.82243741, + "num_input_tokens_seen": 407922704, + "router_z_loss_mlp": 0.31396484, + "step": 4918, + "time_per_iteration": 2.6147992610931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061487, + "balance_loss_mlp": 1.03058767, + "epoch": 0.9463255098114659, + "flos": 527768663040.0, + "grad_norm": 0.061511478598610246, + "language_loss": 0.77884614, + "learning_rate": 7.536131776620936e-06, + "loss": 0.78946102, + "num_input_tokens_seen": 407991136, + "router_z_loss_mlp": 0.30859375, + "step": 4919, + "time_per_iteration": 2.567692756652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060745, + "balance_loss_mlp": 1.02972698, + "epoch": 0.9465178914967295, + "flos": 505798428672.0, + "grad_norm": 0.06093265985390115, + "language_loss": 0.83455086, + "learning_rate": 7.482341043430485e-06, + "loss": 0.84515834, + "num_input_tokens_seen": 408056576, + "router_z_loss_mlp": 0.30981445, + "step": 4920, + "time_per_iteration": 2.5662806034088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058812, + "balance_loss_mlp": 1.02672136, + "epoch": 0.9467102731819931, + "flos": 659934895104.0, + "grad_norm": 0.05112335734363304, + "language_loss": 0.85568339, + "learning_rate": 7.428741522553184e-06, + "loss": 0.8662715, + "num_input_tokens_seen": 408136960, + "router_z_loss_mlp": 0.32080078, + "step": 4921, + "time_per_iteration": 2.886445999145508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059962, + "balance_loss_mlp": 1.02877688, + "epoch": 0.9469026548672567, + "flos": 674854281216.0, + "grad_norm": 0.04908759273743658, + "language_loss": 0.89305884, + "learning_rate": 7.375333234798054e-06, + "loss": 0.90365845, + "num_input_tokens_seen": 408218304, + "router_z_loss_mlp": 0.31152344, + "step": 4922, + "time_per_iteration": 2.920933961868286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061885, + "balance_loss_mlp": 1.03074789, + "epoch": 0.9470950365525203, + "flos": 513701872128.0, + "grad_norm": 0.06478709373660563, + "language_loss": 0.79722393, + "learning_rate": 7.32211620090012e-06, + "loss": 0.80784273, + "num_input_tokens_seen": 408287936, + "router_z_loss_mlp": 0.31103516, + "step": 4923, + "time_per_iteration": 2.6531217098236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106274, + "balance_loss_mlp": 1.03122091, + "epoch": 0.9472874182377837, + "flos": 549823265280.0, + "grad_norm": 0.05014649935871143, + "language_loss": 0.81039178, + "learning_rate": 7.269090441520132e-06, + "loss": 0.82101917, + "num_input_tokens_seen": 408365568, + "router_z_loss_mlp": 0.31494141, + "step": 4924, + "time_per_iteration": 2.7968811988830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060368, + "balance_loss_mlp": 1.0297308, + "epoch": 0.9474797999230473, + "flos": 542510548992.0, + "grad_norm": 0.05010652307615933, + "language_loss": 0.80013597, + "learning_rate": 7.216255977244457e-06, + "loss": 0.81073964, + "num_input_tokens_seen": 408431248, + "router_z_loss_mlp": 0.3059082, + "step": 4925, + "time_per_iteration": 2.610203266143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106326, + "balance_loss_mlp": 1.03155029, + "epoch": 0.9476721816083109, + "flos": 844291427328.0, + "grad_norm": 0.056324444552239616, + "language_loss": 0.85505098, + "learning_rate": 7.163612828585242e-06, + "loss": 0.86568356, + "num_input_tokens_seen": 408514112, + "router_z_loss_mlp": 0.31689453, + "step": 4926, + "time_per_iteration": 3.0810704231262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061593, + "balance_loss_mlp": 1.03052688, + "epoch": 0.9478645632935745, + "flos": 637717349376.0, + "grad_norm": 0.05782852110229401, + "language_loss": 0.79276693, + "learning_rate": 7.1111610159803605e-06, + "loss": 0.80338287, + "num_input_tokens_seen": 408585968, + "router_z_loss_mlp": 0.31030273, + "step": 4927, + "time_per_iteration": 2.7429778575897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060754, + "balance_loss_mlp": 1.02968776, + "epoch": 0.948056944978838, + "flos": 656531716608.0, + "grad_norm": 0.05213979762076608, + "language_loss": 0.75814188, + "learning_rate": 7.058900559793469e-06, + "loss": 0.76874942, + "num_input_tokens_seen": 408665456, + "router_z_loss_mlp": 0.31030273, + "step": 4928, + "time_per_iteration": 2.811953067779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010646, + "balance_loss_mlp": 1.03343904, + "epoch": 0.9482493266641016, + "flos": 440676301824.0, + "grad_norm": 0.06159460987031525, + "language_loss": 0.83276188, + "learning_rate": 7.00683148031378e-06, + "loss": 0.84340787, + "num_input_tokens_seen": 408730192, + "router_z_loss_mlp": 0.3112793, + "step": 4929, + "time_per_iteration": 2.560638666152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059451, + "balance_loss_mlp": 1.02864742, + "epoch": 0.9484417083493651, + "flos": 545707113984.0, + "grad_norm": 0.05635941131383605, + "language_loss": 0.77704895, + "learning_rate": 6.9549537977564024e-06, + "loss": 0.78764343, + "num_input_tokens_seen": 408807616, + "router_z_loss_mlp": 0.30761719, + "step": 4930, + "time_per_iteration": 2.7607994079589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061911, + "balance_loss_mlp": 1.03070188, + "epoch": 0.9486340900346287, + "flos": 538325996544.0, + "grad_norm": 0.08183540435838546, + "language_loss": 0.7971375, + "learning_rate": 6.903267532262003e-06, + "loss": 0.80775654, + "num_input_tokens_seen": 408883552, + "router_z_loss_mlp": 0.31176758, + "step": 4931, + "time_per_iteration": 2.6748297214508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060277, + "balance_loss_mlp": 1.02899647, + "epoch": 0.9488264717198923, + "flos": 681362454528.0, + "grad_norm": 0.05258246646499724, + "language_loss": 0.85742575, + "learning_rate": 6.851772703896975e-06, + "loss": 0.86802852, + "num_input_tokens_seen": 408956400, + "router_z_loss_mlp": 0.3125, + "step": 4932, + "time_per_iteration": 2.8221163749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061977, + "balance_loss_mlp": 1.03086352, + "epoch": 0.9490188534051558, + "flos": 462365729280.0, + "grad_norm": 0.060862795367112775, + "language_loss": 0.88211328, + "learning_rate": 6.8004693326533805e-06, + "loss": 0.8927331, + "num_input_tokens_seen": 409019904, + "router_z_loss_mlp": 0.31079102, + "step": 4933, + "time_per_iteration": 2.5040442943573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059018, + "balance_loss_mlp": 1.02904928, + "epoch": 0.9492112350904194, + "flos": 542865549312.0, + "grad_norm": 0.05128444619283133, + "language_loss": 0.82685566, + "learning_rate": 6.7493574384489e-06, + "loss": 0.83744586, + "num_input_tokens_seen": 409094288, + "router_z_loss_mlp": 0.29907227, + "step": 4934, + "time_per_iteration": 2.680206537246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060329, + "balance_loss_mlp": 1.02938271, + "epoch": 0.949403616775683, + "flos": 550040053248.0, + "grad_norm": 0.05239087368411042, + "language_loss": 0.84056008, + "learning_rate": 6.698437041126992e-06, + "loss": 0.85116339, + "num_input_tokens_seen": 409169120, + "router_z_loss_mlp": 0.30908203, + "step": 4935, + "time_per_iteration": 2.693420648574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061379, + "balance_loss_mlp": 1.03074229, + "epoch": 0.9495959984609466, + "flos": 598105437696.0, + "grad_norm": 0.04662425876845386, + "language_loss": 0.82682049, + "learning_rate": 6.647708160456678e-06, + "loss": 0.83743429, + "num_input_tokens_seen": 409243200, + "router_z_loss_mlp": 0.3059082, + "step": 4936, + "time_per_iteration": 2.7124640941619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063372, + "balance_loss_mlp": 1.03273535, + "epoch": 0.94978838014621, + "flos": 608130270720.0, + "grad_norm": 0.07024683277055332, + "language_loss": 0.82249677, + "learning_rate": 6.597170816132702e-06, + "loss": 0.8331306, + "num_input_tokens_seen": 409319264, + "router_z_loss_mlp": 0.3059082, + "step": 4937, + "time_per_iteration": 2.810114622116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063007, + "balance_loss_mlp": 1.03132105, + "epoch": 0.9499807618314736, + "flos": 540575424000.0, + "grad_norm": 0.04925833827066514, + "language_loss": 0.8649928, + "learning_rate": 6.546825027775427e-06, + "loss": 0.87562287, + "num_input_tokens_seen": 409389840, + "router_z_loss_mlp": 0.31665039, + "step": 4938, + "time_per_iteration": 2.6485135555267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065564, + "balance_loss_mlp": 1.03521299, + "epoch": 0.9501731435167372, + "flos": 594323937792.0, + "grad_norm": 0.046317056563734006, + "language_loss": 0.82939029, + "learning_rate": 6.496670814930717e-06, + "loss": 0.84004581, + "num_input_tokens_seen": 409458752, + "router_z_loss_mlp": 0.30297852, + "step": 4939, + "time_per_iteration": 2.687056303024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063883, + "balance_loss_mlp": 1.03310299, + "epoch": 0.9503655252020008, + "flos": 453906464256.0, + "grad_norm": 0.06011478879513061, + "language_loss": 0.80201852, + "learning_rate": 6.446708197070161e-06, + "loss": 0.81265736, + "num_input_tokens_seen": 409525008, + "router_z_loss_mlp": 0.30737305, + "step": 4940, + "time_per_iteration": 2.528292179107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061453, + "balance_loss_mlp": 1.03041101, + "epoch": 0.9505579068872644, + "flos": 667649253888.0, + "grad_norm": 0.055113851279690214, + "language_loss": 0.846946, + "learning_rate": 6.396937193591079e-06, + "loss": 0.85756052, + "num_input_tokens_seen": 409603376, + "router_z_loss_mlp": 0.31005859, + "step": 4941, + "time_per_iteration": 2.823488235473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061943, + "balance_loss_mlp": 1.03085279, + "epoch": 0.9507502885725279, + "flos": 401989768704.0, + "grad_norm": 0.05825643192840984, + "language_loss": 0.81586736, + "learning_rate": 6.347357823816235e-06, + "loss": 0.82648677, + "num_input_tokens_seen": 409667168, + "router_z_loss_mlp": 0.31054688, + "step": 4942, + "time_per_iteration": 2.495529890060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058501, + "balance_loss_mlp": 1.02719688, + "epoch": 0.9509426702577914, + "flos": 700015288320.0, + "grad_norm": 0.051863922569766165, + "language_loss": 0.79421437, + "learning_rate": 6.297970106994011e-06, + "loss": 0.80479932, + "num_input_tokens_seen": 409746832, + "router_z_loss_mlp": 0.31274414, + "step": 4943, + "time_per_iteration": 2.978654384613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058704, + "balance_loss_mlp": 1.02778077, + "epoch": 0.951135051943055, + "flos": 501170125824.0, + "grad_norm": 0.057314237178262395, + "language_loss": 0.82580554, + "learning_rate": 6.2487740622985126e-06, + "loss": 0.83639264, + "num_input_tokens_seen": 409813792, + "router_z_loss_mlp": 0.30883789, + "step": 4944, + "time_per_iteration": 2.5696232318878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059453, + "balance_loss_mlp": 1.02831542, + "epoch": 0.9513274336283186, + "flos": 614310994944.0, + "grad_norm": 0.06995172115852209, + "language_loss": 0.81611979, + "learning_rate": 6.1997697088292395e-06, + "loss": 0.82671428, + "num_input_tokens_seen": 409898848, + "router_z_loss_mlp": 0.31103516, + "step": 4945, + "time_per_iteration": 2.8979203701019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062862, + "balance_loss_mlp": 1.03184378, + "epoch": 0.9515198153135821, + "flos": 519334129152.0, + "grad_norm": 0.06029032553674963, + "language_loss": 0.81646979, + "learning_rate": 6.150957065611363e-06, + "loss": 0.82709849, + "num_input_tokens_seen": 409966368, + "router_z_loss_mlp": 0.30981445, + "step": 4946, + "time_per_iteration": 2.574259042739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010644, + "balance_loss_mlp": 1.03381109, + "epoch": 0.9517121969988457, + "flos": 664622834688.0, + "grad_norm": 0.054183669348516254, + "language_loss": 0.76488018, + "learning_rate": 6.102336151595667e-06, + "loss": 0.7755242, + "num_input_tokens_seen": 410048496, + "router_z_loss_mlp": 0.30566406, + "step": 4947, + "time_per_iteration": 2.927349805831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059457, + "balance_loss_mlp": 1.02791381, + "epoch": 0.9519045786841093, + "flos": 676108518912.0, + "grad_norm": 0.05907314462519681, + "language_loss": 0.75945526, + "learning_rate": 6.053906985658553e-06, + "loss": 0.77004981, + "num_input_tokens_seen": 410121840, + "router_z_loss_mlp": 0.31518555, + "step": 4948, + "time_per_iteration": 2.787550210952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059452, + "balance_loss_mlp": 1.02836227, + "epoch": 0.9520969603693729, + "flos": 652593065472.0, + "grad_norm": 0.05009872847852139, + "language_loss": 0.80296874, + "learning_rate": 6.005669586601814e-06, + "loss": 0.81356323, + "num_input_tokens_seen": 410199152, + "router_z_loss_mlp": 0.31054688, + "step": 4949, + "time_per_iteration": 2.8136212825775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062928, + "balance_loss_mlp": 1.03183794, + "epoch": 0.9522893420546364, + "flos": 742935836160.0, + "grad_norm": 0.046449409854488935, + "language_loss": 0.8303045, + "learning_rate": 5.957623973152748e-06, + "loss": 0.8409338, + "num_input_tokens_seen": 410285392, + "router_z_loss_mlp": 0.31054688, + "step": 4950, + "time_per_iteration": 3.0413155555725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061178, + "balance_loss_mlp": 1.03013575, + "epoch": 0.9524817237398999, + "flos": 761364679680.0, + "grad_norm": 0.06237526863901178, + "language_loss": 0.80663252, + "learning_rate": 5.909770163964545e-06, + "loss": 0.81724423, + "num_input_tokens_seen": 410359872, + "router_z_loss_mlp": 0.31005859, + "step": 4951, + "time_per_iteration": 2.926941156387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060308, + "balance_loss_mlp": 1.02936172, + "epoch": 0.9526741054251635, + "flos": 528871541760.0, + "grad_norm": 0.05333292784304159, + "language_loss": 0.82116485, + "learning_rate": 5.8621081776155105e-06, + "loss": 0.83176786, + "num_input_tokens_seen": 410425728, + "router_z_loss_mlp": 0.30908203, + "step": 4952, + "time_per_iteration": 2.572510004043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060149, + "balance_loss_mlp": 1.02896416, + "epoch": 0.9528664871104271, + "flos": 488196039168.0, + "grad_norm": 0.06449124257612378, + "language_loss": 0.80692679, + "learning_rate": 5.814638032609787e-06, + "loss": 0.81752825, + "num_input_tokens_seen": 410496080, + "router_z_loss_mlp": 0.31152344, + "step": 4953, + "time_per_iteration": 2.5699658393859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062492, + "balance_loss_mlp": 1.03142655, + "epoch": 0.9530588687956907, + "flos": 517464433152.0, + "grad_norm": 0.04487824642282098, + "language_loss": 0.8520484, + "learning_rate": 5.76735974737691e-06, + "loss": 0.86267328, + "num_input_tokens_seen": 410576448, + "router_z_loss_mlp": 0.31030273, + "step": 4954, + "time_per_iteration": 2.7860260009765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059671, + "balance_loss_mlp": 1.02831888, + "epoch": 0.9532512504809542, + "flos": 674833932288.0, + "grad_norm": 0.06088985073975803, + "language_loss": 0.80344075, + "learning_rate": 5.720273340271864e-06, + "loss": 0.81403744, + "num_input_tokens_seen": 410655792, + "router_z_loss_mlp": 0.31323242, + "step": 4955, + "time_per_iteration": 2.83512544631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106091, + "balance_loss_mlp": 1.02960563, + "epoch": 0.9534436321662177, + "flos": 489269804544.0, + "grad_norm": 0.05399623732044483, + "language_loss": 0.83765268, + "learning_rate": 5.673378829575249e-06, + "loss": 0.84826177, + "num_input_tokens_seen": 410725440, + "router_z_loss_mlp": 0.31274414, + "step": 4956, + "time_per_iteration": 2.5622565746307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064731, + "balance_loss_mlp": 1.03325951, + "epoch": 0.9536360138514813, + "flos": 496335209472.0, + "grad_norm": 0.05639798333533893, + "language_loss": 0.82038826, + "learning_rate": 5.626676233493167e-06, + "loss": 0.83103555, + "num_input_tokens_seen": 410797552, + "router_z_loss_mlp": 0.31445312, + "step": 4957, + "time_per_iteration": 2.6522533893585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106195, + "balance_loss_mlp": 1.03119373, + "epoch": 0.9538283955367449, + "flos": 801114803712.0, + "grad_norm": 0.052998960329489544, + "language_loss": 0.8405599, + "learning_rate": 5.580165570157114e-06, + "loss": 0.85117936, + "num_input_tokens_seen": 410876736, + "router_z_loss_mlp": 0.30712891, + "step": 4958, + "time_per_iteration": 3.0696020126342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061291, + "balance_loss_mlp": 1.03003478, + "epoch": 0.9540207772220085, + "flos": 556386693120.0, + "grad_norm": 0.050926837280592614, + "language_loss": 0.79829109, + "learning_rate": 5.533846857624203e-06, + "loss": 0.80890399, + "num_input_tokens_seen": 410955632, + "router_z_loss_mlp": 0.31225586, + "step": 4959, + "time_per_iteration": 2.758847951889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061461, + "balance_loss_mlp": 1.03044283, + "epoch": 0.954213158907272, + "flos": 684193844736.0, + "grad_norm": 0.056254773205571866, + "language_loss": 0.81409031, + "learning_rate": 5.487720113876882e-06, + "loss": 0.82470489, + "num_input_tokens_seen": 411038480, + "router_z_loss_mlp": 0.30981445, + "step": 4960, + "time_per_iteration": 2.8848278522491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059866, + "balance_loss_mlp": 1.02913427, + "epoch": 0.9544055405925356, + "flos": 535480049664.0, + "grad_norm": 0.06620853899260781, + "language_loss": 0.8256973, + "learning_rate": 5.441785356823214e-06, + "loss": 0.83629596, + "num_input_tokens_seen": 411109744, + "router_z_loss_mlp": 0.30688477, + "step": 4961, + "time_per_iteration": 2.7178971767425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063811, + "balance_loss_mlp": 1.03272176, + "epoch": 0.9545979222777992, + "flos": 825025955328.0, + "grad_norm": 0.06182369055058564, + "language_loss": 0.80590069, + "learning_rate": 5.3960426042965476e-06, + "loss": 0.81653881, + "num_input_tokens_seen": 411202192, + "router_z_loss_mlp": 0.31054688, + "step": 4962, + "time_per_iteration": 3.109530448913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063297, + "balance_loss_mlp": 1.03232658, + "epoch": 0.9547903039630627, + "flos": 761326801920.0, + "grad_norm": 0.052315265148481546, + "language_loss": 0.77177644, + "learning_rate": 5.3504918740558405e-06, + "loss": 0.78240943, + "num_input_tokens_seen": 411289248, + "router_z_loss_mlp": 0.30932617, + "step": 4963, + "time_per_iteration": 3.0747199058532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064785, + "balance_loss_mlp": 1.03367114, + "epoch": 0.9549826856483262, + "flos": 515050652160.0, + "grad_norm": 0.05862888222606825, + "language_loss": 0.82517004, + "learning_rate": 5.3051331837855045e-06, + "loss": 0.83581787, + "num_input_tokens_seen": 411355232, + "router_z_loss_mlp": 0.31079102, + "step": 4964, + "time_per_iteration": 2.5942893028259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058928, + "balance_loss_mlp": 1.02857697, + "epoch": 0.9551750673335898, + "flos": 642818515968.0, + "grad_norm": 0.08814451705350242, + "language_loss": 0.82504213, + "learning_rate": 5.259966551095341e-06, + "loss": 0.83563137, + "num_input_tokens_seen": 411432288, + "router_z_loss_mlp": 0.30297852, + "step": 4965, + "time_per_iteration": 2.8214685916900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106332, + "balance_loss_mlp": 1.03199232, + "epoch": 0.9553674490188534, + "flos": 471967160832.0, + "grad_norm": 0.06036166806665559, + "language_loss": 0.82877362, + "learning_rate": 5.214991993520546e-06, + "loss": 0.83940685, + "num_input_tokens_seen": 411499376, + "router_z_loss_mlp": 0.31298828, + "step": 4966, + "time_per_iteration": 2.5931785106658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063165, + "balance_loss_mlp": 1.03209853, + "epoch": 0.955559830704117, + "flos": 528064026624.0, + "grad_norm": 0.057495617143447204, + "language_loss": 0.81656486, + "learning_rate": 5.170209528521763e-06, + "loss": 0.82719648, + "num_input_tokens_seen": 411564976, + "router_z_loss_mlp": 0.31030273, + "step": 4967, + "time_per_iteration": 2.5922937393188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062233, + "balance_loss_mlp": 1.03064275, + "epoch": 0.9557522123893806, + "flos": 547907079168.0, + "grad_norm": 0.05879724431171985, + "language_loss": 0.83928549, + "learning_rate": 5.125619173485196e-06, + "loss": 0.84990788, + "num_input_tokens_seen": 411636464, + "router_z_loss_mlp": 0.31567383, + "step": 4968, + "time_per_iteration": 2.5986125469207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059963, + "balance_loss_mlp": 1.02887332, + "epoch": 0.955944594074644, + "flos": 509201607168.0, + "grad_norm": 0.04811907274650132, + "language_loss": 0.81663108, + "learning_rate": 5.08122094572222e-06, + "loss": 0.82723069, + "num_input_tokens_seen": 411710672, + "router_z_loss_mlp": 0.31054688, + "step": 4969, + "time_per_iteration": 2.6879472732543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062537, + "balance_loss_mlp": 1.03163767, + "epoch": 0.9561369757599076, + "flos": 527297209344.0, + "grad_norm": 0.0632836097408621, + "language_loss": 0.7964825, + "learning_rate": 5.037014862469824e-06, + "loss": 0.80710787, + "num_input_tokens_seen": 411785616, + "router_z_loss_mlp": 0.30859375, + "step": 4970, + "time_per_iteration": 2.7789974212646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064602, + "balance_loss_mlp": 1.03363097, + "epoch": 0.9563293574451712, + "flos": 497950239744.0, + "grad_norm": 0.05808374909339728, + "language_loss": 0.79714704, + "learning_rate": 4.993000940890391e-06, + "loss": 0.80779302, + "num_input_tokens_seen": 411854832, + "router_z_loss_mlp": 0.30932617, + "step": 4971, + "time_per_iteration": 2.5803020000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010642, + "balance_loss_mlp": 1.00253558, + "epoch": 0.9565217391304348, + "flos": 1408160982528.0, + "grad_norm": 0.0036852795244430637, + "language_loss": 0.81773561, + "learning_rate": 4.949179198071585e-06, + "loss": 0.82784206, + "num_input_tokens_seen": 412081856, + "router_z_loss_mlp": 0.08105469, + "step": 4972, + "time_per_iteration": 4.861463785171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060113, + "balance_loss_mlp": 1.02949977, + "epoch": 0.9567141208156984, + "flos": 503588289024.0, + "grad_norm": 0.0458139454633811, + "language_loss": 0.78067911, + "learning_rate": 4.905549651026464e-06, + "loss": 0.79128021, + "num_input_tokens_seen": 412155600, + "router_z_loss_mlp": 0.30566406, + "step": 4973, + "time_per_iteration": 2.7627005577087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062012, + "balance_loss_mlp": 1.03063631, + "epoch": 0.9569065025009619, + "flos": 432985264128.0, + "grad_norm": 0.0685337363619886, + "language_loss": 0.79855549, + "learning_rate": 4.86211231669359e-06, + "loss": 0.80917561, + "num_input_tokens_seen": 412219584, + "router_z_loss_mlp": 0.31347656, + "step": 4974, + "time_per_iteration": 2.4670193195343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061097, + "balance_loss_mlp": 1.03055596, + "epoch": 0.9570988841862255, + "flos": 589662139392.0, + "grad_norm": 0.08497367515573702, + "language_loss": 0.78133345, + "learning_rate": 4.818867211936806e-06, + "loss": 0.79194438, + "num_input_tokens_seen": 412295088, + "router_z_loss_mlp": 0.30493164, + "step": 4975, + "time_per_iteration": 2.7725143432617188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106144, + "balance_loss_mlp": 1.03037453, + "epoch": 0.957291265871489, + "flos": 766938710016.0, + "grad_norm": 0.09684450321681563, + "language_loss": 0.78521204, + "learning_rate": 4.7758143535454045e-06, + "loss": 0.79582649, + "num_input_tokens_seen": 412376992, + "router_z_loss_mlp": 0.31030273, + "step": 4976, + "time_per_iteration": 3.0132195949554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062638, + "balance_loss_mlp": 1.031739, + "epoch": 0.9574836475567526, + "flos": 638820228096.0, + "grad_norm": 0.058082259437561116, + "language_loss": 0.84378779, + "learning_rate": 4.732953758233849e-06, + "loss": 0.85441422, + "num_input_tokens_seen": 412450064, + "router_z_loss_mlp": 0.30883789, + "step": 4977, + "time_per_iteration": 2.775636672973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010609, + "balance_loss_mlp": 1.00250316, + "epoch": 0.9576760292420161, + "flos": 1575077916672.0, + "grad_norm": 0.0036801546071174532, + "language_loss": 0.78607261, + "learning_rate": 4.690285442642272e-06, + "loss": 0.7961787, + "num_input_tokens_seen": 412676896, + "router_z_loss_mlp": 0.08105469, + "step": 4978, + "time_per_iteration": 4.957335710525513 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064079, + "balance_loss_mlp": 1.03224993, + "epoch": 0.9578684109272797, + "flos": 496089308160.0, + "grad_norm": 0.05585876202637502, + "language_loss": 0.86942095, + "learning_rate": 4.6478094233358695e-06, + "loss": 0.88006175, + "num_input_tokens_seen": 412746848, + "router_z_loss_mlp": 0.31811523, + "step": 4979, + "time_per_iteration": 2.6361520290374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063178, + "balance_loss_mlp": 1.03173101, + "epoch": 0.9580607926125433, + "flos": 429730472448.0, + "grad_norm": 0.06248863095156401, + "language_loss": 0.84962738, + "learning_rate": 4.605525716805337e-06, + "loss": 0.86025918, + "num_input_tokens_seen": 412810144, + "router_z_loss_mlp": 0.31420898, + "step": 4980, + "time_per_iteration": 2.473877191543579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062381, + "balance_loss_mlp": 1.03129125, + "epoch": 0.9582531742978069, + "flos": 1126796659200.0, + "grad_norm": 0.05496423205034748, + "language_loss": 0.79914278, + "learning_rate": 4.563434339466599e-06, + "loss": 0.80976653, + "num_input_tokens_seen": 412904768, + "router_z_loss_mlp": 0.31054688, + "step": 4981, + "time_per_iteration": 3.5336828231811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062471, + "balance_loss_mlp": 1.03061819, + "epoch": 0.9584455559830705, + "flos": 524185012224.0, + "grad_norm": 0.049558760655255135, + "language_loss": 0.78986633, + "learning_rate": 4.521535307661085e-06, + "loss": 0.80049098, + "num_input_tokens_seen": 412974592, + "router_z_loss_mlp": 0.31835938, + "step": 4982, + "time_per_iteration": 2.6623659133911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063285, + "balance_loss_mlp": 1.03198099, + "epoch": 0.9586379376683339, + "flos": 633873240576.0, + "grad_norm": 0.05452483002568014, + "language_loss": 0.80709702, + "learning_rate": 4.479828637655392e-06, + "loss": 0.81772989, + "num_input_tokens_seen": 413052848, + "router_z_loss_mlp": 0.31274414, + "step": 4983, + "time_per_iteration": 2.8733677864074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106139, + "balance_loss_mlp": 1.03025281, + "epoch": 0.9588303193535975, + "flos": 415831007232.0, + "grad_norm": 0.05618867456801452, + "language_loss": 0.83768463, + "learning_rate": 4.438314345641459e-06, + "loss": 0.84829855, + "num_input_tokens_seen": 413118000, + "router_z_loss_mlp": 0.31103516, + "step": 4984, + "time_per_iteration": 2.485130548477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061229, + "balance_loss_mlp": 1.03006768, + "epoch": 0.9590227010388611, + "flos": 481440554496.0, + "grad_norm": 0.05863368736289354, + "language_loss": 0.78077298, + "learning_rate": 4.3969924477365585e-06, + "loss": 0.79138523, + "num_input_tokens_seen": 413185616, + "router_z_loss_mlp": 0.3112793, + "step": 4985, + "time_per_iteration": 2.5876057147979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060016, + "balance_loss_mlp": 1.02904499, + "epoch": 0.9592150827241247, + "flos": 684214193664.0, + "grad_norm": 0.05594536844017473, + "language_loss": 0.80234873, + "learning_rate": 4.355862959983359e-06, + "loss": 0.81294882, + "num_input_tokens_seen": 413265616, + "router_z_loss_mlp": 0.30932617, + "step": 4986, + "time_per_iteration": 2.9769937992095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058834, + "balance_loss_mlp": 1.02800655, + "epoch": 0.9594074644093882, + "flos": 574205870592.0, + "grad_norm": 0.05549029707808997, + "language_loss": 0.70972621, + "learning_rate": 4.314925898349642e-06, + "loss": 0.72031456, + "num_input_tokens_seen": 413341248, + "router_z_loss_mlp": 0.30786133, + "step": 4987, + "time_per_iteration": 2.7206904888153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059627, + "balance_loss_mlp": 1.02856088, + "epoch": 0.9595998460946518, + "flos": 546593204736.0, + "grad_norm": 0.06100041490887349, + "language_loss": 0.7789138, + "learning_rate": 4.2741812787286395e-06, + "loss": 0.78951007, + "num_input_tokens_seen": 413416080, + "router_z_loss_mlp": 0.31030273, + "step": 4988, + "time_per_iteration": 2.7511510848999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061435, + "balance_loss_mlp": 1.02998781, + "epoch": 0.9597922277799154, + "flos": 473798979072.0, + "grad_norm": 0.06528513984211147, + "language_loss": 0.78195035, + "learning_rate": 4.233629116938809e-06, + "loss": 0.79256475, + "num_input_tokens_seen": 413482336, + "router_z_loss_mlp": 0.31420898, + "step": 4989, + "time_per_iteration": 2.5284314155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106279, + "balance_loss_mlp": 1.03122306, + "epoch": 0.9599846094651789, + "flos": 514435193856.0, + "grad_norm": 0.0526648321296654, + "language_loss": 0.85647953, + "learning_rate": 4.193269428723889e-06, + "loss": 0.86710739, + "num_input_tokens_seen": 413553248, + "router_z_loss_mlp": 0.31542969, + "step": 4990, + "time_per_iteration": 2.629483938217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063356, + "balance_loss_mlp": 1.03233767, + "epoch": 0.9601769911504425, + "flos": 594689112576.0, + "grad_norm": 0.06076750451887931, + "language_loss": 0.7845335, + "learning_rate": 4.1531022297529035e-06, + "loss": 0.79516703, + "num_input_tokens_seen": 413625776, + "router_z_loss_mlp": 0.30981445, + "step": 4991, + "time_per_iteration": 2.767305850982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064329, + "balance_loss_mlp": 1.03381109, + "epoch": 0.960369372835706, + "flos": 492755940864.0, + "grad_norm": 0.043940554516399624, + "language_loss": 0.7891221, + "learning_rate": 4.1131275356201536e-06, + "loss": 0.79976535, + "num_input_tokens_seen": 413693056, + "router_z_loss_mlp": 0.3046875, + "step": 4992, + "time_per_iteration": 2.632108211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059952, + "balance_loss_mlp": 1.02862382, + "epoch": 0.9605617545209696, + "flos": 579016055808.0, + "grad_norm": 0.05380542544477012, + "language_loss": 0.82685798, + "learning_rate": 4.073345361845171e-06, + "loss": 0.83745754, + "num_input_tokens_seen": 413765616, + "router_z_loss_mlp": 0.31298828, + "step": 4993, + "time_per_iteration": 2.7149124145507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059008, + "balance_loss_mlp": 1.0279901, + "epoch": 0.9607541362062332, + "flos": 927312717312.0, + "grad_norm": 0.052236302750863224, + "language_loss": 0.86238164, + "learning_rate": 4.033755723872767e-06, + "loss": 0.87297165, + "num_input_tokens_seen": 413850976, + "router_z_loss_mlp": 0.30981445, + "step": 4994, + "time_per_iteration": 3.2594830989837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064119, + "balance_loss_mlp": 1.03255212, + "epoch": 0.9609465178914968, + "flos": 572832359424.0, + "grad_norm": 0.052246235549541574, + "language_loss": 0.75361091, + "learning_rate": 3.994358637073036e-06, + "loss": 0.76425207, + "num_input_tokens_seen": 413931648, + "router_z_loss_mlp": 0.31542969, + "step": 4995, + "time_per_iteration": 2.821571111679077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058679, + "balance_loss_mlp": 1.02792275, + "epoch": 0.9611388995767602, + "flos": 530585496576.0, + "grad_norm": 0.05476477352747421, + "language_loss": 0.85442054, + "learning_rate": 3.955154116741244e-06, + "loss": 0.86500728, + "num_input_tokens_seen": 414003216, + "router_z_loss_mlp": 0.30712891, + "step": 4996, + "time_per_iteration": 2.6323540210723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057031, + "balance_loss_mlp": 1.0260129, + "epoch": 0.9613312812620238, + "flos": 645959826432.0, + "grad_norm": 0.05351133483249604, + "language_loss": 0.81781733, + "learning_rate": 3.916142178097881e-06, + "loss": 0.82838762, + "num_input_tokens_seen": 414077072, + "router_z_loss_mlp": 0.30981445, + "step": 4997, + "time_per_iteration": 2.7790870666503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106076, + "balance_loss_mlp": 1.03000379, + "epoch": 0.9615236629472874, + "flos": 495897251328.0, + "grad_norm": 0.05497898796109047, + "language_loss": 0.77663916, + "learning_rate": 3.877322836288888e-06, + "loss": 0.78724676, + "num_input_tokens_seen": 414157600, + "router_z_loss_mlp": 0.30712891, + "step": 4998, + "time_per_iteration": 2.888197183609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063994, + "balance_loss_mlp": 1.03237975, + "epoch": 0.961716044632551, + "flos": 512716856832.0, + "grad_norm": 0.050530888251375694, + "language_loss": 0.75301838, + "learning_rate": 3.838696106385153e-06, + "loss": 0.76365829, + "num_input_tokens_seen": 414224880, + "router_z_loss_mlp": 0.31591797, + "step": 4999, + "time_per_iteration": 2.595407009124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065785, + "balance_loss_mlp": 1.03431368, + "epoch": 0.9619084263178146, + "flos": 500835474432.0, + "grad_norm": 0.07679880414039399, + "language_loss": 0.80363154, + "learning_rate": 3.800262003382904e-06, + "loss": 0.81428945, + "num_input_tokens_seen": 414291728, + "router_z_loss_mlp": 0.31445312, + "step": 5000, + "time_per_iteration": 2.579832077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106128, + "balance_loss_mlp": 1.02935529, + "epoch": 0.9621008080030781, + "flos": 595343858688.0, + "grad_norm": 0.060590971935696514, + "language_loss": 0.74907798, + "learning_rate": 3.7620205422035923e-06, + "loss": 0.75969076, + "num_input_tokens_seen": 414369568, + "router_z_loss_mlp": 0.3190918, + "step": 5001, + "time_per_iteration": 2.754000425338745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061748, + "balance_loss_mlp": 1.0305872, + "epoch": 0.9622931896883417, + "flos": 502002372096.0, + "grad_norm": 0.05936854573228492, + "language_loss": 0.8188566, + "learning_rate": 3.723971737693899e-06, + "loss": 0.82947409, + "num_input_tokens_seen": 414441424, + "router_z_loss_mlp": 0.3112793, + "step": 5002, + "time_per_iteration": 2.609647274017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059702, + "balance_loss_mlp": 1.02827847, + "epoch": 0.9624855713736052, + "flos": 606998278656.0, + "grad_norm": 0.05580409062839421, + "language_loss": 0.80881554, + "learning_rate": 3.6861156046256728e-06, + "loss": 0.81941253, + "num_input_tokens_seen": 414512960, + "router_z_loss_mlp": 0.31396484, + "step": 5003, + "time_per_iteration": 2.761650800704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061746, + "balance_loss_mlp": 1.03075206, + "epoch": 0.9626779530588688, + "flos": 510461637120.0, + "grad_norm": 0.0549718452181321, + "language_loss": 0.8447454, + "learning_rate": 3.648452157695936e-06, + "loss": 0.85536283, + "num_input_tokens_seen": 414577392, + "router_z_loss_mlp": 0.30957031, + "step": 5004, + "time_per_iteration": 2.553931951522827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060462, + "balance_loss_mlp": 1.02946782, + "epoch": 0.9628703347441323, + "flos": 626994100224.0, + "grad_norm": 0.05616371519000272, + "language_loss": 0.8239938, + "learning_rate": 3.610981411526937e-06, + "loss": 0.83459842, + "num_input_tokens_seen": 414655152, + "router_z_loss_mlp": 0.30957031, + "step": 5005, + "time_per_iteration": 2.8171026706695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060597, + "balance_loss_mlp": 1.02938795, + "epoch": 0.9630627164293959, + "flos": 630474444288.0, + "grad_norm": 0.057826844797486406, + "language_loss": 0.77271199, + "learning_rate": 3.573703380666149e-06, + "loss": 0.78331804, + "num_input_tokens_seen": 414730432, + "router_z_loss_mlp": 0.31176758, + "step": 5006, + "time_per_iteration": 2.7363760471343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064818, + "balance_loss_mlp": 1.03368068, + "epoch": 0.9632550981146595, + "flos": 570267219456.0, + "grad_norm": 0.049148732947391416, + "language_loss": 0.78805315, + "learning_rate": 3.5366180795861622e-06, + "loss": 0.79870129, + "num_input_tokens_seen": 414810688, + "router_z_loss_mlp": 0.31103516, + "step": 5007, + "time_per_iteration": 2.798482894897461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062148, + "balance_loss_mlp": 1.03000939, + "epoch": 0.9634474797999231, + "flos": 465857657856.0, + "grad_norm": 0.061293995921825266, + "language_loss": 0.80628145, + "learning_rate": 3.4997255226847937e-06, + "loss": 0.81690294, + "num_input_tokens_seen": 414880544, + "router_z_loss_mlp": 0.32128906, + "step": 5008, + "time_per_iteration": 2.642397403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106517, + "balance_loss_mlp": 1.03365088, + "epoch": 0.9636398614851867, + "flos": 526345689600.0, + "grad_norm": 0.06183115681843797, + "language_loss": 0.85305095, + "learning_rate": 3.463025724284974e-06, + "loss": 0.86370265, + "num_input_tokens_seen": 414949920, + "router_z_loss_mlp": 0.31494141, + "step": 5009, + "time_per_iteration": 4.074778079986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060705, + "balance_loss_mlp": 1.03002095, + "epoch": 0.9638322431704501, + "flos": 564554976768.0, + "grad_norm": 0.05273989252047438, + "language_loss": 0.75239956, + "learning_rate": 3.4265186986348618e-06, + "loss": 0.76300663, + "num_input_tokens_seen": 415024288, + "router_z_loss_mlp": 0.30639648, + "step": 5010, + "time_per_iteration": 2.758338689804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062801, + "balance_loss_mlp": 1.03113854, + "epoch": 0.9640246248557137, + "flos": 477531016704.0, + "grad_norm": 0.05564071805678616, + "language_loss": 0.84424335, + "learning_rate": 3.3902044599076754e-06, + "loss": 0.85487133, + "num_input_tokens_seen": 415092032, + "router_z_loss_mlp": 0.31640625, + "step": 5011, + "time_per_iteration": 2.572166919708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061468, + "balance_loss_mlp": 1.03083074, + "epoch": 0.9642170065409773, + "flos": 539063700480.0, + "grad_norm": 0.05146867217669957, + "language_loss": 0.88495445, + "learning_rate": 3.354083022201859e-06, + "loss": 0.89556915, + "num_input_tokens_seen": 415158544, + "router_z_loss_mlp": 0.3059082, + "step": 5012, + "time_per_iteration": 2.6278939247131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060669, + "balance_loss_mlp": 1.02934122, + "epoch": 0.9644093882262409, + "flos": 523499742720.0, + "grad_norm": 0.053579427527373706, + "language_loss": 0.8370012, + "learning_rate": 3.3181543995410843e-06, + "loss": 0.84760791, + "num_input_tokens_seen": 415225088, + "router_z_loss_mlp": 0.31298828, + "step": 5013, + "time_per_iteration": 2.619873523712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061188, + "balance_loss_mlp": 1.0306704, + "epoch": 0.9646017699115044, + "flos": 574018195968.0, + "grad_norm": 0.05027392216499071, + "language_loss": 0.78493142, + "learning_rate": 3.2824186058740268e-06, + "loss": 0.79554331, + "num_input_tokens_seen": 415300224, + "router_z_loss_mlp": 0.30493164, + "step": 5014, + "time_per_iteration": 2.701974630355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105829, + "balance_loss_mlp": 1.02779675, + "epoch": 0.964794151596768, + "flos": 636511163904.0, + "grad_norm": 0.05861201015581005, + "language_loss": 0.84411347, + "learning_rate": 3.246875655074588e-06, + "loss": 0.85469639, + "num_input_tokens_seen": 415368784, + "router_z_loss_mlp": 0.30444336, + "step": 5015, + "time_per_iteration": 2.7228355407714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062943, + "balance_loss_mlp": 1.03218722, + "epoch": 0.9649865332820315, + "flos": 617155531776.0, + "grad_norm": 0.06123490133092928, + "language_loss": 0.86001122, + "learning_rate": 3.211525560941675e-06, + "loss": 0.87064075, + "num_input_tokens_seen": 415440752, + "router_z_loss_mlp": 0.30712891, + "step": 5016, + "time_per_iteration": 2.718139171600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106481, + "balance_loss_mlp": 1.03317225, + "epoch": 0.9651789149672951, + "flos": 515898865152.0, + "grad_norm": 0.05141317284262244, + "language_loss": 0.80746591, + "learning_rate": 3.1763683371994754e-06, + "loss": 0.81811404, + "num_input_tokens_seen": 415516128, + "router_z_loss_mlp": 0.31616211, + "step": 5017, + "time_per_iteration": 2.7883141040802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059591, + "balance_loss_mlp": 1.0283339, + "epoch": 0.9653712966525587, + "flos": 492696304128.0, + "grad_norm": 0.05540130147565109, + "language_loss": 0.79782176, + "learning_rate": 3.1414039974972385e-06, + "loss": 0.80841768, + "num_input_tokens_seen": 415583744, + "router_z_loss_mlp": 0.31225586, + "step": 5018, + "time_per_iteration": 2.5674331188201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059766, + "balance_loss_mlp": 1.02824712, + "epoch": 0.9655636783378222, + "flos": 536287564800.0, + "grad_norm": 0.0402183906982743, + "language_loss": 0.82745731, + "learning_rate": 3.106632555409328e-06, + "loss": 0.83805501, + "num_input_tokens_seen": 415659856, + "router_z_loss_mlp": 0.31494141, + "step": 5019, + "time_per_iteration": 2.816701650619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057217, + "balance_loss_mlp": 1.02615166, + "epoch": 0.9657560600230858, + "flos": 458790842880.0, + "grad_norm": 0.05354585998337552, + "language_loss": 0.82026023, + "learning_rate": 3.072054024435167e-06, + "loss": 0.83083236, + "num_input_tokens_seen": 415731792, + "router_z_loss_mlp": 0.31030273, + "step": 5020, + "time_per_iteration": 2.6295535564422607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059918, + "balance_loss_mlp": 1.02970994, + "epoch": 0.9659484417083494, + "flos": 685877276160.0, + "grad_norm": 0.06971172530491482, + "language_loss": 0.83589661, + "learning_rate": 3.0376684179994064e-06, + "loss": 0.84649581, + "num_input_tokens_seen": 415809536, + "router_z_loss_mlp": 0.30151367, + "step": 5021, + "time_per_iteration": 2.790837049484253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101064, + "balance_loss_mlp": 1.00253367, + "epoch": 0.966140823393613, + "flos": 1501503879168.0, + "grad_norm": 0.0036733761011580493, + "language_loss": 0.80694246, + "learning_rate": 3.0034757494516453e-06, + "loss": 0.81704885, + "num_input_tokens_seen": 416027600, + "router_z_loss_mlp": 0.08105469, + "step": 5022, + "time_per_iteration": 4.662962198257446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061518, + "balance_loss_mlp": 1.03085709, + "epoch": 0.9663332050788765, + "flos": 464660236800.0, + "grad_norm": 0.07115272426240939, + "language_loss": 0.80814624, + "learning_rate": 2.9694760320667093e-06, + "loss": 0.81876141, + "num_input_tokens_seen": 416096128, + "router_z_loss_mlp": 0.30615234, + "step": 5023, + "time_per_iteration": 2.6125125885009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058747, + "balance_loss_mlp": 1.02787185, + "epoch": 0.96652558676414, + "flos": 500575016448.0, + "grad_norm": 0.05166050311684865, + "language_loss": 0.85474747, + "learning_rate": 2.9356692790444283e-06, + "loss": 0.86533493, + "num_input_tokens_seen": 416164256, + "router_z_loss_mlp": 0.30834961, + "step": 5024, + "time_per_iteration": 2.638561487197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062477, + "balance_loss_mlp": 1.03222132, + "epoch": 0.9667179684494036, + "flos": 424614749184.0, + "grad_norm": 0.10681545927260369, + "language_loss": 0.82711923, + "learning_rate": 2.9020555035097484e-06, + "loss": 0.837744, + "num_input_tokens_seen": 416227296, + "router_z_loss_mlp": 0.30200195, + "step": 5025, + "time_per_iteration": 2.499992609024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060102, + "balance_loss_mlp": 1.02946556, + "epoch": 0.9669103501346672, + "flos": 516744258048.0, + "grad_norm": 0.047288791646070916, + "language_loss": 0.8577379, + "learning_rate": 2.8686347185127305e-06, + "loss": 0.86833894, + "num_input_tokens_seen": 416297184, + "router_z_loss_mlp": 0.3059082, + "step": 5026, + "time_per_iteration": 2.684466600418091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064062, + "balance_loss_mlp": 1.03242362, + "epoch": 0.9671027318199308, + "flos": 456008914944.0, + "grad_norm": 0.060249957140447924, + "language_loss": 0.75349987, + "learning_rate": 2.8354069370284396e-06, + "loss": 0.76414049, + "num_input_tokens_seen": 416363056, + "router_z_loss_mlp": 0.31616211, + "step": 5027, + "time_per_iteration": 2.581566572189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062603, + "balance_loss_mlp": 1.03151333, + "epoch": 0.9672951135051943, + "flos": 524809234944.0, + "grad_norm": 0.07736308326368693, + "language_loss": 0.80242217, + "learning_rate": 2.802372171957057e-06, + "loss": 0.81304818, + "num_input_tokens_seen": 416430688, + "router_z_loss_mlp": 0.31054688, + "step": 5028, + "time_per_iteration": 2.620943546295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062615, + "balance_loss_mlp": 1.0313108, + "epoch": 0.9674874951904578, + "flos": 573708275712.0, + "grad_norm": 0.05924446990021998, + "language_loss": 0.79946339, + "learning_rate": 2.7695304361237682e-06, + "loss": 0.81008953, + "num_input_tokens_seen": 416505248, + "router_z_loss_mlp": 0.31274414, + "step": 5029, + "time_per_iteration": 2.7776339054107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106092, + "balance_loss_mlp": 1.02990174, + "epoch": 0.9676798768757214, + "flos": 628875380736.0, + "grad_norm": 0.0401751924772168, + "language_loss": 0.79843652, + "learning_rate": 2.7368817422789848e-06, + "loss": 0.80904567, + "num_input_tokens_seen": 416592640, + "router_z_loss_mlp": 0.30981445, + "step": 5030, + "time_per_iteration": 2.9464609622955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011242, + "balance_loss_mlp": 1.00313604, + "epoch": 0.967872258560985, + "flos": 1463074831872.0, + "grad_norm": 0.0037536728734851557, + "language_loss": 0.75563359, + "learning_rate": 2.7044261030979566e-06, + "loss": 0.765746, + "num_input_tokens_seen": 416808560, + "router_z_loss_mlp": 0.08105469, + "step": 5031, + "time_per_iteration": 4.694348573684692 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106361, + "balance_loss_mlp": 1.03314054, + "epoch": 0.9680646402462486, + "flos": 565238836224.0, + "grad_norm": 0.06560457611419962, + "language_loss": 0.79323578, + "learning_rate": 2.672163531181049e-06, + "loss": 0.80387187, + "num_input_tokens_seen": 416878208, + "router_z_loss_mlp": 0.30444336, + "step": 5032, + "time_per_iteration": 2.663724184036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101124, + "balance_loss_mlp": 1.00313365, + "epoch": 0.9682570219315121, + "flos": 1433669635584.0, + "grad_norm": 0.0037552357874484797, + "language_loss": 0.78074801, + "learning_rate": 2.6400940390537976e-06, + "loss": 0.79086041, + "num_input_tokens_seen": 417105968, + "router_z_loss_mlp": 0.08105469, + "step": 5033, + "time_per_iteration": 4.814163446426392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106369, + "balance_loss_mlp": 1.03186107, + "epoch": 0.9684494036167757, + "flos": 584338392576.0, + "grad_norm": 0.07564514705526598, + "language_loss": 0.81710398, + "learning_rate": 2.608217639166688e-06, + "loss": 0.82774091, + "num_input_tokens_seen": 417175168, + "router_z_loss_mlp": 0.31811523, + "step": 5034, + "time_per_iteration": 2.738064765930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059011, + "balance_loss_mlp": 1.02849364, + "epoch": 0.9686417853020393, + "flos": 558784507392.0, + "grad_norm": 0.051069776715125234, + "language_loss": 0.83945799, + "learning_rate": 2.5765343438950982e-06, + "loss": 0.85004807, + "num_input_tokens_seen": 417247760, + "router_z_loss_mlp": 0.3046875, + "step": 5035, + "time_per_iteration": 2.717803716659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062645, + "balance_loss_mlp": 1.03143644, + "epoch": 0.9688341669873028, + "flos": 784594944000.0, + "grad_norm": 0.06911477440096066, + "language_loss": 0.83235759, + "learning_rate": 2.545044165539745e-06, + "loss": 0.84298402, + "num_input_tokens_seen": 417324080, + "router_z_loss_mlp": 0.31176758, + "step": 5036, + "time_per_iteration": 2.9943130016326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106053, + "balance_loss_mlp": 1.02903473, + "epoch": 0.9690265486725663, + "flos": 395682416640.0, + "grad_norm": 0.0635388023408017, + "language_loss": 0.79152626, + "learning_rate": 2.513747116326126e-06, + "loss": 0.80213153, + "num_input_tokens_seen": 417386416, + "router_z_loss_mlp": 0.31469727, + "step": 5037, + "time_per_iteration": 2.5086729526519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061541, + "balance_loss_mlp": 1.03090429, + "epoch": 0.9692189303578299, + "flos": 476113835520.0, + "grad_norm": 0.05740046745509432, + "language_loss": 0.77356291, + "learning_rate": 2.4826432084048002e-06, + "loss": 0.78417832, + "num_input_tokens_seen": 417459648, + "router_z_loss_mlp": 0.3059082, + "step": 5038, + "time_per_iteration": 2.705591917037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059038, + "balance_loss_mlp": 1.02885425, + "epoch": 0.9694113120430935, + "flos": 597297922560.0, + "grad_norm": 0.05798206678897628, + "language_loss": 0.78522074, + "learning_rate": 2.451732453851385e-06, + "loss": 0.79581112, + "num_input_tokens_seen": 417530512, + "router_z_loss_mlp": 0.30126953, + "step": 5039, + "time_per_iteration": 2.6998066902160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059417, + "balance_loss_mlp": 1.02892351, + "epoch": 0.9696036937283571, + "flos": 500628860928.0, + "grad_norm": 0.04971001407251187, + "language_loss": 0.82436031, + "learning_rate": 2.4210148646665598e-06, + "loss": 0.8349545, + "num_input_tokens_seen": 417597600, + "router_z_loss_mlp": 0.30444336, + "step": 5040, + "time_per_iteration": 2.585838556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062586, + "balance_loss_mlp": 1.0316397, + "epoch": 0.9697960754136207, + "flos": 432049711104.0, + "grad_norm": 0.06475972731655387, + "language_loss": 0.8689853, + "learning_rate": 2.3904904527758952e-06, + "loss": 0.87961119, + "num_input_tokens_seen": 417659616, + "router_z_loss_mlp": 0.30908203, + "step": 5041, + "time_per_iteration": 2.440291166305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060775, + "balance_loss_mlp": 1.02982795, + "epoch": 0.9699884570988841, + "flos": 568257901056.0, + "grad_norm": 0.04793585880961143, + "language_loss": 0.85172904, + "learning_rate": 2.3601592300300235e-06, + "loss": 0.86233675, + "num_input_tokens_seen": 417730896, + "router_z_loss_mlp": 0.30908203, + "step": 5042, + "time_per_iteration": 2.706629991531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064477, + "balance_loss_mlp": 1.03293455, + "epoch": 0.9701808387841477, + "flos": 515961474048.0, + "grad_norm": 0.054405736603249856, + "language_loss": 0.81407428, + "learning_rate": 2.33002120820458e-06, + "loss": 0.82471901, + "num_input_tokens_seen": 417803296, + "router_z_loss_mlp": 0.31518555, + "step": 5043, + "time_per_iteration": 2.6756155490875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106029, + "balance_loss_mlp": 1.02970135, + "epoch": 0.9703732204694113, + "flos": 491273330688.0, + "grad_norm": 0.0672507732770155, + "language_loss": 0.76003706, + "learning_rate": 2.300076399000206e-06, + "loss": 0.77063996, + "num_input_tokens_seen": 417870208, + "router_z_loss_mlp": 0.30541992, + "step": 5044, + "time_per_iteration": 2.5917348861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061641, + "balance_loss_mlp": 1.0302887, + "epoch": 0.9705656021546749, + "flos": 625831584768.0, + "grad_norm": 0.05433746286859287, + "language_loss": 0.80137366, + "learning_rate": 2.2703248140424348e-06, + "loss": 0.81199008, + "num_input_tokens_seen": 417944464, + "router_z_loss_mlp": 0.31323242, + "step": 5045, + "time_per_iteration": 2.808703899383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106209, + "balance_loss_mlp": 1.0320015, + "epoch": 0.9707579838399384, + "flos": 471198933504.0, + "grad_norm": 0.054204076995614914, + "language_loss": 0.82907468, + "learning_rate": 2.2407664648819715e-06, + "loss": 0.83969557, + "num_input_tokens_seen": 418010480, + "router_z_loss_mlp": 0.30029297, + "step": 5046, + "time_per_iteration": 2.595574140548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063429, + "balance_loss_mlp": 1.03255379, + "epoch": 0.970950365525202, + "flos": 491845118976.0, + "grad_norm": 0.05790005154915511, + "language_loss": 0.80455661, + "learning_rate": 2.2114013629942475e-06, + "loss": 0.81519091, + "num_input_tokens_seen": 418083952, + "router_z_loss_mlp": 0.30834961, + "step": 5047, + "time_per_iteration": 2.6507327556610107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060135, + "balance_loss_mlp": 1.02894998, + "epoch": 0.9711427472104656, + "flos": 557060378112.0, + "grad_norm": 0.07608957719483044, + "language_loss": 0.80362004, + "learning_rate": 2.1822295197799213e-06, + "loss": 0.81422138, + "num_input_tokens_seen": 418156672, + "router_z_loss_mlp": 0.31152344, + "step": 5048, + "time_per_iteration": 2.72220778465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059825, + "balance_loss_mlp": 1.02973652, + "epoch": 0.9713351288957291, + "flos": 625527456768.0, + "grad_norm": 0.04573208991369369, + "language_loss": 0.83717644, + "learning_rate": 2.153250946564489e-06, + "loss": 0.84777468, + "num_input_tokens_seen": 418242160, + "router_z_loss_mlp": 0.30029297, + "step": 5049, + "time_per_iteration": 2.9444804191589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062024, + "balance_loss_mlp": 1.03064787, + "epoch": 0.9715275105809927, + "flos": 498821773824.0, + "grad_norm": 0.05121797008138963, + "language_loss": 0.80890942, + "learning_rate": 2.1244656545983397e-06, + "loss": 0.81952965, + "num_input_tokens_seen": 418316960, + "router_z_loss_mlp": 0.31347656, + "step": 5050, + "time_per_iteration": 2.765965461730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062944, + "balance_loss_mlp": 1.03149652, + "epoch": 0.9717198922662562, + "flos": 477274940928.0, + "grad_norm": 0.0759374256858869, + "language_loss": 0.77641714, + "learning_rate": 2.0958736550570345e-06, + "loss": 0.78704655, + "num_input_tokens_seen": 418383888, + "router_z_loss_mlp": 0.31420898, + "step": 5051, + "time_per_iteration": 2.534787178039551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058864, + "balance_loss_mlp": 1.02794087, + "epoch": 0.9719122739515198, + "flos": 553171189248.0, + "grad_norm": 0.04369700859439487, + "language_loss": 0.78430104, + "learning_rate": 2.067474959040916e-06, + "loss": 0.79488969, + "num_input_tokens_seen": 418453776, + "router_z_loss_mlp": 0.30883789, + "step": 5052, + "time_per_iteration": 2.6652493476867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063725, + "balance_loss_mlp": 1.03237319, + "epoch": 0.9721046556367834, + "flos": 565583662080.0, + "grad_norm": 0.05627450979959505, + "language_loss": 0.80065435, + "learning_rate": 2.0392695775753312e-06, + "loss": 0.81129158, + "num_input_tokens_seen": 418521984, + "router_z_loss_mlp": 0.31323242, + "step": 5053, + "time_per_iteration": 2.645796537399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061537, + "balance_loss_mlp": 1.03051889, + "epoch": 0.972297037322047, + "flos": 560044537344.0, + "grad_norm": 0.056018858365713145, + "language_loss": 0.78160405, + "learning_rate": 2.0112575216105766e-06, + "loss": 0.7922194, + "num_input_tokens_seen": 418598768, + "router_z_loss_mlp": 0.30981445, + "step": 5054, + "time_per_iteration": 2.7389419078826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063362, + "balance_loss_mlp": 1.0327971, + "epoch": 0.9724894190073105, + "flos": 512175591936.0, + "grad_norm": 0.056389892105133975, + "language_loss": 0.78999579, + "learning_rate": 1.9834388020218974e-06, + "loss": 0.80062938, + "num_input_tokens_seen": 418670064, + "router_z_loss_mlp": 0.30517578, + "step": 5055, + "time_per_iteration": 2.677356719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061934, + "balance_loss_mlp": 1.0307492, + "epoch": 0.972681800692574, + "flos": 613532593152.0, + "grad_norm": 0.05719839880369088, + "language_loss": 0.80146527, + "learning_rate": 1.9558134296094875e-06, + "loss": 0.81208467, + "num_input_tokens_seen": 418745216, + "router_z_loss_mlp": 0.31152344, + "step": 5056, + "time_per_iteration": 2.780590295791626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059791, + "balance_loss_mlp": 1.02882087, + "epoch": 0.9728741823778376, + "flos": 833562385920.0, + "grad_norm": 0.04737798788694853, + "language_loss": 0.83823931, + "learning_rate": 1.92838141509849e-06, + "loss": 0.84883726, + "num_input_tokens_seen": 418824224, + "router_z_loss_mlp": 0.30932617, + "step": 5057, + "time_per_iteration": 3.111090660095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062986, + "balance_loss_mlp": 1.03118145, + "epoch": 0.9730665640631012, + "flos": 571167866880.0, + "grad_norm": 0.06372320617901352, + "language_loss": 0.84056395, + "learning_rate": 1.9011427691389415e-06, + "loss": 0.85119379, + "num_input_tokens_seen": 418899712, + "router_z_loss_mlp": 0.31787109, + "step": 5058, + "time_per_iteration": 2.716362237930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062442, + "balance_loss_mlp": 1.03116202, + "epoch": 0.9732589457483648, + "flos": 506271292416.0, + "grad_norm": 0.049152595370916714, + "language_loss": 0.77131605, + "learning_rate": 1.8740975023057715e-06, + "loss": 0.78194046, + "num_input_tokens_seen": 418964912, + "router_z_loss_mlp": 0.31274414, + "step": 5059, + "time_per_iteration": 2.567103624343872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105856, + "balance_loss_mlp": 1.02799499, + "epoch": 0.9734513274336283, + "flos": 926602716672.0, + "grad_norm": 0.04959205985991603, + "language_loss": 0.80284786, + "learning_rate": 1.84724562509897e-06, + "loss": 0.81343341, + "num_input_tokens_seen": 419040032, + "router_z_loss_mlp": 0.30517578, + "step": 5060, + "time_per_iteration": 3.1037087440490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061035, + "balance_loss_mlp": 1.03011179, + "epoch": 0.9736437091188919, + "flos": 491682175488.0, + "grad_norm": 0.053469499820537766, + "language_loss": 0.77895665, + "learning_rate": 1.8205871479433089e-06, + "loss": 0.78956699, + "num_input_tokens_seen": 419112672, + "router_z_loss_mlp": 0.30883789, + "step": 5061, + "time_per_iteration": 2.7472124099731445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106441, + "balance_loss_mlp": 1.03355861, + "epoch": 0.9738360908041555, + "flos": 613039380480.0, + "grad_norm": 0.05773032133011401, + "language_loss": 0.83614433, + "learning_rate": 1.7941220811885096e-06, + "loss": 0.84678841, + "num_input_tokens_seen": 419183408, + "router_z_loss_mlp": 0.30810547, + "step": 5062, + "time_per_iteration": 2.727924108505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011241, + "balance_loss_mlp": 1.00313449, + "epoch": 0.974028472489419, + "flos": 1548771922944.0, + "grad_norm": 0.0037481101456261563, + "language_loss": 0.75992095, + "learning_rate": 1.7678504351092972e-06, + "loss": 0.77003336, + "num_input_tokens_seen": 419415472, + "router_z_loss_mlp": 0.08105469, + "step": 5063, + "time_per_iteration": 5.0528404712677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011241, + "balance_loss_mlp": 1.0031352, + "epoch": 0.9742208541746825, + "flos": 1410403055616.0, + "grad_norm": 0.0037495136804295438, + "language_loss": 0.79677713, + "learning_rate": 1.7417722199051245e-06, + "loss": 0.80688953, + "num_input_tokens_seen": 419651840, + "router_z_loss_mlp": 0.08105469, + "step": 5064, + "time_per_iteration": 4.989039182662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061659, + "balance_loss_mlp": 1.03114104, + "epoch": 0.9744132358599461, + "flos": 674582238720.0, + "grad_norm": 0.04691018591826747, + "language_loss": 0.76823747, + "learning_rate": 1.7158874457005592e-06, + "loss": 0.77885401, + "num_input_tokens_seen": 419729424, + "router_z_loss_mlp": 0.3046875, + "step": 5065, + "time_per_iteration": 2.84334659576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059101, + "balance_loss_mlp": 1.02741492, + "epoch": 0.9746056175452097, + "flos": 598111229952.0, + "grad_norm": 0.06794562847276414, + "language_loss": 0.77235073, + "learning_rate": 1.690196122544896e-06, + "loss": 0.78294176, + "num_input_tokens_seen": 419803616, + "router_z_loss_mlp": 0.31665039, + "step": 5066, + "time_per_iteration": 2.7670531272888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063312, + "balance_loss_mlp": 1.03234136, + "epoch": 0.9747979992304733, + "flos": 731837237760.0, + "grad_norm": 0.0506469462866736, + "language_loss": 0.82077444, + "learning_rate": 1.6646982604123784e-06, + "loss": 0.83140755, + "num_input_tokens_seen": 419883536, + "router_z_loss_mlp": 0.30932617, + "step": 5067, + "time_per_iteration": 2.989997148513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010609, + "balance_loss_mlp": 1.02926183, + "epoch": 0.9749903809157369, + "flos": 616219978752.0, + "grad_norm": 0.06891073101993225, + "language_loss": 0.76269442, + "learning_rate": 1.6393938692022548e-06, + "loss": 0.77330339, + "num_input_tokens_seen": 419956816, + "router_z_loss_mlp": 0.31616211, + "step": 5068, + "time_per_iteration": 2.6933865547180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010603, + "balance_loss_mlp": 1.02963936, + "epoch": 0.9751827626010003, + "flos": 468160929792.0, + "grad_norm": 0.05062684211706446, + "language_loss": 0.83640307, + "learning_rate": 1.6142829587384443e-06, + "loss": 0.84700608, + "num_input_tokens_seen": 420022096, + "router_z_loss_mlp": 0.30615234, + "step": 5069, + "time_per_iteration": 2.548443078994751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106123, + "balance_loss_mlp": 1.03006864, + "epoch": 0.9753751442862639, + "flos": 598918745088.0, + "grad_norm": 0.08322455471388275, + "language_loss": 0.8529315, + "learning_rate": 1.5893655387698713e-06, + "loss": 0.86354387, + "num_input_tokens_seen": 420097008, + "router_z_loss_mlp": 0.3112793, + "step": 5070, + "time_per_iteration": 2.7602720260620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060615, + "balance_loss_mlp": 1.02931058, + "epoch": 0.9755675259715275, + "flos": 650486232576.0, + "grad_norm": 0.05115135160859089, + "language_loss": 0.82068843, + "learning_rate": 1.5646416189704637e-06, + "loss": 0.83129454, + "num_input_tokens_seen": 420174960, + "router_z_loss_mlp": 0.31274414, + "step": 5071, + "time_per_iteration": 2.878183126449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061905, + "balance_loss_mlp": 1.03138733, + "epoch": 0.9757599076567911, + "flos": 563392461312.0, + "grad_norm": 0.056802057532983834, + "language_loss": 0.7912972, + "learning_rate": 1.5401112089387659e-06, + "loss": 0.80191624, + "num_input_tokens_seen": 420245248, + "router_z_loss_mlp": 0.3046875, + "step": 5072, + "time_per_iteration": 2.6892030239105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106052, + "balance_loss_mlp": 1.02931106, + "epoch": 0.9759522893420547, + "flos": 504385629696.0, + "grad_norm": 0.06614272886181409, + "language_loss": 0.80103958, + "learning_rate": 1.5157743181983819e-06, + "loss": 0.81164479, + "num_input_tokens_seen": 420310688, + "router_z_loss_mlp": 0.31176758, + "step": 5073, + "time_per_iteration": 2.6456804275512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058437, + "balance_loss_mlp": 1.02741873, + "epoch": 0.9761446710273182, + "flos": 583452301824.0, + "grad_norm": 0.05438660473134642, + "language_loss": 0.81815821, + "learning_rate": 1.4916309561976982e-06, + "loss": 0.82874256, + "num_input_tokens_seen": 420379008, + "router_z_loss_mlp": 0.30981445, + "step": 5074, + "time_per_iteration": 2.6796131134033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063789, + "balance_loss_mlp": 1.03289032, + "epoch": 0.9763370527125818, + "flos": 481967262720.0, + "grad_norm": 0.062046674624534275, + "language_loss": 0.82287705, + "learning_rate": 1.4676811323099947e-06, + "loss": 0.83351487, + "num_input_tokens_seen": 420445504, + "router_z_loss_mlp": 0.30859375, + "step": 5075, + "time_per_iteration": 2.5883021354675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060183, + "balance_loss_mlp": 1.02911687, + "epoch": 0.9765294343978453, + "flos": 618706543104.0, + "grad_norm": 0.06359470146777534, + "language_loss": 0.78232706, + "learning_rate": 1.4439248558335561e-06, + "loss": 0.79292893, + "num_input_tokens_seen": 420520528, + "router_z_loss_mlp": 0.31030273, + "step": 5076, + "time_per_iteration": 2.7379183769226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058637, + "balance_loss_mlp": 1.02776134, + "epoch": 0.9767218160831089, + "flos": 526320958464.0, + "grad_norm": 0.06717350649320492, + "language_loss": 0.85320723, + "learning_rate": 1.4203621359911712e-06, + "loss": 0.86379361, + "num_input_tokens_seen": 420586224, + "router_z_loss_mlp": 0.30834961, + "step": 5077, + "time_per_iteration": 2.630486249923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061717, + "balance_loss_mlp": 1.03119898, + "epoch": 0.9769141977683724, + "flos": 524932890624.0, + "grad_norm": 0.047528387695096014, + "language_loss": 0.84178358, + "learning_rate": 1.3969929819308557e-06, + "loss": 0.85240072, + "num_input_tokens_seen": 420655456, + "router_z_loss_mlp": 0.3046875, + "step": 5078, + "time_per_iteration": 2.630990505218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064302, + "balance_loss_mlp": 1.03373718, + "epoch": 0.977106579453636, + "flos": 457359105024.0, + "grad_norm": 0.05153436611336294, + "language_loss": 0.80598915, + "learning_rate": 1.3738174027252416e-06, + "loss": 0.81663209, + "num_input_tokens_seen": 420733216, + "router_z_loss_mlp": 0.30517578, + "step": 5079, + "time_per_iteration": 2.8142943382263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063034, + "balance_loss_mlp": 1.03156233, + "epoch": 0.9772989611388996, + "flos": 531830969856.0, + "grad_norm": 0.06258721314145185, + "language_loss": 0.81607276, + "learning_rate": 1.3508354073719642e-06, + "loss": 0.82670313, + "num_input_tokens_seen": 420803376, + "router_z_loss_mlp": 0.31445312, + "step": 5080, + "time_per_iteration": 2.601149797439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061495, + "balance_loss_mlp": 1.03030968, + "epoch": 0.9774913428241632, + "flos": 754999100928.0, + "grad_norm": 0.058179386830925724, + "language_loss": 0.86116189, + "learning_rate": 1.3280470047933313e-06, + "loss": 0.87177682, + "num_input_tokens_seen": 420886256, + "router_z_loss_mlp": 0.31152344, + "step": 5081, + "time_per_iteration": 2.989039182662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101122, + "balance_loss_mlp": 1.00311351, + "epoch": 0.9776837245094268, + "flos": 1553486003712.0, + "grad_norm": 0.003746354895581978, + "language_loss": 0.78895497, + "learning_rate": 1.3054522038366544e-06, + "loss": 0.79906714, + "num_input_tokens_seen": 421123728, + "router_z_loss_mlp": 0.08105469, + "step": 5082, + "time_per_iteration": 4.968156576156616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062166, + "balance_loss_mlp": 1.0312196, + "epoch": 0.9778761061946902, + "flos": 592260774912.0, + "grad_norm": 0.06333002037000754, + "language_loss": 0.84055161, + "learning_rate": 1.2830510132739725e-06, + "loss": 0.85117328, + "num_input_tokens_seen": 421192576, + "router_z_loss_mlp": 0.30908203, + "step": 5083, + "time_per_iteration": 2.6840903759002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063545, + "balance_loss_mlp": 1.032336, + "epoch": 0.9780684878799538, + "flos": 414732510720.0, + "grad_norm": 0.051194594259949974, + "language_loss": 0.81744003, + "learning_rate": 1.2608434418022175e-06, + "loss": 0.82807547, + "num_input_tokens_seen": 421256272, + "router_z_loss_mlp": 0.31176758, + "step": 5084, + "time_per_iteration": 2.4817535877227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063953, + "balance_loss_mlp": 1.03260052, + "epoch": 0.9782608695652174, + "flos": 568129863168.0, + "grad_norm": 0.05618141703355523, + "language_loss": 0.84910816, + "learning_rate": 1.2388294980431036e-06, + "loss": 0.85974771, + "num_input_tokens_seen": 421332880, + "router_z_loss_mlp": 0.31323242, + "step": 5085, + "time_per_iteration": 2.7052948474884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061201, + "balance_loss_mlp": 1.03020716, + "epoch": 0.978453251250481, + "flos": 690151988736.0, + "grad_norm": 0.06611452324560538, + "language_loss": 0.83097386, + "learning_rate": 1.217009190543239e-06, + "loss": 0.84158587, + "num_input_tokens_seen": 421406160, + "router_z_loss_mlp": 0.30957031, + "step": 5086, + "time_per_iteration": 2.8580446243286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057022, + "balance_loss_mlp": 1.02657628, + "epoch": 0.9786456329357445, + "flos": 502239508992.0, + "grad_norm": 0.04725270380406371, + "language_loss": 0.77273715, + "learning_rate": 1.1953825277740694e-06, + "loss": 0.78330743, + "num_input_tokens_seen": 421476208, + "router_z_loss_mlp": 0.30395508, + "step": 5087, + "time_per_iteration": 2.644757032394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063526, + "balance_loss_mlp": 1.03222179, + "epoch": 0.9788380146210081, + "flos": 862829369856.0, + "grad_norm": 0.06946428392980135, + "language_loss": 0.80393237, + "learning_rate": 1.1739495181317117e-06, + "loss": 0.81456769, + "num_input_tokens_seen": 421549232, + "router_z_loss_mlp": 0.31274414, + "step": 5088, + "time_per_iteration": 3.011176109313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062233, + "balance_loss_mlp": 1.03173923, + "epoch": 0.9790303963062716, + "flos": 512460781056.0, + "grad_norm": 0.05754819894183326, + "language_loss": 0.84162724, + "learning_rate": 1.1527101699371767e-06, + "loss": 0.85224962, + "num_input_tokens_seen": 421617056, + "router_z_loss_mlp": 0.30444336, + "step": 5089, + "time_per_iteration": 2.5700507164001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062724, + "balance_loss_mlp": 1.03218281, + "epoch": 0.9792227779915352, + "flos": 494183296512.0, + "grad_norm": 0.07818701968274684, + "language_loss": 0.8623296, + "learning_rate": 1.1316644914364237e-06, + "loss": 0.87295687, + "num_input_tokens_seen": 421683424, + "router_z_loss_mlp": 0.30493164, + "step": 5090, + "time_per_iteration": 2.578331470489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106283, + "balance_loss_mlp": 1.03171659, + "epoch": 0.9794151596767988, + "flos": 608037138432.0, + "grad_norm": 0.0640945477186255, + "language_loss": 0.81165767, + "learning_rate": 1.1108124908000838e-06, + "loss": 0.82228601, + "num_input_tokens_seen": 421761200, + "router_z_loss_mlp": 0.31079102, + "step": 5091, + "time_per_iteration": 2.8047142028808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060042, + "balance_loss_mlp": 1.02835619, + "epoch": 0.9796075413620623, + "flos": 477979149312.0, + "grad_norm": 0.05635912773275719, + "language_loss": 0.86476392, + "learning_rate": 1.09015417612357e-06, + "loss": 0.8753643, + "num_input_tokens_seen": 421829600, + "router_z_loss_mlp": 0.31665039, + "step": 5092, + "time_per_iteration": 2.5596201419830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010638, + "balance_loss_mlp": 1.03278232, + "epoch": 0.9797999230473259, + "flos": 591936297984.0, + "grad_norm": 0.05185035440216898, + "language_loss": 0.84320545, + "learning_rate": 1.0696895554271335e-06, + "loss": 0.85384345, + "num_input_tokens_seen": 421904928, + "router_z_loss_mlp": 0.30981445, + "step": 5093, + "time_per_iteration": 2.7417502403259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060458, + "balance_loss_mlp": 1.02924931, + "epoch": 0.9799923047325895, + "flos": 556086947328.0, + "grad_norm": 0.057258912795892604, + "language_loss": 0.81443524, + "learning_rate": 1.049418636655919e-06, + "loss": 0.82503974, + "num_input_tokens_seen": 421989616, + "router_z_loss_mlp": 0.31176758, + "step": 5094, + "time_per_iteration": 2.9223530292510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062101, + "balance_loss_mlp": 1.03101122, + "epoch": 0.9801846864178531, + "flos": 579164442624.0, + "grad_norm": 0.04579710583324431, + "language_loss": 0.84433246, + "learning_rate": 1.0293414276797974e-06, + "loss": 0.85495341, + "num_input_tokens_seen": 422067088, + "router_z_loss_mlp": 0.31054688, + "step": 5095, + "time_per_iteration": 2.7353591918945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060539, + "balance_loss_mlp": 1.03030777, + "epoch": 0.9803770681031165, + "flos": 514825099776.0, + "grad_norm": 0.061566152326411605, + "language_loss": 0.79944533, + "learning_rate": 1.0094579362933677e-06, + "loss": 0.81005073, + "num_input_tokens_seen": 422141136, + "router_z_loss_mlp": 0.30175781, + "step": 5096, + "time_per_iteration": 2.65447998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056617, + "balance_loss_mlp": 1.02643323, + "epoch": 0.9805694497883801, + "flos": 566706889728.0, + "grad_norm": 0.053549936488599306, + "language_loss": 0.77981234, + "learning_rate": 9.897681702160654e-07, + "loss": 0.79037857, + "num_input_tokens_seen": 422216400, + "router_z_loss_mlp": 0.30126953, + "step": 5097, + "time_per_iteration": 2.7695438861846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061184, + "balance_loss_mlp": 1.03038001, + "epoch": 0.9807618314736437, + "flos": 479106759168.0, + "grad_norm": 0.051835716035438115, + "language_loss": 0.73514438, + "learning_rate": 9.702721370922208e-07, + "loss": 0.74575627, + "num_input_tokens_seen": 422287664, + "router_z_loss_mlp": 0.30761719, + "step": 5098, + "time_per_iteration": 2.6515543460845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064654, + "balance_loss_mlp": 1.03339684, + "epoch": 0.9809542131589073, + "flos": 545021844480.0, + "grad_norm": 0.05890860301588922, + "language_loss": 0.80092633, + "learning_rate": 9.509698444908344e-07, + "loss": 0.81157291, + "num_input_tokens_seen": 422357552, + "router_z_loss_mlp": 0.31225586, + "step": 5099, + "time_per_iteration": 2.622485876083374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058534, + "balance_loss_mlp": 1.02751541, + "epoch": 0.9811465948441709, + "flos": 520589776896.0, + "grad_norm": 0.05424341522111697, + "language_loss": 0.79649353, + "learning_rate": 9.318612999057452e-07, + "loss": 0.80707896, + "num_input_tokens_seen": 422425872, + "router_z_loss_mlp": 0.30981445, + "step": 5100, + "time_per_iteration": 2.591421127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062161, + "balance_loss_mlp": 1.03142905, + "epoch": 0.9813389765294344, + "flos": 541023556608.0, + "grad_norm": 0.056964991867044526, + "language_loss": 0.79990375, + "learning_rate": 9.129465107554635e-07, + "loss": 0.81052536, + "num_input_tokens_seen": 422495760, + "router_z_loss_mlp": 0.30688477, + "step": 5101, + "time_per_iteration": 2.624356985092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061299, + "balance_loss_mlp": 1.0304718, + "epoch": 0.981531358214698, + "flos": 567080828928.0, + "grad_norm": 0.06134435834417123, + "language_loss": 0.84231782, + "learning_rate": 8.942254843834485e-07, + "loss": 0.85293078, + "num_input_tokens_seen": 422568112, + "router_z_loss_mlp": 0.30786133, + "step": 5102, + "time_per_iteration": 2.723998546600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060525, + "balance_loss_mlp": 1.02972126, + "epoch": 0.9817237398999615, + "flos": 576987798528.0, + "grad_norm": 0.049977089978911385, + "language_loss": 0.80795527, + "learning_rate": 8.756982280578307e-07, + "loss": 0.81856054, + "num_input_tokens_seen": 422641280, + "router_z_loss_mlp": 0.30786133, + "step": 5103, + "time_per_iteration": 2.7338523864746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061515, + "balance_loss_mlp": 1.03006709, + "epoch": 0.9819161215852251, + "flos": 701172011520.0, + "grad_norm": 0.0557213877990239, + "language_loss": 0.81740284, + "learning_rate": 8.573647489714676e-07, + "loss": 0.82801795, + "num_input_tokens_seen": 422720416, + "router_z_loss_mlp": 0.31420898, + "step": 5104, + "time_per_iteration": 2.931447744369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063553, + "balance_loss_mlp": 1.03308296, + "epoch": 0.9821085032704886, + "flos": 623873138688.0, + "grad_norm": 0.05444182607070333, + "language_loss": 0.84073544, + "learning_rate": 8.392250542421653e-07, + "loss": 0.85137099, + "num_input_tokens_seen": 422800384, + "router_z_loss_mlp": 0.30419922, + "step": 5105, + "time_per_iteration": 2.8544764518737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062382, + "balance_loss_mlp": 1.03167391, + "epoch": 0.9823008849557522, + "flos": 499259731968.0, + "grad_norm": 0.059283748900889124, + "language_loss": 0.81204158, + "learning_rate": 8.212791509122353e-07, + "loss": 0.82266545, + "num_input_tokens_seen": 422870768, + "router_z_loss_mlp": 0.30664062, + "step": 5106, + "time_per_iteration": 2.7119359970092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062348, + "balance_loss_mlp": 1.03123415, + "epoch": 0.9824932666410158, + "flos": 523544822784.0, + "grad_norm": 0.05648226297161044, + "language_loss": 0.7276091, + "learning_rate": 8.035270459489929e-07, + "loss": 0.73823255, + "num_input_tokens_seen": 422942864, + "router_z_loss_mlp": 0.31079102, + "step": 5107, + "time_per_iteration": 2.7225635051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063572, + "balance_loss_mlp": 1.0326972, + "epoch": 0.9826856483262794, + "flos": 502411216896.0, + "grad_norm": 0.05541444658999194, + "language_loss": 0.82263827, + "learning_rate": 7.859687462443698e-07, + "loss": 0.83327401, + "num_input_tokens_seen": 423013600, + "router_z_loss_mlp": 0.30834961, + "step": 5108, + "time_per_iteration": 2.6065311431884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063107, + "balance_loss_mlp": 1.03189802, + "epoch": 0.982878030011543, + "flos": 561768666624.0, + "grad_norm": 0.05636875798625559, + "language_loss": 0.84198737, + "learning_rate": 7.686042586151354e-07, + "loss": 0.85261846, + "num_input_tokens_seen": 423093680, + "router_z_loss_mlp": 0.31176758, + "step": 5109, + "time_per_iteration": 2.8074042797088623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061414, + "balance_loss_mlp": 1.03025222, + "epoch": 0.9830704116968064, + "flos": 536824447488.0, + "grad_norm": 0.05411078449531097, + "language_loss": 0.82744133, + "learning_rate": 7.514335898027857e-07, + "loss": 0.83805549, + "num_input_tokens_seen": 423168608, + "router_z_loss_mlp": 0.3112793, + "step": 5110, + "time_per_iteration": 2.73994517326355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060884, + "balance_loss_mlp": 1.02977061, + "epoch": 0.98326279338207, + "flos": 458712267264.0, + "grad_norm": 0.060128702369139815, + "language_loss": 0.84042352, + "learning_rate": 7.344567464735441e-07, + "loss": 0.85103238, + "num_input_tokens_seen": 423233552, + "router_z_loss_mlp": 0.31079102, + "step": 5111, + "time_per_iteration": 2.48018479347229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060072, + "balance_loss_mlp": 1.02881503, + "epoch": 0.9834551750673336, + "flos": 640672395264.0, + "grad_norm": 0.05142672263893664, + "language_loss": 0.79408097, + "learning_rate": 7.17673735218416e-07, + "loss": 0.80468172, + "num_input_tokens_seen": 423307440, + "router_z_loss_mlp": 0.31225586, + "step": 5112, + "time_per_iteration": 2.826101541519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057314, + "balance_loss_mlp": 1.02682042, + "epoch": 0.9836475567525972, + "flos": 1071373478400.0, + "grad_norm": 0.04969099377135597, + "language_loss": 0.79194742, + "learning_rate": 7.010845625530782e-07, + "loss": 0.80252051, + "num_input_tokens_seen": 423394880, + "router_z_loss_mlp": 0.3046875, + "step": 5113, + "time_per_iteration": 3.394486427307129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106507, + "balance_loss_mlp": 1.03347969, + "epoch": 0.9838399384378607, + "flos": 564943472640.0, + "grad_norm": 0.06818796043549862, + "language_loss": 0.75512731, + "learning_rate": 6.846892349181566e-07, + "loss": 0.76577806, + "num_input_tokens_seen": 423461792, + "router_z_loss_mlp": 0.31567383, + "step": 5114, + "time_per_iteration": 2.6842877864837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064716, + "balance_loss_mlp": 1.03333998, + "epoch": 0.9840323201231242, + "flos": 772463278080.0, + "grad_norm": 0.1018443246698012, + "language_loss": 0.79624081, + "learning_rate": 6.684877586787819e-07, + "loss": 0.80688798, + "num_input_tokens_seen": 423539952, + "router_z_loss_mlp": 0.31347656, + "step": 5115, + "time_per_iteration": 2.9627039432525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059546, + "balance_loss_mlp": 1.02867126, + "epoch": 0.9842247018083878, + "flos": 472016623104.0, + "grad_norm": 0.05701334916356296, + "language_loss": 0.85578704, + "learning_rate": 6.524801401249225e-07, + "loss": 0.86638254, + "num_input_tokens_seen": 423607184, + "router_z_loss_mlp": 0.30859375, + "step": 5116, + "time_per_iteration": 2.5376946926116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065055, + "balance_loss_mlp": 1.0332495, + "epoch": 0.9844170834936514, + "flos": 524996909568.0, + "grad_norm": 0.05489154338763104, + "language_loss": 0.84577304, + "learning_rate": 6.366663854713295e-07, + "loss": 0.85642362, + "num_input_tokens_seen": 423676528, + "router_z_loss_mlp": 0.31787109, + "step": 5117, + "time_per_iteration": 2.6327760219573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011198, + "balance_loss_mlp": 1.00309229, + "epoch": 0.984609465178915, + "flos": 1566406245888.0, + "grad_norm": 0.003743704164174256, + "language_loss": 0.77162516, + "learning_rate": 6.210465008574251e-07, + "loss": 0.78173721, + "num_input_tokens_seen": 423905856, + "router_z_loss_mlp": 0.08105469, + "step": 5118, + "time_per_iteration": 4.938253164291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065245, + "balance_loss_mlp": 1.03398824, + "epoch": 0.9848018468641785, + "flos": 519294841344.0, + "grad_norm": 0.061067822522050556, + "language_loss": 0.81750166, + "learning_rate": 6.056204923473584e-07, + "loss": 0.82815415, + "num_input_tokens_seen": 423972496, + "router_z_loss_mlp": 0.31225586, + "step": 5119, + "time_per_iteration": 2.6061348915100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062892, + "balance_loss_mlp": 1.03201687, + "epoch": 0.9849942285494421, + "flos": 492760323072.0, + "grad_norm": 0.07845667393472046, + "language_loss": 0.82774782, + "learning_rate": 5.903883659301167e-07, + "loss": 0.8383767, + "num_input_tokens_seen": 424039968, + "router_z_loss_mlp": 0.30834961, + "step": 5120, + "time_per_iteration": 2.6440351009368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064055, + "balance_loss_mlp": 1.03282189, + "epoch": 0.9851866102347057, + "flos": 545740609536.0, + "grad_norm": 0.056123337743530885, + "language_loss": 0.80794674, + "learning_rate": 5.753501275193029e-07, + "loss": 0.8185873, + "num_input_tokens_seen": 424108096, + "router_z_loss_mlp": 0.31225586, + "step": 5121, + "time_per_iteration": 2.649475574493408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059863, + "balance_loss_mlp": 1.02941656, + "epoch": 0.9853789919199692, + "flos": 476019293184.0, + "grad_norm": 0.0562432124087145, + "language_loss": 0.79757869, + "learning_rate": 5.605057829531912e-07, + "loss": 0.80817735, + "num_input_tokens_seen": 424172256, + "router_z_loss_mlp": 0.30395508, + "step": 5122, + "time_per_iteration": 2.521296262741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062584, + "balance_loss_mlp": 1.03139853, + "epoch": 0.9855713736052328, + "flos": 1032199524864.0, + "grad_norm": 0.055038538436744985, + "language_loss": 0.75819677, + "learning_rate": 5.458553379950049e-07, + "loss": 0.76882255, + "num_input_tokens_seen": 424261088, + "router_z_loss_mlp": 0.31152344, + "step": 5123, + "time_per_iteration": 3.337373971939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062929, + "balance_loss_mlp": 1.03174376, + "epoch": 0.9857637552904963, + "flos": 494794372608.0, + "grad_norm": 0.05369962620538531, + "language_loss": 0.82470608, + "learning_rate": 5.31398798332472e-07, + "loss": 0.83533537, + "num_input_tokens_seen": 424329168, + "router_z_loss_mlp": 0.31152344, + "step": 5124, + "time_per_iteration": 2.5722227096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062656, + "balance_loss_mlp": 1.03113651, + "epoch": 0.9859561369757599, + "flos": 591990142464.0, + "grad_norm": 0.06425183371235867, + "language_loss": 0.83396256, + "learning_rate": 5.17136169578103e-07, + "loss": 0.84458917, + "num_input_tokens_seen": 424399392, + "router_z_loss_mlp": 0.31494141, + "step": 5125, + "time_per_iteration": 2.689207077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060728, + "balance_loss_mlp": 1.03032947, + "epoch": 0.9861485186610235, + "flos": 486719221248.0, + "grad_norm": 0.06282802238411583, + "language_loss": 0.78441298, + "learning_rate": 5.030674572691907e-07, + "loss": 0.79502022, + "num_input_tokens_seen": 424470080, + "router_z_loss_mlp": 0.3034668, + "step": 5126, + "time_per_iteration": 2.6470096111297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061426, + "balance_loss_mlp": 1.03024125, + "epoch": 0.9863409003462871, + "flos": 518536788480.0, + "grad_norm": 0.045592923518431534, + "language_loss": 0.82589722, + "learning_rate": 4.891926668676994e-07, + "loss": 0.83651149, + "num_input_tokens_seen": 424541824, + "router_z_loss_mlp": 0.31152344, + "step": 5127, + "time_per_iteration": 2.6396868228912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011205, + "balance_loss_mlp": 1.00309873, + "epoch": 0.9865332820315506, + "flos": 1485212391936.0, + "grad_norm": 0.003744860643622383, + "language_loss": 0.79182732, + "learning_rate": 4.755118037602646e-07, + "loss": 0.80193937, + "num_input_tokens_seen": 424773408, + "router_z_loss_mlp": 0.08105469, + "step": 5128, + "time_per_iteration": 4.885936260223389 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063316, + "balance_loss_mlp": 1.03232157, + "epoch": 0.9867256637168141, + "flos": 581837271552.0, + "grad_norm": 0.05639219580829708, + "language_loss": 0.79096341, + "learning_rate": 4.620248732582488e-07, + "loss": 0.80159652, + "num_input_tokens_seen": 424840608, + "router_z_loss_mlp": 0.30981445, + "step": 5129, + "time_per_iteration": 2.697075605392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062742, + "balance_loss_mlp": 1.0312469, + "epoch": 0.9869180454020777, + "flos": 958898939904.0, + "grad_norm": 0.054185757090889033, + "language_loss": 0.86109281, + "learning_rate": 4.487318805977969e-07, + "loss": 0.87172019, + "num_input_tokens_seen": 424926128, + "router_z_loss_mlp": 0.31469727, + "step": 5130, + "time_per_iteration": 3.233060598373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064125, + "balance_loss_mlp": 1.03336906, + "epoch": 0.9871104270873413, + "flos": 770385558528.0, + "grad_norm": 0.05107531469569112, + "language_loss": 0.82377338, + "learning_rate": 4.3563283093966954e-07, + "loss": 0.8344146, + "num_input_tokens_seen": 425005744, + "router_z_loss_mlp": 0.30712891, + "step": 5131, + "time_per_iteration": 2.976196765899658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062884, + "balance_loss_mlp": 1.03134108, + "epoch": 0.9873028087726049, + "flos": 446215426560.0, + "grad_norm": 0.06674879005758158, + "language_loss": 0.77993727, + "learning_rate": 4.2272772936940986e-07, + "loss": 0.79056621, + "num_input_tokens_seen": 425068112, + "router_z_loss_mlp": 0.31518555, + "step": 5132, + "time_per_iteration": 2.477193593978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106004, + "balance_loss_mlp": 1.02866411, + "epoch": 0.9874951904578684, + "flos": 507359614464.0, + "grad_norm": 0.04705074666951951, + "language_loss": 0.86384636, + "learning_rate": 4.1001658089717676e-07, + "loss": 0.87444681, + "num_input_tokens_seen": 425137408, + "router_z_loss_mlp": 0.31347656, + "step": 5133, + "time_per_iteration": 2.5848257541656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062248, + "balance_loss_mlp": 1.03211176, + "epoch": 0.987687572143132, + "flos": 716420256768.0, + "grad_norm": 0.04984001642827075, + "language_loss": 0.82130331, + "learning_rate": 3.9749939045791164e-07, + "loss": 0.83192575, + "num_input_tokens_seen": 425213504, + "router_z_loss_mlp": 0.30078125, + "step": 5134, + "time_per_iteration": 2.890923023223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011205, + "balance_loss_mlp": 1.00309896, + "epoch": 0.9878799538283956, + "flos": 1537823121408.0, + "grad_norm": 0.0037470646793171538, + "language_loss": 0.79817951, + "learning_rate": 3.851761629111716e-07, + "loss": 0.80829155, + "num_input_tokens_seen": 425451296, + "router_z_loss_mlp": 0.08105469, + "step": 5135, + "time_per_iteration": 4.827297925949097 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062735, + "balance_loss_mlp": 1.03176403, + "epoch": 0.9880723355136591, + "flos": 721098021888.0, + "grad_norm": 0.09785276178071765, + "language_loss": 0.8140105, + "learning_rate": 3.730469030412964e-07, + "loss": 0.82463777, + "num_input_tokens_seen": 425527536, + "router_z_loss_mlp": 0.30932617, + "step": 5136, + "time_per_iteration": 2.8907034397125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061795, + "balance_loss_mlp": 1.0313009, + "epoch": 0.9882647171989226, + "flos": 557085109248.0, + "grad_norm": 0.08580715505102396, + "language_loss": 0.84046173, + "learning_rate": 3.611116155572969e-07, + "loss": 0.85107958, + "num_input_tokens_seen": 425596608, + "router_z_loss_mlp": 0.30444336, + "step": 5137, + "time_per_iteration": 2.7122488021850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066137, + "balance_loss_mlp": 1.03445137, + "epoch": 0.9884570988841862, + "flos": 562541276160.0, + "grad_norm": 0.0626014477527682, + "language_loss": 0.80571049, + "learning_rate": 3.493703050927999e-07, + "loss": 0.81637186, + "num_input_tokens_seen": 425667280, + "router_z_loss_mlp": 0.31665039, + "step": 5138, + "time_per_iteration": 2.686124801635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106034, + "balance_loss_mlp": 1.02872539, + "epoch": 0.9886494805694498, + "flos": 431537559552.0, + "grad_norm": 0.05773726314876442, + "language_loss": 0.85940892, + "learning_rate": 3.378229762062146e-07, + "loss": 0.87001228, + "num_input_tokens_seen": 425730736, + "router_z_loss_mlp": 0.31591797, + "step": 5139, + "time_per_iteration": 2.464719295501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106163, + "balance_loss_mlp": 1.03075433, + "epoch": 0.9888418622547134, + "flos": 591793703424.0, + "grad_norm": 0.04946155729695369, + "language_loss": 0.90478563, + "learning_rate": 3.264696333806771e-07, + "loss": 0.91540194, + "num_input_tokens_seen": 425807616, + "router_z_loss_mlp": 0.30834961, + "step": 5140, + "time_per_iteration": 2.7692387104034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105981, + "balance_loss_mlp": 1.02912521, + "epoch": 0.989034243939977, + "flos": 1134526984704.0, + "grad_norm": 0.0539141258158205, + "language_loss": 0.80669558, + "learning_rate": 3.1531028102388394e-07, + "loss": 0.81729364, + "num_input_tokens_seen": 425900880, + "router_z_loss_mlp": 0.30639648, + "step": 5141, + "time_per_iteration": 3.5516679286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062728, + "balance_loss_mlp": 1.03128052, + "epoch": 0.9892266256252404, + "flos": 566405733888.0, + "grad_norm": 0.0708287283243169, + "language_loss": 0.81880045, + "learning_rate": 3.0434492346825824e-07, + "loss": 0.82942772, + "num_input_tokens_seen": 425973632, + "router_z_loss_mlp": 0.31420898, + "step": 5142, + "time_per_iteration": 2.699986219406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064078, + "balance_loss_mlp": 1.03193879, + "epoch": 0.989419007310504, + "flos": 640254786048.0, + "grad_norm": 0.047861078038832494, + "language_loss": 0.836555, + "learning_rate": 2.9357356497095033e-07, + "loss": 0.8471958, + "num_input_tokens_seen": 426057088, + "router_z_loss_mlp": 0.32128906, + "step": 5143, + "time_per_iteration": 2.894883394241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062971, + "balance_loss_mlp": 1.03216708, + "epoch": 0.9896113889957676, + "flos": 455236305408.0, + "grad_norm": 0.05543458418629213, + "language_loss": 0.81691206, + "learning_rate": 2.829962097138372e-07, + "loss": 0.82754171, + "num_input_tokens_seen": 426124336, + "router_z_loss_mlp": 0.30761719, + "step": 5144, + "time_per_iteration": 2.607186794281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058638, + "balance_loss_mlp": 1.0274049, + "epoch": 0.9898037706810312, + "flos": 567070654464.0, + "grad_norm": 0.06893052249390286, + "language_loss": 0.80264378, + "learning_rate": 2.726128618033008e-07, + "loss": 0.81323016, + "num_input_tokens_seen": 426191888, + "router_z_loss_mlp": 0.31201172, + "step": 5145, + "time_per_iteration": 2.634690999984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011199, + "balance_loss_mlp": 1.00309277, + "epoch": 0.9899961523662947, + "flos": 1549476131328.0, + "grad_norm": 0.0037450033001579456, + "language_loss": 0.78146422, + "learning_rate": 2.624235252706164e-07, + "loss": 0.79157621, + "num_input_tokens_seen": 426425840, + "router_z_loss_mlp": 0.08105469, + "step": 5146, + "time_per_iteration": 4.934341192245483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061804, + "balance_loss_mlp": 1.02999902, + "epoch": 0.9901885340515583, + "flos": 610401457152.0, + "grad_norm": 0.05110479894795738, + "language_loss": 0.85054564, + "learning_rate": 2.524282040715642e-07, + "loss": 0.86116374, + "num_input_tokens_seen": 426506080, + "router_z_loss_mlp": 0.31787109, + "step": 5147, + "time_per_iteration": 2.8900723457336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059898, + "balance_loss_mlp": 1.0296669, + "epoch": 0.9903809157368219, + "flos": 517231678464.0, + "grad_norm": 0.0532812715747954, + "language_loss": 0.82843816, + "learning_rate": 2.426269020866512e-07, + "loss": 0.83903718, + "num_input_tokens_seen": 426573936, + "router_z_loss_mlp": 0.30175781, + "step": 5148, + "time_per_iteration": 2.5548617839813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062875, + "balance_loss_mlp": 1.03233337, + "epoch": 0.9905732974220854, + "flos": 1099985716224.0, + "grad_norm": 0.05366226804981356, + "language_loss": 0.80599821, + "learning_rate": 2.3301962312122226e-07, + "loss": 0.81662691, + "num_input_tokens_seen": 426657472, + "router_z_loss_mlp": 0.30493164, + "step": 5149, + "time_per_iteration": 3.392335891723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063813, + "balance_loss_mlp": 1.03241384, + "epoch": 0.990765679107349, + "flos": 857630688768.0, + "grad_norm": 0.059297430250845336, + "language_loss": 0.84505075, + "learning_rate": 2.2360637090496073e-07, + "loss": 0.85568881, + "num_input_tokens_seen": 426740560, + "router_z_loss_mlp": 0.3137207, + "step": 5150, + "time_per_iteration": 3.1182923316955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060351, + "balance_loss_mlp": 1.02954686, + "epoch": 0.9909580607926125, + "flos": 491041986048.0, + "grad_norm": 0.07738057790240345, + "language_loss": 0.79969645, + "learning_rate": 2.143871490925542e-07, + "loss": 0.81029999, + "num_input_tokens_seen": 426809296, + "router_z_loss_mlp": 0.30761719, + "step": 5151, + "time_per_iteration": 2.6201224327087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061417, + "balance_loss_mlp": 1.02987456, + "epoch": 0.9911504424778761, + "flos": 584786525184.0, + "grad_norm": 0.05591386255417787, + "language_loss": 0.79255068, + "learning_rate": 2.0536196126319519e-07, + "loss": 0.80316496, + "num_input_tokens_seen": 426881056, + "router_z_loss_mlp": 0.31518555, + "step": 5152, + "time_per_iteration": 2.6853623390197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057775, + "balance_loss_mlp": 1.02711439, + "epoch": 0.9913428241631397, + "flos": 569763832320.0, + "grad_norm": 0.08412789044536739, + "language_loss": 0.81331563, + "learning_rate": 1.9653081092074753e-07, + "loss": 0.82389343, + "num_input_tokens_seen": 426949664, + "router_z_loss_mlp": 0.30615234, + "step": 5153, + "time_per_iteration": 2.678966760635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060701, + "balance_loss_mlp": 1.03011227, + "epoch": 0.9915352058484033, + "flos": 489505531392.0, + "grad_norm": 0.050282318122502674, + "language_loss": 0.86287737, + "learning_rate": 1.8789370149374652e-07, + "loss": 0.87348437, + "num_input_tokens_seen": 427018816, + "router_z_loss_mlp": 0.30541992, + "step": 5154, + "time_per_iteration": 2.6363751888275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059754, + "balance_loss_mlp": 1.02909327, + "epoch": 0.9917275875336667, + "flos": 743708445696.0, + "grad_norm": 0.046515503690355224, + "language_loss": 0.82832515, + "learning_rate": 1.7945063633545423e-07, + "loss": 0.83892262, + "num_input_tokens_seen": 427097984, + "router_z_loss_mlp": 0.30615234, + "step": 5155, + "time_per_iteration": 2.938014507293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060739, + "balance_loss_mlp": 1.02829003, + "epoch": 0.9919199692189303, + "flos": 508009978368.0, + "grad_norm": 0.05259363859514861, + "language_loss": 0.79867995, + "learning_rate": 1.7120161872380412e-07, + "loss": 0.80928731, + "num_input_tokens_seen": 427169280, + "router_z_loss_mlp": 0.32446289, + "step": 5156, + "time_per_iteration": 2.7240796089172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060457, + "balance_loss_mlp": 1.02910459, + "epoch": 0.9921123509041939, + "flos": 543702177792.0, + "grad_norm": 0.05290112083375949, + "language_loss": 0.83877754, + "learning_rate": 1.6314665186123457e-07, + "loss": 0.8493821, + "num_input_tokens_seen": 427237312, + "router_z_loss_mlp": 0.31323242, + "step": 5157, + "time_per_iteration": 2.6792891025543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062835, + "balance_loss_mlp": 1.03200781, + "epoch": 0.9923047325894575, + "flos": 671263428096.0, + "grad_norm": 0.059402742354966065, + "language_loss": 0.77112788, + "learning_rate": 1.5528573887507724e-07, + "loss": 0.78175628, + "num_input_tokens_seen": 427305008, + "router_z_loss_mlp": 0.30786133, + "step": 5158, + "time_per_iteration": 2.7650957107543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106199, + "balance_loss_mlp": 1.03047156, + "epoch": 0.9924971142747211, + "flos": 466291233792.0, + "grad_norm": 0.05256839185105993, + "language_loss": 0.80342734, + "learning_rate": 1.4761888281711322e-07, + "loss": 0.81404722, + "num_input_tokens_seen": 427377008, + "router_z_loss_mlp": 0.31494141, + "step": 5159, + "time_per_iteration": 2.69912052154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063209, + "balance_loss_mlp": 1.03202367, + "epoch": 0.9926894959599846, + "flos": 491337349632.0, + "grad_norm": 0.0517555441430021, + "language_loss": 0.82689339, + "learning_rate": 1.4014608666390594e-07, + "loss": 0.83752549, + "num_input_tokens_seen": 427444528, + "router_z_loss_mlp": 0.31152344, + "step": 5160, + "time_per_iteration": 2.585045099258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060997, + "balance_loss_mlp": 1.02993083, + "epoch": 0.9928818776452482, + "flos": 492144864768.0, + "grad_norm": 0.05816835563996184, + "language_loss": 0.81448108, + "learning_rate": 1.328673533166902e-07, + "loss": 0.825091, + "num_input_tokens_seen": 427509808, + "router_z_loss_mlp": 0.31054688, + "step": 5161, + "time_per_iteration": 2.5658082962036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059734, + "balance_loss_mlp": 1.02869153, + "epoch": 0.9930742593305117, + "flos": 546081053184.0, + "grad_norm": 0.047207524561871925, + "language_loss": 0.84269929, + "learning_rate": 1.2578268560131666e-07, + "loss": 0.85329664, + "num_input_tokens_seen": 427587936, + "router_z_loss_mlp": 0.31005859, + "step": 5162, + "time_per_iteration": 2.8079018592834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059662, + "balance_loss_mlp": 1.02890635, + "epoch": 0.9932666410157753, + "flos": 585234657792.0, + "grad_norm": 0.047366061223404234, + "language_loss": 0.85871446, + "learning_rate": 1.1889208626825188e-07, + "loss": 0.86931109, + "num_input_tokens_seen": 427662224, + "router_z_loss_mlp": 0.30737305, + "step": 5163, + "time_per_iteration": 2.760917901992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062808, + "balance_loss_mlp": 1.03226614, + "epoch": 0.9934590227010388, + "flos": 536833211904.0, + "grad_norm": 0.04936345876995348, + "language_loss": 0.83421499, + "learning_rate": 1.1219555799268921e-07, + "loss": 0.84484303, + "num_input_tokens_seen": 427730544, + "router_z_loss_mlp": 0.30517578, + "step": 5164, + "time_per_iteration": 2.614365577697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062983, + "balance_loss_mlp": 1.03117847, + "epoch": 0.9936514043863024, + "flos": 517754004480.0, + "grad_norm": 0.056445406823462094, + "language_loss": 0.86550891, + "learning_rate": 1.0569310337443794e-07, + "loss": 0.87613875, + "num_input_tokens_seen": 427799760, + "router_z_loss_mlp": 0.31787109, + "step": 5165, + "time_per_iteration": 2.622957944869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062574, + "balance_loss_mlp": 1.03203213, + "epoch": 0.993843786071566, + "flos": 744284616192.0, + "grad_norm": 0.10034207773061687, + "language_loss": 0.80428374, + "learning_rate": 9.938472493803419e-08, + "loss": 0.81490946, + "num_input_tokens_seen": 427881936, + "router_z_loss_mlp": 0.30493164, + "step": 5166, + "time_per_iteration": 3.043248414993286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060393, + "balance_loss_mlp": 1.02982759, + "epoch": 0.9940361677568296, + "flos": 525647273472.0, + "grad_norm": 0.05669440201180295, + "language_loss": 0.8168757, + "learning_rate": 9.327042513251893e-08, + "loss": 0.82747972, + "num_input_tokens_seen": 427951648, + "router_z_loss_mlp": 0.30517578, + "step": 5167, + "time_per_iteration": 2.6479313373565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058196, + "balance_loss_mlp": 1.02732062, + "epoch": 0.9942285494420932, + "flos": 555376946688.0, + "grad_norm": 0.05714439163256689, + "language_loss": 0.79955453, + "learning_rate": 8.735020633177104e-08, + "loss": 0.81013644, + "num_input_tokens_seen": 428031184, + "router_z_loss_mlp": 0.30834961, + "step": 5168, + "time_per_iteration": 2.742321729660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061675, + "balance_loss_mlp": 1.0310148, + "epoch": 0.9944209311273566, + "flos": 585722078208.0, + "grad_norm": 0.04880112875214473, + "language_loss": 0.81882125, + "learning_rate": 8.162407083411872e-08, + "loss": 0.82943803, + "num_input_tokens_seen": 428107296, + "router_z_loss_mlp": 0.30615234, + "step": 5169, + "time_per_iteration": 2.7316317558288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062047, + "balance_loss_mlp": 1.03136218, + "epoch": 0.9946133128126202, + "flos": 735185161728.0, + "grad_norm": 0.05301980101308898, + "language_loss": 0.82062948, + "learning_rate": 7.609202086272804e-08, + "loss": 0.83124995, + "num_input_tokens_seen": 428187904, + "router_z_loss_mlp": 0.30639648, + "step": 5170, + "time_per_iteration": 2.9755966663360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059155, + "balance_loss_mlp": 1.02768385, + "epoch": 0.9948056944978838, + "flos": 645728481792.0, + "grad_norm": 0.07007508791376361, + "language_loss": 0.82043839, + "learning_rate": 7.075405856526995e-08, + "loss": 0.83102989, + "num_input_tokens_seen": 428255856, + "router_z_loss_mlp": 0.31445312, + "step": 5171, + "time_per_iteration": 4.273667812347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060839, + "balance_loss_mlp": 1.02970123, + "epoch": 0.9949980761831474, + "flos": 445610142720.0, + "grad_norm": 0.05905922669837915, + "language_loss": 0.86205649, + "learning_rate": 6.561018601414226e-08, + "loss": 0.87266487, + "num_input_tokens_seen": 428321872, + "router_z_loss_mlp": 0.31103516, + "step": 5172, + "time_per_iteration": 2.528289318084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060626, + "balance_loss_mlp": 1.02908325, + "epoch": 0.995190457868411, + "flos": 435407809536.0, + "grad_norm": 0.04822617192166719, + "language_loss": 0.85489011, + "learning_rate": 6.066040520641414e-08, + "loss": 0.86549646, + "num_input_tokens_seen": 428389232, + "router_z_loss_mlp": 0.31518555, + "step": 5173, + "time_per_iteration": 2.575528621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065737, + "balance_loss_mlp": 1.03366995, + "epoch": 0.9953828395536745, + "flos": 513937598976.0, + "grad_norm": 0.04981117864172083, + "language_loss": 0.8126061, + "learning_rate": 5.590471806377062e-08, + "loss": 0.82326341, + "num_input_tokens_seen": 428456128, + "router_z_loss_mlp": 0.32055664, + "step": 5174, + "time_per_iteration": 2.5552728176116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061578, + "balance_loss_mlp": 1.03008258, + "epoch": 0.995575221238938, + "flos": 479608736256.0, + "grad_norm": 0.05922732070451884, + "language_loss": 0.81865811, + "learning_rate": 5.134312643245709e-08, + "loss": 0.82927388, + "num_input_tokens_seen": 428523504, + "router_z_loss_mlp": 0.31469727, + "step": 5175, + "time_per_iteration": 2.5534262657165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060563, + "balance_loss_mlp": 1.0288291, + "epoch": 0.9957676029242016, + "flos": 587500051968.0, + "grad_norm": 0.06112181675379925, + "language_loss": 0.76542944, + "learning_rate": 4.6975632083445793e-08, + "loss": 0.77603507, + "num_input_tokens_seen": 428596880, + "router_z_loss_mlp": 0.31713867, + "step": 5176, + "time_per_iteration": 2.716848850250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063795, + "balance_loss_mlp": 1.03210914, + "epoch": 0.9959599846094652, + "flos": 426244336128.0, + "grad_norm": 0.05881178966801391, + "language_loss": 0.80162942, + "learning_rate": 4.280223671243588e-08, + "loss": 0.8122673, + "num_input_tokens_seen": 428659472, + "router_z_loss_mlp": 0.31665039, + "step": 5177, + "time_per_iteration": 2.500436305999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060368, + "balance_loss_mlp": 1.02934957, + "epoch": 0.9961523662947287, + "flos": 611312279040.0, + "grad_norm": 0.048622636366890376, + "language_loss": 0.80515933, + "learning_rate": 3.8822941939575804e-08, + "loss": 0.815763, + "num_input_tokens_seen": 428736704, + "router_z_loss_mlp": 0.30981445, + "step": 5178, + "time_per_iteration": 2.828415870666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061082, + "balance_loss_mlp": 1.02937198, + "epoch": 0.9963447479799923, + "flos": 550521681408.0, + "grad_norm": 0.06697428881430652, + "language_loss": 0.73867881, + "learning_rate": 3.5037749309851927e-08, + "loss": 0.74928963, + "num_input_tokens_seen": 428808560, + "router_z_loss_mlp": 0.31689453, + "step": 5179, + "time_per_iteration": 2.6863455772399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063311, + "balance_loss_mlp": 1.03262651, + "epoch": 0.9965371296652559, + "flos": 625590065664.0, + "grad_norm": 0.05269572886478648, + "language_loss": 0.88772953, + "learning_rate": 3.1446660292755446e-08, + "loss": 0.89836264, + "num_input_tokens_seen": 428880688, + "router_z_loss_mlp": 0.30639648, + "step": 5180, + "time_per_iteration": 2.702698230743408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061004, + "balance_loss_mlp": 1.02950907, + "epoch": 0.9967295113505195, + "flos": 639205751808.0, + "grad_norm": 0.05458985414981575, + "language_loss": 0.81910491, + "learning_rate": 2.8049676282504433e-08, + "loss": 0.82971495, + "num_input_tokens_seen": 428960096, + "router_z_loss_mlp": 0.31469727, + "step": 5181, + "time_per_iteration": 2.9029834270477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062568, + "balance_loss_mlp": 1.03159761, + "epoch": 0.996921893035783, + "flos": 607101585408.0, + "grad_norm": 0.05702338558599981, + "language_loss": 0.76808369, + "learning_rate": 2.484679859793282e-08, + "loss": 0.77870935, + "num_input_tokens_seen": 429031296, + "router_z_loss_mlp": 0.30932617, + "step": 5182, + "time_per_iteration": 2.7194111347198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061712, + "balance_loss_mlp": 1.02990723, + "epoch": 0.9971142747210465, + "flos": 643867550208.0, + "grad_norm": 0.06530592489398579, + "language_loss": 0.81833375, + "learning_rate": 2.183802848243488e-08, + "loss": 0.82895088, + "num_input_tokens_seen": 429103312, + "router_z_loss_mlp": 0.31787109, + "step": 5183, + "time_per_iteration": 2.7606563568115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059793, + "balance_loss_mlp": 1.02877522, + "epoch": 0.9973066564063101, + "flos": 1040353251840.0, + "grad_norm": 0.04855677840468543, + "language_loss": 0.80962884, + "learning_rate": 1.9023367104187285e-08, + "loss": 0.82022679, + "num_input_tokens_seen": 429194896, + "router_z_loss_mlp": 0.30981445, + "step": 5184, + "time_per_iteration": 3.3332931995391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062751, + "balance_loss_mlp": 1.03201878, + "epoch": 0.9974990380915737, + "flos": 664784368128.0, + "grad_norm": 0.0558770781712101, + "language_loss": 0.83045828, + "learning_rate": 1.640281555587153e-08, + "loss": 0.84108579, + "num_input_tokens_seen": 429267664, + "router_z_loss_mlp": 0.30688477, + "step": 5185, + "time_per_iteration": 2.8848559856414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059374, + "balance_loss_mlp": 1.0282129, + "epoch": 0.9976914197768373, + "flos": 717808324608.0, + "grad_norm": 0.06124690550469667, + "language_loss": 0.77321869, + "learning_rate": 1.3976374855007024e-08, + "loss": 0.7838124, + "num_input_tokens_seen": 429343472, + "router_z_loss_mlp": 0.3112793, + "step": 5186, + "time_per_iteration": 2.8420236110687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060861, + "balance_loss_mlp": 1.02991474, + "epoch": 0.9978838014621008, + "flos": 518078481408.0, + "grad_norm": 0.05470349904981655, + "language_loss": 0.78897154, + "learning_rate": 1.1744045943451464e-08, + "loss": 0.79958016, + "num_input_tokens_seen": 429411472, + "router_z_loss_mlp": 0.30908203, + "step": 5187, + "time_per_iteration": 2.6191396713256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106246, + "balance_loss_mlp": 1.03110838, + "epoch": 0.9980761831473643, + "flos": 603138203136.0, + "grad_norm": 0.04602158415592968, + "language_loss": 0.83977401, + "learning_rate": 9.70582968801148e-09, + "loss": 0.8503986, + "num_input_tokens_seen": 429486704, + "router_z_loss_mlp": 0.31323242, + "step": 5188, + "time_per_iteration": 2.787299871444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058324, + "balance_loss_mlp": 1.0267812, + "epoch": 0.9982685648326279, + "flos": 453291005952.0, + "grad_norm": 0.04960660439705578, + "language_loss": 0.89004576, + "learning_rate": 7.861726879943021e-09, + "loss": 0.90062904, + "num_input_tokens_seen": 429554736, + "router_z_loss_mlp": 0.31518555, + "step": 5189, + "time_per_iteration": 2.5919430255889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057984, + "balance_loss_mlp": 1.02744257, + "epoch": 0.9984609465178915, + "flos": 481165539840.0, + "grad_norm": 0.06078734147645417, + "language_loss": 0.7877931, + "learning_rate": 6.211738235173403e-09, + "loss": 0.79837286, + "num_input_tokens_seen": 429623216, + "router_z_loss_mlp": 0.30493164, + "step": 5190, + "time_per_iteration": 2.6413121223449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062175, + "balance_loss_mlp": 1.03120399, + "epoch": 0.9986533282031551, + "flos": 476675449344.0, + "grad_norm": 0.045308866149240234, + "language_loss": 0.84180099, + "learning_rate": 4.755864394301312e-09, + "loss": 0.85242271, + "num_input_tokens_seen": 429695808, + "router_z_loss_mlp": 0.30957031, + "step": 5191, + "time_per_iteration": 2.6326682567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062646, + "balance_loss_mlp": 1.03191352, + "epoch": 0.9988457098884186, + "flos": 641647236096.0, + "grad_norm": 0.0680731275372209, + "language_loss": 0.86416614, + "learning_rate": 3.494105922541291e-09, + "loss": 0.87479258, + "num_input_tokens_seen": 429774464, + "router_z_loss_mlp": 0.30688477, + "step": 5192, + "time_per_iteration": 2.7818009853363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064252, + "balance_loss_mlp": 1.03189921, + "epoch": 0.9990380915736822, + "flos": 396105818112.0, + "grad_norm": 0.057818769382579786, + "language_loss": 0.87863171, + "learning_rate": 2.4264633097237365e-09, + "loss": 0.88927424, + "num_input_tokens_seen": 429835872, + "router_z_loss_mlp": 0.32348633, + "step": 5193, + "time_per_iteration": 2.421710252761841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061611, + "balance_loss_mlp": 1.03099763, + "epoch": 0.9992304732589458, + "flos": 575831075328.0, + "grad_norm": 0.0533802000790901, + "language_loss": 0.84760511, + "learning_rate": 1.552936970405927e-09, + "loss": 0.85822117, + "num_input_tokens_seen": 429911440, + "router_z_loss_mlp": 0.3059082, + "step": 5194, + "time_per_iteration": 2.716845989227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059499, + "balance_loss_mlp": 1.02883863, + "epoch": 0.9994228549442093, + "flos": 544017890304.0, + "grad_norm": 0.060624333126130775, + "language_loss": 0.75329232, + "learning_rate": 8.735272437054853e-10, + "loss": 0.76388735, + "num_input_tokens_seen": 429982512, + "router_z_loss_mlp": 0.30615234, + "step": 5195, + "time_per_iteration": 2.649554967880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106175, + "balance_loss_mlp": 1.03030252, + "epoch": 0.9996152366294728, + "flos": 1470777910272.0, + "grad_norm": 0.05155957143930501, + "language_loss": 0.80520171, + "learning_rate": 3.882343933003796e-10, + "loss": 0.8158192, + "num_input_tokens_seen": 430070944, + "router_z_loss_mlp": 0.31420898, + "step": 5196, + "time_per_iteration": 3.691424608230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048694, + "balance_loss_mlp": 1.02315903, + "epoch": 0.9998076183147364, + "flos": 618667255296.0, + "grad_norm": 0.1131411540909355, + "language_loss": 0.70138329, + "learning_rate": 9.70586077619906e-11, + "loss": 0.71187025, + "num_input_tokens_seen": 430164864, + "router_z_loss_mlp": 0.25506592, + "step": 5197, + "time_per_iteration": 4.002978086471558 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_mlp": 1.01343656, + "epoch": 1.0, + "flos": 1289959492608.0, + "grad_norm": 0.02359347763858496, + "language_loss": 0.84250998, + "learning_rate": 0.0, + "loss": 0.85281241, + "num_input_tokens_seen": 430340944, + "router_z_loss_mlp": 0.16827393, + "step": 5198, + "time_per_iteration": 5.848259449005127 + }, + { + "epoch": 1.0, + "num_input_tokens_seen": 430340944, + "step": 5198, + "total_flos": 1.1713320035811328e+16, + "train_loss": 0.0, + "train_runtime": 7.7371, + "train_samples_per_second": 85987.842, + "train_steps_per_second": 671.826 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 430340944, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1713320035811328e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_sharev3/training_args.bin b/sft_pretrain/Full_smoe_sharev3/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8a873e0399667327da7b00584cd7e221fb1f0780 --- /dev/null +++ b/sft_pretrain/Full_smoe_sharev3/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92f5f084471f64073c680bb2e0004d52b897a017cb09f88dff1f3ea2097c1f07 +size 7992